Пример #1
0
def crawl_debian_security(local, parallel=True):
    # Works faster with parallelism.
    cve_re = re.compile("^(CVE\S+)")
    list_url = "https://salsa.debian.org/security-tracker-team/security-tracker/raw/master/data/CVE/list"

    if not local:
        raw = net.get_raw_resource(list_url)
    else:
        path = misc.repo_path("security-tracker-team", "security-tracker")
        path = os.path.join(path, "raw", "master", "data", "CVE", "list")
        with open(path, "r") as f:
            raw = f.read()

    indices = [x.start() for x in re.finditer(r"^CVE", raw, re.MULTILINE)]
    sub_strings = [raw[s:f] for s, f in zip(indices, indices[1:] + [len(raw)])]
    cve_index = _index_cves()

    bar = misc.KnownLengthBar(maxval=len(sub_strings), parallel=parallel)

    def worker(sub_string):
        cve_string = re.findall(cve_re, sub_string)[0]
        cve = cve_index.get(cve_string)
        queries = []
        if cve:
            url_strings = misc.flatten_list([
                re.findall(url_re, sub_string) for url_re in urls_re_whitelist
            ])
            queries = _get_queries_for_cve_url(cve_string, url_strings,
                                               ["DebianSec"])
        bar.update()
        return queries

    return _process_queries_from_workers(worker, sub_strings, parallel, bar)
Пример #2
0
def crawl_django():
    cve_re = re.compile(r":cve:`(\S+)`")
    commit_re = re.compile(r"(https://github.com/\S+/[a-f0-9]+)")
    cve_index = _index_cves()
    raw = net.get_raw_resource(
        "https://raw.githubusercontent.com/django/django/master/docs/releases/security.txt"
    )

    indices = [x.start() for x in re.finditer(r":cve:", raw)]
    sub_strings = [raw[s:f] for s, f in zip(indices, indices[1:] + [len(raw)])]

    bar = misc.KnownLengthBar(maxval=len(indices), parallel=False)

    def worker(sub_string):
        queries = []
        cve_string = "CVE-" + re.findall(cve_re, sub_string)[0]
        cve = cve_index.get(cve_string)
        if not cve:
            print "CVE not found?!: " + cve_string
            return []

        # Find the URLs
        url_strings = re.findall(commit_re, sub_string)
        if url_strings:
            queries = _get_queries_for_cve_url(cve_string, url_strings,
                                               ["Python", "Django"])
        return queries

    return _process_queries_from_workers(worker, sub_strings, False, bar)
Пример #3
0
def find_objects(table, list_of_queries):
    def comp_func(obj, query):
        for k, v in query.params.iteritems():
            if getattr(obj, k) != v:
                return False
        return True

    # Get all rows from db.
    all_objects_list = global_session.query(table).all()
    all_objects_dict = {}
    for x in all_objects_list:
        field = getattr(x, table.unique_field)
        # Arrange them in a map, where value of the field "unique_field" is the key.
        all_objects_dict.setdefault(field, []).append(x)

    debug_print("Searching for objects in table: {0}".format(table.__tablename__))
    if config.DB_VERBOSE:
        bar = misc.KnownLengthBar(maxval=len(list_of_queries), parallel=False)
        list_of_queries = bar(list_of_queries)
    res = {}
    found_count = 0
    for query in list_of_queries:
        res[query] = False
        field = query.params[table.unique_field]
        if all_objects_dict.get(field) is None:
            continue
        for x in all_objects_dict[field]:
            if comp_func(x, query):
                found_count += 1
                res[query] = x
                break
    debug_print("Search done. Found {0}/{1} objects".format(found_count, len(list_of_queries)))
    return res
Пример #4
0
def search_not_found_hashes():
    not_found = get_not_found_hashes()
    try:
        cached = get_cached_commits()
    except:
        cached = {}

    not_found = [x for x in not_found if x and cached.get(x) is None]
    if not not_found:
        print "No hashes to search!"
        return
    bar = misc.KnownLengthBar(maxval=len(not_found), parallel=False)
    start_time = bar.start_time
    for i, h in enumerate(not_found):
        bar.update(1)
        try:
            code, reply = net.github_search(h)
        except Exception as e:
            print "Got exception: {0} for hash: {1}".format(e, h)
            not_found.append(h)
            continue
        if code == net.CODE_OK:
            cached[h] = misc.unique(
                [x["repository"]["full_name"] for x in reply["items"]])
        elif code == net.CODE_TIMEOUT:
            not_found.append(h)
            with open(config.HASH_CACHE_FILE, "w") as f:
                json.dump(cached, f, indent=2)
            # bar.finish()
            time.sleep(60)
            bar = misc.KnownLengthBar(maxval=len(not_found), parallel=False)
            bar.start_time = start_time
            bar.update(i)
        else:
            print "Got code {0} for hash: {1}".format(code, h)

    with open(config.HASH_CACHE_FILE, "w") as f:
        json.dump(cached, f, indent=2)
Пример #5
0
def github_search_loop():
    logger = logging.getLogger("github_search")
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('github_search.log')
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    queries = db.global_session.query(models.GithubSearchQuery).filter(
        models.GithubSearchQuery.state ==
        models.GithubSearchQuery.NOT_SEEN).all()
    bar = misc.KnownLengthBar(maxval=len(queries), parallel=False)
    for i, q in bar(enumerate(queries)):
        logger.info("({0}/{1}) searching: '{2}'".format(
            i, len(queries), q.query))
        _do_github_search_query(q)
        logger.info("-" * 80)
Пример #6
0
def crawl_android_security_bulletin(parallel=True):
    raw = net.get_raw_resource("https://source.android.com/security/bulletin/")
    urls = re.findall(
        r"https?://source.android.com/security/bulletin/[0-9-]{10}", raw)
    urls = misc.unique(urls)
    bar = misc.KnownLengthBar(maxval=len(urls), parallel=parallel)

    def worker(url_string):
        raw = net.get_raw_resource(url_string)
        results = re.findall(
            r"<td>(CVE[0-9-]+)</td>\s+<td><a href=\"(\S+?)\">", raw, re.DOTALL)
        results = [r for r in results if _should_keep_url(r[1])]
        queries = [
            db.InsertQuery(models.CVE, cve_string=r[0]) for r in results
        ]
        queries += [db.InsertQuery(models.URL, url=r[1]) for r in results]
        queries += [
            db.ConnectQuery(models.cve_url_table, r[0], r[1]) for r in results
        ]
        bar.update()
        return queries

    return _process_queries_from_workers(worker, urls, parallel, bar)
Пример #7
0
def crawl_debian_fake_names(parallel=True):
    # Works faster with parallelism.
    main_page = net.get_raw_resource(
        "https://security-tracker.debian.org/tracker/data/fake-names")
    temp_re = re.compile(r"/tracker/(TEMP-[0-9A-F-]+)")
    href_re = re.compile(r'href="(\S+?)"')

    temps = re.findall(temp_re, main_page)
    bar = misc.KnownLengthBar(maxval=len(temps), parallel=parallel)

    def worker(temp):
        raw = net.get_raw_resource(
            "https://security-tracker.debian.org/tracker/{0}".format(temp))
        url_strings = [
            x for x in re.findall(href_re, raw) if _should_keep_url(x)
        ]
        queries = [db.InsertQuery(models.NonCVE, hash_id=temp)]
        queries += _get_queries_for_noncve_url(temp, url_strings,
                                               ["DebianFake"])
        bar.update()
        return queries

    return _process_queries_from_workers(worker, temps, parallel, bar)
Пример #8
0
def crawl_linux_kernel_cves():
    # Works fast enough without parallelism.
    url = "https://raw.githubusercontent.com/nluedtke/linux_kernel_cves/master/stream_fixes.json"
    http_url_t = "https://github.com/torvalds/linux/commit/{0}"
    patches = net.get_json_resource(url)
    bar = misc.KnownLengthBar(maxval=len(patches), parallel=False)

    cve_index = _index_cves()

    def worker(item):
        cve_string = item[0]
        cve = cve_index.get(cve_string)
        queries = []
        if cve:
            url_strings = [
                http_url_t.format(v["cmt_id"]) for v in item[1].values()
            ]
            queries = _get_queries_for_cve_url(cve_string, url_strings,
                                               ["C", "Linux", "github"])
        bar.update()
        return queries

    return _process_queries_from_workers(worker, patches.items(), False, bar)
Пример #9
0
def dump_commits(parallel=True):
    black_list_res = [
        # This list is used to temporarily disable some of the urls to not waste
        # time on processing them.

        # re.compile(r".+git.kernel.org.+"),
        # re.compile(r".+github.com.+"),
        # re.compile(r".+svn.apache.org.+"),
        # re.compile(".+github.+(linux).+"),
    ]

    def black_list(url_string):
        for black_list_re in black_list_res:
            if re.match(black_list_re, url_string):
                return True
        return False

    def extract(url_string):
        extractors = [
            extract_from_github_commit,
            extract_from_github_issue,
            extract_from_github_pull,
            extract_from_apache_svn,
            extract_from_commit_urls,
            extract_from_googlesource,
            extract_from_moodle,
            extract_from_chromium_codereview,
            # TODO: extract from git kernel org
        ]
        queries = []
        for e in extractors:
            queries = e(url_string)
            if queries:
                break
        bar.update()
        return queries

    print "Parsing URLs"
    url_strings = [
        x.url for x in db.global_session.query(models.URL).all()
        if not black_list(x.url) and not x.hashes and not x.queries
    ]
    bar = misc.KnownLengthBar(maxval=len(url_strings), parallel=parallel)
    queries = misc.map_func(extract, url_strings, parallel)
    queries = misc.unique(misc.flatten_list(queries))
    print "Parsing URLs done"

    if not queries:
        print "No new hashes :("
    else:
        print "Storing results"
        db.process_queries(queries)
        print "Storing results done"

    print "Writing bad urls to {0}".format(config.BAD_URLS_FILE)
    good_urls_set = set([
        q.right_unique_value for q in queries if q.__class__ == db.ConnectQuery
        and q.table in [models.hash_url_table, models.query_url_table]
    ])
    bad_urls = [x for x in url_strings if x not in good_urls_set]

    with open(config.BAD_URLS_FILE, "w") as f:
        f.write("\n".join(sorted(bad_urls)))