示例#1
0
def _do_github_search_query(search_query):
    logger = logging.getLogger("github_search")

    def remove_hrefs(s):
        s = re.sub(r"<a href.+?>", "", s)
        s = s.replace("</a>", "")
        return s

    mutations = [
        lambda x: [x], lambda x: [remove_hrefs(x)],
        lambda x: remove_hrefs(x).split("\n")
    ]
    mutants = misc.unique(
        misc.flatten_list([m(search_query.query) for m in mutations]))
    for query_str in mutants:
        if not query_str:
            continue
        logger.info("trying {0}".format(query_str))
        code, answer = net.github_search(query_str)
        if code == net.CODE_TIMEOUT:
            logger.info("sleeping...")
            mutants.append(query_str)  # Try again.
            time.sleep(60)
        elif code == net.CODE_VALIDATION:
            logger.info("got 422: " + answer)
            search_query.state = models.GithubSearchQuery.ERROR
            db.global_session.commit()
        elif code == net.CODE_OK:
            if len(answer["items"]) > 5:
                answer["items"] = [
                    item for item in answer["items"] if _messages_match(
                        search_query.query, item["commit"]["message"])
                ]
            hash_strings = misc.unique(
                [item["sha"] for item in answer["items"]])
            logger.info("got results: {0}".format(hash_strings))
            queries = []
            if hash_strings:
                search_query.state = models.GithubSearchQuery.NON_EMPTY
                for h in hash_strings:
                    queries += [
                        db.InsertQuery(models.CommitHash, hash=h),
                        db.ConnectQuery(models.query_hash_table,
                                        search_query.query, h)
                    ]
                    queries += [
                        db.ConnectQuery(models.hash_url_table, h, url.url)
                        for url in search_query.urls
                    ]
                db.process_queries(queries)
                db.global_session.commit()  # Commit state update.
                return
            search_query.state = models.GithubSearchQuery.EMPTY
            db.global_session.commit()  # Commit state update.
        else:
            raise "got something unexpected: {0} {1}".format(code, answer)
示例#2
0
def crawl_jenkins():
    url_t = "https://github.com/jenkinsci/jenkins/commit/{0}"
    cve_t = "JENKINS-{0}"
    commits = misc.get_repo_log("jenkinsci", "jenkins")
    hashes = [c["hash"] for c in commits if "SECURITY" in c["message"]]
    url_strings = [url_t.format(h) for h in hashes]
    queries = []
    for i, url_string in enumerate(url_strings):
        hash_id = cve_t.format(i)
        queries += [
            db.InsertQuery(models.NonCVE, hash_id=hash_id),
            db.InsertQuery(models.URL, url=url_string),
            db.ConnectQuery(models.non_cve_url_table, hash_id, url_string),
            db.UpdateTagQuery(models.NonCVE, hash_id, ["Jenkins", "Java"])
        ]
    db.process_queries(queries)
示例#3
0
def crawl_nist_feed(parallel=False):
    # Works faster without parallelism.
    def items_handler(cve_items):
        get_cve = lambda item: item["cve"]["CVE_data_meta"]["ID"]
        get_urls = lambda item: item["cve"]["references"]["reference_data"]

        def get_cwe_strings(item):
            return [
                desc["value"]
                for data in item["cve"]["problemtype"]["problemtype_data"]
                for desc in data["description"]
            ]

        queries = []
        for item in cve_items:
            queries.append(db.InsertQuery(models.CVE,
                                          cve_string=get_cve(item)))

            for url_string in filter(_should_keep_url,
                                     [url["url"] for url in get_urls(item)]):
                queries.append(db.InsertQuery(models.URL, url=url_string))
                queries.append(
                    db.ConnectQuery(models.cve_url_table, get_cve(item),
                                    url_string))

            for cwe_string in get_cwe_strings(item):
                queries.append(
                    db.InsertQuery(models.CWE, cwe_string=cwe_string))
                queries.append(
                    db.ConnectQuery(models.cve_cwe_table, get_cve(item),
                                    cwe_string))

        return queries

    result = misc.crawl_nist_files(items_handler, parallel=parallel)
    queries = misc.flatten_list(result)

    print "Done parsing files. Storing results into the DB"
    #TODO: populate cwes with descriptions

    db.process_queries(queries)
    misc.update_cwe_descriptions()
示例#4
0
def crawl_android_cve_checker(local):
    # Works faster with parallelism.
    api_url_t = "https://api.github.com/repos/torvalds/linux/commits/{0}"
    http_url_t = "https://github.com/torvalds/linux/commit/{0}"
    hash_re = re.compile(r"([a-f0-9]{40})")

    cve_index = _index_cves()

    def file_func(file_obj):
        name, ext = splitext(basename(file_obj["path"]))
        queries = []
        if ext == ".patch" and cve_index.get(name):
            if local:
                raw = fs.read_file(file_obj["path"])
            else:
                raw = net.get_raw_resource(file_obj["download_url"],
                                           auth=net.github_auth)
            hashes = re.findall(hash_re, raw)
            for h in hashes:
                api_url = api_url_t.format(h)
                http_url = http_url_t.format(h)
                try:
                    net.get_json_resource(api_url, auth=net.github_auth)
                except:
                    # not upstream commit. Shit happens
                    # print "bad response for resource: " + api_url
                    pass
                else:
                    queries = _get_queries_for_cve_url(
                        name, [http_url], ["C", "Linux", "github"])
        bar.update()
        return ([], queries)

    def dir_func(dir_obj):
        if "patches" in dir_obj["path"]:
            return True
        return False

    bar = misc.UnknownLengthBar(parallel=True)
    queries = crawl_github_repo(user="******",
                                repo="android-cve-checker",
                                file_callback=file_func,
                                dir_callback=dir_func,
                                local=local)
    bar.finish()
    return db.process_queries(queries)
示例#5
0
def dump_commits(parallel=True):
    black_list_res = [
        # This list is used to temporarily disable some of the urls to not waste
        # time on processing them.

        # re.compile(r".+git.kernel.org.+"),
        # re.compile(r".+github.com.+"),
        # re.compile(r".+svn.apache.org.+"),
        # re.compile(".+github.+(linux).+"),
    ]

    def black_list(url_string):
        for black_list_re in black_list_res:
            if re.match(black_list_re, url_string):
                return True
        return False

    def extract(url_string):
        extractors = [
            extract_from_github_commit,
            extract_from_github_issue,
            extract_from_github_pull,
            extract_from_apache_svn,
            extract_from_commit_urls,
            extract_from_googlesource,
            extract_from_moodle,
            extract_from_chromium_codereview,
            # TODO: extract from git kernel org
        ]
        queries = []
        for e in extractors:
            queries = e(url_string)
            if queries:
                break
        bar.update()
        return queries

    print "Parsing URLs"
    url_strings = [
        x.url for x in db.global_session.query(models.URL).all()
        if not black_list(x.url) and not x.hashes and not x.queries
    ]
    bar = misc.KnownLengthBar(maxval=len(url_strings), parallel=parallel)
    queries = misc.map_func(extract, url_strings, parallel)
    queries = misc.unique(misc.flatten_list(queries))
    print "Parsing URLs done"

    if not queries:
        print "No new hashes :("
    else:
        print "Storing results"
        db.process_queries(queries)
        print "Storing results done"

    print "Writing bad urls to {0}".format(config.BAD_URLS_FILE)
    good_urls_set = set([
        q.right_unique_value for q in queries if q.__class__ == db.ConnectQuery
        and q.table in [models.hash_url_table, models.query_url_table]
    ])
    bad_urls = [x for x in url_strings if x not in good_urls_set]

    with open(config.BAD_URLS_FILE, "w") as f:
        f.write("\n".join(sorted(bad_urls)))
示例#6
0
def _process_queries_from_workers(worker_func, sequence, parallel, bar=None):
    results = misc.map_func(worker_func, sequence, parallel)
    if bar:
        bar.finish()
    queries = misc.unique(misc.flatten_list(results))
    return db.process_queries(queries)
示例#7
0
def crawl_vuln_db(local, parallel=True):
    commit_re = re.compile("(https://github.com/\S+/commits?/[a-f0-9]+)")
    cve_re = re.compile("(CVE-\d+-\d+)")
    tag_dict = {
        "composer": "PHP",
        "golang": "Go",
        "maven": "Java",
        "npm": "JS",
        "nuget": "nuget",  #has different languages
        "pip": "Python",
        "rubygems": "Ruby",
    }

    def file_func(file_obj):
        name = basename(file_obj["path"])
        if local:
            raw = fs.read_file(file_obj["path"])
        else:
            raw = net.get_raw_resource(file_obj["download_url"],
                                       auth=net.github_auth)
        url_strings = misc.unique(re.findall(commit_re, raw))
        queries = []
        if url_strings:
            # Figure out the tag list.
            tags = ["vulndb"]
            for k, v in tag_dict.iteritems():
                if k in file_obj["path"]:
                    tags.append(v)
                    break

            # Insert CVEs/NonCVEs and connect them to urls.
            cve_strings = misc.unique(re.findall(cve_re, raw))
            if not cve_strings:
                hash_id = models.NonCVE.hash_id_for_urls(url_strings)
                queries = [db.InsertQuery(models.NonCVE, hash_id=hash_id)]
                queries += _get_queries_for_noncve_url(hash_id, url_strings,
                                                       tags)
            else:
                # Surpisingly there's some CVEs in this db, which are marked
                # as reserved in the nist feed. We need to add them here.
                queries = [
                    db.InsertQuery(models.CVE, cve_string=cve_string)
                    for cve_string in cve_strings
                ]
                for cve_string in cve_strings:
                    queries += _get_queries_for_cve_url(
                        cve_string, url_strings, tags)
        bar.update()
        return ([], queries)

    def dir_func(dir_obj):
        if "data" in dir_obj["path"]:
            return True
        return False

    bar = misc.UnknownLengthBar(parallel=parallel)
    queries = crawl_github_repo(user="******",
                                repo="vulnerabilitydb",
                                file_callback=file_func,
                                dir_callback=dir_func,
                                local=local)
    bar.finish()

    return db.process_queries(queries)