def items_handler(cve_items): get_cve = lambda item: item["cve"]["CVE_data_meta"]["ID"] get_urls = lambda item: item["cve"]["references"]["reference_data"] def get_cwe_strings(item): return [ desc["value"] for data in item["cve"]["problemtype"]["problemtype_data"] for desc in data["description"] ] queries = [] for item in cve_items: queries.append(db.InsertQuery(models.CVE, cve_string=get_cve(item))) for url_string in filter(_should_keep_url, [url["url"] for url in get_urls(item)]): queries.append(db.InsertQuery(models.URL, url=url_string)) queries.append( db.ConnectQuery(models.cve_url_table, get_cve(item), url_string)) for cwe_string in get_cwe_strings(item): queries.append( db.InsertQuery(models.CWE, cwe_string=cwe_string)) queries.append( db.ConnectQuery(models.cve_cwe_table, get_cve(item), cwe_string)) return queries
def _do_github_search_query(search_query): logger = logging.getLogger("github_search") def remove_hrefs(s): s = re.sub(r"<a href.+?>", "", s) s = s.replace("</a>", "") return s mutations = [ lambda x: [x], lambda x: [remove_hrefs(x)], lambda x: remove_hrefs(x).split("\n") ] mutants = misc.unique( misc.flatten_list([m(search_query.query) for m in mutations])) for query_str in mutants: if not query_str: continue logger.info("trying {0}".format(query_str)) code, answer = net.github_search(query_str) if code == net.CODE_TIMEOUT: logger.info("sleeping...") mutants.append(query_str) # Try again. time.sleep(60) elif code == net.CODE_VALIDATION: logger.info("got 422: " + answer) search_query.state = models.GithubSearchQuery.ERROR db.global_session.commit() elif code == net.CODE_OK: if len(answer["items"]) > 5: answer["items"] = [ item for item in answer["items"] if _messages_match( search_query.query, item["commit"]["message"]) ] hash_strings = misc.unique( [item["sha"] for item in answer["items"]]) logger.info("got results: {0}".format(hash_strings)) queries = [] if hash_strings: search_query.state = models.GithubSearchQuery.NON_EMPTY for h in hash_strings: queries += [ db.InsertQuery(models.CommitHash, hash=h), db.ConnectQuery(models.query_hash_table, search_query.query, h) ] queries += [ db.ConnectQuery(models.hash_url_table, h, url.url) for url in search_query.urls ] db.process_queries(queries) db.global_session.commit() # Commit state update. return search_query.state = models.GithubSearchQuery.EMPTY db.global_session.commit() # Commit state update. else: raise "got something unexpected: {0} {1}".format(code, answer)
def _get_queries_for_search_url(url_string, search_query): search_query = search_query[:config.MAX_GITHUB_QUERY_LEN] queries = [ db.InsertQuery(models.GithubSearchQuery, query=search_query), db.ConnectQuery(models.query_url_table, search_query, url_string) ] return queries
def _get_queries_for_hash_url(url_string, hashes): hashes = misc.unique(hashes) if config.IGNORE_SHORT_HASHES: hashes = [h for h in hashes if len(h) == 40] queries = \ [db.InsertQuery(models.CommitHash, hash=h) for h in hashes] +\ [db.ConnectQuery(models.hash_url_table, h, url_string) for h in hashes] return queries
def _get_queries_for_noncve_url(hash_id, url_strings, tags=[]): queries = \ [db.InsertQuery(models.URL, url=url_string) for url_string in url_strings ] + \ [db.ConnectQuery(models.non_cve_url_table, hash_id, url_string) for url_string in url_strings ] if url_strings and tags: queries += [db.UpdateTagQuery(models.NonCVE, hash_id, tags)] return queries
def _get_queries_for_cve_url(cve_string, url_strings, tags=[]): queries = \ [db.InsertQuery(models.URL, url=url_string) for url_string in url_strings ] + \ [db.ConnectQuery(models.cve_url_table, cve_string, url_string) for url_string in url_strings ] if url_strings and tags: queries += [db.UpdateTagQuery(models.CVE, cve_string, tags)] return queries
def worker(url_string): raw = net.get_raw_resource(url_string) results = re.findall( r"<td>(CVE[0-9-]+)</td>\s+<td><a href=\"(\S+?)\">", raw, re.DOTALL) results = [r for r in results if _should_keep_url(r[1])] queries = [ db.InsertQuery(models.CVE, cve_string=r[0]) for r in results ] queries += [db.InsertQuery(models.URL, url=r[1]) for r in results] queries += [ db.ConnectQuery(models.cve_url_table, r[0], r[1]) for r in results ] bar.update() return queries
def crawl_jenkins(): url_t = "https://github.com/jenkinsci/jenkins/commit/{0}" cve_t = "JENKINS-{0}" commits = misc.get_repo_log("jenkinsci", "jenkins") hashes = [c["hash"] for c in commits if "SECURITY" in c["message"]] url_strings = [url_t.format(h) for h in hashes] queries = [] for i, url_string in enumerate(url_strings): hash_id = cve_t.format(i) queries += [ db.InsertQuery(models.NonCVE, hash_id=hash_id), db.InsertQuery(models.URL, url=url_string), db.ConnectQuery(models.non_cve_url_table, hash_id, url_string), db.UpdateTagQuery(models.NonCVE, hash_id, ["Jenkins", "Java"]) ] db.process_queries(queries)