def crawl_debian_security(local, parallel=True): # Works faster with parallelism. cve_re = re.compile("^(CVE\S+)") list_url = "https://salsa.debian.org/security-tracker-team/security-tracker/raw/master/data/CVE/list" if not local: raw = net.get_raw_resource(list_url) else: path = misc.repo_path("security-tracker-team", "security-tracker") path = os.path.join(path, "raw", "master", "data", "CVE", "list") with open(path, "r") as f: raw = f.read() indices = [x.start() for x in re.finditer(r"^CVE", raw, re.MULTILINE)] sub_strings = [raw[s:f] for s, f in zip(indices, indices[1:] + [len(raw)])] cve_index = _index_cves() bar = misc.KnownLengthBar(maxval=len(sub_strings), parallel=parallel) def worker(sub_string): cve_string = re.findall(cve_re, sub_string)[0] cve = cve_index.get(cve_string) queries = [] if cve: url_strings = misc.flatten_list([ re.findall(url_re, sub_string) for url_re in urls_re_whitelist ]) queries = _get_queries_for_cve_url(cve_string, url_strings, ["DebianSec"]) bar.update() return queries return _process_queries_from_workers(worker, sub_strings, parallel, bar)
def crawl_django(): cve_re = re.compile(r":cve:`(\S+)`") commit_re = re.compile(r"(https://github.com/\S+/[a-f0-9]+)") cve_index = _index_cves() raw = net.get_raw_resource( "https://raw.githubusercontent.com/django/django/master/docs/releases/security.txt" ) indices = [x.start() for x in re.finditer(r":cve:", raw)] sub_strings = [raw[s:f] for s, f in zip(indices, indices[1:] + [len(raw)])] bar = misc.KnownLengthBar(maxval=len(indices), parallel=False) def worker(sub_string): queries = [] cve_string = "CVE-" + re.findall(cve_re, sub_string)[0] cve = cve_index.get(cve_string) if not cve: print "CVE not found?!: " + cve_string return [] # Find the URLs url_strings = re.findall(commit_re, sub_string) if url_strings: queries = _get_queries_for_cve_url(cve_string, url_strings, ["Python", "Django"]) return queries return _process_queries_from_workers(worker, sub_strings, False, bar)
def extract_from_chromium_codereview(url_string): if "codereview.chromium.org" not in url_string: return [] try: raw = net.get_raw_resource(url_string) except: print "got exception for: " + url_string return [] hashes = re.findall(r"Committed: <a href=\S+/([0-9a-f]{40})\">", raw) return _get_queries_for_hash_url(url_string, hashes)
def extract_from_github_issue(url_string): queries = [] if re.match(github_issue_re, url_string): try: raw = net.get_raw_resource(url_string, auth=None) except: return [] hashes = re.findall(github_commit_re, raw) hashes += re.findall(github_commit_relative_re, raw) queries = _get_queries_for_hash_url(url_string, hashes) return queries
def extract_from_moodle(url_string): if "git.moodle.org" not in url_string or "MDL" not in url_string: # Links without "MDL" and with "commit" are processed in # extract_from_commit_urls. return [] try: raw = net.get_raw_resource(url_string) except: return [] hashes = re.findall(r"<a href=.+?h=([0-9a-f]{40})\">commit</a>", raw) return _get_queries_for_hash_url(url_string, hashes)
def worker(temp): raw = net.get_raw_resource( "https://security-tracker.debian.org/tracker/{0}".format(temp)) url_strings = [ x for x in re.findall(href_re, raw) if _should_keep_url(x) ] queries = [db.InsertQuery(models.NonCVE, hash_id=temp)] queries += _get_queries_for_noncve_url(temp, url_strings, ["DebianFake"]) bar.update() return queries
def worker(url_string): raw = net.get_raw_resource(url_string) results = re.findall( r"<td>(CVE[0-9-]+)</td>\s+<td><a href=\"(\S+?)\">", raw, re.DOTALL) results = [r for r in results if _should_keep_url(r[1])] queries = [ db.InsertQuery(models.CVE, cve_string=r[0]) for r in results ] queries += [db.InsertQuery(models.URL, url=r[1]) for r in results] queries += [ db.ConnectQuery(models.cve_url_table, r[0], r[1]) for r in results ] bar.update() return queries
def update_cwe_descriptions(): url = "https://nvd.nist.gov/vuln/categories" cwes = db.global_session.query( models.CWE).filter(models.CWE.description == None).all() if not cwes: return raw = net.get_raw_resource(url) regex = r"<span.+?>CWE-(\d+)</span>.+?<a href.+?>(.+?)</a>" descriptions = {x[0]: x[1] for x in re.findall(regex, raw, re.DOTALL)} for cwe in cwes: description = descriptions.get(cwe.cwe_string[4:]) cwe.description = description db.global_session.commit()
def extract_from_github_pull(url_string): queries = [] if re.match(github_pull_re, url_string): commit_match = re.match(r".+/commits?/([0-9a-f]+)/?$", url_string) if commit_match: return _get_queries_for_hash_url(url_string, [commit_match.group(1)]) normalized_url = re.sub(r"/?files.+$", "", url_string) try: raw = net.get_raw_resource(normalized_url + "/commits/", auth=None) except: return [] hashes = re.findall(github_commit_re, raw) hashes += re.findall(github_commit_relative_re, raw) queries = _get_queries_for_hash_url(url_string, hashes) return queries
def extract_from_apache_svn(url_string): # TODO: search other svn urls. if not re.match(svn_apache_re, url_string): return [] db_queries = [] query_t = "org:apache {0}" params = urlparse.parse_qs(urlparse.urlparse(url_string).query) # Search by revision id. r1 = r2 = revision_id = None for k, v in params.iteritems(): if k in ["rev", "revision"]: revision_id = v[0] elif k == "r1": r1 = v[0] elif k == "r2": r2 = v[0] if not revision_id: if r1 and r2: revision_id = max(r1, r2) else: match = re.match(svn_apache_revision_re, url_string) if match: revision_id = match.group(1) if revision_id: query_string = query_t.format(revision_id) db_queries += _get_queries_for_search_url(url_string, query_string) # Search by commit message. try: raw = net.get_raw_resource(url_string) messages = re.findall(svn_apache_message_re, raw) if messages: query_string = query_t.format(messages[0]) db_queries += _get_queries_for_search_url(url_string, query_string) except Exception as e: print "got exception for: {0}".format(url_string) return db_queries
def crawl_android_security_bulletin(parallel=True): raw = net.get_raw_resource("https://source.android.com/security/bulletin/") urls = re.findall( r"https?://source.android.com/security/bulletin/[0-9-]{10}", raw) urls = misc.unique(urls) bar = misc.KnownLengthBar(maxval=len(urls), parallel=parallel) def worker(url_string): raw = net.get_raw_resource(url_string) results = re.findall( r"<td>(CVE[0-9-]+)</td>\s+<td><a href=\"(\S+?)\">", raw, re.DOTALL) results = [r for r in results if _should_keep_url(r[1])] queries = [ db.InsertQuery(models.CVE, cve_string=r[0]) for r in results ] queries += [db.InsertQuery(models.URL, url=r[1]) for r in results] queries += [ db.ConnectQuery(models.cve_url_table, r[0], r[1]) for r in results ] bar.update() return queries return _process_queries_from_workers(worker, urls, parallel, bar)
def crawl_debian_fake_names(parallel=True): # Works faster with parallelism. main_page = net.get_raw_resource( "https://security-tracker.debian.org/tracker/data/fake-names") temp_re = re.compile(r"/tracker/(TEMP-[0-9A-F-]+)") href_re = re.compile(r'href="(\S+?)"') temps = re.findall(temp_re, main_page) bar = misc.KnownLengthBar(maxval=len(temps), parallel=parallel) def worker(temp): raw = net.get_raw_resource( "https://security-tracker.debian.org/tracker/{0}".format(temp)) url_strings = [ x for x in re.findall(href_re, raw) if _should_keep_url(x) ] queries = [db.InsertQuery(models.NonCVE, hash_id=temp)] queries += _get_queries_for_noncve_url(temp, url_strings, ["DebianFake"]) bar.update() return queries return _process_queries_from_workers(worker, temps, parallel, bar)
def file_func(file_obj): name = basename(file_obj["path"]) if local: raw = fs.read_file(file_obj["path"]) else: raw = net.get_raw_resource(file_obj["download_url"], auth=net.github_auth) url_strings = misc.unique(re.findall(commit_re, raw)) queries = [] if url_strings: # Figure out the tag list. tags = ["vulndb"] for k, v in tag_dict.iteritems(): if k in file_obj["path"]: tags.append(v) break # Insert CVEs/NonCVEs and connect them to urls. cve_strings = misc.unique(re.findall(cve_re, raw)) if not cve_strings: hash_id = models.NonCVE.hash_id_for_urls(url_strings) queries = [db.InsertQuery(models.NonCVE, hash_id=hash_id)] queries += _get_queries_for_noncve_url(hash_id, url_strings, tags) else: # Surpisingly there's some CVEs in this db, which are marked # as reserved in the nist feed. We need to add them here. queries = [ db.InsertQuery(models.CVE, cve_string=cve_string) for cve_string in cve_strings ] for cve_string in cve_strings: queries += _get_queries_for_cve_url( cve_string, url_strings, tags) bar.update() return ([], queries)
def file_func(file_obj): name, ext = splitext(basename(file_obj["path"])) queries = [] if ext == ".patch" and cve_index.get(name): if local: raw = fs.read_file(file_obj["path"]) else: raw = net.get_raw_resource(file_obj["download_url"], auth=net.github_auth) hashes = re.findall(hash_re, raw) for h in hashes: api_url = api_url_t.format(h) http_url = http_url_t.format(h) try: net.get_json_resource(api_url, auth=net.github_auth) except: # not upstream commit. Shit happens # print "bad response for resource: " + api_url pass else: queries = _get_queries_for_cve_url( name, [http_url], ["C", "Linux", "github"]) bar.update() return ([], queries)