def crawl_debian_security(local, parallel=True): # Works faster with parallelism. cve_re = re.compile("^(CVE\S+)") list_url = "https://salsa.debian.org/security-tracker-team/security-tracker/raw/master/data/CVE/list" if not local: raw = net.get_raw_resource(list_url) else: path = misc.repo_path("security-tracker-team", "security-tracker") path = os.path.join(path, "raw", "master", "data", "CVE", "list") with open(path, "r") as f: raw = f.read() indices = [x.start() for x in re.finditer(r"^CVE", raw, re.MULTILINE)] sub_strings = [raw[s:f] for s, f in zip(indices, indices[1:] + [len(raw)])] cve_index = _index_cves() bar = misc.KnownLengthBar(maxval=len(sub_strings), parallel=parallel) def worker(sub_string): cve_string = re.findall(cve_re, sub_string)[0] cve = cve_index.get(cve_string) queries = [] if cve: url_strings = misc.flatten_list([ re.findall(url_re, sub_string) for url_re in urls_re_whitelist ]) queries = _get_queries_for_cve_url(cve_string, url_strings, ["DebianSec"]) bar.update() return queries return _process_queries_from_workers(worker, sub_strings, parallel, bar)
def crawl_django(): cve_re = re.compile(r":cve:`(\S+)`") commit_re = re.compile(r"(https://github.com/\S+/[a-f0-9]+)") cve_index = _index_cves() raw = net.get_raw_resource( "https://raw.githubusercontent.com/django/django/master/docs/releases/security.txt" ) indices = [x.start() for x in re.finditer(r":cve:", raw)] sub_strings = [raw[s:f] for s, f in zip(indices, indices[1:] + [len(raw)])] bar = misc.KnownLengthBar(maxval=len(indices), parallel=False) def worker(sub_string): queries = [] cve_string = "CVE-" + re.findall(cve_re, sub_string)[0] cve = cve_index.get(cve_string) if not cve: print "CVE not found?!: " + cve_string return [] # Find the URLs url_strings = re.findall(commit_re, sub_string) if url_strings: queries = _get_queries_for_cve_url(cve_string, url_strings, ["Python", "Django"]) return queries return _process_queries_from_workers(worker, sub_strings, False, bar)
def find_objects(table, list_of_queries): def comp_func(obj, query): for k, v in query.params.iteritems(): if getattr(obj, k) != v: return False return True # Get all rows from db. all_objects_list = global_session.query(table).all() all_objects_dict = {} for x in all_objects_list: field = getattr(x, table.unique_field) # Arrange them in a map, where value of the field "unique_field" is the key. all_objects_dict.setdefault(field, []).append(x) debug_print("Searching for objects in table: {0}".format(table.__tablename__)) if config.DB_VERBOSE: bar = misc.KnownLengthBar(maxval=len(list_of_queries), parallel=False) list_of_queries = bar(list_of_queries) res = {} found_count = 0 for query in list_of_queries: res[query] = False field = query.params[table.unique_field] if all_objects_dict.get(field) is None: continue for x in all_objects_dict[field]: if comp_func(x, query): found_count += 1 res[query] = x break debug_print("Search done. Found {0}/{1} objects".format(found_count, len(list_of_queries))) return res
def search_not_found_hashes(): not_found = get_not_found_hashes() try: cached = get_cached_commits() except: cached = {} not_found = [x for x in not_found if x and cached.get(x) is None] if not not_found: print "No hashes to search!" return bar = misc.KnownLengthBar(maxval=len(not_found), parallel=False) start_time = bar.start_time for i, h in enumerate(not_found): bar.update(1) try: code, reply = net.github_search(h) except Exception as e: print "Got exception: {0} for hash: {1}".format(e, h) not_found.append(h) continue if code == net.CODE_OK: cached[h] = misc.unique( [x["repository"]["full_name"] for x in reply["items"]]) elif code == net.CODE_TIMEOUT: not_found.append(h) with open(config.HASH_CACHE_FILE, "w") as f: json.dump(cached, f, indent=2) # bar.finish() time.sleep(60) bar = misc.KnownLengthBar(maxval=len(not_found), parallel=False) bar.start_time = start_time bar.update(i) else: print "Got code {0} for hash: {1}".format(code, h) with open(config.HASH_CACHE_FILE, "w") as f: json.dump(cached, f, indent=2)
def github_search_loop(): logger = logging.getLogger("github_search") logger.setLevel(logging.INFO) fh = logging.FileHandler('github_search.log') fh.setLevel(logging.DEBUG) logger.addHandler(fh) queries = db.global_session.query(models.GithubSearchQuery).filter( models.GithubSearchQuery.state == models.GithubSearchQuery.NOT_SEEN).all() bar = misc.KnownLengthBar(maxval=len(queries), parallel=False) for i, q in bar(enumerate(queries)): logger.info("({0}/{1}) searching: '{2}'".format( i, len(queries), q.query)) _do_github_search_query(q) logger.info("-" * 80)
def crawl_android_security_bulletin(parallel=True): raw = net.get_raw_resource("https://source.android.com/security/bulletin/") urls = re.findall( r"https?://source.android.com/security/bulletin/[0-9-]{10}", raw) urls = misc.unique(urls) bar = misc.KnownLengthBar(maxval=len(urls), parallel=parallel) def worker(url_string): raw = net.get_raw_resource(url_string) results = re.findall( r"<td>(CVE[0-9-]+)</td>\s+<td><a href=\"(\S+?)\">", raw, re.DOTALL) results = [r for r in results if _should_keep_url(r[1])] queries = [ db.InsertQuery(models.CVE, cve_string=r[0]) for r in results ] queries += [db.InsertQuery(models.URL, url=r[1]) for r in results] queries += [ db.ConnectQuery(models.cve_url_table, r[0], r[1]) for r in results ] bar.update() return queries return _process_queries_from_workers(worker, urls, parallel, bar)
def crawl_debian_fake_names(parallel=True): # Works faster with parallelism. main_page = net.get_raw_resource( "https://security-tracker.debian.org/tracker/data/fake-names") temp_re = re.compile(r"/tracker/(TEMP-[0-9A-F-]+)") href_re = re.compile(r'href="(\S+?)"') temps = re.findall(temp_re, main_page) bar = misc.KnownLengthBar(maxval=len(temps), parallel=parallel) def worker(temp): raw = net.get_raw_resource( "https://security-tracker.debian.org/tracker/{0}".format(temp)) url_strings = [ x for x in re.findall(href_re, raw) if _should_keep_url(x) ] queries = [db.InsertQuery(models.NonCVE, hash_id=temp)] queries += _get_queries_for_noncve_url(temp, url_strings, ["DebianFake"]) bar.update() return queries return _process_queries_from_workers(worker, temps, parallel, bar)
def crawl_linux_kernel_cves(): # Works fast enough without parallelism. url = "https://raw.githubusercontent.com/nluedtke/linux_kernel_cves/master/stream_fixes.json" http_url_t = "https://github.com/torvalds/linux/commit/{0}" patches = net.get_json_resource(url) bar = misc.KnownLengthBar(maxval=len(patches), parallel=False) cve_index = _index_cves() def worker(item): cve_string = item[0] cve = cve_index.get(cve_string) queries = [] if cve: url_strings = [ http_url_t.format(v["cmt_id"]) for v in item[1].values() ] queries = _get_queries_for_cve_url(cve_string, url_strings, ["C", "Linux", "github"]) bar.update() return queries return _process_queries_from_workers(worker, patches.items(), False, bar)
def dump_commits(parallel=True): black_list_res = [ # This list is used to temporarily disable some of the urls to not waste # time on processing them. # re.compile(r".+git.kernel.org.+"), # re.compile(r".+github.com.+"), # re.compile(r".+svn.apache.org.+"), # re.compile(".+github.+(linux).+"), ] def black_list(url_string): for black_list_re in black_list_res: if re.match(black_list_re, url_string): return True return False def extract(url_string): extractors = [ extract_from_github_commit, extract_from_github_issue, extract_from_github_pull, extract_from_apache_svn, extract_from_commit_urls, extract_from_googlesource, extract_from_moodle, extract_from_chromium_codereview, # TODO: extract from git kernel org ] queries = [] for e in extractors: queries = e(url_string) if queries: break bar.update() return queries print "Parsing URLs" url_strings = [ x.url for x in db.global_session.query(models.URL).all() if not black_list(x.url) and not x.hashes and not x.queries ] bar = misc.KnownLengthBar(maxval=len(url_strings), parallel=parallel) queries = misc.map_func(extract, url_strings, parallel) queries = misc.unique(misc.flatten_list(queries)) print "Parsing URLs done" if not queries: print "No new hashes :(" else: print "Storing results" db.process_queries(queries) print "Storing results done" print "Writing bad urls to {0}".format(config.BAD_URLS_FILE) good_urls_set = set([ q.right_unique_value for q in queries if q.__class__ == db.ConnectQuery and q.table in [models.hash_url_table, models.query_url_table] ]) bad_urls = [x for x in url_strings if x not in good_urls_set] with open(config.BAD_URLS_FILE, "w") as f: f.write("\n".join(sorted(bad_urls)))