def matchPackageNames(self, pkgspecs): matched = [] exactmatch = [] unmatched = None for sack in self.sacks.values(): if hasattr(sack, "matchPackageNames"): e, m, u = [], [], [] try: e, m, u = sack.matchPackageNames(pkgspecs) except PackageSackError: continue exactmatch.extend(e) matched.extend(m) if unmatched is None: unmatched = set(u) else: unmatched = unmatched.intersection(set(u)) matched = misc.unique(matched) exactmatch = misc.unique(exactmatch) if unmatched is None: unmatched = [] else: unmatched = list(unmatched) return exactmatch, matched, unmatched
def parsePackages(pkgs, usercommands, casematch=0, unique='repo-epoch-name-version-release-arch'): """matches up the user request versus a pkg list: for installs/updates available pkgs should be the 'others list' for removes it should be the installed list of pkgs takes an optional casematch option to determine if case should be matched exactly. Defaults to not matching.""" pkgdict = buildPkgRefDict(pkgs, bool(casematch)) exactmatch = [] matched = [] unmatched = [] for command in usercommands: if not casematch: command = command.lower() if command in pkgdict: exactmatch.extend(pkgdict[command]) del pkgdict[command] else: # anything we couldn't find a match for # could mean it's not there, could mean it's a wildcard if misc.re_glob(command): trylist = pkgdict.keys() # command and pkgdict are already lowered if not casematch # so case sensitive is always fine restring = fnmatch.translate(command) regex = re.compile(restring) foundit = 0 for item in trylist: if regex.match(item): matched.extend(pkgdict[item]) del pkgdict[item] foundit = 1 if not foundit: unmatched.append(command) else: unmatched.append(command) unmatched = misc.unique(unmatched) if unique == 'repo-epoch-name-version-release-arch': # pkg.__hash__ matched = misc.unique(matched) exactmatch = misc.unique(exactmatch) elif unique == 'repo-pkgkey': # So we get all pkg entries from a repo def pkgunique(pkgs): u = {} for pkg in pkgs: mark = "%s%s" % (pkg.repo.id, pkg.pkgKey) u[mark] = pkg return u.values() matched = pkgunique(matched) exactmatch = pkgunique(exactmatch) else: raise ValueError, "Bad value for unique: %s" % unique return exactmatch, matched, unmatched
def _do_github_search_query(search_query): logger = logging.getLogger("github_search") def remove_hrefs(s): s = re.sub(r"<a href.+?>", "", s) s = s.replace("</a>", "") return s mutations = [ lambda x: [x], lambda x: [remove_hrefs(x)], lambda x: remove_hrefs(x).split("\n") ] mutants = misc.unique( misc.flatten_list([m(search_query.query) for m in mutations])) for query_str in mutants: if not query_str: continue logger.info("trying {0}".format(query_str)) code, answer = net.github_search(query_str) if code == net.CODE_TIMEOUT: logger.info("sleeping...") mutants.append(query_str) # Try again. time.sleep(60) elif code == net.CODE_VALIDATION: logger.info("got 422: " + answer) search_query.state = models.GithubSearchQuery.ERROR db.global_session.commit() elif code == net.CODE_OK: if len(answer["items"]) > 5: answer["items"] = [ item for item in answer["items"] if _messages_match( search_query.query, item["commit"]["message"]) ] hash_strings = misc.unique( [item["sha"] for item in answer["items"]]) logger.info("got results: {0}".format(hash_strings)) queries = [] if hash_strings: search_query.state = models.GithubSearchQuery.NON_EMPTY for h in hash_strings: queries += [ db.InsertQuery(models.CommitHash, hash=h), db.ConnectQuery(models.query_hash_table, search_query.query, h) ] queries += [ db.ConnectQuery(models.hash_url_table, h, url.url) for url in search_query.urls ] db.process_queries(queries) db.global_session.commit() # Commit state update. return search_query.state = models.GithubSearchQuery.EMPTY db.global_session.commit() # Commit state update. else: raise "got something unexpected: {0} {1}".format(code, answer)
def matchPackageNames(self, pkgspecs): """take a list strings and match the packages in the sack against it this will match against: name name.arch name-ver-rel.arch name-ver name-ver-rel epoch:name-ver-rel.arch name-epoch:ver-rel.arch return [exact matches], [glob matches], [unmatch search terms] """ # Setup match() for the search we're doing matched = [] exactmatch = [] unmatched = set(pkgspecs) specs = {} for p in pkgspecs: if misc.re_glob(p): restring = fnmatch.translate(p) specs[p] = re.compile(restring) else: specs[p] = p # We don't use simplePkgList() here because that loads all of the # rpmdb, if we are Eg. doing a "remove PackageKit". pkgs = self.returnPackages(patterns=unmatched) for pkgtup in [pkg.pkgtup for pkg in pkgs]: (n,a,e,v,r) = pkgtup names = set(( n, '%s.%s' % (n, a), '%s-%s-%s.%s' % (n, v, r, a), '%s-%s' % (n, v), '%s-%s-%s' % (n, v, r), '%s:%s-%s-%s.%s' % (e, n, v, r, a), '%s-%s:%s-%s.%s' % (n, e, v, r, a), )) for (term,query) in specs.items(): if term == query: if query in names: exactmatch.append(self.searchPkgTuple(pkgtup)[0]) unmatched.discard(term) else: for n in names: if query.match(n): matched.append(self.searchPkgTuple(pkgtup)[0]) unmatched.discard(term) return misc.unique(exactmatch), misc.unique(matched), list(unmatched)
def _get_queries_for_hash_url(url_string, hashes): hashes = misc.unique(hashes) if config.IGNORE_SHORT_HASHES: hashes = [h for h in hashes if len(h) == 40] queries = \ [db.InsertQuery(models.CommitHash, hash=h) for h in hashes] +\ [db.ConnectQuery(models.hash_url_table, h, url_string) for h in hashes] return queries
def search_tags(self, tagname): res = {} for ptd in self.db_objs.values(): for (name, taglist) in ptd.search_tags(tagname).items(): if not name in res: res[name] = [] res[name].extend(taglist) out = {} for (name, taglist) in res.items(): out[name] = misc.unique(taglist) return out
def process_queries(all_queries): query_classes = [ # Order of classes here is important. InsertQuery, ConnectQuery, # UpdateQuery, UpdateTagQuery, ] for C in query_classes: queries = misc.unique([q for q in all_queries if q.__class__ == C]) C.process_func(queries)
def get_subdirs_recursively(dir): """ get all subdirectories recursively in the given directory """ all_files = [] os.path.walk(dir, match_files_recursively_helper, all_files) matches = misc.unique([f for f in all_files if os.path.isdir(f)]) matches.sort(lambda l, o: cmp(l.upper(), o.upper())) return matches
def get_subdirs_recursively(dir, follow_symlinks=False): """ get all subdirectories recursively in the given directory """ all_files = [] os.path.walk(dir, match_files_recursively_helper, (all_files, [], follow_symlinks)) matches = misc.unique([f for f in all_files if os.path.isdir(f) ]) matches.sort(lambda l, o: cmp(l.upper(), o.upper())) return matches
def match_files_recursively(dir, suffix_list): """ get all files matching suffix_list in the dir and in it's subdirectories """ all_files = [] os.path.walk(dir, match_files_recursively_helper, all_files) matches = misc.unique( [f for f in all_files if match_suffix(f, suffix_list)]) matches.sort(lambda l, o: cmp(l.upper(), o.upper())) return matches
def file_func(file_obj): name = basename(file_obj["path"]) if local: raw = fs.read_file(file_obj["path"]) else: raw = net.get_raw_resource(file_obj["download_url"], auth=net.github_auth) url_strings = misc.unique(re.findall(commit_re, raw)) queries = [] if url_strings: # Figure out the tag list. tags = ["vulndb"] for k, v in tag_dict.iteritems(): if k in file_obj["path"]: tags.append(v) break # Insert CVEs/NonCVEs and connect them to urls. cve_strings = misc.unique(re.findall(cve_re, raw)) if not cve_strings: hash_id = models.NonCVE.hash_id_for_urls(url_strings) queries = [db.InsertQuery(models.NonCVE, hash_id=hash_id)] queries += _get_queries_for_noncve_url(hash_id, url_strings, tags) else: # Surpisingly there's some CVEs in this db, which are marked # as reserved in the nist feed. We need to add them here. queries = [ db.InsertQuery(models.CVE, cve_string=cve_string) for cve_string in cve_strings ] for cve_string in cve_strings: queries += _get_queries_for_cve_url( cve_string, url_strings, tags) bar.update() return ([], queries)
def match_files_recursively(dir, suffix_list, skip_password=False, follow_symlinks=False): """ get all files matching suffix_list in the dir and in it's subdirectories """ all_files = [] if skip_password: os.path.walk(dir, match_files_recursively_skip_protected, (all_files, [], follow_symlinks)) else: os.path.walk(dir, match_files_recursively_helper, (all_files, [], follow_symlinks)) matches = misc.unique([f for f in all_files if match_suffix(f, suffix_list) ]) matches.sort(lambda l, o: cmp(l.upper(), o.upper())) return matches
def check_hashes_for_urls_like(template): urls = db.global_session.query(models.URL).all() urls = [url for url in urls if template in url.url] all_hashes = misc.unique( [hash.hash for url in urls for hash in url.hashes]) good = [] bad = [] for h in all_hashes: matches = filter(lambda x: x.startswith(h), get_found_hashes()) if matches: # We want the longer hash. good.append(matches[0]) else: bad.append(h) print "\n".join(good)
def crawl_github_repo(user, repo, file_callback, dir_callback, local): # TODO: add a non-parallel version. # TODO: adapt to the new db format. Maybe make a second queue for collecting # return data from workers. task_queue = Queue() results_queue = Queue() if local: path = misc.repo_path(user, repo) crawl_list = fs.crawl_dir(path) else: base_url = net.get_api_url(user, repo) crawl_list = net.get_json_resource(base_url, auth=net.github_auth) def worker(task_queue, results_queue): while True: item = task_queue.get() crawl_list = [] if item["type"] == "file": crawl_list, queries = file_callback(item) results_queue.put(queries) elif dir_callback(item): if local: crawl_list = fs.crawl_dir(item["path"]) else: crawl_list = net.get_json_resource(item["url"], auth=net.github_auth) for item in crawl_list: task_queue.put(item) task_queue.task_done() for item in crawl_list: task_queue.put(item) for i in xrange(0, config.THREADS_COUNT): t = Thread(target=worker, args=(task_queue, results_queue)) # TODO: can't make the threads stop properly when an event is fired. # So for now we'll do it with daemon threads, which will get blocked # after the queue gets empty. t.daemon = True t.start() task_queue.join() return misc.unique(misc.flatten_list(results_queue.queue))
def match_files_recursively(dir, suffix_list, skip_password=False, follow_symlinks=False): """ get all files matching suffix_list in the dir and in it's subdirectories """ all_files = [] if skip_password: os.path.walk(dir, match_files_recursively_skip_protected, (all_files, [], follow_symlinks)) else: os.path.walk(dir, match_files_recursively_helper, (all_files, [], follow_symlinks)) matches = misc.unique( [f for f in all_files if match_suffix(f, suffix_list)]) matches.sort(lambda l, o: cmp(l.upper(), o.upper())) return matches
def searchPrco(self, name, prcotype): self._checkIndexes(failure='build') prcodict = getattr(self, prcotype) (n,f,(e,v,r)) = misc.string_to_prco_tuple(name) basic_results = [] results = [] if n in prcodict: basic_results.extend(prcodict[n]) for po in basic_results: if po.checkPrco(prcotype, (n, f, (e,v,r))): results.append(po) if prcotype != "provides": return results if not misc.re_filename(n): return results results.extend(self.searchFiles(n)) return misc.unique(results)
def searchPrco(self, name, prcotype): self._checkIndexes(failure='build') prcodict = getattr(self, prcotype) (n, f, (e, v, r)) = misc.string_to_prco_tuple(name) basic_results = [] results = [] if n in prcodict: basic_results.extend(prcodict[n]) for po in basic_results: if po.checkPrco(prcotype, (n, f, (e, v, r))): results.append(po) if prcotype != "provides": return results if not misc.re_filename(n): return results results.extend(self.searchFiles(n)) return misc.unique(results)
def search_not_found_hashes(): not_found = get_not_found_hashes() try: cached = get_cached_commits() except: cached = {} not_found = [x for x in not_found if x and cached.get(x) is None] if not not_found: print "No hashes to search!" return bar = misc.KnownLengthBar(maxval=len(not_found), parallel=False) start_time = bar.start_time for i, h in enumerate(not_found): bar.update(1) try: code, reply = net.github_search(h) except Exception as e: print "Got exception: {0} for hash: {1}".format(e, h) not_found.append(h) continue if code == net.CODE_OK: cached[h] = misc.unique( [x["repository"]["full_name"] for x in reply["items"]]) elif code == net.CODE_TIMEOUT: not_found.append(h) with open(config.HASH_CACHE_FILE, "w") as f: json.dump(cached, f, indent=2) # bar.finish() time.sleep(60) bar = misc.KnownLengthBar(maxval=len(not_found), parallel=False) bar.start_time = start_time bar.update(i) else: print "Got code {0} for hash: {1}".format(code, h) with open(config.HASH_CACHE_FILE, "w") as f: json.dump(cached, f, indent=2)
def crawl_android_security_bulletin(parallel=True): raw = net.get_raw_resource("https://source.android.com/security/bulletin/") urls = re.findall( r"https?://source.android.com/security/bulletin/[0-9-]{10}", raw) urls = misc.unique(urls) bar = misc.KnownLengthBar(maxval=len(urls), parallel=parallel) def worker(url_string): raw = net.get_raw_resource(url_string) results = re.findall( r"<td>(CVE[0-9-]+)</td>\s+<td><a href=\"(\S+?)\">", raw, re.DOTALL) results = [r for r in results if _should_keep_url(r[1])] queries = [ db.InsertQuery(models.CVE, cve_string=r[0]) for r in results ] queries += [db.InsertQuery(models.URL, url=r[1]) for r in results] queries += [ db.ConnectQuery(models.cve_url_table, r[0], r[1]) for r in results ] bar.update() return queries return _process_queries_from_workers(worker, urls, parallel, bar)
def __add__(self,other): """ sums flights: Assigns all positions of the flight to a single flight only if they are consecutive""" points=np.vstack((self.UTM,other.UTM)) return Flight(misc.unique(points))
def _process_queries_from_workers(worker_func, sequence, parallel, bar=None): results = misc.map_func(worker_func, sequence, parallel) if bar: bar.finish() queries = misc.unique(misc.flatten_list(results)) return db.process_queries(queries)
def dump_commits(parallel=True): black_list_res = [ # This list is used to temporarily disable some of the urls to not waste # time on processing them. # re.compile(r".+git.kernel.org.+"), # re.compile(r".+github.com.+"), # re.compile(r".+svn.apache.org.+"), # re.compile(".+github.+(linux).+"), ] def black_list(url_string): for black_list_re in black_list_res: if re.match(black_list_re, url_string): return True return False def extract(url_string): extractors = [ extract_from_github_commit, extract_from_github_issue, extract_from_github_pull, extract_from_apache_svn, extract_from_commit_urls, extract_from_googlesource, extract_from_moodle, extract_from_chromium_codereview, # TODO: extract from git kernel org ] queries = [] for e in extractors: queries = e(url_string) if queries: break bar.update() return queries print "Parsing URLs" url_strings = [ x.url for x in db.global_session.query(models.URL).all() if not black_list(x.url) and not x.hashes and not x.queries ] bar = misc.KnownLengthBar(maxval=len(url_strings), parallel=parallel) queries = misc.map_func(extract, url_strings, parallel) queries = misc.unique(misc.flatten_list(queries)) print "Parsing URLs done" if not queries: print "No new hashes :(" else: print "Storing results" db.process_queries(queries) print "Storing results done" print "Writing bad urls to {0}".format(config.BAD_URLS_FILE) good_urls_set = set([ q.right_unique_value for q in queries if q.__class__ == db.ConnectQuery and q.table in [models.hash_url_table, models.query_url_table] ]) bad_urls = [x for x in url_strings if x not in good_urls_set] with open(config.BAD_URLS_FILE, "w") as f: f.write("\n".join(sorted(bad_urls)))
#!/usr/bin/python assert __name__ == '__main__' import argparse from datetime import datetime as DT from misc import KEYBOOKS, cdar, unique, car, FORMAT0, FORMAT1, KEEP parser = argparse.ArgumentParser( description='catenate a processed version of an exported libre file.') parser.add_argument('--test', action='store_true') parser.add_argument('filename') args = parser.parse_args() if True: target = args.filename with open(target) as fd: _owner = fd.readline().strip() _keys = fd.readline().strip().split('\t') _agree = lambda book: _keys == map(car, book) keys = map(cdar, unique(filter(_agree, KEYBOOKS))) for line in fd.readlines(): vals = line.strip().split('\t') items = zip(keys, vals) items.sort() stamp = DT.strptime(dict(items)['Time'], FORMAT0).strftime(FORMAT1) print(stamp + ':libre:' + repr(filter(KEEP, items)))