def setUp(self): docs = 1 freq = 1 default_weight = 1 docfreqs = [] self.extractors = {} self.extractor_args = {} for i in range(2): namespace = "extractor%s." % i feat_freq = {} for j in range(2): feat_freq[namespace + str(j)] = freq docfreqs.append(feat_freq) self.extractors[namespace] = self.FakeExtractor( NAME=namespace, NAMESPACE=namespace) self.extractor_args["%s_weight" % namespace] = default_weight # Create tmp file and save OrderedDocumentFrequencies there self.tmp_file = tempfile.NamedTemporaryFile(prefix="test_weighting", delete=False) model = OrderedDocumentFrequencies().construct(docs, docfreqs) model.save(self.tmp_file.name) # arguments.docfreq self.docfreq_args = {"docfreq": self.tmp_file.name} # batches self.batches = [ ] # [BagsBatch(keys=None, matrix=csr_matrix(numpy.eye(4)))]
def create_or_load_ordered_df(args, ndocs: int = None, bag_features: Uast2BagFeatures = None): """ Returns a preexisting OrderedDocumentFrequencies model from docfreq_in, or generates one from the flattened bags of features using args and saves it to docfreq_out. :param args: Instance of `argparse.Namespace` that contains docfreq_in, docfreq_out, min_docfreq, and vocabulary_size. :param ndocs: Number of documents (can be repos, files or functions) :param bag_features: Transformer containing bags of features extracted from the data (the call instantiates an RDD: [(key, doc), val] where key is a specific feature that appeared val times in the document doc. :return: OrderedDocumentFrequencies model """ log = logging.getLogger("create_or_load_ordered_df") if args.docfreq_in: log.info("Loading ordered docfreq model from %s ...", args.docfreq_in) return OrderedDocumentFrequencies().load(args.docfreq_in) elif ndocs is None or bag_features is None: log.error("[IN] only mode, please supply an ordered docfreq model") raise ValueError log.info("Calculating the document frequencies, hold tight ...") df = bag_features \ .link(BagFeatures2DocFreq()) \ .execute() log.info("Writing ordered docfreq model to %s ...", args.docfreq_out) df_model = OrderedDocumentFrequencies() \ .construct(ndocs, df) \ .prune(args.min_docfreq) \ .greatest(args.vocabulary_size) \ .save(args.docfreq_out) return df_model
def setUp(self): docs = 1 freq = 1 default_weight = 1 docfreqs = [] self.extractors = {} self.extractor_args = {} for i in range(2): namespace = "extractor%s." % i feat_freq = {} for j in range(2): feat_freq[namespace + str(j)] = freq docfreqs.append(feat_freq) self.extractors[namespace] = self.FakeExtractor(NAME=namespace, NAMESPACE=namespace) self.extractor_args["%s_weight" % namespace] = default_weight # Create tmp file and save OrderedDocumentFrequencies there self.tmp_file = tempfile.NamedTemporaryFile(prefix="test_weighting", delete=False) model = OrderedDocumentFrequencies().construct(docs, docfreqs) model.save(self.tmp_file.name) # arguments.docfreq self.docfreq_args = {"docfreq": self.tmp_file.name} # batches self.batches = [BagsBatch(keys=None, matrix=csr_matrix(numpy.eye(4)))]
def hash_file(args): if not args.feature: raise ValueError("extractors must not be empty") log = logging.getLogger("hash_file") vocab = OrderedDocumentFrequencies().load(args.docfreq) params = WeightedMinHashParameters().load(args.params) log.info("Extracting UAST from %s", args.file) uast = BblfshClient(args.bblfsh).parse(args.file).uast log.info("Populating the bag") extractors = [ __extractors__[s](args.min_docfreq, **__extractors__[s].get_kwargs_fromcmdline(args)) for s in args.feature ] bag = numpy.zeros(len(vocab), dtype=numpy.float32) for ex in extractors: ex.ndocs = vocab.docs ex.docfreq = vocab for k, v in ex.extract(uast): try: i = vocab.order[k] bag[i] = log_tf_log_idf(df=vocab[k], tf=v, ndocs=vocab.docs) except KeyError: continue log.info("Bag size: %d", len(bag.nonzero()[0])) log.info("Hashing") return weighted_minhash(bag, params.rs.shape[0], params.rs, params.ln_cs, params.betas), bag
def modify_feature_weights(batches, arguments, **kwargs): extractors = {} for ex in __extractors__.values(): if "%s_weight" % ex.NAME in dir(arguments) and \ getattr(arguments, "%s_weight" % ex.NAME) != 1: extractors[ex.NAME] = (ex.NAMESPACE, getattr(arguments, "%s_weight" % ex.NAME)) if not extractors: return batches err = "You must specify location of docfreq file to modify weights of features" assert arguments.docfreq is not None, err assert os.path.isfile(arguments.docfreq), "docfreq should be a file" model = OrderedDocumentFrequencies().load(arguments.docfreq) feature_mapping = model.order voc_size = batches[0].matrix.shape[-1] weights = numpy.ones((voc_size, )) for ext in extractors: namespace = extractors[ext][0] ind = [ feature_mapping[k] for k in feature_mapping if k.startswith(namespace) ] weights[ind] = extractors[ext][1] for batch in batches: # hack to modify attribute in namedtuple batch.matrix.data = batch.matrix.multiply(weights).tocsr().data.astype( numpy.float32) return batches
def __init__(self, arguments): log = logging.getLogger("reweighter") self.extractors = {} for ex in __extractors__.values(): if "%s_weight" % ex.NAME in dir(arguments) and \ getattr(arguments, "%s_weight" % ex.NAME) != 1: self.extractors[ex.NAME] = (ex.NAMESPACE, getattr(arguments, "%s_weight" % ex.NAME)) if not self.extractors: log.info("No extractors found, reweighting will be skipped") return err = "You must specify location of docfreq file to modify weights of features" assert arguments.docfreq is not None, err assert os.path.isfile(arguments.docfreq), "docfreq should be a file" model = OrderedDocumentFrequencies().load(arguments.docfreq) self.feature_mapping = model.order self.weights = numpy.ones((len(self.feature_mapping), )) for ext in self.extractors: namespace = self.extractors[ext][0] ind = [ self.feature_mapping[k] for k in self.feature_mapping if k.startswith(namespace) ] self.weights[ind] = self.extractors[ext][1]
def repos2df(args): log = logging.getLogger("repos2df") extractors = create_extractors_from_args(args) session_name = "repos2df-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point \ .link(UastRow2Document()) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs...") ndocs = uast_extractor.link(Counter()).execute() log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) quant = Uast2Quant(extractors) uast_extractor.link(quant).execute() if quant.levels: log.info("Writing quantization levels to %s", args.quant) QuantizationLevels().construct(quant.levels).save(args.quant) df = uast_extractor \ .link(Uast2BagFeatures(extractors)) \ .link(BagFeatures2DocFreq()) \ .execute() log.info("Writing docfreq model to %s", args.docfreq_out) OrderedDocumentFrequencies().construct(ndocs, df).save(args.docfreq_out) pipeline_graph(args, log, root)
def merge_coocc(args): log = logging.getLogger("merge_coocc") log.setLevel(args.log_level) filepaths = list(handle_input_arg(args.input, log)) log.info("Will merge %d files", len(filepaths)) df = OrderedDocumentFrequencies().load(args.docfreq) if args.no_spark: merge_coocc_no_spark(df, filepaths, log, args) else: merge_coocc_spark(df, filepaths, log, args)
def repos2bow_entry_template(args, select=HeadFiles, cache_hook=None, save_hook=None): log = logging.getLogger("repos2bow") extractors = create_extractors_from_args(args) session_name = "repos2bow-%s" % uuid4() root, start_point = create_uast_source(args, session_name, select=select) uast_extractor = start_point.link(Moder(args.mode)) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) if cache_hook is not None: uast_extractor.link(cache_hook()).execute() # We link UastRow2Document after Cacher here because cache_hook() may want to have all possible # Row items. uast_extractor = uast_extractor.link(UastRow2Document()) log.info("Extracting UASTs and indexing documents...") document_indexer = Indexer(Uast2BagFeatures.Columns.document) uast_extractor.link(document_indexer).execute() ndocs = len(document_indexer) log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) quant = Uast2Quant(extractors) uast_extractor.link(quant).execute() if quant.levels: log.info("Writing quantization levels to %s", args.quant) QuantizationLevels().construct(quant.levels).save(args.quant) uast_extractor = uast_extractor \ .link(Uast2BagFeatures(extractors)) log.info("Calculating the document frequencies...") df = uast_extractor.link(BagFeatures2DocFreq()).execute() log.info("Writing docfreq to %s", args.docfreq) df_model = OrderedDocumentFrequencies() \ .construct(ndocs, df) \ .prune(args.min_docfreq) \ .greatest(args.vocabulary_size) \ .save(args.docfreq) bags_writer = uast_extractor \ .link(BagFeatures2TermFreq()) \ .link(TFIDF(df_model)) \ .link(document_indexer) \ .link(Indexer(Uast2BagFeatures.Columns.token, df_model.order)) if save_hook is not None: bags_writer = bags_writer \ .link(Repartitioner.maybe(args.partitions * 10, args.shuffle)) \ .link(save_hook()) bags_writer.link(BOWWriter(document_indexer, df_model, args.bow, args.batch)) \ .execute() pipeline_graph(args, log, root)
def repos2coocc_entry(args): log = logging.getLogger("repos2coocc") id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq, split_stem=args.split) session_name = "repos2coocc-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point \ .link(UastRow2Document()) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs...") ndocs = uast_extractor.link(Counter()).execute() log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) df = uast_extractor \ .link(Uast2BagFeatures([id_extractor])) \ .link(BagFeatures2DocFreq()) \ .execute() log.info("Writing document frequency model to %s...", args.docfreq) df_model = OrderedDocumentFrequencies() \ .construct(ndocs, df) \ .prune(args.min_docfreq) \ .greatest(args.vocabulary_size) \ .save(args.docfreq) token2index = root.session.sparkContext.broadcast(df_model.order) uast_extractor \ .link(CooccConstructor(token2index=token2index, token_parser=id_extractor.id2bag.token_parser, namespace=id_extractor.NAMESPACE)) \ .link(CooccModelSaver(args.output, df_model)) \ .execute() pipeline_graph(args, log, root)
def test_preprocess(self): import tensorflow as tf with tempfile.TemporaryDirectory() as tmpdir: args = default_preprocess_params(tmpdir, VOCAB) with captured_output() as (out, err, log): id2vec_preprocess(args) self.assertFalse(out.getvalue()) self.assertFalse(err.getvalue()) self.assertEqual(sorted(os.listdir(tmpdir)), [ "col_sums.txt", "col_vocab.txt", "row_sums.txt", "row_vocab.txt", "shard-000-000.pb" ]) df = OrderedDocumentFrequencies().load(source=args.docfreq_in) self.assertEqual(len(df), VOCAB) with open(os.path.join(tmpdir, "col_sums.txt")) as fin: col_sums = fin.read() with open(os.path.join(tmpdir, "row_sums.txt")) as fin: row_sums = fin.read() self.assertEqual(col_sums, row_sums) with open(os.path.join(tmpdir, "col_vocab.txt")) as fin: col_vocab = fin.read() with open(os.path.join(tmpdir, "row_vocab.txt")) as fin: row_vocab = fin.read() self.assertEqual(col_vocab, row_vocab) self.assertEqual(row_vocab.split("\n"), df.tokens()) for word in row_vocab.split("\n"): self.assertGreater(df[word], 0) with open(os.path.join(tmpdir, "shard-000-000.pb"), "rb") as fin: features = tf.parse_single_example( fin.read(), features={ "global_row": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "global_col": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "sparse_local_row": tf.VarLenFeature(dtype=tf.int64), "sparse_local_col": tf.VarLenFeature(dtype=tf.int64), "sparse_value": tf.VarLenFeature(dtype=tf.float32) }) with tf.Session() as session: global_row, global_col, local_row, local_col, value = session.run( [ features[n] for n in ("global_row", "global_col", "sparse_local_row", "sparse_local_col", "sparse_value") ]) self.assertEqual(set(range(VOCAB)), set(global_row)) self.assertEqual(set(range(VOCAB)), set(global_col)) nnz = 16001 self.assertEqual(value.values.shape, (nnz, )) self.assertEqual(local_row.values.shape, (nnz, )) self.assertEqual(local_col.values.shape, (nnz, )) numpy.random.seed(0) all_tokens = row_vocab.split("\n") chosen_indices = numpy.random.choice(list(range(VOCAB)), 128, replace=False) chosen = [all_tokens[i] for i in chosen_indices] freqs = numpy.zeros((len(chosen), ) * 2, dtype=int) index = {w: i for i, w in enumerate(chosen)} chosen = set(chosen) with asdf.open(args.input) as model: matrix = assemble_sparse_matrix(model.tree["matrix"]).tocsr() tokens = split_strings(model.tree["tokens"]) interesting = {i for i, t in enumerate(tokens) if t in chosen} for y in interesting: row = matrix[y] yi = index[tokens[y]] for x, v in zip(row.indices, row.data): if x in interesting: freqs[yi, index[tokens[x]]] += v matrix = coo_matrix( (value.values, ([global_row[row] for row in local_row.values ], [global_col[col] for col in local_col.values])), shape=(VOCAB, VOCAB)) matrix = matrix.tocsr()[chosen_indices][:, chosen_indices].todense( ).astype(int) self.assertTrue((matrix == freqs).all())
def __init__(self, output, df_model: OrderedDocumentFrequencies, **kwargs): super().__init__(**kwargs) self.tokens_list = df_model.tokens() self.output = output self.df_model = df_model
def query(args): log = logging.getLogger("query") session = get_db(args) tables = args.tables if args.id: rows = session.execute( "SELECT hashtable, value FROM %s WHERE sha1='%s'" % (tables["hashtables2"], args.id)) bands = [(r.hashtable, r.value) for r in rows] else: # args.file if not args.feature: log.critical( "-f / --feature must be specified at least once in file query mode" ) return 1 if not args.params: log.critical("-p / --params must be specified in file query mode") return 1 wmh, bag = hash_file(args) htnum, band_size = calc_hashtable_params(args.threshold, len(wmh), args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) bands = [(i, bytearray(wmh[i * band_size:(i + 1) * band_size].data)) for i in range(htnum)] similar = set() log.info("Looking for similar items") for i, band in bands: rows = session.execute( "SELECT sha1 FROM %s WHERE hashtable=%d AND value=0x%s" % (tables["hashtables"], i, codecs.encode(band, "hex").decode())) similar.update(r.sha1 for r in rows) log.info("Fetched %d items", len(similar)) if args.precise: # Precise bags vocab = OrderedDocumentFrequencies().load(args.docfreq) log.info("Calculating the precise result") if args.id: rows = session.execute( "SELECT item, value FROM %s WHERE sha1='%s'" % (tables["bags"], args.id)) bag = numpy.zeros(len(vocab), dtype=numpy.float32) for row in rows: bag[vocab.order[row.item]] = row.value # Fetch other bags from the DB precise = [] for x in similar: rows = session.execute( "SELECT item, value FROM %s WHERE sha1='%s'" % (tables["bags"], x)) other_bag = numpy.zeros(len(vocab), dtype=numpy.float32) for row in rows: other_bag[vocab.order[row.item]] = row.value if weighted_jaccard(bag, other_bag) >= args.threshold: precise.append(x) log.info("Survived: %.2f", len(precise) / len(similar)) similar = precise if args.id: try: similar.remove(args.id) except KeyError: # o_O pass similar = [s.split("@")[1] for s in similar] stream_template(args.template, sys.stdout, size=len(similar), origin=args.id if args.id else os.path.abspath(args.file), items=BatchedHashResolver(similar, args.batch, session, tables["meta"]))