예제 #1
0
    def setUp(self):
        docs = 1
        freq = 1
        default_weight = 1
        docfreqs = []
        self.extractors = {}
        self.extractor_args = {}
        for i in range(2):
            namespace = "extractor%s." % i
            feat_freq = {}
            for j in range(2):
                feat_freq[namespace + str(j)] = freq
            docfreqs.append(feat_freq)

            self.extractors[namespace] = self.FakeExtractor(
                NAME=namespace, NAMESPACE=namespace)
            self.extractor_args["%s_weight" % namespace] = default_weight

        # Create tmp file and save OrderedDocumentFrequencies there
        self.tmp_file = tempfile.NamedTemporaryFile(prefix="test_weighting",
                                                    delete=False)
        model = OrderedDocumentFrequencies().construct(docs, docfreqs)
        model.save(self.tmp_file.name)

        # arguments.docfreq
        self.docfreq_args = {"docfreq": self.tmp_file.name}

        # batches
        self.batches = [
        ]  # [BagsBatch(keys=None, matrix=csr_matrix(numpy.eye(4)))]
예제 #2
0
파일: docfreq.py 프로젝트: y1026/ml
def create_or_load_ordered_df(args,
                              ndocs: int = None,
                              bag_features: Uast2BagFeatures = None):
    """
    Returns a preexisting OrderedDocumentFrequencies model from docfreq_in, or generates one
    from the flattened bags of features using args and saves it to docfreq_out.

    :param args: Instance of `argparse.Namespace` that contains docfreq_in, docfreq_out,
                 min_docfreq, and vocabulary_size.
    :param ndocs: Number of documents (can be repos, files or functions)
    :param bag_features: Transformer containing bags of features extracted from the data (the call
                         instantiates an RDD: [(key, doc), val] where key is a specific feature
                         that appeared val times in the document doc.
    :return: OrderedDocumentFrequencies model
    """
    log = logging.getLogger("create_or_load_ordered_df")
    if args.docfreq_in:
        log.info("Loading ordered docfreq model from %s ...", args.docfreq_in)
        return OrderedDocumentFrequencies().load(args.docfreq_in)
    elif ndocs is None or bag_features is None:
        log.error("[IN] only mode, please supply an ordered docfreq model")
        raise ValueError
    log.info("Calculating the document frequencies, hold tight ...")
    df = bag_features \
        .link(BagFeatures2DocFreq()) \
        .execute()
    log.info("Writing ordered docfreq model to %s ...", args.docfreq_out)
    df_model = OrderedDocumentFrequencies() \
        .construct(ndocs, df) \
        .prune(args.min_docfreq) \
        .greatest(args.vocabulary_size) \
        .save(args.docfreq_out)
    return df_model
    def setUp(self):
        docs = 1
        freq = 1
        default_weight = 1
        docfreqs = []
        self.extractors = {}
        self.extractor_args = {}
        for i in range(2):
            namespace = "extractor%s." % i
            feat_freq = {}
            for j in range(2):
                feat_freq[namespace + str(j)] = freq
            docfreqs.append(feat_freq)

            self.extractors[namespace] = self.FakeExtractor(NAME=namespace, NAMESPACE=namespace)
            self.extractor_args["%s_weight" % namespace] = default_weight

        # Create tmp file and save OrderedDocumentFrequencies there
        self.tmp_file = tempfile.NamedTemporaryFile(prefix="test_weighting", delete=False)
        model = OrderedDocumentFrequencies().construct(docs, docfreqs)
        model.save(self.tmp_file.name)

        # arguments.docfreq
        self.docfreq_args = {"docfreq": self.tmp_file.name}

        # batches
        self.batches = [BagsBatch(keys=None, matrix=csr_matrix(numpy.eye(4)))]
예제 #4
0
파일: hasher.py 프로젝트: fulaphex/apollo
def hash_file(args):
    if not args.feature:
        raise ValueError("extractors must not be empty")
    log = logging.getLogger("hash_file")
    vocab = OrderedDocumentFrequencies().load(args.docfreq)
    params = WeightedMinHashParameters().load(args.params)
    log.info("Extracting UAST from %s", args.file)
    uast = BblfshClient(args.bblfsh).parse(args.file).uast
    log.info("Populating the bag")
    extractors = [
        __extractors__[s](args.min_docfreq,
                          **__extractors__[s].get_kwargs_fromcmdline(args))
        for s in args.feature
    ]
    bag = numpy.zeros(len(vocab), dtype=numpy.float32)
    for ex in extractors:
        ex.ndocs = vocab.docs
        ex.docfreq = vocab
        for k, v in ex.extract(uast):
            try:
                i = vocab.order[k]
                bag[i] = log_tf_log_idf(df=vocab[k], tf=v, ndocs=vocab.docs)
            except KeyError:
                continue

    log.info("Bag size: %d", len(bag.nonzero()[0]))
    log.info("Hashing")

    return weighted_minhash(bag, params.rs.shape[0], params.rs, params.ln_cs,
                            params.betas), bag
예제 #5
0
def modify_feature_weights(batches, arguments, **kwargs):
    extractors = {}
    for ex in __extractors__.values():
        if "%s_weight" % ex.NAME in dir(arguments) and \
                        getattr(arguments, "%s_weight" % ex.NAME) != 1:
            extractors[ex.NAME] = (ex.NAMESPACE,
                                   getattr(arguments, "%s_weight" % ex.NAME))

    if not extractors:
        return batches

    err = "You must specify location of docfreq file to modify weights of features"
    assert arguments.docfreq is not None, err
    assert os.path.isfile(arguments.docfreq), "docfreq should be a file"

    model = OrderedDocumentFrequencies().load(arguments.docfreq)
    feature_mapping = model.order

    voc_size = batches[0].matrix.shape[-1]
    weights = numpy.ones((voc_size, ))

    for ext in extractors:
        namespace = extractors[ext][0]
        ind = [
            feature_mapping[k] for k in feature_mapping
            if k.startswith(namespace)
        ]
        weights[ind] = extractors[ext][1]

    for batch in batches:
        # hack to modify attribute in namedtuple
        batch.matrix.data = batch.matrix.multiply(weights).tocsr().data.astype(
            numpy.float32)

    return batches
예제 #6
0
    def __init__(self, arguments):
        log = logging.getLogger("reweighter")
        self.extractors = {}
        for ex in __extractors__.values():
            if "%s_weight" % ex.NAME in dir(arguments) and \
                            getattr(arguments, "%s_weight" % ex.NAME) != 1:
                self.extractors[ex.NAME] = (ex.NAMESPACE,
                                            getattr(arguments,
                                                    "%s_weight" % ex.NAME))
        if not self.extractors:
            log.info("No extractors found, reweighting will be skipped")
            return

        err = "You must specify location of docfreq file to modify weights of features"
        assert arguments.docfreq is not None, err
        assert os.path.isfile(arguments.docfreq), "docfreq should be a file"

        model = OrderedDocumentFrequencies().load(arguments.docfreq)
        self.feature_mapping = model.order

        self.weights = numpy.ones((len(self.feature_mapping), ))

        for ext in self.extractors:
            namespace = self.extractors[ext][0]
            ind = [
                self.feature_mapping[k] for k in self.feature_mapping
                if k.startswith(namespace)
            ]
            self.weights[ind] = self.extractors[ext][1]
예제 #7
0
def repos2df(args):
    log = logging.getLogger("repos2df")
    extractors = create_extractors_from_args(args)
    session_name = "repos2df-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())
    quant = Uast2Quant(extractors)
    uast_extractor.link(quant).execute()
    if quant.levels:
        log.info("Writing quantization levels to %s", args.quant)
        QuantizationLevels().construct(quant.levels).save(args.quant)
    df = uast_extractor \
        .link(Uast2BagFeatures(extractors)) \
        .link(BagFeatures2DocFreq()) \
        .execute()
    log.info("Writing docfreq model to %s", args.docfreq_out)
    OrderedDocumentFrequencies().construct(ndocs, df).save(args.docfreq_out)
    pipeline_graph(args, log, root)
예제 #8
0
파일: merge_coocc.py 프로젝트: sniperkit/ml
def merge_coocc(args):
    log = logging.getLogger("merge_coocc")
    log.setLevel(args.log_level)
    filepaths = list(handle_input_arg(args.input, log))
    log.info("Will merge %d files", len(filepaths))
    df = OrderedDocumentFrequencies().load(args.docfreq)
    if args.no_spark:
        merge_coocc_no_spark(df, filepaths, log, args)
    else:
        merge_coocc_spark(df, filepaths, log, args)
예제 #9
0
def repos2bow_entry_template(args,
                             select=HeadFiles,
                             cache_hook=None,
                             save_hook=None):
    log = logging.getLogger("repos2bow")
    extractors = create_extractors_from_args(args)
    session_name = "repos2bow-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name, select=select)
    uast_extractor = start_point.link(Moder(args.mode)) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    if cache_hook is not None:
        uast_extractor.link(cache_hook()).execute()
    # We link UastRow2Document after Cacher here because cache_hook() may want to have all possible
    # Row items.
    uast_extractor = uast_extractor.link(UastRow2Document())
    log.info("Extracting UASTs and indexing documents...")
    document_indexer = Indexer(Uast2BagFeatures.Columns.document)
    uast_extractor.link(document_indexer).execute()
    ndocs = len(document_indexer)
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())
    quant = Uast2Quant(extractors)
    uast_extractor.link(quant).execute()
    if quant.levels:
        log.info("Writing quantization levels to %s", args.quant)
        QuantizationLevels().construct(quant.levels).save(args.quant)
    uast_extractor = uast_extractor \
        .link(Uast2BagFeatures(extractors))
    log.info("Calculating the document frequencies...")
    df = uast_extractor.link(BagFeatures2DocFreq()).execute()
    log.info("Writing docfreq to %s", args.docfreq)
    df_model = OrderedDocumentFrequencies() \
        .construct(ndocs, df) \
        .prune(args.min_docfreq) \
        .greatest(args.vocabulary_size) \
        .save(args.docfreq)
    bags_writer = uast_extractor \
        .link(BagFeatures2TermFreq()) \
        .link(TFIDF(df_model)) \
        .link(document_indexer) \
        .link(Indexer(Uast2BagFeatures.Columns.token, df_model.order))
    if save_hook is not None:
        bags_writer = bags_writer \
            .link(Repartitioner.maybe(args.partitions * 10, args.shuffle)) \
            .link(save_hook())
    bags_writer.link(BOWWriter(document_indexer, df_model, args.bow, args.batch)) \
        .execute()
    pipeline_graph(args, log, root)
예제 #10
0
파일: repos2coocc.py 프로젝트: absognety/ml
def repos2coocc_entry(args):
    log = logging.getLogger("repos2coocc")
    id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq,
                                           split_stem=args.split)
    session_name = "repos2coocc-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())

    df = uast_extractor \
        .link(Uast2BagFeatures([id_extractor])) \
        .link(BagFeatures2DocFreq()) \
        .execute()

    log.info("Writing document frequency model to %s...", args.docfreq)
    df_model = OrderedDocumentFrequencies() \
        .construct(ndocs, df) \
        .prune(args.min_docfreq) \
        .greatest(args.vocabulary_size) \
        .save(args.docfreq)

    token2index = root.session.sparkContext.broadcast(df_model.order)
    uast_extractor \
        .link(CooccConstructor(token2index=token2index,
                               token_parser=id_extractor.id2bag.token_parser,
                               namespace=id_extractor.NAMESPACE)) \
        .link(CooccModelSaver(args.output, df_model)) \
        .execute()
    pipeline_graph(args, log, root)
예제 #11
0
 def test_preprocess(self):
     import tensorflow as tf
     with tempfile.TemporaryDirectory() as tmpdir:
         args = default_preprocess_params(tmpdir, VOCAB)
         with captured_output() as (out, err, log):
             id2vec_preprocess(args)
         self.assertFalse(out.getvalue())
         self.assertFalse(err.getvalue())
         self.assertEqual(sorted(os.listdir(tmpdir)), [
             "col_sums.txt", "col_vocab.txt", "row_sums.txt",
             "row_vocab.txt", "shard-000-000.pb"
         ])
         df = OrderedDocumentFrequencies().load(source=args.docfreq_in)
         self.assertEqual(len(df), VOCAB)
         with open(os.path.join(tmpdir, "col_sums.txt")) as fin:
             col_sums = fin.read()
         with open(os.path.join(tmpdir, "row_sums.txt")) as fin:
             row_sums = fin.read()
         self.assertEqual(col_sums, row_sums)
         with open(os.path.join(tmpdir, "col_vocab.txt")) as fin:
             col_vocab = fin.read()
         with open(os.path.join(tmpdir, "row_vocab.txt")) as fin:
             row_vocab = fin.read()
         self.assertEqual(col_vocab, row_vocab)
         self.assertEqual(row_vocab.split("\n"), df.tokens())
         for word in row_vocab.split("\n"):
             self.assertGreater(df[word], 0)
         with open(os.path.join(tmpdir, "shard-000-000.pb"), "rb") as fin:
             features = tf.parse_single_example(
                 fin.read(),
                 features={
                     "global_row": tf.FixedLenFeature([VOCAB],
                                                      dtype=tf.int64),
                     "global_col": tf.FixedLenFeature([VOCAB],
                                                      dtype=tf.int64),
                     "sparse_local_row": tf.VarLenFeature(dtype=tf.int64),
                     "sparse_local_col": tf.VarLenFeature(dtype=tf.int64),
                     "sparse_value": tf.VarLenFeature(dtype=tf.float32)
                 })
         with tf.Session() as session:
             global_row, global_col, local_row, local_col, value = session.run(
                 [
                     features[n]
                     for n in ("global_row", "global_col",
                               "sparse_local_row", "sparse_local_col",
                               "sparse_value")
                 ])
         self.assertEqual(set(range(VOCAB)), set(global_row))
         self.assertEqual(set(range(VOCAB)), set(global_col))
         nnz = 16001
         self.assertEqual(value.values.shape, (nnz, ))
         self.assertEqual(local_row.values.shape, (nnz, ))
         self.assertEqual(local_col.values.shape, (nnz, ))
         numpy.random.seed(0)
         all_tokens = row_vocab.split("\n")
         chosen_indices = numpy.random.choice(list(range(VOCAB)),
                                              128,
                                              replace=False)
         chosen = [all_tokens[i] for i in chosen_indices]
         freqs = numpy.zeros((len(chosen), ) * 2, dtype=int)
         index = {w: i for i, w in enumerate(chosen)}
         chosen = set(chosen)
         with asdf.open(args.input) as model:
             matrix = assemble_sparse_matrix(model.tree["matrix"]).tocsr()
             tokens = split_strings(model.tree["tokens"])
             interesting = {i for i, t in enumerate(tokens) if t in chosen}
             for y in interesting:
                 row = matrix[y]
                 yi = index[tokens[y]]
                 for x, v in zip(row.indices, row.data):
                     if x in interesting:
                         freqs[yi, index[tokens[x]]] += v
         matrix = coo_matrix(
             (value.values,
              ([global_row[row] for row in local_row.values
                ], [global_col[col] for col in local_col.values])),
             shape=(VOCAB, VOCAB))
         matrix = matrix.tocsr()[chosen_indices][:, chosen_indices].todense(
         ).astype(int)
         self.assertTrue((matrix == freqs).all())
예제 #12
0
파일: coocc.py 프로젝트: qakart/ml
 def __init__(self, output, df_model: OrderedDocumentFrequencies, **kwargs):
     super().__init__(**kwargs)
     self.tokens_list = df_model.tokens()
     self.output = output
     self.df_model = df_model
예제 #13
0
def query(args):
    log = logging.getLogger("query")
    session = get_db(args)
    tables = args.tables
    if args.id:
        rows = session.execute(
            "SELECT hashtable, value FROM %s WHERE sha1='%s'" %
            (tables["hashtables2"], args.id))
        bands = [(r.hashtable, r.value) for r in rows]
    else:
        # args.file
        if not args.feature:
            log.critical(
                "-f / --feature must be specified at least once in file query mode"
            )
            return 1
        if not args.params:
            log.critical("-p / --params must be specified in file query mode")
            return 1
        wmh, bag = hash_file(args)
        htnum, band_size = calc_hashtable_params(args.threshold, len(wmh),
                                                 args.false_positive_weight,
                                                 args.false_negative_weight)
        log.info("Number of hash tables: %d", htnum)
        log.info("Band size: %d", band_size)
        bands = [(i, bytearray(wmh[i * band_size:(i + 1) * band_size].data))
                 for i in range(htnum)]
    similar = set()
    log.info("Looking for similar items")
    for i, band in bands:
        rows = session.execute(
            "SELECT sha1 FROM %s WHERE hashtable=%d AND value=0x%s" %
            (tables["hashtables"], i, codecs.encode(band, "hex").decode()))
        similar.update(r.sha1 for r in rows)
    log.info("Fetched %d items", len(similar))
    if args.precise:
        # Precise bags
        vocab = OrderedDocumentFrequencies().load(args.docfreq)
        log.info("Calculating the precise result")
        if args.id:
            rows = session.execute(
                "SELECT item, value FROM %s WHERE sha1='%s'" %
                (tables["bags"], args.id))
            bag = numpy.zeros(len(vocab), dtype=numpy.float32)
            for row in rows:
                bag[vocab.order[row.item]] = row.value
        # Fetch other bags from the DB
        precise = []
        for x in similar:
            rows = session.execute(
                "SELECT item, value FROM %s WHERE sha1='%s'" %
                (tables["bags"], x))
            other_bag = numpy.zeros(len(vocab), dtype=numpy.float32)
            for row in rows:
                other_bag[vocab.order[row.item]] = row.value
            if weighted_jaccard(bag, other_bag) >= args.threshold:
                precise.append(x)
            log.info("Survived: %.2f", len(precise) / len(similar))
        similar = precise
    if args.id:
        try:
            similar.remove(args.id)
        except KeyError:
            # o_O
            pass

    similar = [s.split("@")[1] for s in similar]
    stream_template(args.template,
                    sys.stdout,
                    size=len(similar),
                    origin=args.id if args.id else os.path.abspath(args.file),
                    items=BatchedHashResolver(similar, args.batch, session,
                                              tables["meta"]))