示例#1
0
文件: test_df.py 项目: sniperkit/ml
class DocumentFrequenciesTests(unittest.TestCase):
    def setUp(self):
        self.model = DocumentFrequencies().load(source=paths.DOCFREQ)

    def test_docs(self):
        docs = self.model.docs
        self.assertIsInstance(docs, int)
        self.assertEqual(docs, 1000)

    def test_get(self):
        self.assertEqual(self.model["aaaaaaa"], 341)
        with self.assertRaises(KeyError):
            print(self.model["xaaaaaa"])
        self.assertEqual(self.model.get("aaaaaaa", 0), 341)
        self.assertEqual(self.model.get("xaaaaaa", 100500), 100500)

    def test_tokens(self):
        self.assertEqual(list(self.model._df), self.model.tokens())

    def test_len(self):
        # the remaining 18 are not unique - the model was generated badly
        self.assertEqual(len(self.model), 982)

    def test_iter(self):
        aaa = False
        for tok, freq in self.model:
            if "aaaaaaa" in tok:
                aaa = True
                int(freq)
                break
        self.assertTrue(aaa)

    def test_prune(self):
        pruned = self.model.prune(4)
        for tok, freq in pruned:
            self.assertGreaterEqual(freq, 4)
        self.assertEqual(len(pruned), 346)

    def test_prune_self(self):
        pruned = self.model.prune(1)
        self.assertIs(self.model, pruned)

    def test_greatest(self):
        pruned = self.model.greatest(100)
        freqs = [v for v in self.model._df.values()]
        freqs.sort(reverse=True)
        border = freqs[100]
        for v in pruned._df.values():
            self.assertGreaterEqual(v, border)
        df1 = pruned._df
        df2 = self.model.greatest(100)._df
        self.assertEqual(df1, df2)

    def test_write(self):
        buffer = BytesIO()
        self.model.save(buffer)
        buffer.seek(0)
        new_model = DocumentFrequencies().load(buffer)
        self.assertEqual(self.model._df, new_model._df)
        self.assertEqual(self.model.docs, new_model.docs)
示例#2
0
def id2vec_preprocess(args):
    """
    Loads co-occurrence matrices for several repositories and generates the
    document frequencies and the Swivel protobuf dataset.

    :param args: :class:`argparse.Namespace` with "input", "vocabulary_size", \
                 "shard_size", "df" and "output".
    :return: None
    """
    log = logging.getLogger("preproc")
    log.info("Loading docfreq model from %s", args.docfreq_in)
    df_model = DocumentFrequencies(log_level=args.log_level).load(
        source=args.docfreq_in)
    coocc_model = Cooccurrences().load(args.input)
    if numpy.any(coocc_model.matrix.data < 0):
        raise ValueError(
            ("Co-occurrence matrix %s contains negative elements. "
             "Please check its correctness.") % args.input)
    if numpy.any(numpy.isnan(coocc_model.matrix.data)):
        raise ValueError(("Co-occurrence matrix %s contains nan elements. "
                          "Please check its correctness.") % args.input)

    try:
        df_meta = coocc_model.get_dep(DocumentFrequencies.NAME)
        if df_model.meta != df_meta:
            raise ValueError((
                "Document frequency model you provided does not match dependency inside "
                "Cooccurrences model:\nargs.docfreq.meta:\n%s\ncoocc_model.get_dep"
                "(\"docfreq\")\n%s\n") % (df_model.meta, df_meta))
    except KeyError:
        pass  # There is no docfreq dependency

    vs = args.vocabulary_size
    if len(df_model) < vs:
        vs = len(df_model)
    sz = args.shard_size
    if vs < sz:
        raise ValueError(
            "vocabulary_size=%s is less than shard_size=%s. You should specify a smaller "
            "shard_size (e.g. shard_size=%s)." % (vs, sz, vs))
    vs -= vs % sz
    log.info("Effective vocabulary size: %d", vs)
    df_model = df_model.greatest(vs)
    log.info("Sorting the vocabulary...")
    chosen_words = sorted(df_model.tokens())
    word_indices = {w: i for i, w in enumerate(chosen_words)}

    if not os.path.exists(args.output):
        os.makedirs(args.output)
    with open(os.path.join(args.output, "row_vocab.txt"), "w") as out:
        out.write('\n'.join(chosen_words))
    log.info("Saved row_vocab.txt")
    shutil.copyfile(os.path.join(args.output, "row_vocab.txt"),
                    os.path.join(args.output, "col_vocab.txt"))
    log.info("Saved col_vocab.txt")
    del chosen_words

    ccmatrix = extract_coocc_matrix((vs, vs), word_indices, coocc_model)

    log.info("Planning the sharding...")
    bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1]
    reorder = numpy.argsort(-bool_sums)
    with open(os.path.join(args.output, "row_sums.txt"), "w") as out:
        out.write('\n'.join(map(str, bool_sums.tolist())))
    log.info("Saved row_sums.txt")
    shutil.copyfile(os.path.join(args.output, "row_sums.txt"),
                    os.path.join(args.output, "col_sums.txt"))
    log.info("Saved col_sums.txt")

    log.info("Writing the shards...")
    os.makedirs(args.output, exist_ok=True)
    nshards = vs // args.shard_size
    for row in progress_bar(range(nshards), log, expected_size=nshards):
        for col in range(nshards):
            indices_row = reorder[row::nshards]
            indices_col = reorder[col::nshards]
            shard = ccmatrix[indices_row][:, indices_col].tocoo()

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    "global_row": _int64s(indices_row),
                    "global_col": _int64s(indices_col),
                    "sparse_local_row": _int64s(shard.row),
                    "sparse_local_col": _int64s(shard.col),
                    "sparse_value": _floats(shard.data)
                }))

            with open(
                    os.path.join(args.output,
                                 "shard-%03d-%03d.pb" % (row, col)),
                    "wb") as out:
                out.write(example.SerializeToString())
    log.info("Success")