class DocumentFrequenciesTests(unittest.TestCase): def setUp(self): self.model = DocumentFrequencies().load(source=paths.DOCFREQ) def test_docs(self): docs = self.model.docs self.assertIsInstance(docs, int) self.assertEqual(docs, 1000) def test_get(self): self.assertEqual(self.model["aaaaaaa"], 341) with self.assertRaises(KeyError): print(self.model["xaaaaaa"]) self.assertEqual(self.model.get("aaaaaaa", 0), 341) self.assertEqual(self.model.get("xaaaaaa", 100500), 100500) def test_tokens(self): self.assertEqual(list(self.model._df), self.model.tokens()) def test_len(self): # the remaining 18 are not unique - the model was generated badly self.assertEqual(len(self.model), 982) def test_iter(self): aaa = False for tok, freq in self.model: if "aaaaaaa" in tok: aaa = True int(freq) break self.assertTrue(aaa) def test_prune(self): pruned = self.model.prune(4) for tok, freq in pruned: self.assertGreaterEqual(freq, 4) self.assertEqual(len(pruned), 346) def test_prune_self(self): pruned = self.model.prune(1) self.assertIs(self.model, pruned) def test_greatest(self): pruned = self.model.greatest(100) freqs = [v for v in self.model._df.values()] freqs.sort(reverse=True) border = freqs[100] for v in pruned._df.values(): self.assertGreaterEqual(v, border) df1 = pruned._df df2 = self.model.greatest(100)._df self.assertEqual(df1, df2) def test_write(self): buffer = BytesIO() self.model.save(buffer) buffer.seek(0) new_model = DocumentFrequencies().load(buffer) self.assertEqual(self.model._df, new_model._df) self.assertEqual(self.model.docs, new_model.docs)
def id2vec_preprocess(args): """ Loads co-occurrence matrices for several repositories and generates the document frequencies and the Swivel protobuf dataset. :param args: :class:`argparse.Namespace` with "input", "vocabulary_size", \ "shard_size", "df" and "output". :return: None """ log = logging.getLogger("preproc") log.info("Loading docfreq model from %s", args.docfreq_in) df_model = DocumentFrequencies(log_level=args.log_level).load( source=args.docfreq_in) coocc_model = Cooccurrences().load(args.input) if numpy.any(coocc_model.matrix.data < 0): raise ValueError( ("Co-occurrence matrix %s contains negative elements. " "Please check its correctness.") % args.input) if numpy.any(numpy.isnan(coocc_model.matrix.data)): raise ValueError(("Co-occurrence matrix %s contains nan elements. " "Please check its correctness.") % args.input) try: df_meta = coocc_model.get_dep(DocumentFrequencies.NAME) if df_model.meta != df_meta: raise ValueError(( "Document frequency model you provided does not match dependency inside " "Cooccurrences model:\nargs.docfreq.meta:\n%s\ncoocc_model.get_dep" "(\"docfreq\")\n%s\n") % (df_model.meta, df_meta)) except KeyError: pass # There is no docfreq dependency vs = args.vocabulary_size if len(df_model) < vs: vs = len(df_model) sz = args.shard_size if vs < sz: raise ValueError( "vocabulary_size=%s is less than shard_size=%s. You should specify a smaller " "shard_size (e.g. shard_size=%s)." % (vs, sz, vs)) vs -= vs % sz log.info("Effective vocabulary size: %d", vs) df_model = df_model.greatest(vs) log.info("Sorting the vocabulary...") chosen_words = sorted(df_model.tokens()) word_indices = {w: i for i, w in enumerate(chosen_words)} if not os.path.exists(args.output): os.makedirs(args.output) with open(os.path.join(args.output, "row_vocab.txt"), "w") as out: out.write('\n'.join(chosen_words)) log.info("Saved row_vocab.txt") shutil.copyfile(os.path.join(args.output, "row_vocab.txt"), os.path.join(args.output, "col_vocab.txt")) log.info("Saved col_vocab.txt") del chosen_words ccmatrix = extract_coocc_matrix((vs, vs), word_indices, coocc_model) log.info("Planning the sharding...") bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1] reorder = numpy.argsort(-bool_sums) with open(os.path.join(args.output, "row_sums.txt"), "w") as out: out.write('\n'.join(map(str, bool_sums.tolist()))) log.info("Saved row_sums.txt") shutil.copyfile(os.path.join(args.output, "row_sums.txt"), os.path.join(args.output, "col_sums.txt")) log.info("Saved col_sums.txt") log.info("Writing the shards...") os.makedirs(args.output, exist_ok=True) nshards = vs // args.shard_size for row in progress_bar(range(nshards), log, expected_size=nshards): for col in range(nshards): indices_row = reorder[row::nshards] indices_col = reorder[col::nshards] shard = ccmatrix[indices_row][:, indices_col].tocoo() example = tf.train.Example(features=tf.train.Features( feature={ "global_row": _int64s(indices_row), "global_col": _int64s(indices_col), "sparse_local_row": _int64s(shard.row), "sparse_local_col": _int64s(shard.col), "sparse_value": _floats(shard.data) })) with open( os.path.join(args.output, "shard-%03d-%03d.pb" % (row, col)), "wb") as out: out.write(example.SerializeToString()) log.info("Success")