forked from karpathy/arxiv-sanity-preserver
/
analyze.py
155 lines (139 loc) · 4.96 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Reads txt files of all papers and computes tfidf vectors for all papers.
Dumps results to file tfidf.p
"""
import os
import pickle
from random import shuffle, seed
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from pathlib import Path
import console
from utils import Config, safe_pickle_dump
seed(1337)
max_train = (
5000
) # max number of tfidf training documents (chosen randomly), for memory efficiency
max_features = 1000
def get_valid_papers(root="data/txt", ext=".pdf.txt", min_size_bytes=1000, max_size_bytes=200000):
"""Get a list of valid papers / pids.
Args:
root: directory to search
ext: filename suffix to filter by
Return a 2-tuple: list of valid paper text paths, along with a list of their pids.
"""
txt_paths, pids = [], []
txt_paths_all = sorted(list(Path(root).glob("*{}".format(ext))))
progress_bar = tqdm(txt_paths_all)
for txt_path in progress_bar:
paper_id = str(txt_path.name).replace(ext, "")
txt_size_bytes = txt_path.stat().st_size
if min_size_bytes < txt_size_bytes < max_size_bytes: # filter out the theses
txt_paths.append(str(txt_path))
pids.append(paper_id)
else:
progress_bar.set_description(
"skipped %s with %d bytes" % (paper_id, txt_size_bytes)
)
print(
"in total read in %d text files out of %d possible." % (len(txt_paths), len(txt_paths_all))
)
return txt_paths, pids
print("getting valid papers")
console.time("get valid papers")
txt_paths, pids = get_valid_papers()
console.time_end("get valid papers")
# compute tfidf vectors with scikits
v = TfidfVectorizer(
input="content",
encoding="utf-8",
decode_error="replace",
strip_accents="ascii", # DO NOT USE "unicode"; it is very slow
lowercase=True,
analyzer="word",
stop_words="english",
token_pattern=r"(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b",
ngram_range=(1, 2),
max_features=max_features,
norm="l2",
use_idf=True,
smooth_idf=True,
sublinear_tf=True,
max_df=0.5,
min_df=1,
dtype=np.float32
)
# create an iterator object to conserve memory
def make_corpus(paths, max_chars=None):
total = 0
for p in paths:
with open(p, "r") as f:
txt = f.read()
total += len(txt)
if max_chars is not None and total > max_chars:
print("stopping corpus generation; we have enough")
break
# print("corpus has", total, "chars")
yield txt
# train
train_txt_paths = list(txt_paths) # duplicate
shuffle(train_txt_paths) # shuffle
train_txt_paths = train_txt_paths[: min(len(train_txt_paths), max_train)] # crop
print("training on %d documents..." % (len(train_txt_paths),))
train_corpus = make_corpus(train_txt_paths, max_chars=1e6)
print("created train corpus")
# oom killer was here
console.time("fitting vectorizer")
v.fit(train_corpus)
console.time_end("fitting vectorizer")
# transform
print("transforming %d documents..." % (len(txt_paths),))
corpus = make_corpus(txt_paths)
print("created full corpus")
print("vectorizing full corpus")
# this takes 1000 years and is CPU-bound ahh
# ideas to fix:
# * compress the text on disk, right now it's raw
# * use torch dataloader or something to make read / process concurrent; pure reads only take ~2min instead of 20
recompute_transform = True
if recompute_transform:
console.time("vectorize full corpus")
X = v.transform(tqdm(corpus, total=len(txt_paths)))
console.time_end("vectorize full corpus")
print(X.shape)
# write full matrix out
out = {}
out["X"] = X # this one is heavy!
print("writing", Config.tfidf_path)
safe_pickle_dump(out, Config.tfidf_path)
else:
print("Loading cached sparse matrix")
X = pickle.load(open(Config.tfidf_path, 'rb'))["X"]
# writing lighter metadata information into a separate (smaller) file
out = {}
out["vocab"] = v.vocabulary_
out["idf"] = v._tfidf.idf_
out["pids"] = pids # a full idvv string (id and version number)
out["ptoi"] = {x: i for i, x in enumerate(pids)} # pid to ix in X mapping
print("writing", Config.meta_path)
safe_pickle_dump(out, Config.meta_path)
print("precomputing nearest neighbor queries in batches...")
def precompute_sim_dict(X, pids, batch_size=128, k=50):
sim_dict = {}
# TODO: try ordering here
X = X.todense() # originally it's a sparse matrix
for i in tqdm(range(0, len(pids), batch_size)):
i1 = min(len(pids), i + batch_size)
xquery = X[i:i1] # BxD
ds = -np.asarray(np.dot(X, xquery.T)) # NxD * DxB => NxB
IX = np.argpartition(ds, k, 0)[:k]
IX = np.take_along_axis(IX, np.argsort(np.take_along_axis(ds, IX, 0), 0), 0)
for j in range(i1 - i):
sim_dict[pids[i + j]] = [pids[q] for q in IX[:, j]]
del ds
del IX
return sim_dict
sim_dict = precompute_sim_dict(X, pids)
print("writing", Config.sim_path)
safe_pickle_dump(sim_dict, Config.sim_path)