analyze.py

"""
Reads txt files of all papers and computes tfidf vectors for all papers.
Dumps results to file tfidf.p
"""
import os
import pickle
from random import shuffle, seed

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from pathlib import Path

import console
from utils import Config, safe_pickle_dump

seed(1337)
max_train = (
    5000
)  # max number of tfidf training documents (chosen randomly), for memory efficiency
max_features = 1000

def get_valid_papers(root="data/txt", ext=".pdf.txt", min_size_bytes=1000, max_size_bytes=200000):
    """Get a list of valid papers / pids.

    Args:
      root: directory to search
      ext: filename suffix to filter by
    Return a 2-tuple: list of valid paper text paths, along with a list of their pids.
    """
    txt_paths, pids = [], []
    txt_paths_all = sorted(list(Path(root).glob("*{}".format(ext))))
    progress_bar = tqdm(txt_paths_all)
    for txt_path in progress_bar:
        paper_id = str(txt_path.name).replace(ext, "")
        txt_size_bytes = txt_path.stat().st_size
        if min_size_bytes < txt_size_bytes < max_size_bytes: # filter out the theses
            txt_paths.append(str(txt_path))
            pids.append(paper_id)
        else:
            progress_bar.set_description(
                "skipped %s with %d bytes" % (paper_id, txt_size_bytes)
            )
    print(
        "in total read in %d text files out of %d possible." % (len(txt_paths), len(txt_paths_all))
    )
    return txt_paths, pids

print("getting valid papers")
console.time("get valid papers")
txt_paths, pids = get_valid_papers()
console.time_end("get valid papers")

# compute tfidf vectors with scikits
v = TfidfVectorizer(
    input="content",
    encoding="utf-8",
    decode_error="replace",
    strip_accents="ascii", # DO NOT USE "unicode"; it is very slow
    lowercase=True,
    analyzer="word",
    stop_words="english",
    token_pattern=r"(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b",
    ngram_range=(1, 2),
    max_features=max_features,
    norm="l2",
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True,
    max_df=0.5,
    min_df=1,
    dtype=np.float32
)

# create an iterator object to conserve memory
def make_corpus(paths, max_chars=None):
    total = 0
    for p in paths:
        with open(p, "r") as f:
            txt = f.read()
            total += len(txt)
            if max_chars is not None and total > max_chars:
                print("stopping corpus generation; we have enough")
                break
            # print("corpus has", total, "chars")
        yield txt


# train
train_txt_paths = list(txt_paths)  # duplicate
shuffle(train_txt_paths)  # shuffle
train_txt_paths = train_txt_paths[: min(len(train_txt_paths), max_train)]  # crop

print("training on %d documents..." % (len(train_txt_paths),))
train_corpus = make_corpus(train_txt_paths, max_chars=1e6)
print("created train corpus")
# oom killer was here
console.time("fitting vectorizer")
v.fit(train_corpus)
console.time_end("fitting vectorizer")

# transform
print("transforming %d documents..." % (len(txt_paths),))
corpus = make_corpus(txt_paths)
print("created full corpus")
print("vectorizing full corpus")
# this takes 1000 years and is CPU-bound ahh
# ideas to fix:
# * compress the text on disk, right now it's raw
# * use torch dataloader or something to make read / process concurrent; pure reads only take ~2min instead of 20
recompute_transform = True
if recompute_transform:
    console.time("vectorize full corpus")
    X = v.transform(tqdm(corpus, total=len(txt_paths)))
    console.time_end("vectorize full corpus")
    print(X.shape)
    # write full matrix out
    out = {}
    out["X"] = X  # this one is heavy!
    print("writing", Config.tfidf_path)
    safe_pickle_dump(out, Config.tfidf_path)
else:
    print("Loading cached sparse matrix")
    X = pickle.load(open(Config.tfidf_path, 'rb'))["X"]


# writing lighter metadata information into a separate (smaller) file
out = {}
out["vocab"] = v.vocabulary_
out["idf"] = v._tfidf.idf_
out["pids"] = pids  # a full idvv string (id and version number)
out["ptoi"] = {x: i for i, x in enumerate(pids)}  # pid to ix in X mapping
print("writing", Config.meta_path)
safe_pickle_dump(out, Config.meta_path)

print("precomputing nearest neighbor queries in batches...")
def precompute_sim_dict(X, pids, batch_size=128, k=50):
    sim_dict = {}
    # TODO: try ordering here
    X = X.todense()  # originally it's a sparse matrix
    for i in tqdm(range(0, len(pids), batch_size)):
        i1 = min(len(pids), i + batch_size)
        xquery = X[i:i1]  # BxD
        ds = -np.asarray(np.dot(X, xquery.T))  # NxD * DxB => NxB
        IX = np.argpartition(ds, k, 0)[:k]
        IX = np.take_along_axis(IX, np.argsort(np.take_along_axis(ds, IX, 0), 0), 0)
        for j in range(i1 - i):
            sim_dict[pids[i + j]] = [pids[q] for q in IX[:, j]]
        del ds
        del IX
    return sim_dict
sim_dict = precompute_sim_dict(X, pids)

print("writing", Config.sim_path)
safe_pickle_dump(sim_dict, Config.sim_path)