from collections import Counter, defaultdict
from glob import glob
import numpy as np
from tqdm import tqdm

from utils import stop_words, flatten, load_docs

doc_collections = glob('../data/data/wiki-pages/*.jsonl')

with open("../pickle_jar/idf.pckl", 'rb') as f:
    idf = pickle.load(f)

with open("../pickle_jar/claims_vocab.pckl", 'rb') as f:
    claims_vocab = pickle.load(f)

for doc_collection in tqdm(doc_collections):
    tfs = {}
    for document in load_docs([doc_collection]):
        tf = defaultdict(float)
        doc_words = flatten(document.sentences)
        num_words = 0
        for word in doc_words:
            if word in claims_vocab:
                num_words += 1
                tf[word] = tf[word] + 1
        for word in tf:
            tf[word] /= num_words
        tfs[document.id] = (tf, num_words)
    with open(doc_collection + "_tf.pckl", 'wb') as f:
        pickle.dump(tfs, f)
예제 #2
0
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import config.config as cfg
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from utils import load_docs

docs = load_docs(cfg.vals["clean_data_dir"] + "doc-list.txt")
documents = [TaggedDocument(doc, [i]) for i, doc in docs.items()]
print("training model...")
model = Doc2Vec(documents,
                vector_size=cfg.vals['hidden_size'],
                window=2,
                min_count=1,
                workers=4)

print("writing output...")
with open(cfg.vals["output_dir"] + "doc2vec.txt", 'w') as f:

    for d in documents:
        idx = d[1][0]
        vec = model.docvecs[idx]
        f.write("{}\t".format(idx))
        for component in vec:
            f.write("{} ".format(component))
        f.write("\n")

print("done")
예제 #3
0
from utils import load_doc_embedding, load_docs

# change the filepath to reference your local file
path_to_embedding = "/Volumes/Porter's Data/penn-state/data-sets/nlp-earnings-calls/output/starspace_docs.txt"
path_to_docs = "/Volumes/Porter's Data/penn-state/data-sets/nlp-earnings-calls/clean/raw-doc-list.txt"
limit = 10

# read embedding vectors from disk
# nrows limits to the k rows; just for quick loading
embedding = load_doc_embedding(path_to_embedding, nrows=limit)
documents = load_docs(path_to_docs, nrows=limit)

for doc_id, embed_vec in embedding.items():
    raw_text = documents.get(doc_id, None)
    if raw_text is None:
        continue

    print(f"Document {doc_id}: {raw_text} {embed_vec}")
예제 #4
0
import pickle
import numpy as np
from glob import glob
from collections import Counter

from utils import load_docs, stop_words, flatten

with open("./pickle_jar/claims_vocab.pckl", 'rb') as f:
    claims_vocab = pickle.load(f)

doc_collections = glob('./data/data/wiki-pages/*.jsonl')

frequencies = Counter()

number_of_documents = 0
for document in load_docs(doc_collections):
    number_of_documents += 1
    doc_words = set(flatten(document.sentences)) - stop_words
    frequencies.update(doc_words)

idf = {}
for word in frequencies:
    idf[word] = np.log(number_of_documents / frequencies[word])

with open("./pickle_jar/idf.pckl", 'wb') as f:
    pickle.dump(idf, f)