from collections import Counter, defaultdict from glob import glob import numpy as np from tqdm import tqdm from utils import stop_words, flatten, load_docs doc_collections = glob('../data/data/wiki-pages/*.jsonl') with open("../pickle_jar/idf.pckl", 'rb') as f: idf = pickle.load(f) with open("../pickle_jar/claims_vocab.pckl", 'rb') as f: claims_vocab = pickle.load(f) for doc_collection in tqdm(doc_collections): tfs = {} for document in load_docs([doc_collection]): tf = defaultdict(float) doc_words = flatten(document.sentences) num_words = 0 for word in doc_words: if word in claims_vocab: num_words += 1 tf[word] = tf[word] + 1 for word in tf: tf[word] /= num_words tfs[document.id] = (tf, num_words) with open(doc_collection + "_tf.pckl", 'wb') as f: pickle.dump(tfs, f)
import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) import config.config as cfg from gensim.models.doc2vec import Doc2Vec, TaggedDocument from utils import load_docs docs = load_docs(cfg.vals["clean_data_dir"] + "doc-list.txt") documents = [TaggedDocument(doc, [i]) for i, doc in docs.items()] print("training model...") model = Doc2Vec(documents, vector_size=cfg.vals['hidden_size'], window=2, min_count=1, workers=4) print("writing output...") with open(cfg.vals["output_dir"] + "doc2vec.txt", 'w') as f: for d in documents: idx = d[1][0] vec = model.docvecs[idx] f.write("{}\t".format(idx)) for component in vec: f.write("{} ".format(component)) f.write("\n") print("done")
from utils import load_doc_embedding, load_docs # change the filepath to reference your local file path_to_embedding = "/Volumes/Porter's Data/penn-state/data-sets/nlp-earnings-calls/output/starspace_docs.txt" path_to_docs = "/Volumes/Porter's Data/penn-state/data-sets/nlp-earnings-calls/clean/raw-doc-list.txt" limit = 10 # read embedding vectors from disk # nrows limits to the k rows; just for quick loading embedding = load_doc_embedding(path_to_embedding, nrows=limit) documents = load_docs(path_to_docs, nrows=limit) for doc_id, embed_vec in embedding.items(): raw_text = documents.get(doc_id, None) if raw_text is None: continue print(f"Document {doc_id}: {raw_text} {embed_vec}")
import pickle import numpy as np from glob import glob from collections import Counter from utils import load_docs, stop_words, flatten with open("./pickle_jar/claims_vocab.pckl", 'rb') as f: claims_vocab = pickle.load(f) doc_collections = glob('./data/data/wiki-pages/*.jsonl') frequencies = Counter() number_of_documents = 0 for document in load_docs(doc_collections): number_of_documents += 1 doc_words = set(flatten(document.sentences)) - stop_words frequencies.update(doc_words) idf = {} for word in frequencies: idf[word] = np.log(number_of_documents / frequencies[word]) with open("./pickle_jar/idf.pckl", 'wb') as f: pickle.dump(idf, f)