def get_word_count(input_file, filter_stopwords=True, ngrams=1, bigram_dict=None, words_func=None): result = collections.defaultdict(int) for data in utils.read_json_list(input_file): words = words_func(data["text"], ngrams=ngrams, filter_stopwords=filter_stopwords, bigram_dict=bigram_dict) for w in words: result[w] += 1 return result
def load_doc_topics(input_file, doc_topic_file, threshold=0.01): """Load topics in each document""" articles = [] with open(doc_topic_file) as tfin: for data in utils.read_json_list(input_file): topic_line = tfin.readline() if not topic_line: break ideas = topic_line.strip().split()[2:] ideas = set( [i for (i, v) in enumerate(ideas) if float(v) > threshold]) articles.append( utils.IdeaArticle(fulldate=int(data["date"]), ideas=ideas)) return articles
def convert_word_count_mallet(word_dict, input_file, output_file, words_func=None): doc_id = 0 with open(output_file, "w") as fout: for data in utils.read_json_list(input_file): doc_id += 1 words = collections.Counter(words_func(data["text"])) words = [(word_dict[w], words[w]) for w in words if w in word_dict] words.sort() word_cnts = [" ".join([str(wid)] * cnt) for (wid, cnt) in words] fout.write("%s %s %s\n" % (doc_id, data["date"], " ".join(word_cnts)))
def load_word_articles(input_file, vocab_file, data_dir, vocab_size=100): articles = [] word_map = utils.read_word_dict(vocab_file, vocab_size=vocab_size) word_set = utils.get_reverse_dict(word_map) bigram_file = "%s/bigram_phrases.txt" % data_dir bigram_dict = wc.load_bigrams(bigram_file) words_func = functools.partial(wc.get_mixed_tokens, bigram_dict=bigram_dict) for data in utils.read_json_list(input_file): words = words_func(data["text"]) words = set([word_set[w] for w in words if w in word_set]) articles.append( utils.IdeaArticle(fulldate=int(data["date"]), ideas=words)) return articles, word_set, word_map
def convert_word_count_mallet(word_dict, input_file, output_file, words_func=None): doc_id = 0 if not os.path.exists(output_file): with open(output_file, "w") as fout: for data in utils.read_json_list(input_file): doc_id += 1 words = collections.Counter(words_func(data["text"])) words = [(word_dict[w], words[w]) for w in words if w in word_dict] words.sort() word_cnts = [" ".join([str(wid)] * cnt) for (wid, cnt) in words] fout.write("%s %s %s\n" % (doc_id, data["date"], " ".join(word_cnts))) else: print("convert_word_count_mallet: output file found at: {}, skipping".format(output_file))
def load_doc_topics(input_file, doc_topic_file, threshold=0.01): """Load topics in each document""" articles = [] # fd = open(doc_topic_output_file, "w") # print("opening {}".format(doc_topic_output_file)) with open(doc_topic_file) as tfin: for data in utils.read_json_list(input_file): topic_line = tfin.readline() if not topic_line: break ideas = topic_line.strip().split()[2:] ideas = set([i for (i, v) in enumerate(ideas) if float(v) > threshold]) articles.append(utils.IdeaArticle(fulldate=int(data["date"]), ideas=ideas)) # fd.write('{},"{}"\n'.format(int(data["date"]), list(ideas))) # print('{},"{}"\n'.format(int(data["date"]), list(ideas))) # fd.close() return articles
def preprocess_input(input_file, output_file, func=tokenize): data = [] for d in utils.read_json_list(input_file): d["text"] = " ".join(func(d["text"])) data.append(d) utils.write_json_list(output_file, data)