def create_dictionary(self): """ Utility method to generate gensim-style Dictionary directly from the corpus and vocabulary data. """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word)) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs)) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary
def create_dictionary(self): """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data. Return ------ :class:`gensim.corpora.dictionary.Dictionary` Dictionary, based on corpus. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora.ucicorpus import UciCorpus >>> from gensim.test.utils import datapath >>> ucc = UciCorpus(datapath('testcorpus.uci')) >>> dictionary = ucc.create_dictionary() """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = utils.revdict(self.id2word) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary
f"{args.config}.{sp_key}.dic")) else: special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3} word_dic = Dictionary() word_dic.token2id = special_token_dict #char_dic = Dictionary() #char_dic.token2id = special_token_dict sw_dicts = {} for sp_key, sp in sps.items(): _dic = Dictionary() _dic.token2id = special_token_dict sw_dicts[sp_key] = _dic label_dic = Dictionary(train_label_documents) label_dic.patch_with_special_tokens({PADDING: 0}) label_dic.id2token = { _id: label for label, _id in label_dic.token2id.items() } # add vocabulary word_dic.add_documents(train_word_documents) #char_dic.add_documents(list(chain.from_iterable(train_char_documents))) for sp_key, train_sw_documents in train_sw_documents_dicts.items(): sw_dicts[sp_key].add_documents(train_sw_documents) # load GloVe if config_dic.get("glove_path"): print("========= Load Pretrain Word Embeddings ==========") word2vec = load_pretrain_embeddings( config_dic.get("glove_path"), emb_dim=config_dic.get("word_emb_dim")) pretrain_embeddings = build_pretrain_embeddings(
def abandon(): stopWords = set(stopwords.words('english')) for w in string.punctuation: stopWords.add(w) stops_words = [ "rt", "…", "...", "URL", "http", "https", "“", "”", "‘", "’", "get", "2", "new", "one", "i'm", "make", "go", "good", "say", "says", "know", "day", "..", "take", "got", "1", "going", "4", "3", "two", "n", "like", "via", "u", "would", "still", "first", "really", "watch", "see", "even", "that's", "look", "way", "last", "said", "let", "twitter", "ever", "always", "another", "many", "things", "may", "big", "come", "keep", "5", "time", "much", "want", "think", "us", "love", "people", "need" ] for w in stops_words: stopWords.add(w) tokenizer = CustomTweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False, normalize_usernames=False, normalize_urls=True, keep_allupper=False) cnt = Counter() texts = [] # comm = json.load(open("data/louvain_rst.json")) # users_comm = {str(u) for u in comm if comm[u] == 0} # print(len(users_comm)) # loading data data = pd.read_csv("data/ira-tweets-ele.csv", usecols=["tweet_text", "userid"]) for i, row in tqdm(data.iterrows()): # if row["userid"] not in users_comm: # continue words = tokenizer.tokenize(row["tweet_text"]) words = [w for w in words if w not in stopWords and w] # if words[0] == "RT": # continue for w in words: cnt[w] += 1 texts.append(words) print(len(texts)) json.dump(cnt.most_common(), open("data/word_cloud.json", "w"), indent=2) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(t) for t in texts] def average_distance(v_tops): _sum = 0 _cnt = 0 for i in range(len(v_tops)): for j in range(i + 1, len(v_tops)): _sum += scipy.spatial.distance.cosine(v_tops[i], v_tops[j]) _cnt += 1 return _sum / _cnt with open("data/IRA_topics.txt", "w") as f: for n in range(2, 12): print(f"N = {n}") lda = LdaModel(corpus, num_topics=n, random_state=42) v_topics = lda.get_topics() lda.save(f"model/lda-ira-{n}.mod") # pprint(lda.print_topics()) f.write(f"Perplexity: {lda.log_perplexity(corpus)}" ) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda, texts=corpus, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() f.write(f"Coherence Score: {coherence_lda}") f.write(f"~Average distance: {average_distance(v_topics)}\n") # show x = lda.show_topics(num_topics=n, num_words=20, formatted=False) topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x] dictionary.id2token = { v: k for k, v in dictionary.token2id.items() } # Below Code Prints Topics and Words for topic, words in topics_words: f.write( str(topic) + " :: " + str([dictionary.id2token[int(w)] for w in words]) + "\n") f.write("\n")
f"{args.config}.{sp_key}.dic")) else: special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3} word_dic = Dictionary() word_dic.token2id = special_token_dict char_dic = Dictionary() char_dic.token2id = special_token_dict sw_dicts = {} for sp_key, sp in sps.items(): _dic = Dictionary() _dic.token2id = special_token_dict sw_dicts[sp_key] = _dic label_dic = Dictionary(train_label_documents) label_dic.patch_with_special_tokens({PADDING: 0}) label_dic.id2token = { _id: label for label, _id in label_dic.token2id.items() } # add vocabulary word_dic.add_documents(train_word_documents) char_dic.add_documents(list(chain.from_iterable(train_char_documents))) for sp_key, train_sw_documents in train_sw_documents_dicts.items(): sw_dicts[sp_key].add_documents(train_sw_documents) # load GloVe if config_dic.get("glove_path"): print("============== Load Pretrain Word Embeddings ================") word2vec = load_pretrain_embeddings( config_dic.get("glove_path"), emb_dim=config_dic.get("word_emb_dim")) pretrain_embeddings = build_pretrain_embeddings(
## Vectorize the corpus cv = CountVectorizer(stop_words="english", min_df=5, max_df=0.4, max_features=5000, ngram_range=(1, 1)) dtm = cv.fit_transform(corpus) features = np.array(cv.get_feature_names()) id2token = dict(zip(range(len(features)), features)) token2id = dict(zip(features, range(len(features)))) ## Create a gensim dictionary dictionary = Dictionary() dictionary.id2token = id2token dictionary.token2id = token2id ## Train LDA models with different count of topics topic_counts = [20, 30, 40, 50, 70, 100, 120, 150] def get_topn_words(lda_model, features, topn=20): topics = lda_model.components_ topic_words = [] for topic_num, topic_weights in enumerate(topics): top_words = topic_weights.argsort()[::-1][:topn] topic_words.append(list(features[top_words])) return topic_words
words = tokenizer.tokenize(line.strip()) # if words[0] == "RT": # continue texts.append(words) print("loaded!") # conn.close() dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(t) for t in texts] lda = LdaModel(corpus, num_topics=10) x = lda.show_topics(num_topics=10, num_words=20, formatted=False) topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x] dictionary.id2token = {v: k for k, v in dictionary.token2id.items()} #Below Code Prints Topics and Words for topic, words in topics_words: print( str(topic) + "::" + str([dictionary.id2token[int(w)] for w in words])) print() #Below Code Prints Only Words # for topic, words in topics_words: # print(" ".join(words)) # conn = sqlite3.connect( # "/home/alex/network_workdir/elections/databases_ssd/complete_trump_vs_hillary_sep-nov_db.sqlite") # c = conn.cursor() # c.execute('''SELECT text FROM tweet''')