self.fname = fname def __iter__(self): for line in codecs.open(filename=self.fname, mode='r', encoding='utf-8'): yield line if __name__ == '__main__': training_file_path = 'E:/2017_Deep_learning/text similarity' #training_file_path = './' # Lsi model dictionary = Dictionary() corpus = sohu_corpus(fname=os.path.join( training_file_path, 'sohu_text_similarity_training.corpus'), dic=dictionary) # save dictionary #dictionary.save(os.path.join(training_file_path, '07_11_dictionary.dict')) MmCorpus.serialize(os.path.join(training_file_path, '01_16_corpus_12.mm'), corpus) #dictionary = Dictionary.load(os.path.join(training_file_path, '07_11_dictionary.dict')) corpus_tfidf_mm = MmCorpus( os.path.join(training_file_path, '01_16_corpus_12.mm')) # convert counts to tfidf tfidf = TfidfModel(corpus=corpus_tfidf_mm) corpus_tfidf = tfidf[corpus_tfidf_mm]
def get_people(txtstream, my_nlp): people = [] for txt in txtstream: doc = my_nlp(txt) ppl = [ent for ent in doc.ents if ent.label_ == "PERSON"] people.append(ppl) return people # set(people) if __name__ == "__main__": # # GENSIM TOPIC APPROACH # dictionary = Dictionary(token_stream(NOVELS_DIRPATH)) dictionary.filter_extremes(no_below=10, no_above=0.66) # excludes terms like "the", "to", "and", "of", "i", etc. print("-------------") print("TOKENS", len(dictionary.token2id), list(dictionary.token2id.items())[0:4], "...") bags_of_words = [dictionary.doc2bow(tokens) for tokens in token_stream(NOVELS_DIRPATH)] print("-------------") print("BAGS OF WORDS (CORPUS)", len(bags_of_words), bags_of_words[0]) lda = LdaMulticore(corpus=bags_of_words, id2word=dictionary, random_state=723812, num_topics=15, passes=10, workers=4) print("-------------") print("LDA MODEL", type(lda)) results = lda.print_topics() print("-------------") print("TOPICS (RAW RESULTS)...")
def run_tm(topics, below, above, chunksize, passes, iterations): m, valid = arevalid(topics, below, above, chunksize, passes, iterations) if not valid: fehlerfenster = Toplevel() fehlerfenster.title('Fehler') fehlerfenster.geometry('300x300') # Label mit der Fehlermeldung labelfehler = Label(master=fehlerfenster, text=m) labelfehler.place(x=10, y=10, width=300, height=300) else: with open('../data/docs', 'rb') as f: docs = pickle.load(f) tweet_dictionary = Dictionary(docs) tweet_dictionary.filter_extremes(no_below=int(below), no_above=float(above)) tweet_dictionary.save('../data/tweet_dictionary') ngram_docs = ngrams(input_docs=docs) corpus = make_bow_corpus(tweet_dictionary, ngram_docs) with open('../data/bow_corpus', 'wb') as f: pickle.dump(corpus, f) print('Number of unique tokens: %d' % len(tweet_dictionary)) print('Number of documents: %d' % len(corpus)) """Training parameters.""" num_topics = int( topics ) # Number of topics, here relatively low so we can interpret them more easily -> can be set higher chunk_size = int( chunksize ) # Numbers of documents fed into the training algorithm (we have 7) passes = int(passes) # Number of times trained on the entire corpus iterations = int(iterations) # Number of loops over each document eval_every = None # Don't evaluate model perplexity, takes too much time. """ Make a index to word dictionary.""" temp = tweet_dictionary[0] # This is only to "load" the dictionary. id2word = tweet_dictionary.id2token """Create model We set alpha = 'auto' and eta = 'auto'. Again this is somewhat technical, but essentially we are automatically learning two parameters in the model that we usually would have to specify explicitly.""" model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunk_size, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) model_file = '../data/model/LDA_model_v1' model.save(model_file) """ Tests """ # Top topics top_topics = model.top_topics( corpus ) # , num_words=20) Default value = 20, input is our corpus in BOW format # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. """Topic Coherence measures score a single topic by measuring the degree of semantic similarity between high scoring words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and topics that are artifacts of statistical inference """ avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) pprint(top_topics)
from util.TextSimilarity import TextSimilarity from util.TaskReader import TaskReader from gensim.corpora import Dictionary from gensim.models.ldamodel import LdaModel print("LDA Output: ") first_num = 244 task = TaskReader.read("text.txt") similarity = TextSimilarity('french') doc_set = similarity.get_modified_text(task.text) edu_set = similarity.get_modified_text(task.education) dictionary = Dictionary([[x for x in i.split()] for i in edu_set]) for i in range(0, len(doc_set)): num = i + first_num corp = [x for x in doc_set[i].split()] corpus = [dictionary.doc2bow(corp)] ldamodel = LdaModel(corpus, num_topics=1, id2word=dictionary, passes=50) [ print("Topic № " + str(num) + " : " + x[1]) for x in ldamodel.print_topics(num_topics=1, num_words=6) ]
labels.append(label_id) print('Found %s texts.' % len(texts)) # Tokenize the texts using gensim. tokens = list() for text in texts: tokens.append(simple_preprocess(text)) # Vectorize the text samples into a 2D integer tensor. MAX_NUM_WORDS = 10000 # 2 words reserved: 0=pad, 1=oov MAX_SEQUENCE_LENGTH = 1000 dictionary = Dictionary(tokens) dictionary.filter_extremes(no_below=0, no_above=1.0, keep_n=MAX_NUM_WORDS-2) word_index = dictionary.token2id print('Found %s unique tokens.' % len(word_index)) data = [dictionary.doc2idx(t) for t in tokens] # Truncate and pad sequences. data = [i[:MAX_SEQUENCE_LENGTH] for i in data] data = np.array([np.pad(i, (0, MAX_SEQUENCE_LENGTH-len(i)), mode='constant', constant_values=-2) for i in data], dtype=int) data = data + 2
def lda(self, cat_list: list, below: int = 100, above: float = 0.1, eta: float = 0.9): assert set(cat_list).issubset(set(self.table.category.unique())) df_topic2 = self.table[self.table.category.isin( cat_list)].reset_index().iloc[:, 1:] instances = df_topic2.clean_text.apply(str.split) d = Dictionary(instances) print("Dictionary is:", d) d.filter_extremes(no_below=below, no_above=above) print("Dictionary after filtering:", d) ldacorpus = [d.doc2bow(text) for text in instances] tfidfmodel = TfidfModel(ldacorpus) model_corpus = tfidfmodel[ldacorpus] num_topics = len(df_topic2.groupby(['category']).count()) temp = df_topic2.groupby(['category']).count() prior_probabilities = temp["app"] / temp["app"].sum() alpha = prior_probabilities.values print("Prior probabilities of the topics -alpha- are:", alpha) num_passes = 10 chunk_size = len(model_corpus) * num_passes / 200 print("Preliminary steps to prepare the model done") model = LdaMulticore( num_topics=num_topics, # number of topics corpus=model_corpus, # what to train on id2word=d, # mapping from IDs to words workers=min(10, multiprocessing.cpu_count() - 1), # choose 10 cores, or whatever computer has passes=num_passes, # make this many passes over data chunksize=chunk_size, # update after this many instances alpha=alpha, eta=eta, random_state=5) print("Model is ready") topic_corpus = model[model_corpus] topic_sep = re.compile(r"0\.[0-9]{3}\*") model_topics = [(topic_no, re.sub(topic_sep, '', model_topic).split(' + ')) for topic_no, model_topic in model.print_topics( num_topics=num_topics, num_words=5)] descriptors = [] for i, m in model_topics: print(i + 1, ", ".join(m[:3])) descriptors.append(", ".join(m[:2]).replace('"', '')) print(descriptors) scores = [[t[1] for t in topic_corpus[entry]] for entry in range(len(instances))] topic_distros = pd.DataFrame(data=scores, columns=descriptors) topic_distros['category'] = df_topic2['category'] #%matplotlib inline print("Preparing graph") sns.set_context('poster') fig, ax = plt.subplots(figsize=(20, 10)) aggregate_by_category = topic_distros.groupby( topic_distros.category).mean() aggregate_by_category[descriptors].plot.bar(ax=ax) fig.set_size_inches(30, 30) plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), prop={'size': 25})
STEP_SIZE = 200 # Load Data - corp.pkl contains data_lemmatized, id2word, corpus with open('corp.pkl', 'rb') as f: data_lemmatized, _, _ = pickle.load(f) # Initialize Parameters total_time = 0 coherence_arr = [] time_arr = [] # Set Data State to that of existing model in simulation data = data_lemmatized[:INITIAL_DOC_SIZE] # When updating Online LDA, if I use a normal dictionary I keep getting key errors. # That's why for online lda alone I use Hash Dictionary id2word = Dictionary(documents=data) corpus = [id2word.doc2bow(doc) for doc in data] # Building for the first time - To be considered as the starting/existing model in simulation. start = timeit.default_timer() lda = LdaMulticore(corpus, num_topics=35, id2word=id2word, workers=3, chunksize=2000, passes=10, batch=False) end = timeit.default_timer() time_taken = end - start total_time += time_taken
FLAGS, unparsed = parser.parse_known_args() print('Reading data...') data = load_data(FLAGS.trainfile) comments_text = data['comment_text'] comments_text = comments_text.tolist() print('Finding tokens with embeddings...') ft_model = load_embedding(FLAGS.embedfile) docs = [c.split(' ') for c in comments_text] for i in range(len(docs)): docs[i] = [t for t in docs[i] if t in ft_model.vocab] print('Building dictionary...') comments_dictionary = Dictionary(docs) comments_corpus = [comments_dictionary.doc2bow(d) for d in docs] print("Creating tfidf model...") model_tfidf = TfidfModel(comments_corpus) print("Converting to tfidf vectors...") comments_tfidf = model_tfidf[comments_corpus] comments_vecs = corpus2csc(comments_tfidf).T print('Finding important terms...') labelcols = data.columns.tolist()[2:] terms = Counter() for l in labelcols: cl = data[l] model_fdr = SelectFdr(chi2, alpha=0.025)