def build_dictionary(db): dictionary = Dictionary() for article in db.articles.find(): dictionary.doc2bow(article['clean_text'], allow_update=True) # print dictionary # dictionary.save('data/cnn.dict') # store the dictionary, for future reference return dictionary
class TermFrequency(object): """ Computes a term frequency distance_matrix """ def __init__(self, documents): logging.log(logging.INFO, "Creating Term Frequency") self.id2Word = Dictionary(documents) self.num_unique_words = len(self.id2Word) self.distance_matrix = self.to_term_frequency_matrix(documents) def to_term_frequency_vector(self, document): return self.id2Word.doc2bow(document) def to_binary_vector(self, document): tf = self.id2Word.doc2bow(document) vect = sparse2full(tf, len(self.id2Word.keys())) return np.array( vect > 0, dtype=int ) # concerts to binary def to_term_frequency_matrix(self, documents): return [self.to_term_frequency_vector(d) for d in documents] def binary_matrix(self): """ Turns a regular tf distance_matrix into a binary distance_matrix """ def get_binary_data(val): if val <= 0: return 0 return 1 full_matrix = MatrixHelper.gensim_to_python_mdarray(self.distance_matrix, self.num_unique_words) return [[get_binary_data(cell) for cell in row] for row in full_matrix]
class LDA(Step): def __init__(self, num_topics): self._model = None self._dictionary = None self._n_topics = num_topics def fit(self, filename): contents = [x for _, x in Reader(filename)] self._dictionary = Dictionary(contents) corpus = [self._dictionary.doc2bow(text) for text in contents] self._model = LdaModel(corpus, num_topics=self._n_topics) def transform(self, filename): uuids, vectors = self._transform(filename) return uuids, vectors def _transform(self, filename): vectors = [] uuids = [] for uuid, tokens in Reader(filename): bow = self._dictionary.doc2bow(tokens) lda_probs = {dim: prob for dim, prob in self._model[bow]} lda_vec = [lda_probs.get(i, 0) for i in range(self._n_topics)] vectors.append(lda_vec) uuids.append(uuid) return uuids, np.array(vectors) @classmethod def _read(cls, filename): for uuid, tokens in Reader(filename): yield ' '.join(tokens)
def do_ir2(db, param): print 'Computazione di IR2', db, param, '...' def words(text): stopwords = set(nltk.corpus.stopwords.words('english')) return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords] class BigramsCorpus: def __init__(self, db, collection): self.client = MongoClient()[db][collection] def __iter__(self): for doc in self.client.find(): yield [doc['_id']] def __len__(self): return self.client.count() bigram_corpus = BigramsCorpus('cordis', 'bi_grams') bigrams = Dictionary(bigram_corpus) project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}} a = [project] project_corpus = MongoCorpus('cordis', 'projects', aggregate=a) n = max(bigrams.keys()) dataset = [] for doc in project_corpus: temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))]) x = [0]*(n+1) for bi, _ in temp: x[bi] = 1 dataset.append(x) alg = KMeans(n_clusters=int(param)) alg.fit(dataset) clusters = defaultdict(list) for i, doc in enumerate(project_corpus): temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))]) x = [0]*(n+1) for bi, _ in temp: x[bi] = 1 p = alg.predict([x]) clusters[p[0]].append(doc['reference']) mongo_clusters = [] for k, v in clusters.items(): mongo_clusters.append({'cluster': k, 'projects': v}) # Mongo da questo errore: InvalidDocument: Cannot encode object: 0 print mongo_clusters # Salva su collezione Mongo mongo = MongoClient()['g8']['ir2'] mongo.insert_many(mongo_clusters) print 'Fatto!'
def test_doc2bow(self): d = Dictionary([["žluťoučký"], ["žluťoučký"]]) # pass a utf8 string self.assertEqual(d.doc2bow(["žluťoučký"]), [(0, 1)]) # doc2bow must raise a TypeError if passed a string instead of array of strings by accident self.assertRaises(TypeError, d.doc2bow, "žluťoučký") # unicode must be converted to utf8 self.assertEqual(d.doc2bow([u'\u017elu\u0165ou\u010dk\xfd']), [(0, 1)])
def build_corpora(db): dictionary = Dictionary() corpus = [] for article in db.articles.find(): text = article['clean_text'] dictionary.doc2bow(text, allow_update=True) dictionary.filter_extremes() for article in db.articles.find(): text = article['clean_text'] corpus.append(dictionary.doc2bow(text)) gensim.corpora.MmCorpus.serialize('data/corpus.mm', corpus) dictionary.save('data/cnn.dict') return corpus, dictionary
class MyCorpus(object): def __init__(self, input_file, K): self.K = K self.input_file = input_file self.dictionary = Dictionary() with open(input_file, "rt") as f: for line in f: self.dictionary.add_documents([line.split()]) self.dictionary.filter_extremes(no_below = 2, no_above = 0.5, keep_n = K) def __iter__(self): count = 1 with open(self.input_file, "rt") as f: count += 1 for line in f: yield self.dictionary.doc2bow(line.rstrip().split()) def __str__(self): s = "MyCorpus(" + str(self.dictionary.num_docs) + " documents, " s += str(len(self.dictionary.keys())) + " features, " s += str(corpus.dictionary.num_nnz) + " non-zero entries)" return s def __repr__(self): return "MyCorpus('" + self.input_file + "', " + str(self.K) + ")"
def similarity_matrix(self): """Test similarity_matrix returns expected results.""" corpus = [["government", "denied", "holiday"], ["holiday", "slowing", "hollingworth"]] dictionary = Dictionary(corpus) corpus = [dictionary.doc2bow(document) for document in corpus] # checking symmetry and the existence of ones on the diagonal similarity_matrix = self.similarity_matrix(corpus, dictionary).todense() self.assertTrue((similarity_matrix.T == similarity_matrix).all()) self.assertTrue((np.diag(similarity_matrix) == similarity_matrix).all()) # checking that thresholding works as expected similarity_matrix = self.similarity_matrix(corpus, dictionary, threshold=0.45).todense() self.assertEquals(18, np.sum(similarity_matrix == 0)) # checking that exponent works as expected similarity_matrix = self.similarity_matrix(corpus, dictionary, exponent=1.0).todense() self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix)) # checking that nonzero_limit works as expected similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=4).todense() self.assertEquals(4, np.sum(similarity_matrix == 0)) similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=3).todense() self.assertEquals(20, np.sum(similarity_matrix == 0))
def main(): collection_name = "nips" years = xrange(2010, 2015) # 10 ~ 14 n_topics = 10 corpus_paths = map(lambda y: "data/{}-{}.dat".format(collection_name, y), years) all_corpus = [] year2corpus = {} for year, path in zip(years, corpus_paths): corpus = list(load_line_corpus(path)) all_corpus.append(proc_corpus(corpus)) year2corpus[year] = corpus all_corpus = list(itertools.chain.from_iterable(all_corpus)) dictionary = Dictionary(all_corpus) all_corpus = [dictionary.doc2bow(doc) for doc in all_corpus] import pdb pdb.set_trace() # print all_corpus model = LdaModel(all_corpus, num_topics=n_topics, id2word=dictionary, eval_every=10, passes=100) print model.show_topics()
def process_text(corpus, stoplist=None, bigrams=None, trigrams=None, keep_all=False, no_below=10, no_above=0.8): """ Extracts text data from the corpus Cleans and tokenizes text data Computes most frequent phrases, creates a dictionary and converts the corpus to a BOW model :param corpus: :return: processed corpus with phrases, dictionary and BOW corpus """ logging.info("Cleaned and tokenzed dataset") text_dataset = clean_and_tokenize(corpus, stoplist=stoplist, keep_all=keep_all) if bigrams is not None: bi_grams = Phrases(text_dataset, threshold=bigrams, min_count=no_below) text_dataset = bi_grams[text_dataset] elif trigrams is not None: bi_grams = Phrases(text_dataset, threshold=bigrams) tri_grams = Phrases(bi_grams[text_dataset], threshold=trigrams) text_dataset = tri_grams[bi_grams[text_dataset]] dictionary = Dictionary(text_dataset) dictionary.filter_extremes(no_below=no_below, no_above=no_above) bow_corpus = [dictionary.doc2bow(text) for text in text_dataset] return text_dataset, dictionary, bow_corpus
def test_from_corpus(self): """build `Dictionary` from an existing corpus""" documents = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] stoplist = set('for a of the and to in'.split()) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] dictionary_from_corpus = Dictionary.from_corpus(corpus) #we have to compare values, because in creating dictionary from corpus #informations about words are lost dict_token2id_vals = sorted(dictionary.token2id.values()) dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values()) self.assertEqual(dict_token2id_vals, dict_from_corpus_vals) self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs) self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs) self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos) self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)
def produce(self): print('Getting src docs') docs = [] doctokens = [] # aka Gensim's "text" stopwords = nltk.corpus.stopwords.words('english') for doc in self.src_doc_generator(): (doc_id,doc_label,doc_str) = doc docs.append(doc) doctokens.append([token for token in nltk.word_tokenize(doc_str) if token not in stopwords]) if len(docs) % 1000 == 0: print(len(docs)) print('Creating the dictionary') dictionary = Dictionary(doctokens) #dictionary.compactify() #dictionary.filter_extremes(keep_n=None) if self.dictfile: dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True) with self.dbi as db: print('Creating WORD') # aka Gensim's "dictionary" db.create_table('word') for word_id, word_str in dictionary.iteritems(): db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',(word_id,word_str)) print('Creating DOC and DOCWORD') db.create_table('doc') db.create_table('docword') for doc_idx, doc in enumerate(docs): db.cur.execute('INSERT INTO doc (doc_index,doc_id,doc_label,doc_str ) VALUES (?,?,?,?)',(doc_idx,doc[0],doc[1],doc[2])) doc_id = doc[0] for word_id, word_count in (dictionary.doc2bow(doctokens[doc_idx])): word_str = dictionary.get(word_id) # Is this valid? I believe it is. db.cur.execute('INSERT INTO docword (doc_index,doc_id,word_id,word_str,word_count) VALUES (?,?,?,?,?)',(doc_idx,doc_id,word_id,word_str,word_count))
def load_data(fname): print 'input file name:', fname target = [] #ラベル source = [] #文書ベクトル #文書リストを作成 document_list = [] word_list = [] for l in open(fname, 'r').readlines(): sample = l.strip().split(' ', 1) label = sample[0] target.append([label]) #ラベル word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング document_list.append(word_list) #文書ごとの単語リスト #辞書を作成 #低頻度と高頻度のワードは除く dct = Dictionary(document_list) dct.filter_extremes(no_below=3, no_above=0.6) #文書のBOWでベクトル化 for doc in document_list: tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0]) source.append(dense) dataset = {} dataset['target'] = np.array(target) dataset['source'] = np.array(source) return dataset #, max_len, width
def test_corpus_summarization(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] # Extract the most important documents. selected_documents = summarize_corpus(corpus) # They are compared to the method reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f: summary = f.read() summary = summary.split('\n') # Each sentence in the document selection has to be in the model summary. for doc_number, document in enumerate(selected_documents): # Retrieves all words from the document. words = [dictionary[token_id] for (token_id, count) in document] # Asserts that all of them are in a sentence from the model reference. self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) # remove stopwords stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(stopword_ids) dictionary.compactify() # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(shortword_ids) dictionary.compactify() # remove words that appear only once once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() # filter extreme values dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def bag_of_words(lemma): "Takes in lemmatised words and returns a bow." # Create bag of words from dictionnary dictionary = Dictionary(lemma) dictionary.save('text.dict') # Term frequency–inverse document frequency (TF-IDF) bow = [dictionary.doc2bow(l) for l in lemma] # Calculates inverse document counts for all terms return (bow, dictionary)
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", tofull=False): """ Pass in a directory that holds the lexicon in corpus.dict and the TFIDF model in tfidf.model (for now). Set tofull = True if the next thing is a Scikit-Learn estimator otherwise keep False if the next thing is a Gensim model. """ self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._tfidf_path = os.path.join(dirpath, "tfidf.model") self.lexicon = None self.tfidf = None self.tofull = tofull self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path) def save(self): self.lexicon.save(self._lexicon_path) self.tfidf.save(self._tfidf_path) def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator())
def test_run(self, data): dictionary = Dictionary(data) dictionary.filter_extremes(no_above=0.5) bags_of_words = [ dictionary.doc2bow(t) for t in data] #This can take a while to run: lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=2) results = self.assemble_topics(lda) return results
def run(self, data): wordlists = [corpus.tokenized_contents for corpus in data] dictionary = Dictionary(wordlists) # dictionary.filter_extremes(no_above=0.5) bags_of_words = [ dictionary.doc2bow(t) for t in wordlists] #This can take a while to run: lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=10) results = [] return self.assemble_topics(lda)
def to_corpus(documents): """ Make into a corpus @documents:list[list[tuple[str,int]]] of bows @returns Dictionary, Corpus """ d = Dictionary() corpus = [d.doc2bow(doc, allow_update=True) for doc in documents] return d, corpus
def test_low_distinct_words_corpus_summarization_is_empty_list(self): text = self._get_text_from_test_data("testlowdistinctwords.txt") # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] self.assertEqual(summarize_corpus(corpus), [])
class JsonCorpus(object): def __iter__(self): data = json.load(open('data/nasa.json')) desc = [TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset']] self.dictionary = Dictionary(desc) for d in desc: yield self.dictionary.doc2bow(d)
def setUp(self): texts = [[u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'],[u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'],[u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'],[ u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'],[u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'], [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'],[u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'],[u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'], [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'],[u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'], ['bank','river','shore','water'],['river','water','flow','fast','tree'],['bank','water','fall','flow'],['bank','bank','water','rain','river'], ['river','water','mud','tree'],['money','transaction','bank','finance'], ['bank','borrow','money'], ['bank','finance'], ['finance','money','sell','bank'],['borrow','sell'],['bank','loan','sell']] # initializing using own LDA sufficient statistics so that we get same results each time. sstats = numpy.loadtxt(datapath('sstats_test.txt')) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats)
def testMallet2ModelOn20NewsGroups(self): corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")] dictionary = Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] lda_mallet_model = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, num_topics=20, id2word=dictionary, iterations=500) lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000) self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
def test_corpus_summarization_is_not_empty_list_on_short_input_text(self): text = self._get_text_from_test_data("testsummarization_unrelated.txt") # Keeps the first 8 sentences to make the text shorter. sentences = text.split('\n')[:8] # Generate the corpus. tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] self.assertNotEqual(summarize_corpus(corpus), [])
def test_patch_with_special_tokens(self): special_tokens = {'pad': 0, 'space': 1, 'quake': 3} corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] d = Dictionary(corpus) self.assertEqual(len(d.token2id), 5) d.patch_with_special_tokens(special_tokens) self.assertEqual(d.token2id['pad'], 0) self.assertEqual(d.token2id['space'], 1) self.assertEqual(d.token2id['quake'], 3) self.assertEqual(len(d.token2id), 8) self.assertNotIn((0, 1), d.doc2bow(corpus[0])) self.assertIn((0, 1), d.doc2bow(['pad'] + corpus[0])) corpus_with_special_tokens = [["máma", "mele", "maso"], ["ema", "má", "máma", "space"]] d = Dictionary(corpus_with_special_tokens) self.assertEqual(len(d.token2id), 6) self.assertNotEqual(d.token2id['space'], 1) d.patch_with_special_tokens(special_tokens) self.assertEqual(len(d.token2id), 8) self.assertEqual(max(d.token2id.values()), 7) self.assertEqual(d.token2id['space'], 1) self.assertNotIn((1, 1), d.doc2bow(corpus_with_special_tokens[0])) self.assertIn((1, 1), d.doc2bow(corpus_with_special_tokens[1]))
def test_low_distinct_words_corpus_summarization_is_none(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f: text = f.read() # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] self.assertTrue(summarize_corpus(corpus) is None)
def test_from_corpus(self): """build `Dictionary` from an existing corpus""" documents = [ "Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey" ] stoplist = set('for a of the and to in'.split()) texts = [ [word for word in document.lower().split() if word not in stoplist] for document in documents] # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # Create dictionary from corpus without a token map dictionary_from_corpus = Dictionary.from_corpus(corpus) dict_token2id_vals = sorted(dictionary.token2id.values()) dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values()) self.assertEqual(dict_token2id_vals, dict_from_corpus_vals) self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs) self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs) self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos) self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz) # Create dictionary from corpus with an id=>token map dictionary_from_corpus_2 = Dictionary.from_corpus(corpus, id2word=dictionary) self.assertEqual(dictionary.token2id, dictionary_from_corpus_2.token2id) self.assertEqual(dictionary.dfs, dictionary_from_corpus_2.dfs) self.assertEqual(dictionary.num_docs, dictionary_from_corpus_2.num_docs) self.assertEqual(dictionary.num_pos, dictionary_from_corpus_2.num_pos) self.assertEqual(dictionary.num_nnz, dictionary_from_corpus_2.num_nnz) # Ensure Sparse2Corpus is compatible with from_corpus bow = gensim.matutils.Sparse2Corpus(scipy.sparse.rand(10, 100)) dictionary = Dictionary.from_corpus(bow) self.assertEqual(dictionary.num_docs, 100)
def score_messages_by_text(self, deviation_threshold=2): """ Method calculates token_score parameter for self.messages. Args: deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens """ texts = [x['tokens'] for x in self.messages.values()] if not sum([bool(x) for x in texts]) or len(set([frozenset(x) for x in texts])) == 1: for k in self.messages.keys(): self.messages[k]['token_score'] = 0 return dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = TfidfModel(corpus, id2word=dictionary) index = MatrixSimilarity(tfidf[corpus]) try: scores = index[dictionary.doc2bow(self.cores[deviation_threshold])] except IndexError: error('Index error in token scoring for event {}'.format(self.id)) scores = [0]*len(self.messages.values()) for i in range(len(scores)): self.messages.values()[i]['token_score'] = float(scores[i])
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
stemmed_tokens = [stemmer.stem(token) for token in tokens] return [token for token in stemmed_tokens if len(token) > 2] # skip short tokens dataset = [text2tokens(txt) for txt in newsgroups['data'] ] # convert a documents to list of tokens from gensim.corpora import Dictionary dictionary = Dictionary(documents=dataset, prune_at=None) dictionary.filter_extremes( no_below=5, no_above=0.3, keep_n=None) # use Dictionary to remove un-relevant tokens dictionary.compactify() d2b_dataset = [dictionary.doc2bow(doc) for doc in dataset ] # convert list of tokens to bag of word representation ############################################################################### # # Second, fit two LDA models. # --------------------------- # from gensim.models import LdaMulticore num_topics = 15 lda_fst = LdaMulticore( corpus=d2b_dataset, num_topics=num_topics, id2word=dictionary,
elif preDictTag != None: dct = Dictionary.load('{}{}.dict'.format(corpora_path, preDictTag)) #### Step 2, apply Tf-IDF representation #### bow_corpus = [] meta_wf = open("{}{}-Meta.csv".format(corpora_path, fileTag), "w") meta_wf.write("position_index,id_str,created_time\n") # use Timer to print elapsed time with Timer(): for each_collection in collections: print("Transforming the corpus for {}".format(each_collection)) file_path = f"{corpora_path}{each_collection}-raw-corpus.tsv" for i, a_tweet in enumerate(TweetRawCorpusStream(file_path)): # gensim's Dictionary.doc2bow will ignore words that are not in dictionary by default bow_per_doc = dct.doc2bow(a_tweet.tokens_str.split(",")) if len(bow_per_doc) > 4: timestamp = a_tweet.created_at id_str = a_tweet.id_str meta_wf.write("{},{},{}\n".format(len(bow_corpus), id_str, timestamp)) bow_corpus.append(bow_per_doc) meta_wf.close() tfidf_model = TfidfModel(bow_corpus) # fit model tfidf_corpus = tfidf_model[bow_corpus] #### Step 3, export model #### if preDictTag == None: dct.save('{}{}.dict'.format(corpora_path, fileTag)) MmCorpus.serialize('{}{}-tf-idf.mm'.format(corpora_path, fileTag),
all_questions_dict[group_id][ 'questions_and_choice_tok'].append(question_tok) num_question_groups = len(all_questions_dict) all_questions = [] for question_group_id in range(num_question_groups): all_questions.append([]) for each_question in all_questions_dict[question_group_id][ 'questions_and_choice_tok']: all_questions[question_group_id] += each_question #print(all_questions) dct = Dictionary(all_questions) corpus = [dct.doc2bow(line) for line in all_questions] ##print(corpus) model = TfidfModel(corpus) with open(output_tsv, 'w') as fo: output_line = 'question\ttop3words\tchoices\ttop3wordsIncuChoice\n' fo.write(output_line) for question_group_id, each_question_group in enumerate(corpus): vector = model[each_question_group] #print(all_questions[question_group_id]) sorted_by_second = sorted(vector, key=lambda tup: tup[1], reverse=True) sorted_by_second = [[dct[word], score] for word, score in sorted_by_second] #print(sorted_by_second) for question_id in range( len(all_questions_dict[question_group_id]['questions'])):
def compute_keywords(X_train_raw, n_keywords=20, k_topics=50, alpha=0.3, eta=1, niter=200, ismain=False, news_index=[], news_name=[]): print('compute_keywords\n') stopwordsFilePath = '../stopwords_es_ES_enh.txt' my_punctuation = '!"#$%&\'()*+,-./:;<=>?@[]^_`{|}~' # X_train_raw = [article.content for article in newspaper] processed_docs, stopwords = doc_processing(X_train_raw, stopwordsFilePath, doc=True) id2word = Dictionary(processed_docs) # dictionary.filter_extremes(no_below=1, no_above=0.6, keep_n=None) corpus = [id2word.doc2bow(text) for text in processed_docs] # Topic modeling using LDA ldamodel = LdaModel(corpus=corpus, num_topics=k_topics, alpha=alpha, eta=eta, id2word=id2word, iterations=niter) num_words = 50 ldatopics = ldamodel.top_topics(corpus, num_words=num_words) topic_word_matrix = ldamodel.expElogbeta word2id = id2word.token2id top_words = [] for topic in ldatopics: word_in_topic = [] for wid in range(num_words): word_in_topic.append(topic[0][wid][1]) top_words.append(word_in_topic) #### top_words == ldatopics # Create a topic-document matrix: d = ldamodel.get_document_topics(corpus) topic_doc_matrix = np.zeros((k_topics, len(d)), dtype=np.float16) topic_x_doc = np.zeros((1, len(d)), dtype=np.int8) list_topic_x_doc = [] for n, doc in enumerate(d): aux = np.reshape(doc, (len(doc), 2)) topics = aux[:, 0] list_topic_x_doc.append(topics) if len(topics) > 0: topic_x_doc[0, n] = len(topics) for i in topics: topic_doc_matrix[int(i), n] = aux[int(np.nonzero(aux == int(i))[0])][1] all_keywords = [] all_weigths = [] idnewspaper = 0 for n in range(len(X_train_raw)): tokens_in_file = tokenize(X_train_raw[n], deacc=False) aux = [] for word in tokens_in_file: aux.append(word) aux = Dictionary([aux]) word2id_in_file = aux.token2id topic_filewords_matrix = np.zeros( (topic_x_doc[0, n], len(word2id_in_file)), dtype=float) for k_top in range(topic_x_doc[0, n]): topic = int(list_topic_x_doc[n][k_top]) for word in word2id_in_file: if len( word ) > 1 and word not in stopwords and word not in my_punctuation and word in word2id: topic_filewords_matrix[ k_top, word2id_in_file[word]] += topic_word_matrix[ topic, word2id[word]] # topic_sentence_matrix[k_top, s] += topic_doc_matrix[topic, n] * topic_word_matrix[topic, word2id[word]] topic_filewords_matrix[k_top, :] *= topic_doc_matrix[topic, n] # Suma por columnas para tener el peso acumulado de una palabra en todos los topics del documento weight_words = np.sum(topic_filewords_matrix, axis=0) doc_keywords, keyword_weight = get_keywords(n_keywords, weight_words, word2id_in_file) all_keywords.append(doc_keywords) all_weigths.append(keyword_weight / np.sum(keyword_weight)) # Normalizo pesos if ismain: if n == news_index[idnewspaper]: save_obj(all_keywords, 'data/%s.keyw.pkl' % news_name[idnewspaper]) save_obj(all_weigths, 'data/%s.weight.pkl' % news_name[idnewspaper]) all_keywords = [] all_weigths = [] idnewspaper += 1 if not ismain: return all_keywords, all_weigths, word2id, id2word.id2token
# remov% #all_tokens = sum(texts, []) #tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) < (len(dictionary))*0.02) #texts = [[word for word in text if word not in tokens_once] # for text in texts] #stemmed_all=texts csv.register_dialect('myDialect', delimiter=',', quoting=csv.QUOTE_ALL) with open('Lemmatized_documents.csv', 'w', newline='') as file: writer = csv.writer(file, dialect='myDialect') writer.writerows(stemmed_all) mydict = corpora.Dictionary() mydict = corpora.Dictionary() #dtm doc_term_matrix = [dictionary.doc2bow(doc) for doc in stemmed_all] mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in stemmed_all] csv.register_dialect('myDialect', delimiter=',', quoting=csv.QUOTE_ALL) with open('Document_term_matrix.csv', 'w', newline='') as file: writer = csv.writer(file, dialect='myDialect') writer.writerows(doc_term_matrix) #print(mycorpus) word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus] texts = stemmed_all texts=stemmed_all data_lemmatized=stemmed_all
#Dont add the new tweets because we want to recreate the same dictionary # Remove rare and common tokens. from gensim.corpora import Dictionary # Create a dictionary representation of all the documents from all the years. dictionaryAll = Dictionary(justTweetsAll) # Filter out words that occur less than 20 documents, or more than 50% of the documents # The last part was removed since I want them in there because theyre search terms, but it was no_above=0.5 dictionaryAll.filter_extremes(no_below=20) ################################################################################################################# # Bag-of-words representation of the documents. corpusAll = [dictionaryAll.doc2bow(doc) for doc in justTweetsAll] corpus2018 = [dictionaryAll.doc2bow(doc) for doc in justTweets2018] corpus2019 = [dictionaryAll.doc2bow(doc) for doc in justTweets2019] corpus2020 = [dictionaryAll.doc2bow(doc) for doc in justTweets2020] corpusNEW = [dictionaryAll.doc2bow(doc) for doc in justTweetsNEW] # Let's see how many tokens and documents we have to train on print('Number of unique tokens: %d' % len(dictionaryAll)) print('Number of documents: %d' % len(corpusAll)) ################################################################################################################## #monte carlo simulation to identify number of topics from gensim.models import LdaModel from gensim.models import CoherenceModel
sentences = df_sentences['Article_sentence_nouns_cleaned'].to_list() # Read in list in list (=1 sentences 1 doc) # sentences = MakeListInLists(sentences) # Create a dictionary representation of the documents dict_nouns = Dictionary(sentences) # Display # pp.pprint(dict_nouns.token2id) # Filter out words that occur less than 20 documents, or more than 50% of the documents dict_nouns.filter_extremes(no_below=4, no_above=0.4) # Bag-of-words representation of the documents corpus_nouns = [dict_nouns.doc2bow(doc) for doc in sentences] # Make a index to word dictionary temp = dict_nouns[0] # This is only to "load" the dictionary id2word_nouns = dict_nouns.id2token # Display pp.pprint(id2word_nouns) # Display results of Corpus # print(corpus_nouns) # print('Number of unique tokens: {}'.format(len(dict_nouns))) # print('Number of documents: {}'.format(len(corpus_nouns))) # TODO: save corpus and dctionary to disk and load them back # save to path_lda_data
minimum_count_for_link = 4 word_window = 5 corpus = [] for x in d: thing = d[x]["text"].lower().translate( str.maketrans('', '', string.punctuation)) tfiltered = list(filter(lambda w: not w in s, thing.split())) #tfiltered = map(lambda x: lemmatizer.lemmatize(x), tfiltered) #tfiltered = list(tfiltered) corpus.append(tfiltered) dct = Dictionary(corpus) bow_corpus = [dct.doc2bow(line) for line in corpus] term_doc_mat = corpus2csc(bow_corpus) from collections import OrderedDict document = corpus names = dct.values() occurrences = OrderedDict( (name, OrderedDict((name, 0) for name in names)) for name in names) # Find the co-occurrences: for l in document: for i in range(len(l)): print(l[i - word_window:i] + l[i + word_window:]) for item in l[i - word_window:i] + l[i + word_window:]:
def prepare(): with open(dataset_path) as file: lines = file.readlines() step = 65000 with open(out_put, 'w') as f: for i in range(0, len(lines), step): count_line = 0 dataset = [] label = [] print(count_line, len(lines)) for line in lines: if i <= count_line < i + step: if type_ != 'test': dataset.append(line.split(',')[1].split(' ')) label.append(line.split(',')[2]) else: dataset.append(line.split(',')[1].split(' ')) label.append(line.split(',')[0]) count_line += 1 from gensim.models import TfidfModel from gensim.corpora import Dictionary dct = Dictionary(dataset) corpus = [dct.doc2bow(line) for line in dataset] # convert corpus to BoW format model = TfidfModel(corpus) # fit model # vec = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1) id2token = {} for (k, v) in dct.token2id.items(): id2token[v] = k # ver_rs=[] # vs=['520477','816903','995362','920327','1226448','1025743','990423', # '133940','1071452','876555','323159','572782','105283','166959', # '235896','554251','','1267351','1224594','201789','824446','263278'] # for v in vs: # print(v,dct.token2id[v]) dataset_after_tfidf = [] for i in range(len(dataset)): vector = model[ corpus[i]] # apply model to the first corpus document ver_rs = [] for v in vector: (id2, score) = v if score > 0.01: ver_rs.append(id2token[id2]) d_temp = [] for d in dataset[i]: if type_ != 'test': if d in ver_rs or d == '816903': d_temp.append(d) else: if d in ver_rs: d_temp.append(d) # print(len(dataset[i]),len(d_temp)) dataset_after_tfidf.append(' '.join(d_temp)) if type_ != 'test': # f.writelines(dataset_after_tfidf[i]+'__label__'+label[i]+'\n') for i in range(len(dataset_after_tfidf)): count = 0 new_line = [] for t in dataset_after_tfidf[i].split(' '): if t != '816903' or count < 15: if t == '816903': count += 1 if t not in [ '520477', '816903', '995362', '920327', '1226448', '1025743', '990423', '133940', '1071452', '876555', '323159', '572782', '105283', '166959', '235896', '554251', '', '1267351', '1224594', '201789', '824446', '263278' ]: new_line.append(t) else: # print(count) count = 0 f.writelines(' '.join(new_line) + ' __label__' + str(label[i]) + '\n') new_line = [] else: for i in range(len(dataset_after_tfidf)): f.writelines(label[i] + ',' + dataset_after_tfidf[i] + '\n') print('dd')
lines = f.readlines() for sentence in lines: words = sentence.decode('utf8').split(" ") sentence_segment = [] for word in words: if word.strip() != '': sentence_segment.append(word.strip()) corpus_list.append(sentence_segment) return corpus_list code_dataset = getCorpus("frcorpus/text%d.dat" % REPO_ID) text_dataset = getCorpus("frcorpus/code%d.dat" % REPO_ID) code_dct = Dictionary(code_dataset) text_dct = Dictionary(text_dataset) code_corpus = [code_dct.doc2bow(line) for line in code_dataset] # convert corpus to BoW format text_corpus = [text_dct.doc2bow(line) for line in text_dataset] # convert corpus to BoW format code_model = TfidfModel(code_corpus) code_model.save("frcorpus/code%d.model" % REPO_ID) text_model = TfidfModel(text_corpus) text_model.save("frcorpus/text%d.model" % REPO_ID) def read_data(path): res = [] filelist = os.listdir(path) for i in range(0, len(filelist)): filepath = os.path.join(path, filelist[i]) logging.info("Loaded the file:" + filepath)
total_examples = model_2.corpus_count #Updating the model on our data based on the pretrained model model_2.build_vocab([list(glove_model.vocab.keys())], update=True) model_2.intersect_word2vec_format("word2vec.txt", binary=False, lockf=1.0) model_2.train(data['content'], total_examples=total_examples, epochs=model_2.iter) # Getting sentence vector through weighted average of word vector and tf idf score from gensim.models import TfidfModel from gensim.corpora import Dictionary # Basically a vocabulary of the words in our dataset dct = Dictionary(data['content']) #Creating corpus for every tweet corpus = [dct.doc2bow(line) for line in data['content']] # convert corpus to BoW format #Fitting the tfidf model on the corpus model_tfidf = TfidfModel(corpus) #Initializing an empty list for sentence vector sent_vec=[] #Sent_vec=sum over length of sentence(Tf_idf*Word vector)/Sum tf_idf for i in range(len(data['content'])): weighted = np.zeros(200) sum_tfidf=0 for j in range(len(list(set(data['content'][i])))): weighted= (weighted)+ (model_tfidf[corpus[i]][j][1])* (model_2[list(set(data['content'][i]))[j]]) sum_tfidf+=model_tfidf[corpus[i]][j][1] sent_vec.append((weighted)/sum_tfidf)
) # needed because sample data files are located in the same folder def datapath(fname): return os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) texts = [['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') class TestLsiModel(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.model = lsimodel.LsiModel(self.corpus, num_topics=2) def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model
class Parent(object): def __init__(self, hotel_description, user_description, hotel_attributes, user_attributes, incremental=False, num_incremental=4): '''hotel_description = path of file user_description = path of file hotel_attributes = path of file user_attributes = path of file are list of lists, where each list corresponds to a different element that has been preprocessed and tokenized incremental indicates the use of the incremental user method num_incremental indicates the number of sentences per user query''' self.hotel_description = hotel_description self.user_description = user_description self.hotel_attributes = hotel_attributes self.user_attributes = user_attributes self.clean_hotel_description = self.preprocess( self.hotel_description, 'testorcone') #preprocessed self.clean_user_description = self.preprocess( self.user_description, 'richiestautente') #preprocessed self.clean_hotel_attributes = self.load_hotel_attributes( ) #preprocessed self.clean_user_attributes = self.load_user_attributes( self.user_attributes, 'richiestautente') #preprocessed self.dictionary = Dictionary(self.clean_hotel_description) self.vocab_new = dict() for k, v in self.dictionary.token2id.items(): self.vocab_new[k] = v self.incremental = incremental if incremental == True: self.incremental_user = self.write_user_queriesXsentence( num_incremental) self.clean_user_description = self.preprocess( "./datasets/incremental_user/queries" + str(num_incremental) + ".txt", 'richiestautente') #make a switch to the user query part so that user description becomes that 'Functions for loading and preprocessing files' def loader_docs(self, file, stop_word): # Load a text file, dividing it in different strings depending on stopword f = open(file, 'r') txt = f.readlines() f.close() l = ' ' txt = [txt[i].strip() for i in range(len(txt))] txt = l.join(txt) txt = txt.split(stop_word) txt.pop() return txt def description_to_words(self, raw_review): # The input is a single string, and the output is a tokenized list preprocessed tokenizer = RegexpTokenizer(r'\w+') docs = raw_review.lower() # Convert to lowercase. doc = tokenizer.tokenize(docs) # tokenize string # Remove stop words stops = set(stopwords.words("english")) doc = [w for w in doc if not w in stops] # Remove words of only one character doc = [token for token in doc if len(token) > 1] # Stem words snowball = SnowballStemmer('english') doc = [snowball.stem(token) for token in doc] return doc def get_bigrams(self, clean_docs): # in place operation # Add bigrams to docs (only ones that appear 20 times or more). bigram = Phrases(clean_docs, min_count=20) for idx in range(len(clean_docs)): for token in bigram[clean_docs[idx]]: if '_' in token: # Token is a bigram, add to document. clean_docs[idx].append(token) return clean_docs def preprocess(self, file, stop_word, activate_grams='Yes'): #preprocess txt file docs = self.loader_docs(file, stop_word) clean_docs = [] for i in range(0, len(docs)): clean_docs.append(self.description_to_words(docs[i])) if activate_grams == 'Yes': clean_docs = self.get_bigrams(clean_docs) return clean_docs def load_hotel_attributes(self): #specific preprocess for hotel attributes clean_hotel_attributes = self.preprocess(self.hotel_attributes, 'nuovohotelinarrivo', 'NO') clean_hotel_attributes = [[ token for token in doc if token != 'attributinuovi' ] for doc in clean_hotel_attributes] return clean_hotel_attributes def load_user_attributes(self, file, stop_word): # specific preprocess for user attributes f = open(file, 'r') txt = f.readlines() f.close() snowball = SnowballStemmer('english') stops = set(stopwords.words("english")) #removing unnacesary parts of user_attributes stops.add('go') stops.add('play') stops.add('center') stops.add('centre') stops.add('nearby') stops.add('service') stops.add('do') stops.add('spot') tokenizer = RegexpTokenizer(r'\w+') txt = [txt[i].strip() for i in range(len(txt))] txt = [token for token in txt if token != ''] txt = [token for token in txt if token != 'new_sentence'] users = [] user = [] for i in range(len(txt)): if txt[i] == stop_word: users.append(user) user = [] else: attribute = txt[i].lower() attribute = tokenizer.tokenize(attribute) attribute = [w for w in attribute if not w in stops] attribute = [token for token in attribute if len(token) > 1] attribute = [snowball.stem(token) for token in attribute] l = ' ' attribute = l.join(attribute) user.append(attribute) return users def preprocess_special(self): 'creates files of preprocessed hotel description and attributes used for evaluation' f = open('./datasets/pp/hotel_attributes.txt', 'w') for txt in self.clean_hotel_attributes: l = ' ' txt = [txt[i].strip() for i in range(len(txt))] txt = l.join(txt) f.write(txt + '\n') f.close() f = open('./datasets/pp/hotel_descriptions.txt', 'w') for txt in self.clean_hotel_description: l = ' ' txt = [txt[i].strip() for i in range(len(txt))] txt = l.join(txt) f.write(txt + '\n') f.close() f = open('./datasets/pp/user_queries.txt', 'w') for txt in self.clean_user_description: l = ' ' txt = [txt[i].strip() for i in range(len(txt))] txt = l.join(txt) f.write(txt + '\n') f.close() def incremental_loader_docs(self, file): # Load a text file, dividing it in different strings for incremental user f = open(file, 'r') txt = f.readlines() f.close() hotels = [] hotel = [] for i in txt: line = i.strip() if line == 'richiestautente': hotels.append(hotel) hotel = [] else: hotel.append(line) return hotels 'General functions' def get_corpus(self): 'get the Bag Of Words representation for the hotel_description and user_queries' corpus = [ self.dictionary.doc2bow(doc) for doc in self.clean_hotel_description ] BOW_user_queries = [ self.dictionary.doc2bow(doc) for doc in self.clean_user_description ] return corpus, BOW_user_queries def accuracy_query2hotel(self, hotel, user): # This function calculates how many attributes are satisfied in an hotel description # inputs: hotel_description index and user_description index # output: attributes satisfied in hotel description over total number of attributes tokenizer = RegexpTokenizer(r'\w+') count = 0 f = open("./datasets/pp/hotel_descriptions.txt", 'r') clean_hotel_description = [] for line in f: print(line.split(' ')) clean_hotel_description.append(line.split(' ')) f.close() f = open("./datasets/pp/hotel_attributes.txt", 'r') clean_hotel_attributes = [] for line in f: clean_hotel_attributes.append(line.split(' ')) f.close() for user_attribute in self.clean_user_attributes[user]: part_count = 0 user_attribute = tokenizer.tokenize(user_attribute) for part_attribute in user_attribute: if part_attribute in clean_hotel_description[ hotel] or part_attribute in clean_hotel_attributes[ hotel]: part_count += 1 if part_count == len(user_attribute): count += 1 return count / len(self.clean_user_attributes[user]) def make_accuracy_array(self, queryXhotel, num_best, bol=True): 'for each user query it computes the accuracy of the 5 most similar hotels' self.preprocess_special() accuracy_array = np.zeros((len(self.clean_user_description), num_best)) for i in range(np.shape(queryXhotel)[0]): ordered = matutils.argsort(queryXhotel[i], topn=5, reverse=bol) if self.incremental == False: for j in range(num_best): accuracy = self.accuracy_query2hotel(ordered[j], i) accuracy_array[i][j] = accuracy else: for j in range(num_best): accuracy = self.accuracy_query2hotel( ordered[j], self.incremental_user[i]) accuracy_array[i][j] = accuracy return accuracy_array def get_overall_accuracy(self, accuracy_array, num_best=5): #It computes the average accuracy for each best hotel count = 0 for i in range(num_best): overall_accuracy = np.sum( accuracy_array[:, i]) / accuracy_array.shape[0] print('recall ' + str(i + 1) + ' ' + str(overall_accuracy)) count += overall_accuracy return count def get_accuracy_array(self, hotel_match_X_query, num_best): '''for each user query it computes the accuracy of the 5 most similar hotels, works with gensim module cosine similarity and word's mover distance''' self.preprocess_special() accuracy_array = np.zeros((len(hotel_match_X_query), num_best)) for i in range(len(hotel_match_X_query)): if self.incremental == False: for j in range(num_best): accuracy = self.accuracy_query2hotel( hotel_match_X_query[i][j][0], i) accuracy_array[i][j] = accuracy else: for j in range(num_best): accuracy = self.accuracy_query2hotel( hotel_match_X_query[i][j][0], self.incremental_user[i]) accuracy_array[i][j] = accuracy return accuracy_array 'Similarity functions' #A series of functions that interacts with the gensim modules def Jaccard_similiarity(self, corpus, corpus_model_user_description, num_best=5): 'for each user query it computes the Jaccard coefficient with respect to each hotel' length = len(corpus_model_user_description) queryXhotel = np.zeros((length, len(corpus))) for i in range(length): for j in range(len(corpus)): queryXhotel[i][j] = jaccard(corpus_model_user_description[i], corpus[j]) #np.save('jaccard_similiarity', queryXhotel) accuracy_array = self.make_accuracy_array(queryXhotel, num_best, bol=False) return accuracy_array def cosine_similarity(self, corpus, corpus_model_user_description, num_best=5): # corpus can be for example corpus_tfidf # corpus_model_user_description can be for example tfidf_user_queries: # num_best refers to the number of best hotels that will be considered #USES cosine similarity as implemented in Gensim index = similarities.MatrixSimilarity(corpus, num_best=num_best) hotel_match_X_query = [] for query in corpus_model_user_description: #tfidf_user_queries: sims = index[query] hotel_match_X_query.append(sims) accuracy_array = self.get_accuracy_array(hotel_match_X_query, num_best) return accuracy_array def Hellinger_similiarity(self, corpus, corpus_model_user_description, num_best=5): 'implements Hellinger similarity using gensim modules' length = len(corpus_model_user_description) queryXhotel = np.zeros((length, len(corpus))) print('It takes some time') for i in range(length): for j in range(len(corpus)): queryXhotel[i][j] = hellinger(corpus_model_user_description[i], corpus[j]) print(i) #np.save('hellinger_similiarity', queryXhotel) accuracy_array = self.make_accuracy_array(queryXhotel, num_best, bol=False) #true? return accuracy_array def WMD_similiarity(self, corpus, w2v_model, corpus_model_user_description, num_best=5): 'Word mover distance similarity' index = similarities.WmdSimilarity(corpus, w2v_model, num_best) hotel_match_X_query = [] for query in corpus_model_user_description: #tfidf_user_queries::20 sims = index[query] hotel_match_X_query.append(sims) accuracy_array = self.get_accuracy_array(hotel_match_X_query, num_best) return accuracy_array 'second experiment' #The second experiment consists in analyzing the performance of the reccomendation system on user queries based on #a different amount of attributes def divide_query_per_num_attribute(self): 'The queries are divided based on the number of attributes they contain' length_4 = [] length_5 = [] length_6 = [] length_7 = [] length_8 = [] length_9 = [] for i in range(len(self.clean_user_attributes)): length = len(self.clean_user_attributes[i]) if length == 4: length_4.append(i) elif length == 5: length_5.append(i) elif length == 6: length_6.append(i) elif length == 7: length_7.append(i) elif length == 8: length_8.append(i) elif length == 9: length_9.append(i) else: print('Error,missing list of len = ' + str(len(i))) return length_4, length_5, length_6, length_7, length_8, length_9 def get_accuracy_based_attributes(self, accuracy_array, length_array, num_best=1): 'the accuracy is calculated for each user query based on the amount of attributes' for i in range(len(length_array)): numpy_length = np.array(length_array[i]) sliced_array = accuracy_array[numpy_length] accuracy = 0 for j in range(num_best): accuracy += np.sum(sliced_array[:, j]) / sliced_array.shape[0] accuracy = accuracy / float(num_best) print('accuracy for ' + str(i + 4) + ' attributes ' + str(accuracy)) 'incremental user' def write_user_queriesXsentence(self, numb_sentences): 'it divides the user query based on the amount of sentences' #input is the amount of sentences the query should have #output is a list with the index of user queries that satisfy the contraint and a text file containing all query f = open( "./datasets/incremental_user/queries" + str(numb_sentences) + ".txt", 'w') hotels = self.incremental_loader_docs(self.user_description) indexes = [] for i in range(len(hotels)): if len(hotels[i]) > (numb_sentences - 1): indexes.append(i) for j in range(numb_sentences): sentence = hotels[i][j] f.write(sentence + '\n') f.write('richiestautente\n') f.close() return indexes
============================================================================= tri-grammed tokenized article: {} """.format(docs[1], docs_tokens[1], docs_phrased[1])) # %% get corpus & dictionary to use for further nlp analysis """ I suggest to prepare the dictionary and the corpus `once for all' -- that is, dumping the files that, eventually, will be loaded for further analysis. """ # get dictionary and write it to a file """ a dictionary is a mapping between words and their integer ids. See Gensim documentation here: https://radimrehurek.com/gensim/corpora/dictionary.html """ pr_dictionary = Dictionary(docs_phrased) pr_dictionary.save("/tmp/pr_dictionary.dict") # get corpus and write it to a file """ as per the Gensim documentation, it possible to convert document into the bag-of-words (format = list of (token_id, token_count) tuples) via doc2bow """ pr_corpus = [pr_dictionary.doc2bow(doc) for doc in docs_phrased] """ Gensim offers several utilities to write a corpus of text to a file. Personally, I prefer the Matrix Market format [1] [1]: https://math.nist.gov/MatrixMarket/formats.html """ corpora.MmCorpus.serialize("/tmp/pr_corpus.mm", pr_corpus)
import sys from gensim.corpora import Dictionary from gensim.models.ldaseqmodel import LdaSeqModel from gensim.models import word2vec data_file = sys.argv[1] topic_num = int(sys.argv[2]) sentences = list(word2vec.LineSentence(data_file)) dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences] ldaseq = LdaSeqModel(corpus = corpus, id2word = dic, num_topics = topic_num, time_slice = [len(corpus)]) print('topic,item,prob') for i, ts in enumerate(ldaseq.print_topics(top_terms = 10)): for t in ts: print(f'{i},{t[0]},{t[1]}')
lines = f.readlines() for sentence in lines: words = sentence.decode('utf8').split(" ") sentence_segment = [] for word in words: if word.strip() != '': sentence_segment.append(word.strip()) corpus_list.append(sentence_segment) return corpus_list code_dataset = getCorpus("frcorpus/text%d.dat" % REPO_ID) text_dataset = getCorpus("frcorpus/code%d.dat" % REPO_ID) code_dct = Dictionary(code_dataset) text_dct = Dictionary(text_dataset) code_corpus = [code_dct.doc2bow(line) for line in code_dataset] # convert corpus to BoW format text_corpus = [text_dct.doc2bow(line) for line in text_dataset] # convert corpus to BoW format code_model = TfidfModel(code_corpus) code_model.save("frcorpus/code%d.model" % REPO_ID) text_model = TfidfModel(text_corpus) text_model.save("frcorpus/text%d.model" % REPO_ID) def read_data(path): res = [] filelist = os.listdir(path) for i in range(0, len(filelist)): filepath = os.path.join(path, filelist[i]) logging.info("Loaded the file:"+filepath) if os.path.isfile(filepath): file = open(filepath, 'rb')
def preprocess(sentence): return [w for w in sentence.lower().split() if w not in stop_words] sentence_obama = preprocess(sentence_obama) sentence_president = preprocess(sentence_president) sentence_orange = preprocess(sentence_orange) ############################################################################### # Next, we will build a dictionary and a TF-IDF model, and we will convert the # sentences to the bag-of-words format. # from gensim.corpora import Dictionary documents = [sentence_obama, sentence_president, sentence_orange] dictionary = Dictionary(documents) sentence_obama = dictionary.doc2bow(sentence_obama) sentence_president = dictionary.doc2bow(sentence_president) sentence_orange = dictionary.doc2bow(sentence_orange) from gensim.models import TfidfModel documents = [sentence_obama, sentence_president, sentence_orange] tfidf = TfidfModel(documents) sentence_obama = tfidf[sentence_obama] sentence_president = tfidf[sentence_president] sentence_orange = tfidf[sentence_orange] ############################################################################### # Now, as mentioned earlier, we will be using some downloaded pre-trained # embeddings. We load these into a Gensim Word2Vec model class and we build # a term similarity mextrix using the embeddings.
""" Automated tests for checking transformation algorithms (the models package). """ import logging import unittest from gensim.corpora import mmcorpus, Dictionary from gensim.models import hdpmodel from gensim.test import basetmtests from gensim.test.utils import datapath, common_texts import numpy as np dictionary = Dictionary(common_texts) corpus = [dictionary.doc2bow(text) for text in common_texts] class TestHdpModel(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = hdpmodel.HdpModel self.model = self.class_(corpus, id2word=dictionary, random_state=np.random.seed(0)) def testTopicValues(self): """ Check show topics method """ results = self.model.show_topics()[0]
def create_bow(data): dct = Dictionary(data) dct.filter_extremes(no_below=20) bow = [dct.doc2bow(doc) for doc in data] return dct, bow
corpus = sohu_corpus(fname=os.path.join(training_file_path, 'neg_1.txt'), dic=dictionary) # save dictionary # dictionary.save(os.path.join(training_file_path, '07_11_dictionary.dict')) MmCorpus.serialize(os.path.join(training_file_path, '07_11_corpus_12.mm'), corpus) # dictionary = Dictionary.load(os.path.join(training_file_path, '07_11_dictionary.dict')) corpus_tfidf_mm = MmCorpus( os.path.join(training_file_path, '07_11_corpus_12.mm')) training_src_data = sogou_corpus_file( os.path.join(training_file_path, 'neg_1.txt')) training_src = [] for each_file in training_src_data: training_src.append(each_file) # convert counts to tfidf tfidf = TfidfModel(corpus=corpus_tfidf_mm) index = MatrixSimilarity(tfidf[corpus_tfidf_mm]) sims = index[tfidf[dictionary.doc2bow(['阳台', '打死'])]] print('doc2bow:') print(dictionary.doc2bow(['阳台'])) print('tfidf:') print(tfidf[dictionary.doc2bow(['阳台'])]) print(u'相似文档:\n') sims = sorted(enumerate(sims), key=lambda item: -item[1]) for num, value in sims[:3]: print(str(num) + '\t' + str(value) + '\t' + training_src[num] + '\n')
import gensim import json from gensim.corpora import Dictionary from gensim.matutils import corpus2csc #model = gensim.models.Word2Vec.load("./models/word2vec/size-256_min-count-2_epoch-50_examples-total_window-15_sentences/word2vec_size-100_window-5_min-count-1_workers-4.model") def read_list_corpus(list_corp, tokens_only=False): for i, paragraph in enumerate(list_corp): if tokens_only: yield gensim.utils.simple_preprocess(paragraph[0]) else: yield gensim.models.doc2vec.TaggedDocument( gensim.utils.simple_preprocess(paragraph[0]), [i]) model = gensim.models.TfidfModel.load("./models/tfidf/sentences/tfidf") dataset = json.load(open("./datasets/dataset_paragraphs.json")) dataset = list(read_list_corpus(dataset, tokens_only=True)) dct = Dictionary(dataset) bow_corpus = [dct.doc2bow(line) for line in dataset] term_doc_mat = corpus2csc(bow_corpus) print dir(term_doc_mat) print term_doc_mat.get_shape()
class Word2VecWmdRelaxSimilarity(Word2VecSimilarityBase): def __init__( self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8, ): super().__init__( cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer, confidence_threshold=confidence_threshold, ) self.dictionary = Dictionary(self.corpus) self.tfidf = TfidfModel(dictionary=self.dictionary) def search_similar_bugs(self, query): query = self.text_preprocess(self.get_text(query)) words = [ word for word in set(chain(query, *self.corpus)) if word in self.w2vmodel.wv ] indices, words = zip( *sorted( ( (index, word) for (index, _), word in zip(self.dictionary.doc2bow(words), words) ) ) ) query = dict(self.tfidf[self.dictionary.doc2bow(query)]) query = [ (new_index, query[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in query ] documents = [ dict(self.tfidf[self.dictionary.doc2bow(document)]) for document in self.corpus ] documents = [ [ (new_index, document[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in document ] for document in documents ] embeddings = np.array( [self.w2vmodel.wv[word] for word in words], dtype=np.float32 ) nbow = dict( ( (index, list(chain([None], zip(*document)))) for index, document in enumerate(documents) if document != [] ) ) nbow["query"] = tuple([None] + list(zip(*query))) distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors("query") return [ self.bug_ids[distance[0]] for distance in distances if self.bug_ids[distance[0]] != query["id"] ] def get_distance(self, query1, query2): query1 = self.text_preprocess(self.get_text(query1)) query2 = self.text_preprocess(self.get_text(query2)) words = [ word for word in set(chain(query1, query2, *self.corpus)) if word in self.w2vmodel.wv ] indices, words = zip( *sorted( ( (index, word) for (index, _), word in zip(self.dictionary.doc2bow(words), words) ) ) ) query1 = dict(self.tfidf[self.dictionary.doc2bow(query1)]) query2 = dict(self.tfidf[self.dictionary.doc2bow(query2)]) query1 = [ (new_index, query1[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in query1 ] query2 = [ (new_index, query2[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in query2 ] embeddings = np.array( [self.w2vmodel.wv[word] for word in words], dtype=np.float32 ) nbow = {} nbow["query1"] = tuple([None] + list(zip(*query1))) nbow["query2"] = tuple([None] + list(zip(*query2))) distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors("query1") return distances[0][1]
print(time.time() - start) # Create the term similarity matrix. similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) print(time.time() - start) output_file = open('semantic_comparison.json', 'w+') output = [] for x in range(0, 300): query_string = qs[(int)(len(qs) * random.random())] # pick a random question query = preprocess(query_string) # Compute Soft Cosine Measure between the query and the documents. # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb query_tf = tfidf[dictionary.doc2bow(query)] index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix) doc_similarity_scores = index[query_tf] # Output the sorted similarity scores and documents sorted_indexes = np.argsort(doc_similarity_scores)[::-1] output_obj = {'question': query_string} answer_array = [] ticks = 0 for idx in sorted_indexes: ticks += 1 if ticks == 5:
import time time_start = time.time() # 数据预处理 with open("toutiao_cat_data.txt", "r", encoding="utf-8") as f: # with open("test.txt","r",encoding="utf-8") as f: data = [] for line in f.readlines(): line = line.strip() # 去除空格 line = ','.join(line.split("_!_")[3:]) # 按符号切割数据,并且不要前三个无关文本内容的数据 data.append(jieba.lcut(line)) # 文本向量化 dictionary = Dictionary(data) # 统计每个词在其它文本中出现了多少次 dictionary.filter_n_most_frequent(200) # 过滤掉频率过高的词 corpus = [dictionary.doc2bow(text) for text in data] # 转化为词袋向量 # 训练模型 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10) # 指定了10个主题, # 获取主题词分布 topic_list = lda.print_topics(20) # print(topic_list) for i in topic_list: print(i) def pre(data): '获取某篇文档的主题分布' print(data) doc_bow = dictionary.doc2bow(data) # 文档转换成bow
class DocDataset(Dataset): def __init__(self,taskname,txtPath=None,lang="zh",tokenizer=None,stopwords=None,no_below=5,no_above=0.0134,hasLable=False,rebuild=False,use_tfidf=False): cwd = os.getcwd() txtPath = os.path.join(cwd,'data',f'{taskname}_lines.txt') if txtPath==None else txtPath tmpDir = os.path.join(cwd,'data',taskname) self.txtLines = [line.strip('\n') for line in open(txtPath,'r',encoding='utf-8')] self.vob = [ [line.strip('\n')] for line in open("./data/topic_model_vocab.txt",'r',encoding='utf-8')] self.dictionary = None self.bows,self.docs = None,None self.use_tfidf = use_tfidf self.tfidf,self.tfidf_model = None,None if not os.path.exists(tmpDir): os.mkdir(tmpDir) if not rebuild and os.path.exists(os.path.join(tmpDir,'corpus.mm')): # print("here exit") # exit() self.bows = gensim.corpora.MmCorpus(os.path.join(tmpDir,'corpus.mm')) if self.use_tfidf: self.tfidf = gensim.corpora.MmCorpus(os.path.join(tmpDir,'tfidf.mm')) self.dictionary = Dictionary.load_from_text(os.path.join(tmpDir,'dict.txt')) self.docs = pickle.load(open(os.path.join(tmpDir,'docs.pkl'),'rb')) self.dictionary.id2token = {v:k for k,v in self.dictionary.token2id.items()} # because id2token is empty be default, it is a bug. else: if stopwords==None: stopwords = set([l.strip('\n').strip() for l in open(os.path.join(cwd,'data','stopwords.txt'),'r',encoding='utf-8')]) # self.txtLines is the list of string, without any preprocessing. # self.texts is the list of list of tokens. print('Tokenizing ...') if tokenizer is None: tokenizer = globals()[LANG_CLS[lang]](stopwords=stopwords) self.docs = tokenizer.tokenize(self.txtLines) # print("self.docs", len(self.docs)) self.docs = [line for line in self.docs if line!=[]] #print("self.docs", len(self.docs)) # build dictionary print("self.vob", len(self.vob), self.vob[0]) self.dictionary = Dictionary(self.vob) # #self.dictionary.filter_n_most_frequent(remove_n=20) # self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=2000) # use Dictionary to remove un-relevant tokens # self.dictionary.compactify() self.dictionary.id2token = {v:k for k,v in self.dictionary.token2id.items()} # because id2token is empty by default, it is a bug. # convert to BOW representation self.bows, _docs = [],[] for doc in self.docs: _bow = self.dictionary.doc2bow(doc) if _bow!=[]: _docs.append(list(doc)) self.bows.append(_bow) self.docs = _docs print("bow", len(self.bows), self.bows[0], len(self.bows[0])) if self.use_tfidf==True: self.tfidf_model = TfidfModel(self.bows) self.tfidf = [self.tfidf_model[bow] for bow in self.bows] # serialize the dictionary gensim.corpora.MmCorpus.serialize(os.path.join(tmpDir,'corpus.mm'), self.bows) self.dictionary.save_as_text(os.path.join(tmpDir,'dict.txt')) pickle.dump(self.docs,open(os.path.join(tmpDir,'docs.pkl'),'wb')) if self.use_tfidf: gensim.corpora.MmCorpus.serialize(os.path.join(tmpDir,'tfidf.mm'),self.tfidf) self.vocabsize = len(self.dictionary) self.numDocs = len(self.bows) print(f'Processed {len(self.bows)} documents.') def __getitem__(self,idx): bow = torch.zeros(self.vocabsize) if self.use_tfidf: item = list(zip(*self.tfidf[idx])) else: item = list(zip(*self.bows[idx])) # bow = [[token_id1,token_id2,...],[freq1,freq2,...]] bow[list(item[0])] = torch.tensor(list(item[1])).float() txt = self.docs[idx] return txt,bow def __len__(self): return self.numDocs def collate_fn(self,batch_data): texts,bows = list(zip(*batch_data)) return texts,torch.stack(bows,dim=0) def __iter__(self): for doc in self.docs: yield doc def show_dfs_topk(self,topk=20): ndoc = len(self.docs) dfs_topk = sorted([(self.dictionary.id2token[k],fq) for k,fq in self.dictionary.dfs.items()],key=lambda x: x[1],reverse=True)[:topk] for i,(word,freq) in enumerate(dfs_topk): print(f'{i+1}:{word} --> {freq}/{ndoc} = {(1.0*freq/ndoc):>.13f}') return dfs_topk def show_cfs_topk(self,topk=20): ntokens = sum([v for k,v in self.dictionary.cfs.items()]) cfs_topk = sorted([(self.dictionary.id2token[k],fq) for k,fq in self.dictionary.cfs.items()],key=lambda x: x[1],reverse=True)[:topk] for i,(word,freq) in enumerate(cfs_topk): print(f'{i+1}:{word} --> {freq}/{ntokens} = {(1.0*freq/ntokens):>.13f}') def topk_dfs(self,topk=20): ndoc = len(self.docs) dfs_topk = self.show_dfs_topk(topk=topk) return 1.0*dfs_topk[-1][-1]/ndoc
import pandas as pd from gensim.corpora import Dictionary from gensim.models import LdaModel pos_com = pd.read_csv('./data/pos_com.csv', header=None, index_col=0) neg_com = pd.read_csv('./data/neg_com.csv', header=None, index_col=0) # 正向评价 pos_com.columns = ['comment'] mid = list(pos_com['comment'].str.split(' ')) dictionary = Dictionary(mid) bow = [dictionary.doc2bow(com) for com in mid] # 模型构建 pos_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3) pos_model.print_topic(0) pos_model.print_topic(1) pos_model.print_topic(2) # 负面评价 neg_com.columns = ['comment'] mid = list(neg_com['comment'].str.split(' ')) dictionary = Dictionary(mid) bow = [dictionary.doc2bow(com) for com in mid] # 模型构建 neg_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3) neg_model.print_topic(0) neg_model.print_topic(1) neg_model.print_topic(2)
def start(num_topics, kind): data = loader.load_data(kind) df = pd.DataFrame(data) cleaner.clean(df) nlps = { 'it': spacy.load('it_core_news_lg'), 'en': spacy.load('en_core_web_lg'), 'fr': spacy.load('fr'), 'de': spacy.load('de') } tokenizers = { 'it': Tokenizer(nlps['it'].vocab), 'en': Tokenizer(nlps['en'].vocab), 'fr': Tokenizer(nlps['fr'].vocab), 'de': Tokenizer(nlps['de'].vocab) } # Customize stop words by adding to the default list stop_words = [] stop_words += nlps['it'].Defaults.stop_words stop_words += nlps['en'].Defaults.stop_words stop_words += nlps['fr'].Defaults.stop_words stop_words += nlps['de'].Defaults.stop_words stop_words += s.ALL_STOPWORDS stop_words = set(stop_words) # ALL_STOP_WORDS = spacy + gensim + wordcloud ALL_STOP_WORDS = stop_words.union(SW).union(stopwords) cleaner.remove_stopwords(df, tokenizers, ALL_STOP_WORDS) cleaner.lemmas(df, nlps) tok.tokenize_text(df) # Create a id2word dictionary id2word = Dictionary(df['lemma_tokens']) print(len(id2word)) # Filtering Extremes id2word.filter_extremes(no_below=2, no_above=.99) print(len(id2word)) # Creating a corpus object corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']] # Instantiating a Base LDA model base_model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=12, passes=5) # Filtering for words words = [re.findall(r'"([^"]*)"', t[1]) for t in base_model.print_topics()] # Create Topics topics = [' '.join(t[0:10]) for t in words] # Getting the topics for id, t in enumerate(topics): print(f"------ Topic {id} ------") print(t, end="\n\n") # Compute Perplexity # a measure of how good the model is. lower the better base_perplexity = base_model.log_perplexity(corpus) print('\nPerplexity: ', base_perplexity) # Compute Coherence Score coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v') coherence_lda_model_base = coherence_model.get_coherence() print('\nCoherence Score: ', coherence_lda_model_base) lda_display = pyLDAvis.gensim.prepare(base_model, corpus, id2word) d = pyLDAvis.display(lda_display) today = date.today() directory_path = f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/" if not os.path.exists(directory_path): os.makedirs(directory_path) f = open( f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/{num_topics}.html", 'w') f.write(d.data) f.close() vectorizer = CountVectorizer() data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text']) # Define Search Param search_params = { 'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9] } # Init the Model lda = LatentDirichletAllocation() # Init Grid Search Class model = GridSearchCV(lda, param_grid=search_params) # Do the Grid Search model.fit(data_vectorized) GridSearchCV(cv=None, error_score='raise', estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_jobs=1, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0), iid=True, n_jobs=1, param_grid={ 'n_topics': [10, 15, 20, 30], 'learning_decay': [0.5, 0.7, 0.9] }, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0) # Best Model best_lda_model = model.best_estimator_ # Model Parameters print("Best Model's Params: ", model.best_params_) # Log Likelihood Score print("Best Log Likelihood Score: ", model.best_score_) # Perplexity print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
def lda_topic_model(input_filename, keyword, size, *, num_topics, iterations=50, passes=1, chunksize=2000, eval_every=10, verbose=False, gamma_threshold=0.001, filter_no_below=5, filter_no_above=0.5, filter_keep_n=100000, open_browser=True): cl.section('LDA Topic Model Training') cl.info('Keyword: %s' % keyword) cl.info('Data size: %d' % size) cl.info('Number of topics: %d' % num_topics) cl.info('Iterations: %d' % iterations) cl.info('Passes: %d' % passes) cl.info('Chunk size: %d' % chunksize) cl.info('Eval every: %s' % eval_every) cl.info('Verbose: %s' % verbose) cl.info('Gamma Threshold: %f' % gamma_threshold) cl.info('Filter no below: %d' % filter_no_below) cl.info('Filter no above: %f' % filter_no_above) cl.info('Filter keep n: %d' % filter_keep_n) assert re.fullmatch(r'[-_0-9a-zA-Z+]+', keyword) input_filename = data_source_file(input_filename) description = '%s-%d-%d-%dx%d-%s' % (keyword, size, num_topics, iterations, passes, time.strftime('%Y%m%d%H%M%S')) if verbose: log_filename = log_file('ldalog-%s.log' % description) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG, filename=log_filename) cl.info('Writing logs into file: %s' % log_filename) with TimeMeasure('load_preprocessed_text'): preprocessed_texts = file_read_json(input_filename) preprocessed_texts = [item[1] for item in preprocessed_texts] with TimeMeasure('gen_dict_corpus'): cl.progress('Generating dictionary and corpus...') dictionary = Dictionary(preprocessed_texts, prune_at=None) dictionary.filter_extremes(no_below=filter_no_below, no_above=filter_no_above, keep_n=filter_keep_n) dictionary.compactify() corpus = [dictionary.doc2bow(text) for text in preprocessed_texts] corpusfilename = model_file('ldacorpus-%s.json' % description) file_write_json(corpusfilename, corpus) cl.success('Corpus saved as: %s' % corpusfilename) with TimeMeasure('training'): cl.progress('Performing training...') with NoConsoleOutput(): ldamodel = LdaMulticore(corpus, workers=N_WORKERS, id2word=dictionary, num_topics=num_topics, iterations=iterations, passes=passes, chunksize=chunksize, eval_every=eval_every, gamma_threshold=gamma_threshold, alpha='symmetric', eta='auto') cl.success('Training finished.') with TimeMeasure('save_model'): modelfilename = 'ldamodel-%s' % description ldamodel.save(model_file(modelfilename)) cl.success('Model saved as: %s' % modelfilename) with TimeMeasure('measure_coherence'): cl.progress('Measuring topic coherence...') measure_coherence(ldamodel, preprocessed_texts, corpus, dictionary) with TimeMeasure('vis_save'): cl.progress('Preparing visualization...') vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) htmlfilename = 'ldavis-%s.html' % description htmlfilename = report_file(htmlfilename) pyLDAvis.save_html(vis, htmlfilename) cl.success('Visualized result saved in file: %s' % htmlfilename) if open_browser: open_html_in_browser(htmlfilename)
def _build_corpus(sentences): split_tokens = [sentence.token.split() for sentence in sentences] dictionary = Dictionary(split_tokens) return [dictionary.doc2bow(token) for token in split_tokens]
# Do lemmatization keeping only Noun, Adj, Verb, Adverb data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) data_lemmatized[:10] data_words2 = list(sent_to_words(data_lemmatized)) from gensim.corpora import Dictionary # Create a dictionary representation of the documents. dictionary = Dictionary(data_words2) dictionary.filter_extremes(no_below=10, no_above=0.6) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) corpus = [dictionary.doc2bow(doc) for doc in data_words2] vectorizer = CountVectorizer(analyzer='word', min_df=10, # minimum reqd occurences of a word max_df=0.6, stop_words='english', # remove stop words lowercase=True, # convert all words to lowercase token_pattern='[a-zA-Z0-9]{3,}', # num chars > 3 # max_features=50000, # max number of uniq words ) data_vectorized = vectorizer.fit_transform(data_lemmatized) # Materialize the sparse data data_dense = data_vectorized.todense()