class EnronCorpus(TextCorpus): def __init__(self, root_name, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None): """ Initialize the corpus. This scans through all the emails once, to determine the corpus vocabulary. (only the first `keep_words` most frequent words that appear in at least `no_below` documents are kept). """ self.root_name = root_name if dictionary is None: self.dictionary = Dictionary(self.get_texts()) self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) else: self.dictionary = dictionary def get_texts(self, return_raw=False): """ Walk the file system, strip punctuation, normalize all numbers to be '2'. """ filenames = walk_os(self.root_name) opened_files = gen_open(filenames) stripped_files = strip_punct(opened_files) length = 0 for email in stripped_files: if len(email) > ARTICLE_MIN_CHARS: length += 1 print "Iteration: %i" % length yield tokenize(email) self.length = length # cache corpus length
def loadDictionary(fname, mapping_only=True): """ Load previously stored mapping between words and their ids. The result can be used as the `id2word` parameter for input to transformations. """ if mapping_only: result = {} for lineNo, line in enumerate(open(fname)): cols = line[:-1].split('\t') if len(cols) == 2: wordId, word = cols elif len(cols) == 3: wordId, word, dfs = cols else: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) result[int(wordId)] = word # dfs not used else: result = Dictionary() for lineNo, line in enumerate(open(fname)): cols = line[:-1].split('\t') if len(cols) == 3: wordId, word, dfs = cols else: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordId = int(wordId) result.token2id[word] = wordId result.dfs[wordId] = int(dfs) return result
class WordCorpus(BaseCorpus): """\ Wrapper around a `gensim.corpora.dictionary.Dictionary`. This is a light-weight alternative to `CableCorpus` to create an initial word dictionary:: wd = WordCorpus() wd.add_text('ref-1', 'bla bla') # add more texts wd.dct.filter_extremes() corpus = CableCorpus('/my/directory/', wd.dct) corpus.add_text('ref-1', 'bla bla') # add more texts corpus.close() """ def __init__(self, dct=None, tokenizer=None): """\ Initializes the wrapper. `dct` An existing Dictionary or ``None`` if a new Dictionary should be created (default) `tokenizer` A tokenizer function or ``None``, see `BaseCorpus` """ super(WordCorpus, self).__init__(tokenizer) self.dct = Dictionary() if dct is None else dct def add_words(self, reference_id, words): self.dct.doc2bow(words, True)
def get_corpus_dictionary(): """Crafts a toy corpus and the dictionary associated.""" # Toy corpus. corpus = [ ['carrot', 'salad', 'tomato'], ['carrot', 'salad', 'dish'], ['tomato', 'dish'], ['tomato', 'salad'], ['car', 'break', 'highway'], ['highway', 'accident', 'car'], ['moto', 'break'], ['accident', 'moto', 'car'] ] dictionary = Dictionary(corpus) # Transforming corpus with dictionary. corpus = [dictionary.doc2bow(doc) for doc in corpus] # Building reverse index. for (token, uid) in dictionary.token2id.items(): dictionary.id2token[uid] = token return corpus, dictionary
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS): """\ """ wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle') bow_filename = os.path.join(out_dir, 'cables_bow.mm') tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm') predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany) # 1. Create word dict dct = Dictionary() dct_handler = DictionaryHandler(dct) handler = create_filter(dct_handler) handle_source(src, handler, predicate) dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) dct.save(wordid_filename) # 2. Reiterate through the cables and create the vector space corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False) handler = create_filter(corpus_handler) handle_source(src, handler, predicate) # 3. Load corpus mm = MmCorpus(bow_filename) # 4. Create TF-IDF model tfidf = TfidfModel(mm, id2word=dct, normalize=True) # 5. Save the TF-IDF model MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
def create_dictionaries(train=None, test=None, model=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (train is not None) and (model is not None) and (test is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) w2indx = {v: k+1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(data): ''' Words become integers ''' for key in data.keys(): txt = data[key].lower().replace('\n', '').split() new_txt = [] for word in txt: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data[key] = new_txt return data train = parse_dataset(train) test = parse_dataset(test) return w2indx, w2vec, train, test else: print('No data provided...')
def doc_to_gensim(doc, lemmatize=True, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document. Args: doc (``spacy.Doc``) lemmatize (bool): if True, use lemmatized strings for words; otherwise, use the original form of the string as it appears in ``doc`` filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`: integer word ID to word string mapping list((int, int)): bag-of-words document, a list of (integer word ID, word count) 2-tuples """ gdict = Dictionary() words = extract.words(doc, filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums) if lemmatize is True: gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True) else: gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True) return (gdict, gdoc)
class CorpusOfMethodContents(TextCorpus): def __init__(self): self.mapMethodFQNtoIndex = {} self.methodFqns = [] self.methodContents = [] TextCorpus.__init__(self) def addDocument(self, methodFqn, words): if methodFqn not in self.mapMethodFQNtoIndex: self.methodFqns.append(methodFqn) self.mapMethodFQNtoIndex[methodFqn] = len(self.mapMethodFQNtoIndex) - 1 self.methodContents.append(words) self.dictionary.doc2bow(words, allow_update = True) else: self.methodContents[self.mapMethodFQNtoIndex[methodFqn]] = words self.dictionary = Dictionary() self.dictionary.add_documents(self.get_texts()) def getMethodContentsForFqn(self, fqn): if fqn in self.mapMethodFQNtoIndex.keys(): return self.methodContents[self.mapMethodFQNtoIndex[fqn]] return None def get_texts(self): for content in self.methodContents: yield content
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量 def parse_dataset(combined): ''' Words become integers ''' data=[] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined=parse_dataset(combined) combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec,combined else: print 'No data provided...'
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None): ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must be a 3-tuple of the picklefile names in the following order: (title, body, tags) If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved. ''' utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary() for eid in xrange(n): for row in row_stream(splits_template % eid): ID, title, body, tags = row utitledict.doc2bow(title.split(), allow_update=True) ubodydict.doc2bow(body.split(), allow_update=True) utagdict.doc2bow(tags.split(), allow_update=True) assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs print "Before filtering..." print "utitledict:", utitledict print "ubodydict:", ubodydict print "utagdict:", utagdict if save_pickle_tup: assert len(save_pickle_tup) == 3 if save_pickle_tup[0]: print "saving utitledict..." utitledict.save(save_pickle_tup[0]) if save_pickle_tup[1]: print "saving ubodydict..." ubodydict.save(save_pickle_tup[1]) if save_pickle_tup[2]: print "saving utagdict..." utagdict.save(save_pickle_tup[2]) return (utitledict, ubodydict, utagdict)
def create_dictionary(analyzed_items_path, dictionary_path=None): dictionary = Dictionary(iter_docs(analyzed_items_path)) if dictionary_path: dictionary.save(dictionary_path) return dictionary
def build_dictionary(self): documents = ReadThreads( self.board, input_dir=self.input_dir, file_type='phrases', return_func=lambda x, y: y.split()) dictionary = Dictionary(documents) dictionary.save(f'{self.board}.dictionary') return dictionary
def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus. Args: spacy_docs (list(``spacy.Doc``)) spacy_vocab (``spacy.Vocab``) lemmatize (bool): if True, use lemmatized strings for words; otherwise, use the original form of the string as it appears in ``doc`` filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`: integer word ID to word string mapping list(list((int, int))): list of bag-of-words documents, where each doc is a list of (integer word ID, word count) 2-tuples """ gdict = Dictionary() gcorpus = [] stringstore = StringStore() doc_freqs = Counter() for spacy_doc in spacy_docs: if lemmatize is True: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items()) else: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.ORTH).items()) if filter_stops is True: bow = ((lex, count) for lex, count in bow if not lex.is_stop) if filter_punct is True: bow = ((lex, count) for lex, count in bow if not lex.is_punct) if filter_nums is True: bow = ((lex, count) for lex, count in bow if not lex.like_num) bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow), key=itemgetter(0)) doc_freqs.update(tok_id for tok_id, _ in bow) gdict.num_docs += 1 gdict.num_pos += sum(count for _, count in bow) gdict.num_nnz += len(bow) gcorpus.append(bow) gdict.token2id = {s: i for i, s in enumerate(stringstore)} gdict.dfs = dict(doc_freqs) return (gdict, gcorpus)
class SublexicalizedCorpus(TextCorpus): def __init__(self, base_corpus, order=3, word_limit=None, clean_func=mahoney_clean, create_dictionary=True, n_proc=1): self.order = order self.clean_func = clean_func self.base_corpus = base_corpus self.word_limit = word_limit self.n_proc = n_proc super(SublexicalizedCorpus, self).__init__() self.dictionary = Dictionary() if create_dictionary: self.dictionary.add_documents(self.get_texts()) def get_texts(self): a_count = 0 t_count = 0 texts = ((text, self.clean_func, self.order) for text in self.base_corpus.get_texts()) pool = multiprocessing.Pool(self.n_proc) start = time.clock() prev = start for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100): for tokens in pool.imap_unordered(process, group): a_count += 1 cur = time.clock() if cur - prev > 60: logging.info("Sublexicalized %d in %d seconds, %.0f t/s" % (t_count, cur - start, t_count*1. / (cur - start))) prev = cur t_count += len(tokens) yield tokens if self.word_limit and t_count > self.word_limit: break pool.terminate() end = time.clock() logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s" % (t_count, end - start, t_count*1. / (end - start))) self.length = t_count
def getDictionary(word_corpus, useSavedTill): if useSavedTill >= USESAVED.dictionary: common_logger.info("loading dictionary from file") dictionary = Dictionary.load(file_lda_gensim_dictionary) return dictionary else: common_logger.info("Creating dictionary from corpus") dictionary = Dictionary(word_corpus.values()) common_logger.info("saving dictionary") dictionary.save(file_lda_gensim_dictionary) return dictionary
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None): ''' Build dictionary from splits. If `save_pickle` is provided, then save. ''' unfiltered_dict = Dictionary() for eid in xrange(n): unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column)) print "Before filtering,", unfiltered_dict if save_pickle: print "\nsaving..." unfiltered_dict.save(save_pickle) return unfiltered_dict
class tip_rec: def __init__(self, num_topics = 15): self.numtopics = num_topics self.topic_dict = dict(enumerate(np.zeros(num_topics))) self.user_dict = {} self.model = None self.worddict = {} self.mydict = None def train(self, df): self.user_dict = {el:self.topic_dict.copy() for el in df.sender.unique()} cv = CV(stop_words='english') X = cv.fit_transform(df['context']) vocab = cv.vocabulary_.keys() self.worddict=dict([(i, s) for i, s in enumerate(vocab)]) self.mydict = Dictionary() self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(X, documents_columns=False), id2word=self.worddict) self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict) for i in df.iterrows(): if i[1]['context'] == '': continue else: values = new_model[mydict.doc2bow(i[1]['context'].split())] for val in values: if val[0] in user_dict[i[1].sender].keys(): if i[1].amt == '': continue user_dict[i[1].sender][val[0]] += val[1] * float(i[1].amt) continue user_dict[i[1].sender][val[0]] = val[1] for i in user_dict.keys(): norm_const = sum(user_dict[i].values()) for j in user_dict[i].keys(): user_dict[i][j] = user_dict[i][j]/norm_const def predict(self, text, username = ''): topics = self.model[self.mydict.doc2bow(text.split())] doc_aff = np.zeros(self.numtopics) for i in topics: doc_aff[i[0]] = i[1] if username == '': returndict = {} for user in self.user_dict.keys(): user_aff = np.array(self.user_dict[user].values()) score = np.linalg.norm(user_aff - doc_aff) returndict[user] = score return returndict else: user_aff = np.array(self.user_dict[username].values()) score = np.linalg.norm(user_aff - doc_aff) return (username, score)
def _load_vocab(self,fname): logging.info("loading plain-text file:{}".format(fname)) src_file = codecs.open(fname, 'rb', 'utf-8') dictionary = Dictionary() num_instances = 0 for term in src_file: dictionary.doc2bow(term.strip().lower().encode('utf-8').split(), allow_update=True) num_instances += 1 logging.info("processed {} instances".format(num_instances)) self.dictionary = dictionary
def __init__(self, fname, dictionary=None): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. """ self.fname = fname self.metadata = False if dictionary is None: dictionary = Dictionary() for text in self.get_texts(): dictionary.add_documents([text]) self.dictionary = dictionary
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None): """ :rtype : gensim.corpora.dictionary.Dictionary :param corpora: :param stopwords: :param allowed_pos: :param max_doc: :return: """ logging.info('Lemmatizing the corpora...') count = 0 corpus_num = len(corpora) processed_corpora = [] corpus_id2orig_id = [] for index, corpus in corpora.items(): count += 1 if count > max_doc: break if corpus is None: # skip if corpus is None continue print '\r', count, '/', corpus_num, cleaned_corpus = clean_text(corpus) # delete irrelevant characters corpus = [] tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos) for token in tokens: word, pos = token.split('/') corpus.append(word) # convert compound word into one token corpus = convert_compound(corpus) # filter stop words, long words, and non-english words corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary
def merge_dictionaries(dictionaries_path, merged_dictionary_path=None): dict_paths = list(iglob(dictionaries_path)) final_dictionary = Dictionary.load(dict_paths[0]) for dict_path in dict_paths[1:]: dictionary = Dictionary.load(dict_path) final_dictionary.merge_with(dictionary) if merged_dictionary_path: final_dictionary.save(merged_dictionary_path) return final_dictionary
def analyze_top_dfs(tokendict, tagdict, cutoff_factor=1): ''' Provided gensim-dicts `tokendict` and `tagsdict`, show the top word frequencies. ''' if type(tokendict) == str: tokendict = Dictionary.load(tokendict) if type(tagdict) == str: tagdict = Dictionary.load(tagdict) max_tag_df = max(tagdict.dfs.iteritems(), key=operator.itemgetter(1)) sorted_dfs = sorted(tokendict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True) print "count threshold: %-15s\t%d" % (tagdict[max_tag_df[0]], max_tag_df[1]) print "----------------------------------------------" for tup in sorted_dfs[:100]: if tup[1] > max_tag_df[1] * cutoff_factor: print "%-15s\t%d" % (tokendict[tup[0]][:15], tup[1]) else: break
def build_dictionary(self): logging.debug('=' * 20) logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表') # 构建训练库字典 # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token, # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...] train_document = map(lambda x: x.split(),self.__seg_sentence__) gensim_dict = Dictionary.from_documents(train_document) # 更新字典,再字典中添加特殊符号,其中 # U表示未知字符,即OOV词汇 gensim_dict.add_documents([[u'UNKOWN']]) logging.debug('更新字典,再字典中添加特殊符号(UNKOWN),之后字典大小为:%d' % (len(gensim_dict.keys()))) # print '更新字典,再字典中添加特殊符号,之后字典大小为:%d' % (len(gensim_dict.keys())) self.__gensim_dict__ = gensim_dict self.__vocabulary_size__ = len(gensim_dict.keys()) logging.debug('训练库字典为:%d' % (self.__vocabulary_size__)) print '训练库字典为:%d' % self.__vocabulary_size__ logging.debug(u'字典有:%s' % (','.join(gensim_dict.token2id.keys()))) print u'字典有:%s' % (','.join(gensim_dict.token2id.keys())) # word2embedding = {} # unknow_token_index = self.__gensim_dict__.token2id[u'UNKOWN'] embedding_weights = np.zeros((self.__vocabulary_size__ + 1, self.__word_embedding_length__ )) for key,value in gensim_dict.token2id.items(): embedding_weights[value,:] = self.get_w2vEmbedding(key) # todo 创建词向量字典 self.__embedding_weights__ = embedding_weights
def main(args): if args.corpus_type != "wiki": if args.processed_corpus_save_path is not None: raise ValueError("Processed corpus saving only supported " "for 'wiki' corpus type") kwargs = {} if args.dictionary_path is not None: kwargs["dictionary"] = Dictionary.load(args.dictionary_path) if args.dictionary_out_path is not None: kwargs["dictionary_save_path"] = args.dictionary_out_path if args.corpus_type == "wiki" and args.processed_corpus_save_path is not None: kwargs["sentences_save_path"] = args.processed_corpus_save_path logging.debug("Building corpus") corpus = CORPUS_TYPES[args.corpus_type](args.corpus_path, **kwargs) documents = corpus.get_texts() logging.debug("Now beginning VSM construction with Word2Vec") model = Word2Vec( sentences=documents, vocab_path=args.vocab_path, window=args.window_size, drop_capitals=args.drop_capitals, min_count=args.minimum_token_count, size=args.vector_dimensions, workers=multiprocessing.cpu_count(), ) model.save(args.out_path) if args.vocab_out_path is not None: model.save_vocab(args.vocab_out_path)
def user_lda(lda, dictionary_path, textyielder): id2word = Dictionary.load_from_text(dictionary_path) ret = {} for user, text in text_yielder(): bow = id2word.doc2bow(UserCorpus.text2tokens(text)) ret[user] = lda[bow] return ret
def __init__(self, corpus_file): """ Args: corpus_file -- 语料文件,第一列是类别,后面都是标签 """ corpus = [] categories = [] self._category_distribution = {} # 统计各个类别的样本数 self._words_cate = {} # 统计每个词(标签、特征)下的类别样本数 self._words_sample_count = {} self._info_gain = {} with open(corpus_file, 'r') as documents: for line in documents: words = line.strip().split() if len(words) <= 1: continue categories.append(words[0]) corpus.append(words[1:]) if words[0] not in self._category_distribution: self._category_distribution[words[0]] = 0 self._category_distribution[words[0]] += 1 # 统计词(标签、特征)和类别的共现次数,用于计算条件熵 for word in set(words[1:]): if word not in self._words_cate: self._words_cate[word] = {} self._words_sample_count[word] = 0 if words[0] not in self._words_cate[word]: self._words_cate[word][words[0]] = 0 self._words_cate[word][words[0]] += 1 self._words_sample_count[word] += 1 self._common_dictionary = Dictionary(corpus) self._corpus = corpus self._categories = categories
def __init__(self, topics = 10, worker = 3, pretrained_model = None, dictionary = None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary)
def train(self, df): self.user_dict = {el:self.topic_dict.copy() for el in df.sender.unique()} cv = CV(stop_words='english') X = cv.fit_transform(df['context']) vocab = cv.vocabulary_.keys() self.worddict=dict([(i, s) for i, s in enumerate(vocab)]) self.mydict = Dictionary() self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(X, documents_columns=False), id2word=self.worddict) self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict) for i in df.iterrows(): if i[1]['context'] == '': continue else: values = new_model[mydict.doc2bow(i[1]['context'].split())] for val in values: if val[0] in user_dict[i[1].sender].keys(): if i[1].amt == '': continue user_dict[i[1].sender][val[0]] += val[1] * float(i[1].amt) continue user_dict[i[1].sender][val[0]] = val[1] for i in user_dict.keys(): norm_const = sum(user_dict[i].values()) for j in user_dict[i].keys(): user_dict[i][j] = user_dict[i][j]/norm_const
def plot_dict_hist(gdict): ''' Provided gensim-dict `gdict`, plot hist statistics ''' if type(gdict) == str: gdict = Dictionary.load(gdict) sorted_dfs = sorted(gdict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True) y = [tup[1] for tup in sorted_dfs] x = arange(0, len(y)) plt.figure(figsize=(8,5)); plt.loglog(x, y); plt.grid(); plt.xlabel("Token rank"); plt.ylabel("Document count"); cdf = np.empty(len(y)) delta(y, cdf) cdf /= np.max(cdf) # normalize x50 = x[cdf > 0.50][0] x80 = x[cdf > 0.80][0] x90 = x[cdf > 0.90][0] x95 = x[cdf > 0.95][0] plt.axvline(x50, color='c'); plt.axvline(x80, color='g'); plt.axvline(x90, color='r'); plt.axvline(x95, color='k'); print "50%\t", x50 print "80%\t", x80 print "90%\t", x90 print "95%\t", x95
def __init__(self, input=None,create_dictionary=True): super(DefaultJsonCorpus, self).__init__() self.input = input self.dictionary = Dictionary() self.metadata = False if create_dictionary: self.dictionary.add_documents(self.get_texts())
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1-创建索引映射的单词 2-创建一个单词到矢量映射 3-转换训练和测试词典 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # freqxiao10->0 所以k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用 ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('没有提供数据...')
def create_dictionaries(model=None, combined=None): if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # freqxiao10->0 所以k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用 ''' 单词变集合 ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 f12.write(str(combined)) f12.write('\n') return w2indx, w2vec, combined else: print('没有提供数据...')
def main(): articles_path = '/texts_corrected/*.txt' stopword_path = '/stopwords.txt' resultspath = '/results/' location_path = '/locations.txt' tot_topic_vectors_path = resultspath + 'time200msc_topic_vectors_beta0_1.csv' tot_topic_mixtures_path = resultspath + 'time200msc_topic_mixtures_beta0_1.csv' tot_topic_shapes_path = resultspath + 'time200msc_topic_shapes_beta0_1.csv' tot_pickle_path = resultspath + 'time200iter_beta0_1.pickle' coherence_pickle_path = resultspath + 'coherence.pickle' seed_file = resultspath + '/seedwords.txt' tot = stot_model() articles,date,vocab = tot.initDataset(articles_path, stopword_path, location_path) ##save variable for coherence measures dictionary = Dictionary(articles) corpus = [dictionary.doc2bow(article) for article in articles] coherence_pickle = open(coherence_pickle_path, 'wb') pickle.dump(dictionary, coherence_pickle) pickle.dump(corpus, coherence_pickle) coherence_pickle.close() #resume with modelling process tot.init_seedwords(seed_file, vocab) param = tot.initParam(articles, date, vocab) theta,phi,psi = tot.TopicsOverTimeGibbsSampling(param) np.savetxt(tot_topic_vectors_path, phi, delimiter=',') np.savetxt(tot_topic_mixtures_path, theta, delimiter=',') np.savetxt(tot_topic_shapes_path, psi, delimiter=',') tot_pickle = open(tot_pickle_path, 'wb') pickle.dump(param, tot_pickle) tot_pickle.close()
def create_dictionaries(model=None, combined=None): ''' 这个函数做3件事 1- 创建一个单词到索引的映射 2- 创建一个单词到词向量的映射 3- 对训练集和测试集的词典进行转换 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 词频小于10->0 所以v->k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用, 把combined中的词语转换成对应的索引 ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # 词频小于10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def buildDic(self, model=None, words=None): ''' 构建词典, :param model: word2vec模型 :param words: 结巴分词后所有的文本内容 :return: 返回每个词语的索引(词语-索引),词向量(词语-向量),以及每个句子所对应的词语索引(下标索引) ''' if (model is not None) and (words is not None): # 初始化一个词典 dict = Dictionary() # model.vocab.keys() 为 word2vec 中所有的词,设置 allow_update=True 则每个词出现一个,频率就会增加一次 # 转换为词袋模型 dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 重新生成字典:key 是单词,value 是单词对应的下标。其中 k 为下标索引,v 为 字典中包含的词, w2indx = {v: k + 1 for k, v in dict.items()} # key 是单词,value 是对应的词向量 w2vec = {word: model[word] for word in w2indx.keys()} # 获取一句话所对应的词语索引 def parseDataset(words): data = [] for sentence in words: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parseDataset(words) # 对长短不同的时序统一维度。 combined = sequence.pad_sequences(combined, maxlen=self.maxlen) return w2indx, w2vec, combined else: print("模型或数据导入失败")
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. Parameters ---------- fname : str Path to file with wikipedia dump. processes : int, optional Number of processes to run, defaults to **number of cpu - 1**. lemmatize : bool Whether to use lemmatization instead of simple regexp tokenization. Defaults to `True` if *pattern* package installed. dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional Dictionary, if not provided, this scans the corpus once, to determine its vocabulary (this needs **really long time**). filter_namespaces : tuple of str Namespaces to consider. tokenizer_func : function, optional Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`. Need to support interface: tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. article_min_tokens : int, optional Minimum tokens in article. Article will be ignored if number of tokens is less. token_min_len : int, optional Minimal token length. token_max_len : int, optional Maximal token length. lower : bool, optional If True - convert all text to lower case. """ self.fname = fname self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize self.tokenizer_func = tokenizer_func self.article_min_tokens = article_min_tokens self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower self.dictionary = dictionary or Dictionary(self.get_texts())
def _load(self, tfidf_path, dictionary_path): """ If specified, attempts to load gensim TfidfModel from `tfidf_path` and gensim Dictionary from `dictionary_path`. Parameters ---------- tfidf_path: str File-path designating where self.tfidf should be saved. dictionary_path: str File-path designating where self.dictionary should be saved. """ from gensim.models import TfidfModel from gensim.corpora.dictionary import Dictionary if not os.path.exists(tfidf_path): raise IOError( 'The provided file path to the TfidfModel was not found.' 'Please ensure that the argument is the correct path.') if not os.path.exists(dictionary_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') self.tfidf = TfidfModel().load(tfidf_path) self.dictionary = Dictionary().load(dictionary_path)
def train(docs): num_topics = lda_cfg("topics") epochs = lda_cfg("epochs") label = f'{datetime.now().isoformat(".", timespec="minutes")}({num_topics}-topics,{epochs}-epochs)' log_path = config("path.lda-log").format(label) os.makedirs(os.path.dirname(log_path), exist_ok=True) logging.basicConfig(filename=log_path, format='%(asctime)s : %(levelname)s : %(message)s', datefmt='%H:%M:%S', level=logging.INFO) dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=lda_cfg("word-extremes.min-count"), no_above=lda_cfg("word-extremes.max-freq")) corpus = [dictionary.doc2bow(doc) for doc in docs] model = LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=epochs, eval_every=lda_cfg.dict_like.get("eval-every"), chunksize=lda_cfg("chunk-size")) return label, model, dictionary, corpus
def __init__(self, pages_gen, processes=None, lemmatize=utils.has_pattern(), dictionary=None): self.pages_gen = pages_gen self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
def vect2gensim(vectorizer, dtmatrix): # transform sparse matrix into gensim corpus and dictionary start = time() corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix, documents_columns=False) dictionary = Dictionary.from_corpus( corpus_vect_gensim, id2word=dict( (id, word) for word, id in vectorizer.vocabulary_.items())) end = time() print( "Transform vector model to gensim format ... done in {0:0.3f} miliseconds" .format((end - start) * 1000)) return (corpus_vect_gensim, dictionary)
def main(): logformat = '%(asctime)s %(name)-12s: %(message)s' logging.basicConfig(level=logging.DEBUG, format=logformat) kera = NOB_kera() es = Elasticsearch(port=9201) mod = LdaModel.load(modelfile) vocab = Dictionary.load(vocabulary) tfidf = TfidfModel(dictionary=vocab) results = [] for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf): res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es) results.append({'topics': topics, 'result': res, 'topicid': topicid}) results = add_keywords(results, kera) df = pd.DataFrame(results) df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8')
def load_data(): '''this function loads up the already processed data with all of the nested lists properly reformatted as lists, and loads up the dictionaries''' df = pd.read_csv('data/processed_full.tsv', sep='\t') df['english_tokens'] = df['english_tokens'].apply( lambda x: x.strip("['']").split("', '")) df['french_tokens'] = df['french_tokens'].apply( lambda x: x.strip("['']").split("', '")) df['english_bow'] = df['english_bow'].apply(str_to_int) df['french_bow'] = df['french_bow'].apply(str_to_int) df['english_padded'] = df['english_padded'].apply(str_to_int) df['french_padded'] = df['french_padded'].apply(str_to_int) df = df.drop('Unnamed: 0', axis=1) eng = Dictionary.load('data/Dictionaries/eng') fren = Dictionary.load('data/Dictionaries/fren') # create ML data X_eng = np.vstack(df['english_padded'].values) y_fren = np.vstack(df['french_padded'].values) y_fren = y_fren.reshape(*y_fren.shape, 1) X_eng = X_eng.reshape(*X_eng.shape, 1) return df, eng, fren, X_eng, y_fren
def create_dictionaries(model=None, combined=None): if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(combined): data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) combined = sequence.pad_sequences(combined, maxlen=maxlen) return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(cls, model=None, combined=None): """ Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries """ def _parse_dataset(sentences): """Words become integers 将每一个句子中的每个词用词向量存在的词的索引表示出来, 如果词没有在索引中出现,则标为0 """ data = [] for sentence in sentences: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except KeyError: new_txt.append(0) data.append(new_txt) return data if combined is not None and model is not None: gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 所有频数超过5的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()} # 所有频数超过5的词语的词向量 combined = _parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=cls.maxlen) # 每个句子所含词语对应的索引,所有句子中含有频数小于5的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): #创造辞典 1-创建单词到索引的映射 2-创建单词到矢量的映射 3-转换培训和测试词典 if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # word => index 词的索引 f = open("../model/word2index.txt", 'w', encoding='utf8') #word2index,txt文件是如何生成的? for key in w2indx: f.write(str(key)) f.write(' ') f.write(str(w2indx[key])) f.write('\n') f.close() w2vec = {word: model[word] for word in w2indx.keys()} # word => vector def parse_dataset(combined): # 解析数据集 闭包(函数内部的函数)临时使用 data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data # word => index combined = parse_dataset(combined) combined = sequence.pad_sequences(combined, maxlen=maxlen) # 句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # freqxiao10->0 ,so k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items()} # all index of word with freq>10,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys()} # all index of word vectors with freq>10, (word->model(word)) def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences(combined, maxlen=maxlen) # index to every word in every sentence, when freq < 10, index = 0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries :param model: :param combined: :return: ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(list(model.wv.vocab.keys()), allow_update=True) w2indx = {v: k + 1 for k, v in list(gensim_dict.items())} # 所有频数超过10的词语的索引 w2vec = {word: model[word] for word in list(w2indx.keys())} # 所有频数超过10的词语的词向量 def parse_dataset(combined): data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def transform_data(model, x_train, y_train, x_test, y_test): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_data(x, y): for key in range(len(y)): txt = x[key].lower().replace('\n', '').split() new_txt = [] for word in txt: try: new_txt.append(w2indx[word]) except: new_txt.append(0) x[key] = new_txt return x, y x_train, y_train = parse_data(x_train, y_train) x_test, y_test = parse_data(x_test, y_test) return w2indx, w2vec, x_train, y_train, x_test, y_test
def get_vocab(tweets=None): if 'vocab_sentiment' in os.listdir('.'): if not tweets: print("Loading vocabulary...") vocab = Dictionary.load('vocab_sentiment') print("Loaded vocabulary") return vocab response = input('Vocabulary found. Do you want to load it? (Y/n)'\ ': ') if response.lower() in ['n', 'no', 'nah', 'nono', 'nahi', 'nein']: if not tweets: tweets, labels = export() del labels return create_vocab(tweets) else: print("Loading vocabulary...") vocab = Dictionary.load('vocab_sentiment') print("Loaded vocabulary") return vocab else: if not tweets: tweets, labels = export() del labels return create_vocab(tweets)
def main(): texts = [ ['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey'] ] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2) badLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=1, num_topics=2) goodcm = CoherenceModel(model=goodLdaModel, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v') badcm = CoherenceModel(model=badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass') print(goodcm.get_coherence()) print(badcm.get_coherence())
def create_dictionaries(train=None, test=None, model=None): if (train is not None) and (model is not None) and (test is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(data): for key in data.keys(): txt = data[key].lower().replace('\n', '').split() new_txt = [] for word in txt: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data[key] = new_txt return data train = parse_dataset(train) test = parse_dataset(test) return w2indx, w2vec, train, test else: print('No data provided...')
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # the index of a word which have word vector is not 0 w2indx = {v: k + 1 for k, v in gensim_dict.items()} # integrate all the corresponding word vectors into the word vector matrix w2vec = {word: model[word] for word in w2indx.keys()} # a word without a word vector is indexed 0,return the index of word def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in list(sentence): try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) # unify the length of the sentence with the pad_sequences function of keras combined = sequence.pad_sequences(combined, maxlen=maxlen) # return index, word vector matrix and the sentence with an unifying length and indexed return w2indx, w2vec, combined else: print('No data provided...')
def pre_processing(): global vocab,model; try: model = load_model('SentimentAnalysis/model_nn.h5') except IOError: if 'model_nn.tar.gz' not in os.listdir('SentimentAnalysis'): raise IOError("Could not find Sentiment Analysis model. Ensure model "\ "is present in: ./SentimentAnalysis") else: process = subprocess.Popen("cd SentimentAnalysis/; "\ "tar -zxf model_nn.tar.gz; cd ..", shell=True, stdout=subprocess.PIPE) process.wait() model = load_model('/content/PClub-Project-master/SentimentAnalysis/model_nn.h5') vocab = Dictionary.load('SentimentAnalysis/vocab_sentiment')
def __init__(self, data=None, dictionary=None): """ initialize, data should be provided, only when unpickling class object it is not needed!""" self.data = data self.model = None self.num_topics = None self.iterations = None self.random_state = None self.dictionary = dictionary if self.data is not None: if self.dictionary is None: self.dictionary = Dictionary(self.data) self.corpus = [self.dictionary.doc2bow(text) for text in self.data] else: self.dictionary = None self.corpus = None self.distributed = None self.chuncksize = None self.passes = None self.update_every = None self.alpha = None self.eta = None self.decay = None self.offset = None self.eval_every = None self.gamma_threshold = None self.minimum_probability = None self.ns_conf = None self.minimum_phi_value = None self.per_word_topics = None self.num_topics = None self.iterations = None self.random_state = None self.model = None self.coherence_model = None self.coherence = None self.coherence_type = None
def create_dictionaries(model=None, combined=None): if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items() } # 所有频数超过10的词语的索引,(k->v)=>(v->k) f = open("word2index.txt", 'w', encoding='utf8') for key in w2indx: f.write(str(key)) f.write(' ') f.write(str(w2indx[key])) f.write('\n') f.close() w2vec = {word: model[word] for word in w2indx.keys() } # 所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data # word=>index combined = parse_dataset(combined) # [[1,2,3...],[]] combined = sequence.pad_sequences( combined, maxlen=maxlen) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def get_corpus_and_dict(df, tokens_column, seen=True): """ A Corpus is an iterable collection of Documents that your model is trained on. - e.g. all news articles since 2018 Dictionary is the vocabulary found in your corpus - e.g. Merriam Webster's dictionary We represent these tokens/words in bag-of-words format to optimize processing. """ dictionary = Dictionary(documents=df['tokens']) df['bow'] = df['tokens'].apply(dictionary.doc2bow) corpus = list(df['bow']) return df, corpus, dictionary
class Vocab(): def __init__(self): self.dic = Dictionary() self.dic.add_documents([[u'<UNK>']]) def construct(self, input_file): f = codecs.open(input_file, 'r', 'utf-8') sentences = [] for line in f: line = line.strip().split() sentences.append(line) self.dic.add_documents(sentences) f.close() self.dic.id2token = {v: k for k, v in self.dic.token2id.items()} def word2id(self, input_file, output_file): f = codecs.open(input_file, 'r', 'utf-8') g = open(output_file, 'w') for line in f: line = line.strip().split() line = map(lambda x: str(self.dic.token2id[x]), line) line = u" ".join(line) + u"\n" g.write(line) f.close() g.close() def id2word(self, input_file, output_file): f = open(input_file, 'r') g = codecs.open(output_file, 'w', 'utf-8') for line in f: line = line.strip().split() line = map(lambda x: self.dic.id2token.get(int(x), u'#'), line) line = u" ".join(line) + u"\n" g.write(line) f.close() g.close()
def word2vec_train(tokenizedtalkfile, vocabularyfile): wordlist = [] for line in open(tokenizedtalkfile, 'r'): talkwords = [] for word in line.split(' '): if word.find('\n') != -1: word = word.replace('\n', '') talkwords.append(word) wordlist.append(talkwords) print('Start Training ...') start = time.time() model = Word2Vec(size=50, min_count=1, window=7, workers=4, sg=1, iter=5) model.build_vocab(wordlist) model.train(wordlist) model.save('corpus_word2vec_model.pkl') end = time.time() print('Training Time: %.5f' % (end - start)) model = Word2Vec.load('corpus_word2vec_model.pkl') gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) word2index = {v: k for k, v in gensim_dict.items()} with open(vocabularyfile, 'w') as vocabFile: for item in word2index.keys(): vocabFile.write(item + '\t' + str(word2index[item]) + '\n')
def further_preprocessing_phase(temp_data_frame): temp_data_frame['text'] = temp_data_frame['text'].apply(lambda text: th.tokenize_text(text) if text != None else '') # textlist = temp_data_frame['text'].to_numpy() textlist = temp_data_frame['text'].tolist() # if it raises an exeption could be the empty texts patent_dictionary = Dictionary(textlist) corpus = [patent_dictionary.doc2bow(text) for text in textlist] print('original dictionary size: ', len(patent_dictionary)) vocab_tf={} for i in corpus: for item, count in dict(i).items(): if item in vocab_tf: vocab_tf[item]+=int(count) else: vocab_tf[item] =int(count) remove_ids=[] no_of_ids_below_limit=0 for id,count in vocab_tf.items(): if count<=5: remove_ids.append(id) patent_dictionary.filter_tokens(bad_ids=remove_ids) patent_dictionary.filter_extremes(no_below=0) patent_dictionary.filter_n_most_frequent(30) print('parsed dictionary size: ', len(patent_dictionary)) vocabulary = list(patent_dictionary.token2id.keys()) ids_list = [] data_frame = pd.DataFrame(columns=['patent_id', 'text', 'classification']) temp_data_frame.apply(lambda row : shrink_vocabulary(row, vocabulary, data_frame, ids_list), axis=1) print(len(ids_list)) data_frame.set_index(data_frame['patent_id'], inplace=True) data_frame.drop(ids_list, axis=0, inplace=True) return data_frame
def score(self, X, y=None, sample_weight=None) -> float: # TODO this needs further testing for correctness, WIP if self.autoencoder is None: raise NotFittedError self.autoencoder.eval() corpus = Sparse2Corpus(X, documents_columns=False) decoder_weight = self.autoencoder.decoder.linear.weight.detach().cpu() id2word = {index: str(index) for index in range(X.shape[1])} topics = [[str(item.item()) for item in topic] for topic in decoder_weight.topk( min(self.score_num, X.shape[1]), dim=0)[1].t()] cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, id2word), coherence='u_mass') return cm.get_coherence()