def label(self): for i in range(len(self.sents) / 2): self.labelledSents.append( doc2vec.LabeledSentence(words=self.sents[2 * i].split(), tags=['title_%s' % self.articleId[i]])) self.labelledSents.append( doc2vec.LabeledSentence( words=self.sents[2 * i + 1].split(), tags=['question_%s' % self.articleId[i]]))
def iter_docs_queries(): df = pd.read_csv('./queries_norm.tsv', sep='\t', header=None, index_col=0) for idx, row in tqdm(df.iterrows()): yield doc2vec.LabeledSentence(str(row[1]).split(), ['QUERY_%d' % idx]) for filename in tqdm(os.listdir('./docs/')): path = os.path.join('./docs/', filename) df = pd.read_csv(path, sep='\t', index_col=0, header=None) for idx, row in df.iterrows(): yield doc2vec.LabeledSentence((str(row[1]) + ' ' + str(row[2])).split(), ['DOC_%d' % idx])
def label(self): for i in range(len(self.sents) / 3): self.labelledSents.append( doc2vec.LabeledSentence( words=self.sents[3 * i].split(), tags=['postText_%s' % self.articleId[i]])) self.labelledSents.append( doc2vec.LabeledSentence( words=self.sents[3 * i + 1].split(), tags=['targetTitle_%s' % self.articleId[i]])) self.labelledSents.append( doc2vec.LabeledSentence( words=self.sents[3 * i + 2].split(), tags=['targetDescription_%s' % self.articleId[i]]))
def doc2vec(vec_size, min_count_of_each_word, window_size, n_epoch): # load 'article_cutted' with open('article_cutted', 'rb') as file: data = pickle.load(file) # create a document id map sentence_list = [] for i, l in enumerate(data): sentence_list.append(doc2vec.LabeledSentence(words=l, tags=[str(i)])) # define doc2vec model model = Doc2Vec(size=vec_size, min_count=min_count_of_each_word, window=window_size) # build vocabulary model.build_vocab(sentence_list) # train doc2vec model ; shuffle data every epoch for i in range(n_epoch): random.shuffle(sentence_list) model.train(sentence_list, total_examples=len(data), epochs=1) # print result model.docvecs['0'] # save result model.save('word2vec_model/doc2vec.wv.syn0.npy')
def label_sentences(corpus, label_type): labeled = [] for i, v in enumerate(corpus): label = label_type + '_' + str(i) labeled.append(doc2vec.LabeledSentence(v.split(), [label])) return labeled
def doc_to_labeled_sentences(doc, tokenizer, sent_num_start, remove_stopwords=False): # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # 1. Use the NLTK tokenizer to split the paragraph into sentences raw_sentences = tokenizer.tokenize(doc.strip()) # 2. Loop over each sentence sentences = [] sent_num = sent_num_start for raw_sentence in raw_sentences: # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call doc_to_wordlist to get a list of words words = doc_to_wordlist(raw_sentence, remove_stopwords) labeled_sentence = doc2vec.LabeledSentence( words=words, labels=['SENT_%s' % sent_num]) sentences.append(labeled_sentence) sent_num = sent_num + 1 # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return (sentences, sent_num)
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield d2v.LabeledSentence( utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def label_text(corpus, label_type): func_name = sys._getframe().f_code.co_name logging.info("d2vModel :: " + str(func_name)) labeled = [] for i, v in enumerate(corpus): label = label_type + '_' + str(i) labeled.append(doc2vec.LabeledSentence(v.split(), [label])) return labeled
def get_sentenses(posts, type_prefix): sentences = [] #analyzedDocument = namedtuple('AnalyzedDocument', 'ID') for k, v in posts.iteritems(): words = v.lower().split() tags = ["".join((type_prefix, str(k)))] sentences.append(doc2vec.LabeledSentence(words=words, tags=tags)) #print sentences return sentences
def __iter__(self): for content, (page_id, title) in self.wiki.get_texts(): yield doc2vec.LabeledSentence( # 1. 对content中的每一个c, # 2. 转换成简体中文之后用jieba分词 # 3. 加入到words列表中 words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title])
def __iter__(self): for content, (page_id, title) in self.wiki.get_texts(): yield doc2vec.LabeledSentence( # 1. 对content中的每一个c, # 2. 转换成简体中文之后用jieba分词 # 3. 加入到words列表中 words=[w for c in content for w in jieba.cut(HanziConv.toSimplified(c))], tags=[title])
def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append( d2v.LabeledSentence( utils.to_unicode(line).split(), [prefix + '_%s' % item_no])) return self.sentences
def __init__(self): sens = self._unpickle_doc() if sens: data = [ doc2vec.LabeledSentence(words=words, tags=["SENT_%d" % i]) for i, words in enumerate(sens) ] self._d2v = Doc2Vec(data, size=config.STATE_D2V_DIM, min_count=1) else: self._d2v = Doc2Vec(size=config.STATE_D2V_DIM, min_count=1)
def _load_d2v(self): sens = self._unpickle_doc() if sens: data = [ doc2vec.LabeledSentence(words=words, tags=["SENT_%d" % i]) for i, words in enumerate(sens) ] self._d2v = Doc2Vec(data, size=2, min_count=1) else: self._d2v = Doc2Vec(size=2, min_count=1)
def initParagraph(self, sentences): count = -1 sen = [] for sentence in sentences: count += 1 temp = doc2vec.LabeledSentence(words=sentence, tags=['id' + str(count)]) sen.append(temp) return sen
def label_sentences(corpus, label_type): """ Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it. We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is a dummy index of the review. """ labeled = [] for i, v in enumerate(corpus): label = label_type + '_' + str(i) labeled.append(doc2vec.LabeledSentence(v.split(), [label])) return labeled
def label_sentences(corpus, label_type): """ Gensim'in Doc2Vec uygulaması, her belgenin / paragrafın kendisiyle ilişkilendirilmiş bir etiketi olmasını gerektirir. Bunu LabeledSentence yöntemini kullanarak yapıyoruz. Biçim "TRAIN_i" veya "TEST_i" olacaktır; burada "i" gözden geçirmenin dummy endeksi. """ labeled = [] for i, v in enumerate(corpus): label = label_type + '_' + str(i) labeled.append(doc2vec.LabeledSentence(v.split(), [label])) return labeled
def Doc2Vec(dataframe): '''INPUT: Pandas Dataframe OUTPUT: Trained Doc2Vec model''' #Clean all the posts in the dataframe and create a LabeledSentence generator with the post and tag data. docs = [] for i in range(len(dataframe)): post = cleanText(dataframe['ttl_ctxt'].values[i]) tags = dataframe['tags'].values[i] labeledsent = doc2vec.LabeledSentence(words=post, tags=tags) docs.append(labeledsent) #Train the Doc2Vec model with a list of generators. model = doc2vec.Doc2Vec(docs) return model
def df2labeled_sentence(df): """ Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it. We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is a dummy index of the review. """ logging.info("Create labeled sentence") labeled = [] for index, row in df.iterrows(): label = row.Class + '_' + str(row['ID']) text = row.Text.split() labeled.append(doc2vec.LabeledSentence(text, [label])) return (labeled)
def buildRawSents(self, myfile): for txtfile in glob.glob(devdata + myfile): xmldoc = minidom.parse(txtfile) itemlist0 = xmldoc.getElementsByTagName('document') count = 0 for it0 in itemlist0: parag = "" itemlist = it0.getElementsByTagName('text') for item in itemlist: if '.' in item.firstChild.data: parag = parag + " " + item.firstChild.data toks = self.tokenizeAbs(parag.encode("utf-8").decode('utf-8')) lab = [txtfile + '_' + ` count `] self.pars.append(doc2vec.LabeledSentence(words=toks, tags=lab)) count = count + 1
def label_sentences2(corpus, label_type): """ Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it. We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is a dummy index of the review. """ labeled = [] for i, v in enumerate(corpus): label = label_type + '_' + str(i) #labeled.append(doc2vec.LabeledSentence(v.split(), [label])) labeled.append( doc2vec.LabeledSentence(words=Tokenization( corpus, concept=False, stem=True, removeStopwords=True), tags=[label])) #LabeledSentence(words=Tokenization(corpus, concept=False, stem, removeStopwords), tags=[fnames[i]]) return labeled
def train(): documents = [] with open('/home/ycw/tax_data.csv', 'r') as f: reader = csv.reader(f, dialect='excel', delimiter=',') for line in reader: print(line) word_list = transform_text(line[1].strip(), strip=False) # word_list = eval(line[2]) documents.append(doc2vec.LabeledSentence(word_list, [line[0]])) model = Doc2Vec(documents, dm=1, size=DIMENSION, window=5, negative=5, min_count=2, workers=4) model.save('../models/doc2vec.model') indexer = AnnoyIndexer(model, 2) # _, tem_fn = mkstemp() indexer.save('../models/dv_index')
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) #%%Embedd the paragraphs # create word2vec model print("starting paragraphe vectorization") from gensim.models import word2vec, doc2vec list_all_paragraphe_split = [sentence.split() for sentence in list_all_paragraphe_filtered] labels = ["paragraph_"+str(i) for i in range(len(list_all_paragraphe_split))] model = word2vec.Word2Vec(list_all_paragraphe_split, size=100) print("done with word2vec") # create doc2vec model # [str(list_all_paragraphe_split.index(sentence))] sentences = [doc2vec.LabeledSentence( words=sentence, tags=labels) for sentence in list_all_paragraphe_split] model_doc = doc2vec.Doc2Vec(sentences, size=100) print("done with doc2vec") # store the model to files model_doc.save('my_model_complete_data.doc2vec') #%%Do a dimension reduction on the data to see if it influences the clustering from sklearn.cluster import KMeans, SpectralClustering from sklearn.decomposition import PCA from sklearn.metrics import silhouette_score from sklearn.preprocessing import normalize sentences_vec = [model_doc.docvecs[label] for label in labels] sentences_vec = normalize(sentences_vec) pca = PCA(n_components="mle", svd_solver='full') pca.fit(sentences_vec)
for list in lol: output.extend(list) return output if __name__ == '__main__': assert doc2vec.FAST_VERSION > -1 # Apparently it is unusably slow otherwise. sentences = [] for city in args.city: print "%s\tLoading tweets from city: %s" % (time.asctime(), city) nghd_tweets = ujson.load(open('data/%s/%s' % (city, NGHD_TWEETS_FILE))) print "%s\tDone loading tweets for city: %s" % (time.asctime(), city) for nghd, tweet_words in nghd_tweets.iteritems(): words_flattened = flatten(tweet_words) sentences.append( doc2vec.LabeledSentence(words=words_flattened, tags=['CITY_' + city, 'NGHD_' + nghd])) # This below is all from https://rare-technologies.com/doc2vec-tutorial/ print "%s\tbuilding vocab" % time.asctime() model = doc2vec.Doc2Vec(size=100, min_count=2, alpha=0.025, min_alpha=0.025, max_vocab_size=30000, workers=4) model.build_vocab(sentences) # print "%s\tdone building vocab, this many words: %s" % (time.asctime(), model. for epoch in range(10): print "%s\ttraining epoch %s" % (time.asctime(), epoch) model.train(sentences)
def labelizeReviews(reviews): labelized = [] for i, v in enumerate(reviews): label = '%s' % (i) labelized.append(doc2vec.LabeledSentence(v, [label])) return labelized
def sentences(wakati_list): return [ doc2vec.LabeledSentence(tokens, tags=[category]) for category, tokens in wakati_list ]
def __iter__(self): for i, text in enumerate(self.doc1): yield doc2vec.LabeledSentence(words=split_sentence(text), tags=['%s' % i])
def __iter__(self): for idx, doc in enumerate(self.doc_list): labels=[self.labels_list[idx]] words=doc.split() yield Doc2Vec.LabeledSentence(words,labels)
def label(self): for uid, line in enumerate(self.sents): self.labelledSents.append( doc2vec.LabeledSentence(words=line.split(), tags=['SENT_%s' % uid]))
print(str(round(num * 100 / len(sentences2), 3)) + '%', end='\r') futures.append(executor.submit(label_Sentences, item)) concurrent.futures.wait(futures) tags = [] for i in futures: tags.append(i.result()) #Pickle/save list of tags for sentences f = open('/home/lanna/Dropbox/Insight/tags', 'wb') pickle.dump(tags, f) LabeledSentences = [] for i in range(0, len(sentences2)): LabeledSentences.append( doc2vec.LabeledSentence(sentences2[i].split(), tags[i])) #https://linanqiu.github.io/2015/05/20/word2vec-sentiment/ nfeatures = 300 model = gensim.models.doc2vec.Doc2Vec(workers=10, size=nfeatures, window=10, min_count=1, alpha=0.025, min_alpha=0.025) #Build the vocabulary table: digesting all the words and filtering out the unique words, and doing some basic counts on them model.build_vocab(LabeledSentences) #Train Doc2Vec from random import shuffle