from elasticsearch import Elasticsearch from gensim import utils from gensim.models.doc2vec import LabeledSentence from gensim.models import Doc2Vec es = Elasticsearch("http://localhost:9200") docs = [] for id in range(0, 1005): res = es.get(index="kdc", doc_type="arts", id=id) #print res['_source']['words'] docs.append(res['_source']['words']) print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" #raw_input("docs: ") #print docs #docs=[['word1', 'word2', 'word3', 'lastword'], ['label1']] sentence = LabeledSentence([[u'some', u'words', u'here']], [[u'SENT_1']]) class LabeledLineSentence(object): def __init__(self, doc_list): self.doc_list = doc_list def __iter__(self, ): for uid, line in enumerate(self.doc_list): yield LabeledSentence(line, ['SENT_%s' % uid]) doc_itr = LabeledLineSentence(docs) doc_model = Doc2Vec(alpha=0.025, min_alpha=0.025) # use fixed learning rate doc_model.build_vocab(doc_itr)
def __iter__(self): for uid, wordlist in enumerate(self.sentences): yield LabeledSentence(words=wordlist, tags=['SENT_%s' % uid])
def labeled_sentence(x,y): sentence = [] for i in range(len(x)): tag = y + str(i) sentence.append(LabeledSentence(words=x[i], tags=[tag])) return sentence
def __getitem__(self, phraseno): """Return phrase as LabeledSentence that Doc2Vec expects""" phrase = self.phrases[phraseno] words = phrase.text.split() return LabeledSentence(words, ['PHR_%s' % phrase.id])
def __iter__(self): for idx, doc in enumerate(self.doc_list): words = [line.rstrip('\n') for line in doc] if args.PreProcess: words = process(words) yield LabeledSentence(words, [self.labels_list[idx]])
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def labelReviews(reviews, label_type): labelled = [] for i in range(len(reviews)): label = label_type + str(i) labelled.append(LabeledSentence(reviews[i], [label])) return labelled
x=0 v=0 book=xlrd.open_workbook('/Users/stuti/Desktop/data.xlsx') hindi=book.sheet_by_index(0) model=load_model('model_file.h5') output_model=Doc2Vec.load('outputvector.doc2vec') for i in range(499): op.append(output_model[hindi.row_values(i)]) """test=input("Enter Text") print(test) ip.append(test.split()) testmodel=LabeledSentence(words=['I','am','fine.'], tags=500) model= Doc2Vec([testmodel], min_count = 1)""" sentences=LabeledSentence(words=['The','heavy','rain','brought','the','flood','causing','a' ,'lot','of','damage','around.'], tags=['SENT_1']) model1 = Doc2Vec([sentences], size = 100, window = 1, min_count = 1, workers=1) ip.append(model1['The','heavy','rain','brought','the','flood','causing','a' ,'lot','of','damage','around.']) ip=numpy.asarray(ip).reshape(1,12,100) ip=model.predict(ip).reshape(12,100) op=numpy.asarray(op).reshape(5988,100) print(ip.shape) for i in range(10): x=0 for j in range(12): #v=v+2*(j-1) #v=v+2*(1-abs(cosine_similarity(ip,op[i])[j])) x=x+abs(cosine_similarity(ip,op[i])[j]) x=x/12 #array[i]=2(1-x) array.append(x)
def labelize_text(text, label): result = [] prefix = label for i, t in zip(text.index, text): result.append(LabeledSentence(t.split(), [prefix + '_%s' % i])) return result
print ("Book "+filename+" Length: "+str(len(s))) temp = [] for line in s: t_line = re.sub(r"[^ a-zA-Z\n]"," ",line) temp = temp + [t_line.split(" ")] s = temp for i in range(len(s)): if(i%1000 == 0): print i document = [] for j in range(window_size): if(i+j>=len(s)): break document = document + s[i+j] label = label + 1 sentences = sentences + [LabeledSentence(words=document, tags = [u"SENT_"+str(label)])] fin.close() answer = "Encapsulation is a strategy used as part of abstraction. Encapsulation refers to the state of objects - objects encapsulate their state and hide it from the outside; outside users of the class interact with it through its methods, but cannot access the classes state directly. So the class abstracts away the implementation details related to its state. Abstraction is a more generic term, it can also be achieved by (amongst others) subclassing. For example, the interface List in the standard library is an abstraction for a sequence of items, indexed by their position, concrete examples of a List are an ArrayList or a LinkedList. Code that interacts with a List abstracts over the detail of which kind of a list it is using. Abstraction is often not possible without hiding underlying state by encapsulation - if a class exposes its internal state, it can't change its inner workings, and thus cannot be abstracted." # sentences = sentences + [LabeledSentence(words=fix_line(answer).split(" "), tags = [u"ANSWER"])] sentences = sentences + [LabeledSentence(words=answer.split(" "), tags = [u"ANSWER"])] # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters num_features = 500 # Word vector dimensionality
def __iter__(self): for source, prefix in self.sources.items(): with open(path.join(mypath + '/tutorial', source), 'r', encoding="UTF-8") as fin: for item_no, line in enumerate(fin): yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def format_labeled_sentences(data): sentences = [] for i in range(len(data)): sentences.append(LabeledSentence(data[i].split(), [str(i)])) return sentences
def label_sent(case_num, sent_num, sent_vec): return LabeledSentence( sent_vec, (["CASE_" + str(case_num) + "_SENT_" + str(sent_num)]))
def __iter__(self, ): for uid, line in enumerate(self.doc_list): yield LabeledSentence(line, ['SENT_%s' % uid])
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg): """ Returns the feature vectors for all text in the train and test datasets. """ # Doc2Vec requires LabeledSentence objects as input. # Turn the datasets from lists of words to lists of LabeledSentence objects. # YOUR CODE HERE labeled_train_pos=[] i=0 for line in train_pos: labeled_train_pos.append(LabeledSentence(line, ['train_pos_{}'.format(i)])) i=i+1 labeled_train_neg=[] i=0 for line in train_neg: labeled_train_neg.append(LabeledSentence(line, ['train_neg_{}'.format(i)])) i=i+1 labeled_test_pos=[] i=0 for line in test_pos: labeled_test_pos.append(LabeledSentence(line, ['test_pos_{}'.format(i)])) i=i+1 labeled_test_neg=[] i=0 for line in test_neg: labeled_test_neg.append(LabeledSentence(line, ['test_neg_{}'.format(i)])) i=i+1 # Initialize model model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4) sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg model.build_vocab(sentences) # Train the model # This may take a bit to run for i in range(5): print "Training iteration %d" % (i) random.shuffle(sentences) model.train(sentences) # Use the docvecs function to extract the feature vectors for the training and test data # YOUR CODE HERE train_pos_vec=[] train_neg_vec=[] test_pos_vec=[] test_neg_vec=[] for line in labeled_train_pos: train_pos_vec.append(model.docvecs[line.tags[0]]) for line in labeled_train_neg: train_neg_vec.append(model.docvecs[line.tags[0]]) for line in labeled_test_pos: test_pos_vec.append(model.docvecs[line.tags[0]]) for line in labeled_test_neg: test_neg_vec.append(model.docvecs[line.tags[0]]) # Return the four feature vectors return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
def labelizeReviews(reviews, label_type): labelized = [] for i, v in enumerate(reviews): label = '%s_%s' % (label_type, i) labelized.append(LabeledSentence(v, [label])) return labelized
if type(my_str) is str: if lengthRet: return len(my_regx.findall(my_str)) else: return my_regx.findall(my_str) else: return 0 def sub_string(my_str, my_regx, my_replace): if type(my_str) is not float: return my_regx.sub(my_replace, my_str) else: return "" def clean_column(col): col = col.apply(sub_string, my_regx=rx_to_space, my_replace=" ") col = col.apply(sub_string, my_regx=rx_to_blank, my_replace="") col = col.apply(sub_string, my_regx=rx_to_period, my_replace=".") return col sentences = clean_column(df.product_description) test_sent = [] for index, value in sentences.iteritems(): test_sent.append( LabeledSentence(words=value.split(), labels=['SENT_%s' % index])) model = Doc2Vec(test_sent)
from gensim.models import Doc2Vec, Phrases, KeyedVectors from gensim.models.doc2vec import LabeledSentence ### articles_path = home + 'data/articles_final.csv' word_embed_path = home + 'results/word_embeddings.bin' article_embed_path = home + 'results/article_embeddings.pkl' ### # Pull in raw dataset for cleaning df = pd.read_csv(articles_path) # Create bigrams to feed into doc2vec model bigram = Phrases(df.text.str.split().tolist()) # Prepare docs for model article_docs = [LabeledSentence(bigram[text], url) for text, url in zip(df.text.str.split().tolist(), df.url.tolist())] # Create, train, and save doc2vec model model = Doc2Vec(dm=0, dbow_words=1, min_count=3, negative=5, hs=0, sample=1e-5, window=10, vector_size=100, workers=8) model.build_vocab(article_docs) model.train(article_docs, total_examples=model.corpus_count, epochs=10) model.wv.save_word2vec_format(word_embed_path, binary=True) # Create full set of article embeddings embeddings_df = pd.DataFrame(index=range(len(article_docs)), columns=['url','source','embedding']) for i in range(len(article_docs)): embeddings_df.url[i] = article_docs[i].tags embeddings_df.source[i] = df.source[i] embeddings_df.embedding[i] = model.infer_vector(article_docs[i].words) # Store embeddings to pickle file
def __iter__(self): for index, doc in enumerate(self.pos_doc_list): yield LabeledSentence(words=doc.split(), labels=[self.pos_label_list[index]])
def add_label(twt): output = [] for i, s in zip(twt.index, twt): output.append(LabeledSentence(s, ["tweet_" + str(i)])) return output
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg): """ Returns the feature vectors for all text in the train and test datasets. """ # Doc2Vec requires TaggedDocument objects as input. # Turn the datasets from lists of words to lists of TaggedDocument objects. # YOUR CODE HERE labeled_train_pos = [] labeled_train_neg = [] labeled_test_pos = [] labeled_test_neg = [] line_nbr = 0 for review in train_pos: labeled_train_pos.append(LabeledSentence(words=review, tags=['TRAIN_POS_'+str(line_nbr)])) line_nbr += 1 line_nbr = 0 for review in train_neg: labeled_train_neg.append(LabeledSentence(words=review, tags=['TRAIN_NEG_' + str(line_nbr)])) line_nbr += 1 line_nbr = 0 for review in test_pos: labeled_test_pos.append(LabeledSentence(words=review, tags=['TEST_POS_' + str(line_nbr)])) line_nbr += 1 line_nbr = 0 for review in test_neg: labeled_test_neg.append(LabeledSentence(words=review, tags=['TEST_NEG_' + str(line_nbr)])) line_nbr += 1 # Initialize model model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4) print("Doc2Vec") sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg model.build_vocab(sentences) # Train the model # This may take a bit to run for i in range(5): print("Training iteration %d" % (i)) random.shuffle(sentences) model.train(sentences,total_examples=model.corpus_count, epochs=model.iter) print("end of training") # Use the docvecs function to extract the feature vectors for the training and test data # YOUR CODE HERE train_pos_vec = [] train_neg_vec = [] test_pos_vec = [] test_neg_vec = [] for tag in model.docvecs.doctags: if 'TRAIN_POS' in tag: train_pos_vec.append(model.docvecs[tag]) elif 'TEST_POS' in tag: test_pos_vec.append(model.docvecs[tag]) elif 'TRAIN_NEG' in tag: train_neg_vec.append(model.docvecs[tag]) elif 'TEST_NEG' in tag: test_neg_vec.append(model.docvecs[tag]) # Return the four feature vectors return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
def labelize_news(news, label): result = [] prefix = label for i, t in zip(news.index, news): result.append(LabeledSentence(t.split(), [prefix + '_%s' % i])) return result
def preprocessing(self, filename): df = pd.read_csv(filename, sep='\s+', header=None, names=["idx", "cited"]) df["values"] = 1 if self._is_network is True and self._model != 'glove': df = df.groupby(["idx", "cited"])["values"].sum().unstack().fillna(0) citation = np.array(df.values, int) r_citation = [[ str(idx2) for idx2, word2 in enumerate(word1) if word2 > 0 ] for idx1, word1 in enumerate(citation)] elif self._is_network is True and self._model == 'glove': df = df.groupby(["idx", "cited"])["values"].sum().unstack().fillna(0) citation = np.array(df.values, int) elif self._is_network is False and self._model != 'glove': df['cited'] = df['cited'].astype(np.str) df = df.groupby(['idx'])['cited'].apply(lambda x: list(x)) r_citation = df.tolist() elif self._is_network is False and self._model == 'glove': df = df.groupby(["idx", "cited" ])["values"].sum().unstack().fillna(0).astype(int) X = df.values Xc = np.dot(X, X.T) Xc[np.diag_indices_from(Xc)] = 0 citation = Xc df_idx = list(df.index.unique().values) missing_idx = list(set(self._labels.values) - set(df_idx)) if self._model == 'glove': processed_citation = { idx1: {idx2: word2 for idx2, word2 in enumerate(word1) if word2 > 0} for idx1, word1 in zip(df_idx, citation) } for idx in missing_idx: processed_citation[idx] = {} elif self._model == 'doc2vec': processed_citation = [ LabeledSentence(" ".join(r_citation[idx]), [str(df_idx[idx])]) for idx in range(len(df_idx)) ] for idx in missing_idx: sentence = LabeledSentence(' ', [str(idx)]) processed_citation.append(sentence) processed_citation = sorted(processed_citation, key=lambda ipc: int(ipc.tags[0])) elif self._model == 'word2vec': processed_citation = [ list(set([str(idx)] + cited)) for i, (idx, cited) in enumerate(zip(df_idx, r_citation)) ] for idx in missing_idx: processed_citation.append([str(idx)]) return processed_citation
def __iter__(self): for idx, words in enumerate(self.words_list): yield LabeledSentence(words=words, tags=[self.labels_list[idx]])
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg): """ Returns the feature vectors for all text in the train and test datasets. """ # Doc2Vec requires LabeledSentence objects as input. # Turn the datasets from lists of words to lists of LabeledSentence objects. # YOUR CODE HERE labeled_train_pos = [None] * len(train_pos) labeled_train_neg = [None] * len(train_neg) labeled_test_pos = [None] * len(test_pos) labeled_test_neg = [None] * len(test_neg) i = 0 for s in train_pos: labeled_train_pos[i] = LabeledSentence(words=s, tags=["TRAIN_POS_" + str(i)]) i = i + 1 i = 0 for s in train_neg: labeled_train_neg[i] = LabeledSentence(words=s, tags=["TRAIN_NEG_" + str(i)]) i = i + 1 i = 0 for s in test_pos: labeled_test_pos[i] = LabeledSentence(words=s, tags=["TEST_POS_" + str(i)]) i = i + 1 i = 0 for s in test_neg: labeled_test_neg[i] = LabeledSentence(words=s, tags=["TEST_NEG_" + str(i)]) i = i + 1 # Initialize model model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4) sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg model.build_vocab(sentences) # Train the model # This may take a bit to run for i in range(5): print "Training iteration %d" % (i) random.shuffle(sentences) model.train(sentences) # Use the docvecs function to extract the feature vectors for the training and test data # YOUR CODE HERE train_pos_vec = [] train_neg_vec = [] test_pos_vec = [] test_neg_vec = [] for tag in model.docvecs.doctags.keys(): if "TRAIN_POS_" in tag: train_pos_vec.append(model.docvecs[tag]) elif "TRAIN_NEG_" in tag: train_neg_vec.append(model.docvecs[tag]) elif "TEST_POS_" in tag: test_pos_vec.append(model.docvecs[tag]) elif "TEST_NEG_" in tag: test_neg_vec.append(model.docvecs[tag]) # Return the four feature vectors return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
def constructLabeledSentences(data): sentences=[] for index, row in data.iteritems(): sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)])) return sentences
# In[37]: import re sentences = [] sentiments = [] for fname in ["yelp", "amazon_cells", "imdb"]: with open("sentiment labelled sentences/%s_labelled.txt" % fname) as f: for item_no, line in enumerate(f): line_split = line.strip().split('\t') sent = line_split[0].lower() sent = re.sub(r'\'', '', sent) sent = re.sub(r'\W', ' ', sent) sent = re.sub(r'\s+', ' ', sent).strip() sentences.append(LabeledSentence(sent.split(), ["%s_%d" % (fname, item_no)])) sentiments.append(int(line_split[1])) # In[38]: sentences # In[43]: import random class PermuteSentences(object): def __iter__(self):
def __iter__(self): for label, line in enumerate(open(self.filename)): print line print label print type(line) yield LabeledSentence(words=line.split(), tags= ['TXT_%s' % label])
def label_Reviews(rev, type): labeled = [] for i in range(1, len(rev) + 1): label = '%s_%s'%(type, i) labeled.append(LabeledSentence(rev[i - 1], [label])) return labeled
def to_array(self): self.sentences = [] for tag, line in self.sources: self.sentences.append(LabeledSentence(words=line, tags=[tag])) return self.sentences