Пример #1
0
from elasticsearch import Elasticsearch
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

es = Elasticsearch("http://localhost:9200")
docs = []
for id in range(0, 1005):
    res = es.get(index="kdc", doc_type="arts", id=id)
    #print res['_source']['words']
    docs.append(res['_source']['words'])
print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
#raw_input("docs: ")
#print docs
#docs=[['word1', 'word2', 'word3', 'lastword'], ['label1']]
sentence = LabeledSentence([[u'some', u'words', u'here']], [[u'SENT_1']])


class LabeledLineSentence(object):
    def __init__(self, doc_list):
        self.doc_list = doc_list

    def __iter__(self, ):
        for uid, line in enumerate(self.doc_list):
            yield LabeledSentence(line, ['SENT_%s' % uid])


doc_itr = LabeledLineSentence(docs)

doc_model = Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
doc_model.build_vocab(doc_itr)
Пример #2
0
 def __iter__(self):
     for uid, wordlist in enumerate(self.sentences):
         yield LabeledSentence(words=wordlist, tags=['SENT_%s' % uid])
Пример #3
0
def labeled_sentence(x,y):
    sentence = []
    for i in range(len(x)):
        tag = y + str(i)
        sentence.append(LabeledSentence(words=x[i], tags=[tag]))
    return sentence
 def __getitem__(self, phraseno):
     """Return phrase as LabeledSentence that Doc2Vec expects"""
     phrase = self.phrases[phraseno]
     words = phrase.text.split()
     return LabeledSentence(words, ['PHR_%s' % phrase.id])
Пример #5
0
 def __iter__(self):
     for idx, doc in enumerate(self.doc_list):
         words = [line.rstrip('\n') for line in doc]
         if args.PreProcess:
             words = process(words)
         yield LabeledSentence(words, [self.labels_list[idx]])
Пример #6
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
 def labelReviews(reviews, label_type):
     labelled = []
     for i in range(len(reviews)):
         label = label_type + str(i)
         labelled.append(LabeledSentence(reviews[i], [label]))
     return labelled
Пример #8
0
x=0
v=0
book=xlrd.open_workbook('/Users/stuti/Desktop/data.xlsx')
hindi=book.sheet_by_index(0)

model=load_model('model_file.h5')
output_model=Doc2Vec.load('outputvector.doc2vec')
for i in range(499):
    op.append(output_model[hindi.row_values(i)])

"""test=input("Enter Text")
print(test)
ip.append(test.split())
testmodel=LabeledSentence(words=['I','am','fine.'], tags=500)
model= Doc2Vec([testmodel], min_count = 1)"""
sentences=LabeledSentence(words=['The','heavy','rain','brought','the','flood','causing','a' ,'lot','of','damage','around.'], tags=['SENT_1']) 
model1 = Doc2Vec([sentences], size = 100, window = 1, min_count = 1, workers=1)
ip.append(model1['The','heavy','rain','brought','the','flood','causing','a' ,'lot','of','damage','around.'])
ip=numpy.asarray(ip).reshape(1,12,100)
ip=model.predict(ip).reshape(12,100)
op=numpy.asarray(op).reshape(5988,100)
print(ip.shape)
for i in range(10):
    x=0
    for j in range(12):
        #v=v+2*(j-1)
        #v=v+2*(1-abs(cosine_similarity(ip,op[i])[j]))
        x=x+abs(cosine_similarity(ip,op[i])[j])
        x=x/12
    #array[i]=2(1-x)
    array.append(x)
def labelize_text(text, label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
Пример #10
0
	print ("Book "+filename+" Length: "+str(len(s)))
	temp = []
	for line in s:
		t_line = re.sub(r"[^ a-zA-Z\n]"," ",line)
		temp = temp + [t_line.split(" ")]
	s = temp
	for i in range(len(s)):
		if(i%1000 == 0):
			print i
		document = []
		for j in range(window_size):
			if(i+j>=len(s)):
				break
			document = document + s[i+j]
		label = label + 1
		sentences = sentences + [LabeledSentence(words=document, tags = [u"SENT_"+str(label)])]
	fin.close()


answer = "Encapsulation is a strategy used as part of abstraction. Encapsulation refers to the state of objects - objects encapsulate their state and hide it from the outside; outside users of the class interact with it through its methods, but cannot access the classes state directly. So the class abstracts away the implementation details related to its state. Abstraction is a more generic term, it can also be achieved by (amongst others) subclassing. For example, the interface List in the standard library is an abstraction for a sequence of items, indexed by their position, concrete examples of a List are an ArrayList or a LinkedList. Code that interacts with a List abstracts over the detail of which kind of a list it is using. Abstraction is often not possible without hiding underlying state by encapsulation - if a class exposes its internal state, it can't change its inner workings, and thus cannot be abstracted."
# sentences = sentences + [LabeledSentence(words=fix_line(answer).split(" "), tags = [u"ANSWER"])]
sentences = sentences + [LabeledSentence(words=answer.split(" "), tags = [u"ANSWER"])]

# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 500    # Word vector dimensionality                      
Пример #11
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with open(path.join(mypath + '/tutorial', source), 'r', encoding="UTF-8") as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def format_labeled_sentences(data):
    sentences = []
    for i in range(len(data)):
        sentences.append(LabeledSentence(data[i].split(), [str(i)]))
    return sentences
Пример #13
0
def label_sent(case_num, sent_num, sent_vec):
    return LabeledSentence(
        sent_vec, (["CASE_" + str(case_num) + "_SENT_" + str(sent_num)]))
Пример #14
0
 def __iter__(self, ):
     for uid, line in enumerate(self.doc_list):
         yield LabeledSentence(line, ['SENT_%s' % uid])
Пример #15
0
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg):
    """
    Returns the feature vectors for all text in the train and test datasets.
    """
    # Doc2Vec requires LabeledSentence objects as input.
    # Turn the datasets from lists of words to lists of LabeledSentence objects.
    # YOUR CODE HERE

    labeled_train_pos=[]
    i=0
    for line in train_pos:
	labeled_train_pos.append(LabeledSentence(line, ['train_pos_{}'.format(i)]))
	i=i+1

    labeled_train_neg=[]
    i=0
    for line in train_neg:
	labeled_train_neg.append(LabeledSentence(line, ['train_neg_{}'.format(i)]))
	i=i+1

    labeled_test_pos=[]
    i=0
    for line in test_pos:
	labeled_test_pos.append(LabeledSentence(line, ['test_pos_{}'.format(i)]))
	i=i+1

    labeled_test_neg=[]
    i=0
    for line in test_neg:
	labeled_test_neg.append(LabeledSentence(line, ['test_neg_{}'.format(i)]))
	i=i+1
    # Initialize model
    model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4)
    sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg
    model.build_vocab(sentences)
    
    # Train the model
    # This may take a bit to run 
    for i in range(5):
        print "Training iteration %d" % (i)
        random.shuffle(sentences)
        model.train(sentences)

    # Use the docvecs function to extract the feature vectors for the training and test data
    # YOUR CODE HERE
    
    train_pos_vec=[]
    train_neg_vec=[]
    test_pos_vec=[]
    test_neg_vec=[]

    for line in labeled_train_pos:
	train_pos_vec.append(model.docvecs[line.tags[0]])

    for line in labeled_train_neg:
	train_neg_vec.append(model.docvecs[line.tags[0]])

    for line in labeled_test_pos:
	test_pos_vec.append(model.docvecs[line.tags[0]])

    for line in labeled_test_neg:
	test_neg_vec.append(model.docvecs[line.tags[0]])

    # Return the four feature vectors
    return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
Пример #16
0
def labelizeReviews(reviews, label_type):
    labelized = []
    for i, v in enumerate(reviews):
        label = '%s_%s' % (label_type, i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized
Пример #17
0
    if type(my_str) is str:
        if lengthRet:
            return len(my_regx.findall(my_str))
        else:
            return my_regx.findall(my_str)
    else:
        return 0


def sub_string(my_str, my_regx, my_replace):
    if type(my_str) is not float:
        return my_regx.sub(my_replace, my_str)
    else:
        return ""


def clean_column(col):
    col = col.apply(sub_string, my_regx=rx_to_space, my_replace=" ")
    col = col.apply(sub_string, my_regx=rx_to_blank, my_replace="")
    col = col.apply(sub_string, my_regx=rx_to_period, my_replace=".")
    return col


sentences = clean_column(df.product_description)
test_sent = []

for index, value in sentences.iteritems():
    test_sent.append(
        LabeledSentence(words=value.split(), labels=['SENT_%s' % index]))

model = Doc2Vec(test_sent)
Пример #18
0
from gensim.models import Doc2Vec, Phrases, KeyedVectors
from gensim.models.doc2vec import LabeledSentence
###
articles_path = home + 'data/articles_final.csv'
word_embed_path = home + 'results/word_embeddings.bin'
article_embed_path = home + 'results/article_embeddings.pkl'
###

# Pull in raw dataset for cleaning
df = pd.read_csv(articles_path)

# Create bigrams to feed into doc2vec model
bigram = Phrases(df.text.str.split().tolist())

# Prepare docs for model
article_docs = [LabeledSentence(bigram[text], url) for text, url in zip(df.text.str.split().tolist(), df.url.tolist())]

# Create, train, and save doc2vec model
model = Doc2Vec(dm=0, dbow_words=1, min_count=3, negative=5, hs=0, sample=1e-5, window=10, vector_size=100, workers=8)
model.build_vocab(article_docs)
model.train(article_docs, total_examples=model.corpus_count, epochs=10)
model.wv.save_word2vec_format(word_embed_path, binary=True)

# Create full set of article embeddings
embeddings_df = pd.DataFrame(index=range(len(article_docs)), columns=['url','source','embedding'])
for i in range(len(article_docs)):
    embeddings_df.url[i] = article_docs[i].tags
    embeddings_df.source[i] = df.source[i]
    embeddings_df.embedding[i] = model.infer_vector(article_docs[i].words)

# Store embeddings to pickle file
Пример #19
0
 def __iter__(self):
     for index, doc in enumerate(self.pos_doc_list):
         yield LabeledSentence(words=doc.split(),
                               labels=[self.pos_label_list[index]])
Пример #20
0
def add_label(twt):
    output = []
    for i, s in zip(twt.index, twt):
        output.append(LabeledSentence(s, ["tweet_" + str(i)]))
    return output
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg):
    """
    Returns the feature vectors for all text in the train and test datasets.
    """
    # Doc2Vec requires TaggedDocument objects as input.
    # Turn the datasets from lists of words to lists of TaggedDocument objects.
    # YOUR CODE HERE
    labeled_train_pos = []
    labeled_train_neg = []
    labeled_test_pos = []
    labeled_test_neg = []

    line_nbr = 0
    for review in train_pos:
        labeled_train_pos.append(LabeledSentence(words=review, tags=['TRAIN_POS_'+str(line_nbr)]))
        line_nbr += 1

    line_nbr = 0
    for review in train_neg:
        labeled_train_neg.append(LabeledSentence(words=review, tags=['TRAIN_NEG_' + str(line_nbr)]))
        line_nbr += 1

    line_nbr = 0
    for review in test_pos:
        labeled_test_pos.append(LabeledSentence(words=review, tags=['TEST_POS_' + str(line_nbr)]))
        line_nbr += 1

    line_nbr = 0
    for review in test_neg:
        labeled_test_neg.append(LabeledSentence(words=review, tags=['TEST_NEG_' + str(line_nbr)]))
        line_nbr += 1

    # Initialize model
    model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4)
    print("Doc2Vec")
    sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg
    model.build_vocab(sentences)

    # Train the model
    # This may take a bit to run
    for i in range(5):
        print("Training iteration %d" % (i))
        random.shuffle(sentences)
        model.train(sentences,total_examples=model.corpus_count, epochs=model.iter)
    print("end of training")

    # Use the docvecs function to extract the feature vectors for the training and test data
    # YOUR CODE HERE
    train_pos_vec = []
    train_neg_vec = []
    test_pos_vec = []
    test_neg_vec = []

    for tag in model.docvecs.doctags:
        if 'TRAIN_POS' in tag:
            train_pos_vec.append(model.docvecs[tag])
        elif 'TEST_POS' in tag:
            test_pos_vec.append(model.docvecs[tag])
        elif 'TRAIN_NEG' in tag:
            train_neg_vec.append(model.docvecs[tag])
        elif 'TEST_NEG' in tag:
            test_neg_vec.append(model.docvecs[tag])

    # Return the four feature vectors
    return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
Пример #22
0
def labelize_news(news, label):
    result = []
    prefix = label
    for i, t in zip(news.index, news):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
Пример #23
0
    def preprocessing(self, filename):

        df = pd.read_csv(filename,
                         sep='\s+',
                         header=None,
                         names=["idx", "cited"])
        df["values"] = 1

        if self._is_network is True and self._model != 'glove':
            df = df.groupby(["idx",
                             "cited"])["values"].sum().unstack().fillna(0)
            citation = np.array(df.values, int)
            r_citation = [[
                str(idx2) for idx2, word2 in enumerate(word1) if word2 > 0
            ] for idx1, word1 in enumerate(citation)]

        elif self._is_network is True and self._model == 'glove':
            df = df.groupby(["idx",
                             "cited"])["values"].sum().unstack().fillna(0)
            citation = np.array(df.values, int)

        elif self._is_network is False and self._model != 'glove':
            df['cited'] = df['cited'].astype(np.str)
            df = df.groupby(['idx'])['cited'].apply(lambda x: list(x))
            r_citation = df.tolist()

        elif self._is_network is False and self._model == 'glove':
            df = df.groupby(["idx", "cited"
                             ])["values"].sum().unstack().fillna(0).astype(int)
            X = df.values
            Xc = np.dot(X, X.T)
            Xc[np.diag_indices_from(Xc)] = 0
            citation = Xc

        df_idx = list(df.index.unique().values)
        missing_idx = list(set(self._labels.values) - set(df_idx))

        if self._model == 'glove':
            processed_citation = {
                idx1:
                {idx2: word2
                 for idx2, word2 in enumerate(word1) if word2 > 0}
                for idx1, word1 in zip(df_idx, citation)
            }

            for idx in missing_idx:
                processed_citation[idx] = {}

        elif self._model == 'doc2vec':
            processed_citation = [
                LabeledSentence(" ".join(r_citation[idx]), [str(df_idx[idx])])
                for idx in range(len(df_idx))
            ]

            for idx in missing_idx:
                sentence = LabeledSentence(' ', [str(idx)])
                processed_citation.append(sentence)

            processed_citation = sorted(processed_citation,
                                        key=lambda ipc: int(ipc.tags[0]))

        elif self._model == 'word2vec':
            processed_citation = [
                list(set([str(idx)] + cited))
                for i, (idx, cited) in enumerate(zip(df_idx, r_citation))
            ]

            for idx in missing_idx:
                processed_citation.append([str(idx)])

        return processed_citation
Пример #24
0
 def __iter__(self):
     for idx, words in enumerate(self.words_list):
         yield LabeledSentence(words=words, tags=[self.labels_list[idx]])
Пример #25
0
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg):
    """
    Returns the feature vectors for all text in the train and test datasets.
    """
    # Doc2Vec requires LabeledSentence objects as input.
    # Turn the datasets from lists of words to lists of LabeledSentence objects.
    # YOUR CODE HERE
    labeled_train_pos = [None] * len(train_pos)
    labeled_train_neg = [None] * len(train_neg)
    labeled_test_pos = [None] * len(test_pos)
    labeled_test_neg = [None] * len(test_neg)

    i = 0
    for s in train_pos:
        labeled_train_pos[i] = LabeledSentence(words=s,
                                               tags=["TRAIN_POS_" + str(i)])
        i = i + 1

    i = 0
    for s in train_neg:
        labeled_train_neg[i] = LabeledSentence(words=s,
                                               tags=["TRAIN_NEG_" + str(i)])
        i = i + 1

    i = 0
    for s in test_pos:
        labeled_test_pos[i] = LabeledSentence(words=s,
                                              tags=["TEST_POS_" + str(i)])
        i = i + 1

    i = 0
    for s in test_neg:
        labeled_test_neg[i] = LabeledSentence(words=s,
                                              tags=["TEST_NEG_" + str(i)])
        i = i + 1

    # Initialize model
    model = Doc2Vec(min_count=1,
                    window=10,
                    size=100,
                    sample=1e-4,
                    negative=5,
                    workers=4)
    sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg
    model.build_vocab(sentences)

    # Train the model
    # This may take a bit to run
    for i in range(5):
        print "Training iteration %d" % (i)
        random.shuffle(sentences)
        model.train(sentences)

    # Use the docvecs function to extract the feature vectors for the training and test data
    # YOUR CODE HERE
    train_pos_vec = []
    train_neg_vec = []
    test_pos_vec = []
    test_neg_vec = []

    for tag in model.docvecs.doctags.keys():
        if "TRAIN_POS_" in tag:
            train_pos_vec.append(model.docvecs[tag])
        elif "TRAIN_NEG_" in tag:
            train_neg_vec.append(model.docvecs[tag])
        elif "TEST_POS_" in tag:
            test_pos_vec.append(model.docvecs[tag])
        elif "TEST_NEG_" in tag:
            test_neg_vec.append(model.docvecs[tag])

    # Return the four feature vectors
    return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
Пример #26
0
 def constructLabeledSentences(data):
     sentences=[]
     for index, row in data.iteritems():
         sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
     return sentences
Пример #27
0
# In[37]:


import re
sentences = []
sentiments = []
for fname in ["yelp", "amazon_cells", "imdb"]:
    with open("sentiment labelled sentences/%s_labelled.txt" % fname) as f:
        for item_no, line in enumerate(f):
            line_split = line.strip().split('\t')
            sent = line_split[0].lower()
            sent = re.sub(r'\'', '', sent)
            sent = re.sub(r'\W', ' ', sent)
            sent = re.sub(r'\s+', ' ', sent).strip()
            sentences.append(LabeledSentence(sent.split(), ["%s_%d" % (fname, item_no)]))
            sentiments.append(int(line_split[1]))


# In[38]:


sentences


# In[43]:


import random
class PermuteSentences(object):
    def __iter__(self):
Пример #28
0
    def __iter__(self):
        for label, line in enumerate(open(self.filename)):
	    print line
	    print label
	    print type(line)
	    yield LabeledSentence(words=line.split(), tags= ['TXT_%s' % label])
Пример #29
0
 def label_Reviews(rev, type):
     labeled = []
     for i in range(1, len(rev) + 1):
         label = '%s_%s'%(type, i)
         labeled.append(LabeledSentence(rev[i - 1], [label]))
     return labeled
    def to_array(self):
        self.sentences = []
        for tag, line in self.sources:
            self.sentences.append(LabeledSentence(words=line, tags=[tag]))

        return self.sentences