示例#1
0
 def __iter__(self):
     """
 Iterator of documents. Assign unique tag number to each document.
 :rtype: gensim.models.doc2vec.LabeledSentence
 """
     for uid, line in enumerate(self.documents):
         yield LabeledSentence(self.documents[uid], ['SENT_%s' % uid])
示例#2
0
 def __iter__(self):
     for user in self.messages_dic:
         if self.is_sample:
             messages_list = random.sample(self.messages_dic[user], min(100, len(self.messages_dic[user])))
         else:
             messages_list = self.messages_dic[user]
         yield LabeledSentence((' '.join(messages_list)).split(), [user])
示例#3
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         print "Iter over " + prefix
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(word_tokenize(line.strip()),
                                       [prefix])  #prefix + '_%s' % item_no
示例#4
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(
                     utils.to_unicode(line).split(),
                     [prefix + '_%s' % item_no])
示例#5
0
def getDoc2Vec(content, nbDim):
    # formatting
    dic = {}
    for (k, v) in content:
        if k in dic:
            dic[k].append(v)
        else:
            dic[k] = [v]
    vec = []
    for (key, val) in dic.items():
        vec.append(LabeledSentence(words=val, tags=['u' + key]))

    # Window size
    w = max([len(x) for x in dic.items()])
    # Model (inc. training (CBOW))
    model = gensim.models.Doc2Vec(vec,
                                  window=w,
                                  min_count=0,
                                  size=nbDim,
                                  dbow_words=1)
    # Locations
    # Some places are never visited --> don't appear in the vocabulary, so we take directly what's been counted
    locations = model.wv.syn0
    # Users
    users = [model.docvecs[u.tags[0]] for u in vec]
    return (model, users, locations, vec, dic)
示例#6
0
 def __iter__(self):
     InputDatasets = self.datasets
     #iteration over a dictionary in which every value is a DataFrame
     for key, singleDataset in InputDatasets.items():
         for item_no, line, label in zip(range(0, len(singleDataset.index)), singleDataset["Text"],singleDataset["Informativeness"]):
             #yield is like return but for generators (for more details see: https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do)
             yield LabeledSentence(utils.to_unicode(line).split(), [key + '_%s' % item_no + '_' + label])
def constructLabeledSentences(data):
    sentences = []
    for index, row in data.iteritems():
        sentences.append(
            LabeledSentence(
                utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences
示例#8
0
 def to_array(self):
     self.sentences = []
     InputDatasets = self.datasets
     for key, singleDataset in InputDatasets.items():
         for item_no, line, label in zip(range(0, len(singleDataset.index)), singleDataset["Text"], singleDataset["Informativeness"]):
             self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [key + '_%s' % item_no + '_' + label]))
     return self.sentences
示例#9
0
 def __iter__(self):
     for name in self.ted_l1:
         with open(self.ted_l1[name]) as l1s, open(self.ted_l2[name]) as l2s:
             for l1, l2 in izip(l1s, l2s):
                 # ted corpus has lang suffixes
                 ws = l1.split() + l2.split()
                 yield LabeledSentence(words=ws, labels=[l1])
示例#10
0
 def to_array(self):
     for idx, line in enumerate(self.file.data):
         tags_doc = map_doc_to_topic(self.lda_model, self.id2word, idx,
                                     line)
         self.sentences.append(
             LabeledSentence(words=tokenize(line), tags=tags_doc))
     return self.sentences
示例#11
0
 def to_array(self):
     self.sentences = []
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
     return self.sentences
示例#12
0
 def generateListofTaggedDocuments(wordList,s):
     c = 0
     labeledList = []
     for tweet in wordList:
         labeledList.append(LabeledSentence(words=tweet, tags=[s+str(c)]))
         c = c + 1
     return labeledList
示例#13
0
def label_sentences(df):
    labeled_sentences = []
    for index, datapoint in df.iterrows():
        tokenized_words = re.findall(w, datapoint["reviewText"].lower())
        labeled_sentences.append(
            LabeledSentence(words=tokenized_words, tags=['SENT_%s' % index]))
    return labeled_sentences
示例#14
0
    def doc_vector_train(self, sentences):
        """
        """
        sen_sig_pre = "sen_"
        sen_cur = 0
        doc_sens = []

        len_of_sen = len(sentences)
        for i in xrange(len_of_sen):
            sen_sig = sen_sig_pre + str(sen_cur)
            labeled_sentence = LabeledSentence(words=sentences[i], labels=[sen_sig])
            doc_sens.append(labeled_sentence)
            sen_cur += 1


        self.doc2vec_model = Doc2Vec(alpha=0.025,
                                     min_alpha=0.025,
                                     size=self.size)
        self.doc2vec_model.build_vocab(doc_sens)

        # train
        for epoch in xrange(10):
            self.doc2vec_model.train(doc_sens)
            self.alpha -= 0.002
            self.doc2vec_model.min_alpha = self.alpha
        return
def generate_document(ind_seq, patient_df):
    document = []
    patient_day_groups = patient_df.groupby(by=["ENTRY_DATE"])
    for _, day_df in patient_day_groups:
        day_codes = np.unique(day_df["CODE"].astype(str).values)
        day_codes.sort()
        document += day_codes.tolist()
    return LabeledSentence(words=document, tags=[str(ind_seq)])
示例#16
0
def get_journal_concat(journal):
    current_ngrams = []
    for doc in journal:
        uid = doc[0]
        for ngram in doc[1]:
            current_ngrams.append(ngram[0].lower())
    unique_ngrams = " ".join(list(set(current_ngrams)))
    return LabeledSentence(words=unique_ngrams.split(), tags=['%s' % (uid)])
示例#17
0
def createLabeledSentence(datasets, label):
	labeledSentObjList = []
	count  = 0
	for line in datasets:
		labeledSentObjList.append(LabeledSentence(line, [label+'%s' % count]))
		count = count + 1
	#print labeledSentObjList
	return labeledSentObjList
 def __iter__(self):
     with open(self.filename) as file_obj:
         for line in file_obj:
             if not line.strip(): continue
             (doc_id, text) = eval(line)
             if doc_id in self.training_docs_list:
                 yield LabeledSentence(words=stemtokenizer(text),
                                       tags=[doc_id])
示例#19
0
def _doc2vec_doc_stream(docs):
    """
    Generator to feed sentences to the dov2vec model.
    """
    for i, doc in enumerate(docs):
        doc = doc.lower()
        tokens = word_tokenize(doc)
        yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i+1)])
示例#20
0
def corpus_to_sentences(corpus):
    sentences = []
    for idx, (name, doc) in enumerate(corpus.items()):
        sys.stdout.write('\r前処理中 {}/{}'.format(idx, len(corpus)))
        sentence = LabeledSentence(words=doc, tags=[name])
        sentences.append(sentence)

    return sentences
示例#21
0
 def __iter__(self):
     f = self.filename
     with open(f + 'en') as ens, open(f + 'de') as des:
         for i, (en, de) in enumerate(islice(izip(ens, des), self.size)):
             pen = preprocess(en)
             en = ['%s_en' % w for w in pen.split()]
             de = ['%s_de' % w for w in preprocess(de).split()]
             yield LabeledSentence(words=en + de, labels=[pen])
示例#22
0
 def label_data(self, data):
     labelized = []
     for filename in data.keys():
         for i in range(data[filename].__len__()):
             sentence = data[filename][i]
             labelized.append(
                 LabeledSentence(sentence, [filename + "_" + str(i)]))
     return labelized
示例#23
0
 def __iter__(self):
     with nested(*(open(f) for f in self.filenames)) as texts:
         pe = {os.path.splitext(t.name)[1][1:]:t for t in texts }
         pe = [mappend(t,e) for e,t in pe.items()]
         for i, line in enumerate(islice(izip(*pe), self.size)):
             ws = ['%s_%s'%(w,s) for s,l in line for w in preprocess(l).split()]
             lbl = preprocess(dict(line)['en'])
             yield LabeledSentence(words=ws, labels=[lbl])
示例#24
0
文件: doc2vec.py 项目: nyuvis/ela
 def scan_data():
     for doc_id, text in client.scan(query=query, field=field):
         tokens = [
             t for t in tokenize(clean_text(text))
             if t not in STOP_WORDS
         ]
         if len(tokens) > 0:
             yield LabeledSentence(words=tokens, tags=[doc_id])
示例#25
0
 def labelize_tweets_ug(self,tweets, label):
     result = []
     prefix = label
     i = 0
     for t in tweets:
         result.append(LabeledSentence(t, [prefix + '_%s' % i]))
         i = i + 1
     return result
示例#26
0
 def to_array(self):
     self.sentences = []
     for source, fname in self.sources.items():
         with utils.smart_open(fname) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(LabeledSentence( \
                       utils.to_unicode(line).split(), ['%s_PARA_%d'%(source, item_no)]))
     return self.sentences
示例#27
0
 def __iter__(self):
     for dataset in self.datasets:
         print dataset
         with open(dataset) as fid:
             for i, l in enumerate(fid):
                 if i > self.max_sent: break
                 txt = l.decode("utf-8").split()
                 yield LabeledSentence(words=txt, tags=[i])
示例#28
0
 def __iter__(self):
     with open(self.sents_fname, 'r') as sents_file:
         sents_csv = csv.reader(sents_file)
         next(sents_csv)  # Ignore header
         for rid, sent_index, tokens, overall, helpful in sents_csv:
             labels = [rid + '_' + str(sent_index)]
             tokens = tokens.lower().split(' ')
             yield LabeledSentence(words=tokens, tags=labels)
示例#29
0
	def to_array(self):
		self.sentences = []
		for fileName in os.listdir(dataPath):
			userTexts = pickle.load( open( dataPath + fileName, "rb" ) )
			for label in userTexts:
				for sentence in userTexts[label]:
					self.sentences.append(LabeledSentence(utils.to_unicode(sentence + " " + fileName[:-2] + '_%s' % label).split(), [fileName[:-2] + '_%s' % label]))
		return self.sentences
示例#30
0
def convert_headlines_to_vectors(stock, create_model=True):
    
    def read_headline_file():
        
        with open(os.path.join('..','data', stock + '-headlines.csv'), 'r', encoding="utf-8") as headline_file:
        
            for line in headline_file:
            
                if len(line) > 6:
                
                    date, headlines = line.split(',')
                
                    yield date, map(lambda x: x.strip(), headlines.split("@@"))
    
    if create_model:
        
        i = 0
        
        headlines_corpus = []
        
        for date, headlines in read_headline_file():
            
            for headline in headlines:
                
                if headline not in headlines_corpus:
                
                    headlines_corpus.append(LabeledSentence(process_raw_text(headline), tags=['headline_' + str(i)]))
                
                    i += 1
        
        doc_model = Doc2Vec(headlines_corpus, size=100, window=5, min_count=3, workers=4)
        doc_model.save(os.path.join('..', 'models', stock + '-headlines-doc2vec.model'))
    
    doc_model = Doc2Vec.load(os.path.join('..', 'models', stock + '-headlines-doc2vec.model'))
    
    with open(os.path.join('..', 'data', stock + '-headlines-vectors.csv'), 'w', encoding="utf-8") as headline_vectors:
        
        i = 0
        
        used = []
        
        for date, headlines in read_headline_file(): #TODO file read not needed
        
            for headline in headlines:
                
                if headline not in used:
                    
                    used.append(headline)
                
                    vector = doc_model.docvecs[i]
                
                    vector = str(list(vector))
                
                    headline_vectors.write("{},{}\n".format(date, vector))
                
                i += 1
    
    return doc_model