def __iter__(self): """ Iterator of documents. Assign unique tag number to each document. :rtype: gensim.models.doc2vec.LabeledSentence """ for uid, line in enumerate(self.documents): yield LabeledSentence(self.documents[uid], ['SENT_%s' % uid])
def __iter__(self): for user in self.messages_dic: if self.is_sample: messages_list = random.sample(self.messages_dic[user], min(100, len(self.messages_dic[user]))) else: messages_list = self.messages_dic[user] yield LabeledSentence((' '.join(messages_list)).split(), [user])
def __iter__(self): for source, prefix in self.sources.items(): print "Iter over " + prefix with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(word_tokenize(line.strip()), [prefix]) #prefix + '_%s' % item_no
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence( utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def getDoc2Vec(content, nbDim): # formatting dic = {} for (k, v) in content: if k in dic: dic[k].append(v) else: dic[k] = [v] vec = [] for (key, val) in dic.items(): vec.append(LabeledSentence(words=val, tags=['u' + key])) # Window size w = max([len(x) for x in dic.items()]) # Model (inc. training (CBOW)) model = gensim.models.Doc2Vec(vec, window=w, min_count=0, size=nbDim, dbow_words=1) # Locations # Some places are never visited --> don't appear in the vocabulary, so we take directly what's been counted locations = model.wv.syn0 # Users users = [model.docvecs[u.tags[0]] for u in vec] return (model, users, locations, vec, dic)
def __iter__(self): InputDatasets = self.datasets #iteration over a dictionary in which every value is a DataFrame for key, singleDataset in InputDatasets.items(): for item_no, line, label in zip(range(0, len(singleDataset.index)), singleDataset["Text"],singleDataset["Informativeness"]): #yield is like return but for generators (for more details see: https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do) yield LabeledSentence(utils.to_unicode(line).split(), [key + '_%s' % item_no + '_' + label])
def constructLabeledSentences(data): sentences = [] for index, row in data.iteritems(): sentences.append( LabeledSentence( utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)])) return sentences
def to_array(self): self.sentences = [] InputDatasets = self.datasets for key, singleDataset in InputDatasets.items(): for item_no, line, label in zip(range(0, len(singleDataset.index)), singleDataset["Text"], singleDataset["Informativeness"]): self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [key + '_%s' % item_no + '_' + label])) return self.sentences
def __iter__(self): for name in self.ted_l1: with open(self.ted_l1[name]) as l1s, open(self.ted_l2[name]) as l2s: for l1, l2 in izip(l1s, l2s): # ted corpus has lang suffixes ws = l1.split() + l2.split() yield LabeledSentence(words=ws, labels=[l1])
def to_array(self): for idx, line in enumerate(self.file.data): tags_doc = map_doc_to_topic(self.lda_model, self.id2word, idx, line) self.sentences.append( LabeledSentence(words=tokenize(line), tags=tags_doc)) return self.sentences
def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])) return self.sentences
def generateListofTaggedDocuments(wordList,s): c = 0 labeledList = [] for tweet in wordList: labeledList.append(LabeledSentence(words=tweet, tags=[s+str(c)])) c = c + 1 return labeledList
def label_sentences(df): labeled_sentences = [] for index, datapoint in df.iterrows(): tokenized_words = re.findall(w, datapoint["reviewText"].lower()) labeled_sentences.append( LabeledSentence(words=tokenized_words, tags=['SENT_%s' % index])) return labeled_sentences
def doc_vector_train(self, sentences): """ """ sen_sig_pre = "sen_" sen_cur = 0 doc_sens = [] len_of_sen = len(sentences) for i in xrange(len_of_sen): sen_sig = sen_sig_pre + str(sen_cur) labeled_sentence = LabeledSentence(words=sentences[i], labels=[sen_sig]) doc_sens.append(labeled_sentence) sen_cur += 1 self.doc2vec_model = Doc2Vec(alpha=0.025, min_alpha=0.025, size=self.size) self.doc2vec_model.build_vocab(doc_sens) # train for epoch in xrange(10): self.doc2vec_model.train(doc_sens) self.alpha -= 0.002 self.doc2vec_model.min_alpha = self.alpha return
def generate_document(ind_seq, patient_df): document = [] patient_day_groups = patient_df.groupby(by=["ENTRY_DATE"]) for _, day_df in patient_day_groups: day_codes = np.unique(day_df["CODE"].astype(str).values) day_codes.sort() document += day_codes.tolist() return LabeledSentence(words=document, tags=[str(ind_seq)])
def get_journal_concat(journal): current_ngrams = [] for doc in journal: uid = doc[0] for ngram in doc[1]: current_ngrams.append(ngram[0].lower()) unique_ngrams = " ".join(list(set(current_ngrams))) return LabeledSentence(words=unique_ngrams.split(), tags=['%s' % (uid)])
def createLabeledSentence(datasets, label): labeledSentObjList = [] count = 0 for line in datasets: labeledSentObjList.append(LabeledSentence(line, [label+'%s' % count])) count = count + 1 #print labeledSentObjList return labeledSentObjList
def __iter__(self): with open(self.filename) as file_obj: for line in file_obj: if not line.strip(): continue (doc_id, text) = eval(line) if doc_id in self.training_docs_list: yield LabeledSentence(words=stemtokenizer(text), tags=[doc_id])
def _doc2vec_doc_stream(docs): """ Generator to feed sentences to the dov2vec model. """ for i, doc in enumerate(docs): doc = doc.lower() tokens = word_tokenize(doc) yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i+1)])
def corpus_to_sentences(corpus): sentences = [] for idx, (name, doc) in enumerate(corpus.items()): sys.stdout.write('\r前処理中 {}/{}'.format(idx, len(corpus))) sentence = LabeledSentence(words=doc, tags=[name]) sentences.append(sentence) return sentences
def __iter__(self): f = self.filename with open(f + 'en') as ens, open(f + 'de') as des: for i, (en, de) in enumerate(islice(izip(ens, des), self.size)): pen = preprocess(en) en = ['%s_en' % w for w in pen.split()] de = ['%s_de' % w for w in preprocess(de).split()] yield LabeledSentence(words=en + de, labels=[pen])
def label_data(self, data): labelized = [] for filename in data.keys(): for i in range(data[filename].__len__()): sentence = data[filename][i] labelized.append( LabeledSentence(sentence, [filename + "_" + str(i)])) return labelized
def __iter__(self): with nested(*(open(f) for f in self.filenames)) as texts: pe = {os.path.splitext(t.name)[1][1:]:t for t in texts } pe = [mappend(t,e) for e,t in pe.items()] for i, line in enumerate(islice(izip(*pe), self.size)): ws = ['%s_%s'%(w,s) for s,l in line for w in preprocess(l).split()] lbl = preprocess(dict(line)['en']) yield LabeledSentence(words=ws, labels=[lbl])
def scan_data(): for doc_id, text in client.scan(query=query, field=field): tokens = [ t for t in tokenize(clean_text(text)) if t not in STOP_WORDS ] if len(tokens) > 0: yield LabeledSentence(words=tokens, tags=[doc_id])
def labelize_tweets_ug(self,tweets, label): result = [] prefix = label i = 0 for t in tweets: result.append(LabeledSentence(t, [prefix + '_%s' % i])) i = i + 1 return result
def to_array(self): self.sentences = [] for source, fname in self.sources.items(): with utils.smart_open(fname) as fin: for item_no, line in enumerate(fin): self.sentences.append(LabeledSentence( \ utils.to_unicode(line).split(), ['%s_PARA_%d'%(source, item_no)])) return self.sentences
def __iter__(self): for dataset in self.datasets: print dataset with open(dataset) as fid: for i, l in enumerate(fid): if i > self.max_sent: break txt = l.decode("utf-8").split() yield LabeledSentence(words=txt, tags=[i])
def __iter__(self): with open(self.sents_fname, 'r') as sents_file: sents_csv = csv.reader(sents_file) next(sents_csv) # Ignore header for rid, sent_index, tokens, overall, helpful in sents_csv: labels = [rid + '_' + str(sent_index)] tokens = tokens.lower().split(' ') yield LabeledSentence(words=tokens, tags=labels)
def to_array(self): self.sentences = [] for fileName in os.listdir(dataPath): userTexts = pickle.load( open( dataPath + fileName, "rb" ) ) for label in userTexts: for sentence in userTexts[label]: self.sentences.append(LabeledSentence(utils.to_unicode(sentence + " " + fileName[:-2] + '_%s' % label).split(), [fileName[:-2] + '_%s' % label])) return self.sentences
def convert_headlines_to_vectors(stock, create_model=True): def read_headline_file(): with open(os.path.join('..','data', stock + '-headlines.csv'), 'r', encoding="utf-8") as headline_file: for line in headline_file: if len(line) > 6: date, headlines = line.split(',') yield date, map(lambda x: x.strip(), headlines.split("@@")) if create_model: i = 0 headlines_corpus = [] for date, headlines in read_headline_file(): for headline in headlines: if headline not in headlines_corpus: headlines_corpus.append(LabeledSentence(process_raw_text(headline), tags=['headline_' + str(i)])) i += 1 doc_model = Doc2Vec(headlines_corpus, size=100, window=5, min_count=3, workers=4) doc_model.save(os.path.join('..', 'models', stock + '-headlines-doc2vec.model')) doc_model = Doc2Vec.load(os.path.join('..', 'models', stock + '-headlines-doc2vec.model')) with open(os.path.join('..', 'data', stock + '-headlines-vectors.csv'), 'w', encoding="utf-8") as headline_vectors: i = 0 used = [] for date, headlines in read_headline_file(): #TODO file read not needed for headline in headlines: if headline not in used: used.append(headline) vector = doc_model.docvecs[i] vector = str(list(vector)) headline_vectors.write("{},{}\n".format(date, vector)) i += 1 return doc_model