def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])) return self.sentences
def gatherSentences(gatheredWords): tasksList = Task.objects.values_list("task_name") rowNumber = 0 for row in tasksList: #check if row isnt empty if len(row) != 0: currentSentence = row[0]#access the tuple content(task_name) separatedWords = currentSentence.split(" ") #get rid of interpunction to lower letters cleanseWords(separatedWords) gatheredWords.append(LabeledSentence(words = separatedWords, tags=["SENT_"+str(rowNumber)])) rowNumber += 1
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg): """ Returns the feature vectors for all text in the train and test datasets. """ # Doc2Vec requires TaggedDocument objects as input. # Turn the datasets from lists of words to lists of TaggedDocument objects. # YOUR CODE HERE labeled_train_pos = [None] * len(train_pos) labeled_train_neg = [None] * len(train_neg) labeled_test_pos = [None] * len(test_pos) labeled_test_neg = [None] * len(test_neg) cnt = 0 for i in train_pos: labeled_train_pos[cnt] = LabeledSentence( words=i, tags=["TRAIN_POS_" + str(cnt)]) cnt = cnt + 1 cnt = 0 for i in train_neg: labeled_train_neg[cnt] = LabeledSentence( words=i, tags=["TRAIN_NEG_" + str(cnt)]) cnt = cnt + 1 cnt = 0 for i in test_pos: labeled_test_pos[cnt] = LabeledSentence(words=i, tags=["TEST_POS_" + str(cnt)]) cnt = cnt + 1 cnt = 0 for i in test_neg: labeled_test_neg[cnt] = LabeledSentence(words=i, tags=["TEST_NEG_" + str(cnt)]) cnt = cnt + 1 # Initialize model model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4) print("Doc2Vec") sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg model.build_vocab(sentences) # Train the model # This may take a bit to run for i in range(5): print("Training iteration %d" % (i)) random.shuffle(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) print("end of training") # Use the docvecs function to extract the feature vectors for the training and test data # YOUR CODE HERE train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec = [], [], [], [] for tag in model.docvecs.doctags.keys(): if "TRAIN_POS_" in tag: train_pos_vec.append(model.docvecs[tag]) elif "TRAIN_NEG_" in tag: train_neg_vec.append(model.docvecs[tag]) elif "TEST_POS_" in tag: test_pos_vec.append(model.docvecs[tag]) elif "TEST_NEG_" in tag: test_neg_vec.append(model.docvecs[tag]) # Return the four feature vectors return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
def labelizeTweets(tweets, label_type): labelized = [] for i, v in tqdm(enumerate(tweets)): label = '%s_%s' % (label_type, i) labelized.append(LabeledSentence(v, [label])) return labelized
def gettab(reviews, label_type): labelized = [] for i,v in enumerate(reviews): label = '%s_%s'%(label_type,i) labelized.append(LabeledSentence(v, [label])) return labelized
def __iter__(self): for i in range(len(self.sentences)): yield LabeledSentence(words=self.sentences[i], tags=['SENT_%s' % i])
def __iter__(self): for idx, doc in enumerate(self.para_list): yield LabeledSentence(words=doc.split(), tags=[idx])
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def __iter__(self): for idx, doc in enumerate(self.doc_list): yield LabeledSentence(doc, [self.labels_list[idx]])
'project_subject_subcategories': str, 'project_title': str, 'project_essay_1': str, 'project_essay_2': str, 'project_essay_3': str, 'project_essay_4': str, 'project_resource_summary': str, 'teacher_number_of_previously_posted_projects': int, 'project_is_approved': np.uint8, } # Read data and store in DataFrame. train_data = pd.read_csv(train_file_path, sep=',', dtype=dtype, low_memory=True).sample(10000) essay1 = train_data['project_essay_1'] ids = train_data['id'] ess1_list = [] for index, row in train_data.iterrows(): ess1_list.append( LabeledSentence(row['project_essay_1'].split(" "), [row['id']])) #size is the vector length, window means how many words are included in one paragraph model = models.Doc2Vec(size=100, window=200, min_count=3, workers=1) vocab = model.build_vocab(ess1_list) model.train(ess1_list, epochs=10, total_words=100) model.save("ess1_model.doc2vec") # model_loaded = models.Doc2Vec.load('ess1_model.doc2vec') # print "the first vector is: " # print model.docvecs[0]