Exemplo n.º 1
0
 def to_array(self):
     self.sentences = []
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
     return self.sentences
Exemplo n.º 2
0
def gatherSentences(gatheredWords):
        tasksList = Task.objects.values_list("task_name")
        rowNumber = 0
        for row in tasksList:
            #check if row isnt empty
            if len(row) != 0:
                currentSentence = row[0]#access the tuple content(task_name)
                separatedWords = currentSentence.split(" ")
                #get rid of interpunction to lower letters
                cleanseWords(separatedWords)
                gatheredWords.append(LabeledSentence(words = separatedWords, tags=["SENT_"+str(rowNumber)]))
                rowNumber += 1
Exemplo n.º 3
0
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg):
    """
    Returns the feature vectors for all text in the train and test datasets.
    """
    # Doc2Vec requires TaggedDocument objects as input.
    # Turn the datasets from lists of words to lists of TaggedDocument objects.
    # YOUR CODE HERE
    labeled_train_pos = [None] * len(train_pos)
    labeled_train_neg = [None] * len(train_neg)
    labeled_test_pos = [None] * len(test_pos)
    labeled_test_neg = [None] * len(test_neg)

    cnt = 0
    for i in train_pos:
        labeled_train_pos[cnt] = LabeledSentence(
            words=i, tags=["TRAIN_POS_" + str(cnt)])
        cnt = cnt + 1

    cnt = 0
    for i in train_neg:
        labeled_train_neg[cnt] = LabeledSentence(
            words=i, tags=["TRAIN_NEG_" + str(cnt)])
        cnt = cnt + 1

    cnt = 0
    for i in test_pos:
        labeled_test_pos[cnt] = LabeledSentence(words=i,
                                                tags=["TEST_POS_" + str(cnt)])
        cnt = cnt + 1

    cnt = 0
    for i in test_neg:
        labeled_test_neg[cnt] = LabeledSentence(words=i,
                                                tags=["TEST_NEG_" + str(cnt)])
        cnt = cnt + 1

    # Initialize model
    model = Doc2Vec(min_count=1,
                    window=10,
                    size=100,
                    sample=1e-4,
                    negative=5,
                    workers=4)
    print("Doc2Vec")
    sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg
    model.build_vocab(sentences)

    # Train the model
    # This may take a bit to run
    for i in range(5):
        print("Training iteration %d" % (i))
        random.shuffle(sentences)
        model.train(sentences,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
    print("end of training")

    # Use the docvecs function to extract the feature vectors for the training and test data
    # YOUR CODE HERE
    train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec = [], [], [], []
    for tag in model.docvecs.doctags.keys():
        if "TRAIN_POS_" in tag:
            train_pos_vec.append(model.docvecs[tag])
        elif "TRAIN_NEG_" in tag:
            train_neg_vec.append(model.docvecs[tag])
        elif "TEST_POS_" in tag:
            test_pos_vec.append(model.docvecs[tag])
        elif "TEST_NEG_" in tag:
            test_neg_vec.append(model.docvecs[tag])

    # Return the four feature vectors
    return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
def labelizeTweets(tweets, label_type):
    labelized = []
    for i, v in tqdm(enumerate(tweets)):
        label = '%s_%s' % (label_type, i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized
Exemplo n.º 5
0
def gettab(reviews, label_type):
    labelized = []
    for i,v in enumerate(reviews):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized
Exemplo n.º 6
0
 def __iter__(self):
     for i in range(len(self.sentences)):
         yield LabeledSentence(words=self.sentences[i], tags=['SENT_%s' % i])
 def __iter__(self):
     for idx, doc in enumerate(self.para_list):
         yield LabeledSentence(words=doc.split(), tags=[idx])
Exemplo n.º 8
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
Exemplo n.º 9
0
 def __iter__(self):
     for idx, doc in enumerate(self.doc_list):
         yield LabeledSentence(doc, [self.labels_list[idx]])
Exemplo n.º 10
0
    'project_subject_subcategories': str,
    'project_title': str,
    'project_essay_1': str,
    'project_essay_2': str,
    'project_essay_3': str,
    'project_essay_4': str,
    'project_resource_summary': str,
    'teacher_number_of_previously_posted_projects': int,
    'project_is_approved': np.uint8,
}
# Read data and store in DataFrame.
train_data = pd.read_csv(train_file_path,
                         sep=',',
                         dtype=dtype,
                         low_memory=True).sample(10000)
essay1 = train_data['project_essay_1']
ids = train_data['id']

ess1_list = []
for index, row in train_data.iterrows():
    ess1_list.append(
        LabeledSentence(row['project_essay_1'].split(" "), [row['id']]))
#size is the vector length, window means how many words are included in one paragraph
model = models.Doc2Vec(size=100, window=200, min_count=3, workers=1)
vocab = model.build_vocab(ess1_list)
model.train(ess1_list, epochs=10, total_words=100)
model.save("ess1_model.doc2vec")
# model_loaded = models.Doc2Vec.load('ess1_model.doc2vec')
# print "the first vector is: "
# print model.docvecs[0]