Exemplo n.º 1
0
def train(x_train, size=200, epoch_num=1):  ##size 最终训练出的句子向量的维度
    model_dm = Doc2Vec(x_train, min_count=3, window=5, vector_size=size, sample=1e-3, negative=5, workers=4)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=10)
    model_dm.save('model/model_dm')  ##模型保存的位置
    return model_dm
Exemplo n.º 2
0

if __name__ == '__main__':
    print 'Read data...'
    df_file_records, nw_file_records = \
        read_file_info_records(train_ere_dir, train_entity_info_dir, train_relation_info_dir, train_event_info_dir,
                               train_em_args_dir)
    test_df_file_records = \
        read_file_info_records(test_df_ere_dir, test_df_entity_info_dir, test_df_relation_info_dir,
                               test_df_event_info_dir, test_df_em_args_dir, False)
    test_nw_file_records = \
        read_file_info_records(test_nw_ere_dir, test_nw_entity_info_dir, test_nw_relation_info_dir,
                               test_nw_event_info_dir, test_nw_em_args_dir, False)
    file_records = df_file_records + nw_file_records + test_df_file_records + test_nw_file_records
    contexts = get_contexts(file_records)

    print 'Write doctext...'
    texts = get_doc2vec_dataform(contexts)
    write_doc2vec_input(texts, doctext_path)

    print 'Doc2vec...'
    docslist = doc2vec.TaggedLineDocument(doctext_path)
    model = Doc2Vec(docslist,
                    workers=multiprocessing.cpu_count(),
                    min_count=1,
                    size=200)
    model.save(docmodel_path)
    model = Doc2Vec.load(docmodel_path)
    doc2vec_model = model.docvecs
    print doc2vec_model[0]
Exemplo n.º 3
0
 def _make_model(self, embedding_size = 300):
     model = Doc2Vec(self.documents, vector_size=embedding_size, window=5, min_count=1,epochs=400, seed=42)
     model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
     return model
    def main_query(query):
        actual_query = query
        query = query.replace('?', '')
        new_text = ""
        new_sentences = ""
        new1 = ""

        if ext == "docx":
            passage = docx2txt.process(file_name)
            sentences = re.split('\n', passage)
            new_text = ""
            for i in sentences:
                if i != "":
                    j = i.lstrip('0123456789. ')
                    if (len(j) != len(i)):
                        if new_text != "":
                            new_text = new_text + " " + j
                        else:
                            new_text = new_text + j
            new1 = new_text
            new_sentences = sent_tokenize(new_text)
            print('inside docx')

        elif ext == 'txt':
            passage = ""
            with open(file_name) as f:
                for line in f:
                    passage = passage + line
            sentences = re.split('\n', passage)
            new_text = ""
            print("Length of sentences generated :", len(sentences))
            for i in sentences:
                if i != "":
                    j = i.lstrip('0123456789. ')
                    if (len(j) != len(i)):
                        if new_text != "":
                            new_text = new_text + " " + j
                        else:
                            new_text = new_text + j

            new_sentences = sent_tokenize(new_text)
            print('inside txt')

        elif ext == 'pdf':
            text = ""
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            with open(file_name, 'rb') as fh:
                for page in PDFPage.get_pages(fh,
                                              caching=True,
                                              check_extractable=True):
                    page_interpreter.process_page(page)
                text = fake_file_handle.getvalue()
            converter.close()
            fake_file_handle.close()
            passage1 = text
            print("PDF")
            text_split = passage1.split()
            pdf_sent = ""
            for t in text_split:
                t = t.lstrip('0123456789. ')
                if t != "":
                    if pdf_sent == "":
                        pdf_sent = t + " " + pdf_sent
                    else:
                        pdf_sent = pdf_sent + " " + t

            print(pdf_sent)

            new_sentences = sent_tokenize(pdf_sent)
            print("PDF tokenize: ", len(new_sentences))

            new_text = ""
            for sent in new_sentences:
                new_text = sent + new_text

            print('inside pdf')

        elif ext == "xlsx":
            text = ""
            f = pd.ExcelFile(file_name)
            for names in f.sheet_names:
                sheet = pd.read_excel(f, names, header=None)
                for row in sheet.values:
                    for w in row:
                        w = w.lstrip('0123456789. ')
                        if text == "":
                            text = text + str(w)
                        else:
                            text = text + " " + str(w)

            new_text = text
            new_sentences = sent_tokenize(new_text)
            print("xlsx tokenize: ", len(new_sentences))
            print('inside excel')

        new2 = new_text
        print(new1 == new2)
        print(new_text)
        print(len(new_text))

        if query.startswith('is') or query.startswith('does'):

            result = predictor.predict(passage=new_text, question=query)
            answer = result['best_span_str']

            tokenized_doc = []

            for d in final_df["final_sentences"]:
                tokenized_doc.append(word_tokenize(d.lower()))

            tagged_data = [
                TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)
            ]

            model = Doc2Vec(tagged_data,
                            vector_size=20,
                            window=2,
                            min_count=1,
                            workers=4,
                            epochs=100)
            model.save("test_doc2vec.model")
            model = Doc2Vec.load("test_doc2vec.model")

            q_tokens = word_tokenize(query)
            q_tokens_pos = nltk.pos_tag(q_tokens)
            exclude_tag = [
                "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC",
                "VBD", "DT", "VBN", "VBZ"
            ]
            q_tagged_list = []
            [
                q_tagged_list.append(x[0]) for x in q_tokens_pos
                if x[1] not in exclude_tag
            ]

            a_tokens = word_tokenize(answer)
            a_tokens_pos = nltk.pos_tag(a_tokens)
            exclude_tag = [
                "RBR", "JJS", "IN", "CD", "JJR", "NNP", "VBG", "MD", "CC",
                "VBD", "DT", "VBN", "VBZ"
            ]
            a_tagged_list = []
            [
                a_tagged_list.append(x[0]) for x in a_tokens_pos
                if x[1] not in exclude_tag
            ]

            query_final = ""
            for i in q_tagged_list:
                query_final += i + " "

            answer_final = ""
            for i in a_tagged_list:
                answer_final += i + " "

            vec1 = model.infer_vector(query_final.split())
            vec2 = model.infer_vector(answer_final.split())

            similairty = spatial.distance.cosine(vec1, vec2)

            if ((similairty >= 0.005 and similairty <= 0.006)
                    or (similairty >= 0.012 and similairty <= 0.022)
                    or (similairty >= 0.0561 and similairty <= 0.0568)):
                return "No"

            else:
                return "Yes"

        else:

            if actual_query.endswith("?"):
                actual_query = actual_query
            else:
                actual_query = actual_query + "?"

            result = predictor.predict(passage=new_text, question=actual_query)
            answer = result['best_span_str']
            similarity_value = []
            print(len(new_sentences))
            print('inside what questions : ')
            print(answer)
            for k in new_sentences:

                output_tokenize = word_tokenize(answer)
                k_tokenize = word_tokenize(k)

                sw = stopwords.words('english')
                l1 = []
                l2 = []

                output_set = {w for w in output_tokenize if not w in sw}
                k_set = {w for w in k_tokenize if not w in sw}

                rvector = output_set.union(k_set)
                for w in rvector:
                    if w in output_set: l1.append(1)  # create a vector
                    else: l1.append(0)
                    if w in k_set: l2.append(1)
                    else: l2.append(0)
                c = 0

                for i in range(len(rvector)):
                    c += l1[i] * l2[i]
                    cosine = c / float((sum(l1) * sum(l2))**0.5)

                similarity_value.append(cosine)

            print("Result : ")

            print(max(similarity_value))
            print(new_sentences[similarity_value.index(max(similarity_value))])

            answer = new_sentences[similarity_value.index(
                max(similarity_value))]

            return answer
Exemplo n.º 5
0
                indices.write(userid + '\n')

    documents = TaggedLineDocument(args.d2v_dir + 'docs_songs.txt.gz')

    pathname = "{}-{}-{}-{}".format(args.size, args.window, args.min_count,
                                    args.sample)
    if os.path.exists(args.d2v_dir + pathname):
        raise Exception("It appears this model has already been run.")
    else:
        os.mkdir(args.d2v_dir + pathname)

    with timed('Running Doc2Vec'):
        model = Doc2Vec(documents,
                        dm=1,
                        sample=args.sample,
                        size=args.size,
                        window=args.window,
                        min_count=args.min_count,
                        workers=args.workers)

    if args.norm:
        with timed('Norming vectors'):
            from sklearn.preprocessing import Normalizer
            nrm = Normalizer('l2')
            normed = nrm.fit_transform(model.docvecs.doctag_syn0)
            words_normed = nrm.fit_transform(model.wv.syn0)

    with timed('Saving data'):
        if args.norm:
            np.save(
                '{0}{1}/user_features_normed_{1}.npy'.format(
Exemplo n.º 6
0
logger.info("Writing to	 " + dataPath)

# %% Import Data

logger.info("Reading Parquet File")
# data = pq.read_table(dataPath + '/data.clean.parquet').to_pandas() using sentences.csv

# %% initialize
documents = gensim.models.doc2vec.TaggedLineDocument(dataPath + "sentences.csv")
model = Doc2Vec(
    dm=1,
    vector_size=600,
    negative=0,
    window=5,
    min_count=5,
    alpha=0.2,
    min_alpha=0.025,
    max_vocab_size=None,
    sample=2000,
    epochs=500,
    workers=(multiprocessing.cpu_count() - 1) * 3,
)
# %% build
model.build_vocab(documents=documents, progress_per=10000)
# %% train
model.train(
    documents=documents,
    epochs=2000,
    total_examples=model.corpus_count,
    start_alpha=0.2,
    end_alpha=0.025,
Exemplo n.º 7
0
#be closer to each other in vector space

model1 = gensim.models.Word2Vec(X_train, min_count=1, size=100, window=5)
model1.wv.vectors
model1.wv.vectors.shape

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_data = [
    TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)])
    for i, _d in enumerate(X_train)
]

model_ug_dbow = Doc2Vec(dm=0,
                        size=100,
                        negative=5,
                        min_count=2,
                        alpha=0.065,
                        min_alpha=0.065)
model_ug_dbow.build_vocab(tagged_data)

similar_doc = model_ug_dbow.docvecs.most_similar('1')

len(model_ug_dbow.docvecs)

a = np.matrix(model_ug_dbow.docvecs[0])

for i in range(1, len(model_ug_dbow.docvecs)):
    a = np.vstack((a, model_ug_dbow.docvecs[i]))

a.shape
Exemplo n.º 8
0
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg):
    """
    Returns the feature vectors for all text in the train and test datasets.
    """
    # Doc2Vec requires TaggedDocument objects as input.
    # Turn the datasets from lists of words to lists of TaggedDocument objects.
    # YOUR CODE HERE
    labeled_train_pos = [
        TaggedDocument(words, ["TRAIN_POS_" + str(i)])
        for i, words in enumerate(train_pos)
    ]
    labeled_train_neg = [
        TaggedDocument(words, ["TRAIN_NEG_" + str(i)])
        for i, words in enumerate(train_neg)
    ]
    labeled_test_pos = [
        TaggedDocument(words, ["TEST_POS_" + str(i)])
        for i, words in enumerate(test_pos)
    ]
    labeled_test_neg = [
        TaggedDocument(words, ["TEST_NEG_" + str(i)])
        for i, words in enumerate(test_neg)
    ]

    # Initialize model
    model = Doc2Vec(min_count=1,
                    window=10,
                    size=100,
                    sample=1e-4,
                    negative=5,
                    workers=4)
    print("Doc2Vec")
    sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg
    model.build_vocab(sentences)

    # Train the model
    # This may take a bit to run
    for i in range(5):
        print("Training iteration %d" % (i))
        random.shuffle(sentences)
        model.train(sentences,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
    print("end of training")

    # Use the docvecs function to extract the feature vectors for the training and test data
    # YOUR CODE HERE
    train_pos_vec = [
        model.docvecs["TRAIN_POS_" + str(i)]
        for i in range(len(labeled_train_pos))
    ]
    train_neg_vec = [
        model.docvecs["TRAIN_NEG_" + str(i)]
        for i in range(len(labeled_train_neg))
    ]
    test_pos_vec = [
        model.docvecs["TEST_POS_" + str(i)]
        for i in range(len(labeled_test_pos))
    ]
    test_neg_vec = [
        model.docvecs["TEST_NEG_" + str(i)]
        for i in range(len(labeled_test_neg))
    ]

    # Return the four feature vectors
    return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
Exemplo n.º 9
0
    def run(self):
        print('app started')

        cores = multiprocessing.cpu_count()
        print('num of cores is %s' % cores)
        gc.collect()
        if load_existing:
            print('loading an exiting model')
            model = Doc2Vec.load(doc2vec_model)
        else:
            print('reading training corpus from %s' % self.train_corpus)
            train_data = [self.train_corpus]
            if include_groups_in_train:
                train_data += self.groups
            # corpus_data = DocumentsIterable(train_data)
            corpus_data = CorpusReader(train_data).read_corpus()
            model = Doc2Vec(size=model_dimensions, window=10, min_count=3, sample=1e-4, negative=5, workers=cores, dm=1)
            print('building vocabulary...')
            model.build_vocab(corpus_data)

            # start training the model
            for epoch in range(epochs):
                print ('Now training epoch %s' % epoch)
                shuffle(corpus_data)
                model.train(corpus_data, total_examples=model.corpus_count, epochs=model.iter)
                # model.alpha -= 0.002  # decrease the learning rate
                # model.min_alpha = model.alpha  # fix the learning rate, no decay

            model.save(doc2vec_model)
            model.save_word2vec_format(word2vec_model)

        print('total docs learned %s' % (len(model.docvecs)))

        groups_vectors = []
        ids = []
        labels = []
        groups_sizes = {}

        # add the groups vectors
        for i, group in enumerate(self.groups):
            print('inferring group of documents from %s' % group)
            group_data = DocumentsIterable([group])
            if i == 0:
                groups_sizes[i] = 0
            else:
                groups_sizes[i] = groups_sizes[i-1]

            for vec in group_data:
                vec_data = model.infer_vector(vec.words)
                groups_vectors.append(vec_data)
                ids.append(vec.tags)
                labels.append(i)
                groups_sizes[i] += 1

        print('writing meta data to file in tensorflow format')
        with open(os.path.join(output_path, meta_file + '.tsv'), 'wb') as file_metadata:
            file_metadata.write(b'doc_id\tgroup' + b'\n')
            for i, id_val in enumerate(ids):
                file_metadata.write((id_val[0] + '\t' + str(groupsNames[labels[i]]) + '\n').encode('utf-8'))

        print('writing vectors to file')
        with open(os.path.join(output_path, meta_file + '-vecs.tsv'), 'wb') as file_metadata:
            for i, vec in enumerate(groups_vectors):
                file_metadata.write((",".join(["{}".format(number) for number in vec]) + '\n').encode('utf-8'))

        # create a new tensor board visualizer
        visualizer = TF_visualizer(groups_sizes, model_dimensions)

        # visualize the data using tensor board
        visualizer.visualize()
Exemplo n.º 10
0
l = []
for s in text:
    input = nltk.word_tokenize(s.lower())
    l.append(input)

print(l)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(l)]

print(documents)
#documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]

#print(len(text))
model = Doc2Vec(documents,
                vector_size=2,
                window=2,
                min_count=1,
                workers=4,
                epochs=10)
#, vector_size=5, window=2, min_count=1, workers=4

#model.save("models/doc2vec_model")
#model = Doc2Vec.load("models/doc2vec_model")  # you can continue training with the loaded model!

input = nltk.word_tokenize("Proleukin".lower())
input_v = model.infer_vector(input)

#sentences = [ " ".join(w) for w in text]

llista = []
for sen in text:
    v = model.infer_vector([sen.lower()])
Exemplo n.º 11
0
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg):
    """
    Returns the feature vectors for all text in the train and test datasets.
    """
    # Doc2Vec requires TaggedDocument objects as input.
    # Turn the datasets from lists of words to lists of TaggedDocument objects.
    # YOUR CODE HERE

    labeled_train_pos = [None] * len(train_pos)
    labeled_train_neg = [None] * len(train_neg)
    labeled_test_pos = [None] * len(test_pos)
    labeled_test_neg = [None] * len(test_neg)

    index = 0
    for s in train_pos:
        labeled_train_pos[index] = LabeledSentence(
            words=s, tags=["TRAIN_POS_" + str(index)])
        index += 1

    index = 0
    for s in train_neg:
        labeled_train_neg[index] = LabeledSentence(
            words=s, tags=["TRAIN_NEG_" + str(index)])
        index += 1

    index = 0
    for s in test_pos:
        labeled_test_pos[index] = LabeledSentence(
            words=s, tags=["TEST_POS_" + str(index)])
        index += 1

    index = 0
    for s in test_neg:
        labeled_test_neg[index] = LabeledSentence(
            words=s, tags=["TEST_NEG_" + str(index)])
        index += 1

    # Initialize model
    model = Doc2Vec(min_count=1,
                    window=10,
                    size=100,
                    sample=1e-4,
                    negative=5,
                    workers=4)
    print("Doc2Vec")
    sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg
    model.build_vocab(sentences)

    # Train the model
    # This may take a bit to run
    for i in range(5):
        print("Training iteration %d" % (i))
        random.shuffle(sentences)
        model.train(sentences,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)
        #model.train(sentences, total_examples=model.corpus_count)
    print("end of training")

    # Use the docvecs function to extract the feature vectors for the training and test data
    # YOUR CODE HERE

    train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec = [], [], [], []

    for i in model.docvecs.doctags.keys():
        if "TRAIN_POS_" in i:
            train_pos_vec.append(model.docvecs[i])
        elif "TRAIN_NEG_" in i:
            train_neg_vec.append(model.docvecs[i])
        elif "TEST_POS_" in i:
            test_pos_vec.append(model.docvecs[i])
        elif "TEST_NEG_" in i:
            test_neg_vec.append(model.docvecs[i])

    # Return the four feature vectors
    return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg):
    """
    Returns the feature vectors for all text in the train and test datasets.
    """
    # Doc2Vec requires LabeledSentence objects as input.
    # Turn the datasets from lists of words to lists of LabeledSentence objects.
    # YOUR CODE HERE

    labeled_train_pos = []
    #print train_pos[0:2]
    for i, list_of_words in enumerate(train_pos):
        so = LabeledSentence(words=list_of_words, tags=['train_pos_' + str(i)])
        labeled_train_pos.append(so)

    labeled_train_neg = []
    for i, list_of_words in enumerate(train_neg):
        labeled_train_neg.append(
            LabeledSentence(words=list_of_words, tags=["train_neg_" + str(i)]))

    labeled_test_pos = []
    for i, list_of_words in enumerate(test_pos):
        labeled_test_pos.append(
            LabeledSentence(words=list_of_words, tags=["test_pos_" + str(i)]))

    labeled_test_neg = []
    for i, list_of_words in enumerate(test_neg):
        labeled_test_neg.append(
            LabeledSentence(words=list_of_words, tags=["test_neg_" + str(i)]))

    #print labeled_train_pos[0:3]
    #sys.exit(0)

    # Initialize model
    model = Doc2Vec(min_count=1,
                    window=10,
                    size=100,
                    sample=1e-4,
                    negative=5,
                    workers=4)
    sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg
    model.build_vocab(sentences)

    # Train the model
    # This may take a bit to run
    for i in range(5):
        print "Training iteration %d" % (i)
        random.shuffle(sentences)
        model.train(sentences)

    # Use the docvecs function to extract the feature vectors for the training and test data
    # YOUR CODE HERE
    #train_pos_vec=model.docvecs[train_pos_
    train_pos_vec = []
    for i, line in enumerate(train_pos):
        train_pos_vec.append(model.docvecs['train_pos_' + str(i)])

    train_neg_vec = []
    for i, line in enumerate(train_neg):
        train_neg_vec.append(model.docvecs['train_neg_' + str(i)])

    test_pos_vec = []
    for i, line in enumerate(test_pos):
        test_pos_vec.append(model.docvecs['test_pos_' + str(i)])

    test_neg_vec = []
    for i, line in enumerate(test_neg):
        test_neg_vec.append(model.docvecs['test_neg_' + str(i)])

    # Return the four feature vectors
    return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
Exemplo n.º 13
0
    def doc2vec(self, args):
        #training doc2vec on training set
        # https: // radimrehurek.com / gensim / models / doc2vec.html
        token_pattern = RegexpTokenizer(self.token_pattern)

        modelname = 'doc2vec_win_' + str(args.d2vwindow) + '_dm_' + str(
            args.d2vdm)
        address = os.path.join(self.data_home, modelname)
        if os.path.exists(address):
            model = Doc2Vec.load(address)
        else:
            logging.info('do2vec training started')
            documents_train = [
                TaggedDocument(token_pattern.tokenize(doc), [i])
                for i, doc in enumerate(self.df_train.text)
            ]
            model = Doc2Vec(documents_train,
                            vector_size=args.d2vec,
                            window=args.d2vwindow,
                            min_count=1,
                            workers=4,
                            dm=args.d2vdm)

            model.save(address)
            logging.info('do2vec model saved')

        self.X_train_doc2vec = np.array([
            model.infer_vector(token_pattern.tokenize(row.text))
            for user, row in self.df_train.iterrows()
        ])
        self.X_test_doc2vec = np.array([
            model.infer_vector(token_pattern.tokenize(row.text))
            for user, row in self.df_test.iterrows()
        ])
        self.X_dev_doc2vec = np.array([
            model.infer_vector(token_pattern.tokenize(row.text))
            for user, row in self.df_dev.iterrows()
        ])
        # finding the 10 similar documents for each node
        all_dfs = pd.concat([self.df_train, self.df_dev, self.df_test])

        l = len(all_dfs)

        weight_matrix = np.zeros((l, l))
        predicted_similarities = []
        i = 0
        for row_id, row in all_dfs.iterrows():
            new_vector = model.infer_vector(token_pattern.tokenize(row.text))
            sims = np.array(model.docvecs.most_similar([new_vector]))
            column_id = sample_mask(sims[:, 0].astype(int), l)
            values = sims[:, 1]
            predicted_similarities.append(sims[:, 0])
            weight_matrix[i, column_id] = values
            i = i + 1
        predicted_similarities = np.array(predicted_similarities)
        predicted_similarities = predicted_similarities.astype(int)

        ## creating the adjacency matrix

        adj_doc2vec = np.zeros((l, l))
        # adj_doc2vec[np.arange(l), predicted_similarities] = 1

        for row, i in enumerate(predicted_similarities):
            adj_doc2vec[row, i] = 1

        self.adj_doc2vec = adj_doc2vec.astype(int)
        self.adj_weight_d2v = weight_matrix.astype(int)
Exemplo n.º 14
0
from collections import defaultdict

all_user_comments = defaultdict(list)
count = 0
for e in data:
    curr_list = []
    count += 1
    curr_list = all_user_comments[data[e]['author']]
    for word in (data[e]['body']):
        curr_list.append(word)
    all_user_comments[data[e]['author']] = curr_list

from nltk.tokenize import word_tokenize
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# reviews_train_list = getReviewsList(reviews_train)

tagged_data = [
    TaggedDocument(words=(_d), tags=[str(i)])
    for i, _d in all_user_comments.items()
]
model = Doc2Vec(tagged_data,
                vector_size=50,
                window=3,
                min_count=5,
                epochs=4,
                workers=8)

sims = model.docvecs.most_similar('YoungModern')
Exemplo n.º 15
0

test['wv_ft'] = test.cleaned_text.progress_apply(
    lambda x: wv_average(ft_model, x))

x_test4 = pd.DataFrame(test.wv_ft.to_list(), columns=wv_cols)

predict4 = lgbm4.predict(x_test4)

accuracy_score(test['category'], predict4)

############################ DBOW #######################

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

model = Doc2Vec(dm=0)

train['sentences1'] = [' '.join(i) for i in train['sentences']]

# tag the document

documents = [
    TaggedDocument(doc, [i]) for i, doc in enumerate(train['sentences'])
]
#doc_train = [documents[doc].words.split(' ') for doc in range(len(documents))]
model.build_vocab(documents)

# train model
model.train(documents, total_examples=model.corpus_count, epochs=30)

x_train5 = [model.docvecs[i] for i in range(train['sentences1'].shape[0])]
Exemplo n.º 16
0
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

sentences = []
for review in reviews:
    sentences.append(review.split())

model_name = '4-1.300features.doc2vec'
model_saved = True

if model_saved:
    model = Doc2Vec.load(DATA_IN_PATH + model_name)
else:
    # gensim 패키지를 이용하여 문장을 vector화 한다 (Doc2Vec)
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
    model = Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.00025, 
                    min_count=10, workers=4, dm =1)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=10)
    model.save(DATA_IN_PATH + model_name)



# model을 확인해 본다.
keys = list(model.wv.vocab.keys())[:20]
print(keys)

# 단어 'stuff'의 vector를 확인한다. 길이 = 300개
model.wv['stuff']

# 단어 유사도를 측정해 본다.
model.wv.similarity("dog", "cat")
Exemplo n.º 17
0
    while node:
        word_type = node.feature.split(",")[0]
        if word_type in ["名詞"]:
            out_word.append(node.surface)
        node = node.next
    return out_word

# 学習データとなる各文書
training_docs = []
for i, text in enumerate(texts):
    training_docs.append(TaggedDocument(words=tokenizer(text), tags=['doc' +str(i + 1)]))

#print(training_docs)
# min_count=1:最低1回出現した単語を学習に使用
# dm=0: 学習モデル=DBOW
model = Doc2Vec(documents=training_docs, min_count=1, dm=0)

# モデルのセーブ
# model.save("model/doc2vec.model")

# モデルのロード
# model = Dec2Vec.load("model/doc2vec.model")

print(len(training_docs))

print(model.docvecs.most_similar('doc1'))
print(model.docvecs.most_similar('doc2'))

#for v, k in training_docs:
#    print(k)
#    print(v)
Exemplo n.º 18
0
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

file = [["I have a pet"], ["They have a pet"], ["she has no pet"], ["no pet"],
        ["many have pet"], ["Some have no pet"], ["they no pet"], ["no pet"],
        ["We have no pet"]]

total = [word.split() for sentence in file for word in sentence]

#Tokenize the sentence so that I can feed it into Doc2Vec model for training
totalTagged = [
    TaggedDocument(sentence, [i]) for i, sentence in enumerate(total)
]

#Create the model, build the vocabulary and finally train it
model = Doc2Vec(totalTagged, min_count=1, workers=1, vector_size=3)
model.build_vocab(totalTagged, update=True)
model.train(totalTagged, total_examples=1, epochs=1000000)
# Print all the words that are similar to these words
print(model.wv.most_similar("have"))
print(model.wv.most_similar("Some"))

#Print all the sentences that are similar to the labels
print(model.docvecs.most_similar(0))
print(model.docvecs.most_similar(8))

#print(model.wv)
Exemplo n.º 19
0
    "I love holidays", "I love shopping", "I love Indonesia",
    "This is a good day", "This is a good Things", "This is a good Handphone"
]

tokenized_doc = ['love']
tokenized_doc

print(doc)

#%%
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
tagged_data
## Train doc2vec model
model = Doc2Vec(tagged_data,
                vector_size=20,
                window=2,
                min_count=1,
                workers=4,
                epochs=100)
# Save trained doc2vec model
model.save("test_doc2vec.model")
## Load saved doc2vec model
model = Doc2Vec.load("test_doc2vec.model")
## Print model vocabulary
model.wv.vocab

#%% Soal no 4
import re
import os
unsup_sentences = []

# source: http://ai.stanford.edu/~amaas/data/sentiment/, data from IMDB
Exemplo n.º 20
0
            }
        }
        return result


if __name__ == '__main__':
    warnings.filterwarnings('ignore', category=FutureWarning)

    norm_train = list(read_train_dataset('./static/norm-train.jsonl'))
    anom_train = list(read_train_dataset('./static/anom-train.jsonl'))

    model = Doc2Vec(norm_train + anom_train,
                    dm=1,
                    window=2,
                    min_count=13,
                    vector_size=200,
                    alpha=0.08,
                    min_alpha=0.01,
                    epochs=160,
                    workers=6)
    model.save('./tmp/a.model')
    # model = Doc2Vec.load('./tmp/a.model')

    norm_train_vecs = [
        model.docvecs['norm' + str(i)] for i in range(len(norm_train))
    ]
    anom_train_vecs = [
        model.docvecs['anom' + str(i)] for i in range(len(anom_train))
    ]

    x_train = norm_train_vecs + anom_train_vecs
Exemplo n.º 21
0
                de = ['%s_de' % w for w in preprocess(de).split()]
                langs = [en, de]
                shuffle(langs)
                for l in langs:
                    yield LabeledSentence(words=l, labels=[pen] * self.repeat)


print 'Simply training German and English words with the bitext sentence id as label'
start = timeit.default_timer()

f = sys.argv[1] + '/europarl-v7.de-en.'
n = 50000
sentences = BitextDoubleSentence(f, n)
print '%s sentences' % n

model = Doc2Vec(alpha=0.025, min_alpha=0.025, size=256, workers=8)
model.build_vocab(sentences)
print '%s words in vocab' % (len(model.vocab) - n)

print 'epochs'
for epoch in range(10):
    model.train(sentences)
    print epoch
    model.alpha -= 0.002  # decrease the learning rate

inspect_sentences(model)
model.train_lbls = False  # stop training sentences
model.alpha = 0.025  # reset learning rate
# scale sentence vectors by repeating
sentences = BitextDoubleSentence(f, n, repeat=5)
Exemplo n.º 22
0
def execute_interview_request(ir_object):
    LOGGER.info(
        f'[tag:INTRUNTER10] tasks.execute_interview_request: received execute request for ir_id: {ir_object.id}'
    )

    alphabet_list = ['A', 'B', 'C', 'D', 'E', 'F']

    tag_dict = {
        'funding': {
            'keywords': [
                'funding', 'investor', 'valuation', 'term sheet',
                'venture capital', 'venture debt'
            ],
            'mail_tag_line':
            'Type {}: Funding'
        },
        'acquisition': {
            'keywords': ['acquisition', 'acquired'],
            'mail_tag_line': 'Type {}: Acquisition'
        },
        'collabaration': {
            'keywords': ['collabarate', 'collabaration'],
            'mail_tag_line': 'Type {}: Collabaration'
        },
        'social good': {
            'keywords': ['donate'],
            'mail_tag_line': 'Type {}: Strategic initiative'
        },
        'covid': {
            'keywords': ['covid'],
            'mail_tag_line': 'Type {}: Covid'
        },
    }

    # ir_object = InterviewRequest.objects.get(id=ir_id)
    irr_object = get_object_or_None(InterviewRequestResult,
                                    type_form_id=ir_object.type_form_id,
                                    interview_request_id=ir_object.id,
                                    company_id=ir_object.company.id,
                                    user=ir_object.user.id)
    if not irr_object:
        irr_object = InterviewRequestResult(
            type_form_id=ir_object.type_form_id,
            is_published=False,
            interview_request_id=ir_object.id,
            company_id=ir_object.company.id,
            user_id=ir_object.user.id,
        )
        irr_object.save()

    user_name = ir_object.user.first_name
    company_name = ir_object.company.name
    result_data = dict()
    try:
        post_log("Getting news from google", 'STARTED')
        attachment_file_list = []
        user_tag_list = []
        from pygooglenews import GoogleNews
        gn = GoogleNews()
        s = gn.search(company_name.lower())
        final_data = []
        for news in s['entries']:
            new_dict = {
                'title': news['title'],
                'link': news['link'],
                'published': news['published']
            }
            summary_texts = []
            tags = []
            try:
                soup = BeautifulSoup(
                    requests.get(news['link'], timeout=300).content,
                    "html.parser")
                for p in soup.findAll('p'):
                    # print(p.text)
                    dummy_text = p.text
                    tags.extend(get_tag(dummy_text))
                    if "“" in dummy_text:
                        summary_texts.append(dummy_text)
                        # break
                if summary_texts:
                    new_dict['summary'] = summary_texts
                    new_dict['tags'] = list(set(tags))
                    user_tag_list.extend(new_dict['tags'])
                    final_data.append(new_dict)
            except Exception as e:
                print(f"{e} : {news}")
        result_data['news_data'] = final_data
        user_email = ir_object.user.email
        post_log(f"Getting news from google for {user_email}", 'COMPLETED')
        # creating a Dataframe object
        news_df = pd.DataFrame(final_data)
        news_df['Date'] = pd.to_datetime(news_df['published'], errors='coerce')
        news_df.sort_values(by=['Date'], inplace=True, ascending=False)
        del news_df['Date']
        file_name = f'{company_name}_Scrapped News.csv'
        news_df.to_csv(f'{DEFAULT_PATH}/{file_name}')
        post_log(f"File creation for the scrapped news for {user_email}",
                 'COMPLETED')
        attachment_file_list.append(file_name)
        google_play_app_id = ir_object.company.google_play_app_id
        if google_play_app_id:
            post_log(f"Srapping reviews for the app for {user_email}",
                     'STARTED')
            result = reviews_all(
                google_play_app_id,
                sleep_milliseconds=0,  # defaults to 0
                lang='en',  # defaults to 'en'
                country='us',  # defaults to 'us'
                sort=Sort.NEWEST  # defaults to Sort.MOST_RELEVANT
                # filter_score_with=5 # defaults to None(means all score)
            )
            post_log(f"Srapping reviews for the app for {user_email}",
                     'COMPLETED')

            df = pd.DataFrame(result)
            # df = pd.read_csv('{DEFAULT_PATH}/Netflix_all_reviews.csv')
            # print(df.head())
            # Product Scores
            # post_log(f"Histogram creation for the app reviews for {user_email}", 'STARTED')
            # fig = px.histogram(df, x="score")
            # fig.update_traces(marker_color="turquoise", marker_line_color='rgb(8,48,107)',
            #                   marker_line_width=1.5)
            # fig.update_layout(title_text='Product Score')
            # HTML(fig.to_html())
            # fig.write_image(f"{DEFAULT_PATH}/{company_name}_playstore_ratings.png")
            # plt.show()
            # plt.savefig(f'{DEFAULT_PATH}/{company_name}_playstore_ratings.png')
            # attachment_file_list.append(f"{company_name}_playstore_ratings.png")
            # post_log(f"Histogram creation for the app reviews for {user_email}", 'COMPLETED')
            reviews_df = df
            # reviews_df["review"] = reviews_df["content"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))
            reviews_df["is_bad_review"] = reviews_df["score"].apply(
                lambda x: 1 if x < 3 else 0)
            # select only relevant columnss
            reviews_df = reviews_df[[
                "content", "reviewCreatedVersion", "at", "is_bad_review"
            ]]
            # reviews_df.head()
            reviews_df["review"] = reviews_df["content"]
            # reviews_df
            post_log(f"Sentiment analysis for {user_email}", 'STARTED')
            # return the wordnet object value corresponding to the POS tag

            # clean text data
            reviews_df["review_clean"] = reviews_df["review"].apply(
                lambda x: clean_text(x))
            # add sentiment anaylsis columns

            sid = SentimentIntensityAnalyzer()
            reviews_df["sentiments"] = reviews_df["review"].apply(
                lambda x: sid.polarity_scores(str(x)))
            reviews_df = pd.concat([
                reviews_df.drop(['sentiments'], axis=1),
                reviews_df['sentiments'].apply(pd.Series)
            ],
                                   axis=1)
            # add number of characters column
            reviews_df["nb_chars"] = reviews_df["review"].apply(
                lambda x: len(str(x)))

            # add number of words column
            reviews_df["nb_words"] = reviews_df["review"].apply(
                lambda x: len(str(x).split(" ")))
            # create doc2vec vector columns

            documents = [
                TaggedDocument(doc, [i])
                for i, doc in enumerate(reviews_df["review_clean"].apply(
                    lambda x: str(x).split(" ")))
            ]

            # train a Doc2Vec model with our text data
            model = Doc2Vec(documents,
                            vector_size=5,
                            window=2,
                            min_count=1,
                            workers=4)

            # transform each document into a vector data
            doc2vec_df = reviews_df["review_clean"].apply(
                lambda x: model.infer_vector(str(x).split(" "))).apply(
                    pd.Series)
            doc2vec_df.columns = [
                "doc2vec_vector_" + str(x) for x in doc2vec_df.columns
            ]
            reviews_df = pd.concat([reviews_df, doc2vec_df], axis=1)
            # add tf-idfs columns
            tfidf = TfidfVectorizer(min_df=10)
            tfidf_result = tfidf.fit_transform(
                reviews_df["review_clean"]).toarray()
            tfidf_df = pd.DataFrame(tfidf_result,
                                    columns=tfidf.get_feature_names())
            tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
            tfidf_df.index = reviews_df.index
            reviews_df = pd.concat([reviews_df, tfidf_df], axis=1)
            # show is_bad_review distribution
            # reviews_df["sentiment"].value_counts(normalize = True)
            post_log(f"Sentiment analysis for {user_name}", 'COMPLETED')

            # print wordcloud
            post_log(f"Creating word cloud for {user_name}", 'STARTED')
            wc_name = show_wordcloud(reviews_df["review"], company_name)
            attachment_file_list.append(wc_name)
            post_log(f"Creating word cloud for {user_name}", 'COMPLETED')
            # highest positive sentiment reviews (with more than 5 words)
            reviews_df[reviews_df["nb_words"] >= 5].sort_values(
                "pos", ascending=False)[["review", "pos"]].head(10)

            # show is_bad_review distribution
            reviews_df["is_bad_review"].value_counts(normalize=True)

            # lowest negative sentiment reviews (with more than 5 words)
            post_log(f"Creating negative reviews csv for {user_name}",
                     'STARTED')
            negative_df = reviews_df[reviews_df["nb_words"] >= 5].sort_values(
                "neg", ascending=False)[["content", "neg"]].head(50)
            negative_df.to_csv(
                f'{DEFAULT_PATH}/{company_name}_negative_reviews.csv',
                columns=["content"])
            attachment_file_list.append(f'{company_name}_negative_reviews.csv')
            negative_reviews_data = negative_df.to_json(orient="split")
            parsed = json.loads(negative_reviews_data)
            result_data['negative_reviews'] = parsed
            post_log(f"Creating negative reviews csv for {user_name}",
                     'COMPLETED')
        else:
            attachment_file_list.extend(
                ['app_playstore.png', 'app_word_cloud.png'])
        # gbrowniepoint
        post_log(f"Creation of email body for {user_name}", 'STARTED')

        # Set Global Variables
        gmail_user = '******'
        gmail_password = GMAIL_PASSWORD

        fromaddr = "*****@*****.**"
        toaddr = "*****@*****.**"

        # instance of MIMEMultipart
        msg = MIMEMultipart()

        # storing the senders email address
        msg['From'] = fromaddr

        # storing the receivers email address
        msg['To'] = toaddr

        # storing the subject
        msg['Subject'] = f"Interview Brownie : {user_name}'s report"

        # string to store the body of the mail
        body = f'''
                <p>Hi {user_name},</p>
                <div dir="ltr"><br />Here is your report<br /><br /><strong><u>1. PR synthesis</u></strong>&nbsp;<br /><br />
              '''
        # print(f'body before adding tags : {body}')
        # print(user_tag_list)
        for index, tag in enumerate(list(set(user_tag_list))):
            tag_data = get_first_tag_quotes(tag, final_data)
            type_str = tag_dict[tag]['mail_tag_line'].format(
                alphabet_list[index])
            summary = '<br />'.join(map(str, tag_data["summary"]))
            body = body + f'<u>{type_str}</u><br /><br />Quote:<br />&nbsp;&ldquo;{summary}<br />Source: <a href="{tag_data["link"]}" target="_blank">{tag_data["title"]}</a><br /><br />'

        # print(f'body after adding tags : {body}')

        body = body + f'''
              <p><strong><u><em>How do you use these insights in your interview?<br /></em></u></strong><br />
              Interviewer - Do you have any questions for us?<br />{user_name} - Yes, I read about the launch of ASAP - how do people get assigned to such projects internally?<br /><br />
              From Type A.<br /><br />Another one,<br />{user_name} - I also read about the platform for data collaboration for covid - amazing to see the pace of execution on that one, how is that going?<br /><br />
              From type B<br /><br />{user_name} - There were 40 million raised for the clinical analysis, do we raise money for specific projects / verticals or was this a covid specific development?<br /><br />
              From type C.<br /><br />Now remember, these are just examples and you should be able to come up with genuine talking points, questions, things that you can relate to now with minimal effort of going through the links <br /><br />You can also find a consolidated list of all public mentions of {company_name} in the past year attached.<br /><u></u></p><div dir="ltr">&nbsp;</div>
              '''
        if google_play_app_id:
            body = body + f'''<div dir="ltr"><strong><u>2. End user understanding</u></strong></div>
                  <ul>
                  <li>A significant chunk of the bad ratings of the app are generic bad reviews, investing in talking to these consumers might uncover issues yet unknown</li>
                  <li>1 peculiar thing was the mention of cbse in a cluster of reviews, the CBSE learning experience might have some issues in particular</li>
                  </ul>
                  &nbsp;</div>
                  <div>This is a word cloud from all the positive reviews,<br />
                  <br><img src="cid:1"><br>
                  <ul>
                  <li>The trend of generic reviews continues here as well, 1 suggestion could be to request reviewers to write a few lines describing what they loved about their experience</li>
                  </ul>
                  <div>Thanks for trying out the beta, please feel free to revert with any questions, suggestions/ feedback etc and it will be super helpful to us if you can share this in your network - a linkedin post talking about your experience will help us reach more people<br /><br />If you don't have anything to ask or say, please revert with your rating on 5 on how useful did you find this tool, it will help us gauge it's efficacy&nbsp;<br /><br />Cheers,</div>
                  </div>
                  <p>--</p>
                  <div dir="ltr" data-smartmail="gmail_signature">
                  <div dir="ltr">
                  <div>
                  <div dir="ltr">
                  <div dir="ltr">
                  <div dir="ltr">
                  <div>Gaurav Dagde and Gagan Gehani</div>
                  </div>
                  </div>
                  </div>
                  </div>
                  </div>
                  </div>'''
        else:
            body = body + f'''
                <div><strong><u>2. End user understanding<br /></u></strong></div>
                <div><br />Playstore reviews - Our system couldn't find {company_name} app on the playstore.
                 Nonetheless, I am attaching screenshots of the output of another beta tester to give you a taste of what you can expect from this section</div>
                <br><img src="cid:0"><br>
                <br><img src="cid:1"><br>
                <p>If you don't have anything to ask or say, please revert with your rating on 5 on how useful did you find this tool, it will help us gauge it's efficacy
                <br /><br />All the best for your interview!</p>
                <div>Thanks for trying out the beta, please feel free to revert with any questions, suggestions/ feedback etc and it will be super helpful to us if you can share this in your network - a linkedin post talking about your experience will help us reach more people<br /><br />If you don't have anything to ask or say, please revert with your rating on 5 on how useful did you find this tool, it will help us gauge it's efficacy&nbsp;<br /><br />Cheers,</div>
                </div>
                <p>--</p>
                <div dir="ltr" data-smartmail="gmail_signature">
                <div dir="ltr">
                <div>
                <div dir="ltr">
                <div dir="ltr">
                <div dir="ltr">
                <div>Gaurav Dagde and Gagan Gehani</div>
                </div>
                </div>
                </div>
                </div>
                </div>
                </div
                '''

        result_data['mail_body'] = body
        # attach the body with the msg instance
        msg.attach(MIMEText(body, 'html', 'utf-8'))
        # file_list = ['ps_image.png',file_name]
        img_count = 0
        for attach_file in attachment_file_list:
            # open the file to be sent
            # filename = file_name
            attachment = open(f'/app/mail_content/{attach_file}', "rb")
            # to add an attachment is just add a MIMEBase object to read a picture locally.
            post_log(f"filename : {attach_file}", "IN_PROGRESS")
            if '.png' in attach_file:
                # post_log(f"In PNG block", "IN_PROGRESS")
                # with open(f'/app/mail_content/{attach_file}', 'rb') as attachment:
                # set attachment mime and file name, the image type is png
                mime = MIMEBase('image', 'png', filename=attach_file)
                # add required header data:
                mime.add_header('Content-Disposition',
                                'attachment',
                                filename=attach_file)
                mime.add_header('X-Attachment-Id', '{}'.format(img_count))
                mime.add_header('Content-ID', '<{}>'.format(img_count))
                # read attachment file content into the MIMEBase object
                mime.set_payload(attachment.read())
                # encode with base64
                encoders.encode_base64(mime)
                # add MIMEBase object to MIMEMultipart object
                msg.attach(mime)
                img_count += 1
            else:
                # post_log(f"In else block", "IN_PROGRESS")
                # instance of MIMEBase and named as p
                p = MIMEBase('application', 'octet-stream')

                # To change the payload into encoded form
                p.set_payload(attachment.read())

                # encode into base64
                encoders.encode_base64(p)

                p.add_header('Content-Disposition',
                             "attachment; filename= %s" % attach_file)

                # attach the instance 'p' to instance 'msg'
                msg.attach(p)

        # creates SMTP session
        s = smtplib.SMTP('smtp.gmail.com', 587)

        # start TLS for security
        s.starttls()

        # Authentication
        s.login(fromaddr, gmail_password)

        # Converts the Multipart msg into a string
        text = msg.as_string()
        post_log(f"Creation of email body for {user_name}", 'COMPLETED')
        # sending the mail
        s.sendmail(fromaddr, toaddr, text)
        post_log(f"Email sending for the user : {user_name}", 'COMPLETED')
        # terminating the session
        s.quit()

        # updating object value
        ir_object.is_visited_by_cron = True
        ir_object.save()
        # updating result
        irr_object.is_published = True
        irr_object.data = result_data
        irr_object.save()
    except Exception as e:
        # df.to_csv(f'{company_name}_all_reviews.csv')
        # traceback.print_exc()
        post_log(f"{e} : for user : {user_name}", "ERROR")
        irr_object.data = result_data
        irr_object.save()

    LOGGER.info(
        f'[tag:INTRUNTER20] tasks.execute_interview_request: finished execution for ir_id: {ir_object.id}'
    )
Exemplo n.º 23
0
tagged_data = [
    TaggedDocument(words=word_tokenize(sentence.lower()),
                   tags=[sentence.lower()]) for sentence in df['content']
]
df['TAGGED_DATA'] = tagged_data

#tagged data contains TaggedDocuments with word having the tokens and the tag as the same as the cleaned sentence,so that we can identidy it later

max_epochs = 100
vec_size = 100
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=1)

model.build_vocab(tagged_data)
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

#to get a sentence vector,use the cleaned form of the sentence as an index into model.docvecs
Exemplo n.º 24
0
'''
Created on Apr 1, 2018

@author: aldoj
'''
import pickle

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

with open("8Ks_&_prices.p", "rb") as f:
    docs = pickle.load(f)

taggedDocs = []
for i, x in enumerate(docs):
    taggedDocs += [TaggedDocument(x, [i])]

model = Doc2Vec(taggedDocs, vector_size=100, window=5, min_count=5, workers=4)

doc1 = model.docvecs[0]

print("done")
Exemplo n.º 25
0
    tagged_documents.append(TaggedDocument(sentence, [i]))


# In[39]:


#表示してみる
print(tagged_documents[36])


# In[300]:


model = Doc2Vec(documents=tagged_documents,
                vector_size=100,
                min_count=5,
                window=5,
                epochs=20,
                dm= 0)


# In[301]:


print(question_words[66])
print(model.docvecs[66])


# In[37]:


print(model.docvecs.most_similar(66))
Exemplo n.º 26
0
    def train_model(self, tagged_docs):
        max_epochs = self.epochs
        vec_size = self.vec_size
        no_of_workers = multiprocessing.cpu_count() / 2
        logging.basicConfig(filename=self.log_path + 'training/' +
                            str(datetime.now()) + '_log (' +
                            self.model_name[:-6] + ').log',
                            level=logging.INFO)
        # alpha = 0.025

        model = Doc2Vec(vector_size=vec_size,
                        min_count=2,
                        dm=1,
                        workers=no_of_workers,
                        epochs=max_epochs)

        # BUILD VOCABULARY
        print("\nBuilding vocabulary started:", str(datetime.now()))
        logging.info('.. Build vocabulary ' + str(datetime.now()))
        vocab_start_time = time.monotonic()

        model.build_vocab(tagged_docs, progress_per=250000)

        vocab_end_time = time.monotonic()
        print("Building vocabulary ended:",
              str(datetime.now()) + ".", "Time taken:",
              timedelta(seconds=vocab_end_time - vocab_start_time), "Size:",
              len(model.wv.vocab))
        logging.info('.. Build vocabulary ended ' + str(datetime.now()) +
                     " Time taken: " +
                     str(timedelta(seconds=vocab_end_time -
                                   vocab_start_time)) + "Size: " +
                     str(len(model.wv.vocab)))

        # TRAIN MODEL
        print("Training began:", str(datetime.now()), "Vector Size:", vec_size,
              "Epochs:", max_epochs)
        logging.info('.. Train model ' + str(datetime.now()) +
                     "Vector Size: " + str(vec_size) + ", Epochs:" +
                     str(max_epochs))
        start_time = time.monotonic()

        model.train(tagged_docs,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)

        end_time = time.monotonic()
        print("Training Ended:",
              str(datetime.now()) + ".", "Time taken:",
              str(timedelta(seconds=end_time - start_time)))
        logging.info('.. Train model ended ' + str(datetime.now()) +
                     ' Time taken: ' +
                     str(timedelta(seconds=end_time - start_time)))

        # SAVE MODEL
        if not exists(self.path_models + self.model_name[:-6] + '/'):
            makedirs(self.path_models + self.model_name[:-6] + '/')
        model.save(self.path_models + self.model_name[:-6] + '/' +
                   self.model_name)
        print("Model Saved")

        # CLEAR LOG CONFIGS
        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)
Exemplo n.º 27
0
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]

import aaa

documents = []
for i in aaa.read("temp"):
    documents.append(TaggedDocument(i[0], i[1]))
    print("here")
print(documents[2])
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

fname = get_tmpfile("test.model")

model.save(fname)
model = Doc2Vec.load(fname)

vector = model.infer_vector(["system", "response"])
tokens = ["system", "response"]
print(model.docvecs.index_to_doctag(0))
new_vector = model.infer_vector(tokens)
sims = model.docvecs.most_similar([new_vector])
print(sims)
from gensim.test.utils import common_corpus, common_dictionary
from gensim.similarities import MatrixSimilarity
query = [(1, 2), (5, 4)]
index = MatrixSimilarity(common_corpus, num_features=len(common_dictionary))
sims = index[query]
"""
Things to try first:
1) Get vector embeddings for description and transcript. (or title?)
2) Cluster them and see if we can make any sense out of the clusters.
3) Try to find patterns in the transcript talks.  
"""

# TED data
df = pd.read_csv('datasets/ted_main.csv')
df_transcript = pd.read_csv('datasets/transcripts.csv')
"""
Use Doc2Vec to vectorize, visualize the vectors.  Cluster and reduce dimension, then plot clusters.
"""
descriptions = [x for x in df['description']]
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(descriptions)]
model = Doc2Vec(documents, vector_size=8, window=2, min_count=1, workers=4)
tags = list(range(0, len(documents)))
vector_list = model[tags]

# reduce dim of description vectors...
data_embed1 = TSNE(n_components=2,
                   perplexity=50,
                   verbose=2,
                   method='barnes_hut').fit_transform(vector_list)

# plot the reduced vectors
x_axis = data_embed1[:, 0]
y_axis = data_embed1[:, 1]
plt.scatter(x_axis, y_axis, s=5)
plt.show()
Exemplo n.º 29
0
    def __init__(
        self,
        training_text,
        training_tags,
        tokenize_f=None,
        tags_to_train=None,
        tags_to_test=None,
        vector_size=300,
        window_size=15,
        min_count=1,
        sampling_threshold=1e-4,
        negative_size=5,
        train_epoch=40,
        dm=0,
        worker_count=7,
    ):
        training_data = list(zip(training_text, training_tags))

        if tokenize_f is None:
            tokenizer = TweetTokenizer(preserve_case=False)
            stopwords = frozenset(nltk.corpus.stopwords.words("english"))
            stemmer = SnowballStemmer("english")

            tokenize_f = lambda s: [
                stemmer.stem(w) for w in tokenizer.tokenize(s)
                if w not in stopwords
            ]

            #tokenize_f = re.compile("\w+").findall

        if tags_to_train is None:
            tags_to_train = {tag for _, tags in training_data for tag in tags}

        if tags_to_test is None:
            tags_to_test = [tags_to_train]

        model = Doc2Vec(vector_size=vector_size,
                        window_size=window_size,
                        min_count=min_count,
                        sampling_threshold=sampling_threshold,
                        negative_size=negative_size,
                        train_epoch=train_epoch,
                        dm=dm,
                        worker_count=worker_count)

        training_data_docs = []

        for text, tags in training_data:
            # Only add data points whose tags overlap with ys_to_train to the
            # training data set
            tags = list(set(tags) & tags_to_train)

            if tags:
                training_data_docs.append(
                    TaggedDocument(tokenize_f(text), tags))

        model.build_vocab(training_data_docs)
        model.train(training_data_docs,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)

        self.model = model
        self.tags = tags_to_test
        self.tokenize = tokenize_f
Exemplo n.º 30
0
# train original word2vec
sentences = LineSentence(filename)
model = Word2Vec(sentences, size=300, window=3, min_count=5, workers=6, sg=1)
model.save(word2vec_model)

# train fasttext
model = fasttext.skipgram(filename,
                          fasttext_model,
                          dim=300,
                          ws=5,
                          word_ngrams=3)

# train doc2vec
# build TaggedLineDocument object
documents = TaggedLineDocument(filename)
dm_model = Doc2Vec(size=300, window=4, min_count=5, workers=6)

# build vocab for model
dm_model.build_vocab(documents)

# train model using corpus
dm_model.train(documents, total_examples=dm_model.corpus_count, epochs=10)

# save model
dm_model.save(doc2vec_model)

# # learn from corpus + titles data
# # build list of TaggedDocument objects from corpus
# alldocs = []
# with open(filename) as alldata:
#     for line_no, line in enumerate(alldata):