예제 #1
0
 def test_dmc_neg(self):
     """Test DM/concatenate doc2vec training."""
     model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=0, negative=10,
                             alpha=0.05, min_count=2, iter=20)
     self.model_sanity(model)
예제 #2
0
 def test_deterministic_hs(self):
     """Test doc2vec results identical with identical RNG seed."""
     # hs
     model = doc2vec.Doc2Vec(DocsLeeCorpus(), seed=42, workers=1)
     model2 = doc2vec.Doc2Vec(DocsLeeCorpus(), seed=42, workers=1)
     self.models_equal(model, model2)
    # if not os.path.exists(model_path):
    # sentencess =  LabeledLineSentence
    # train_list = get_namelist(train_path)
    # sentences = d2v.TaggedLineDocument(path_main+doc_labeled)
    sentences = list(read_corpus(output))
    sentences_test = list(read_corpus(output_test))
    sentences.extend(sentences_test)
    print(sentences_test)
    # testset_tranning = get_testset_update_trainning(True)
    # sentences.extend(testset_tranning)
    # print(testset_tranning)
    model = d2v.Doc2Vec(min_count=1,
                        alpha=0.02,
                        min_alpha=0.015,
                        window=100,
                        size=150,
                        sample=1e-4,
                        negative=6,
                        workers=8)
    model.build_vocab(sentences)
    model.train(sentences, total_examples=model.corpus_count,
                epochs=120)  #,start_alpha=0.02,end_alpha=0.015
    """
    documents (iterable of iterables) – The documents iterable can be simply a list of TaggedDocument elements, but for 
    larger corpora, consider an iterable that streams the documents directly from disk/network. If you don’t supply 
    documents, the model is left uninitialized – use if you plan to initialize it in some other way.
    """

    # model = d2v.Doc2Vec(sentences,min_count=2,alpha=0.02, min_alpha=0.015, window=60,
    #                     size=180, sample=1e-4, negative=5,iter=5, workers=8)
    model.save(model_path)
예제 #4
0
    'size': 100,
    'dm_concat': 1,
    'dm': 1,
    'min_count': 2
}  #100차원 공간에 임베딩, dm=1(pv-dm 적용), dm_concat=1(1-layer의 hidden node 구성시 word vector와 paragraph vector concatenate. 논문상에서 average보다 concatenate가 좋다고 얘기함), min_count=2(최소 2회 이상 출현 단어)

#namedtuple 형으로 만들어줘야 doc2vec을 실행할 수 있다.
economic_tagged_document = namedtuple('economic_tagged_document',
                                      ['words', 'tags'])
#namedtuple 만드는 과정
economic_tagged_tr_document = [
    economic_tagged_document(words, [tags])
    for tags, words in enumerate(economic_stop)
]
#doc2vec 객체 생성 및 하이퍼파라미터 설정
economic_doc_vectorizer = doc2vec.Doc2Vec(**config)
#vocabulary 선정
economic_doc_vectorizer.build_vocab(economic_tagged_tr_document)
#100번 epoch
for epoch in range(100):
    print(epoch)
    #training document로 학습
    economic_doc_vectorizer.train(economic_tagged_tr_document)
    #learning rate decay
    economic_doc_vectorizer.alpha -= 0.002
    #최소
    economic_doc_vectorizer.min_alpha = economic_doc_vectorizer.alpha

oshumed_tagged_document = namedtuple('oshumed_tagged_document',
                                     ['words', 'tags'])
oshumed_tagged_tr_document = [
예제 #5
0
 def test_persistence(self):
     """Test storing/loading the entire model."""
     tmpf = get_tmpfile('gensim_doc2vec.tst')
     model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1)
     model.save(tmpf)
     self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
예제 #6
0
num_features = 100
model_name = "sec_filings_model_{}.d2v".format(num_features)

# prepare training data
sentences = filings_iterator(
    tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'),
    N=10000,
    useDB=True,
    year_start=year_start,
    year_end=year_end)

model = doc2vec.Doc2Vec(
    sentences,
    size=num_features,
    min_count=1,
    seed=5,
    window=20,
    sample=1e-3,
    #hashfxn = analyze.hash32
    workers=4)

#model.init_sims(replace=True)
#model.save(model_name)

# ==============================================================
# retrieve features for each document and store them in a file
# ==============================================================
sentences = filings_iterator(
    tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'),
    N=26761,
    useDB=True,
 def test_dbow_neg(self):
     """Test DBOW doc2vec training."""
     model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, epochs=20)
     self.model_sanity(model)
예제 #8
0
for d in rawDoc.keys():
	docs.append(analyzedDocument(rawDoc[d], [d]))

alpha_val = 0.025
min_alpha_val = 1e-4
passes = 40

alpha_delta = (alpha_val - min_alpha_val) / (passes - 1)

# model = doc2vec.Doc2Vec(vector_size=FLAGS.embedding_size, dm=1, 
# 						sample=1e-3, negative=5, 
# 						window=2, min_count=0, 
# 						workers=4, epochs=10)

model = doc2vec.Doc2Vec(vector_size=FLAGS.embedding_size,
						window=FLAGS.window_size, min_count=2, 
						workers=4, epochs=10)

model.build_vocab(docs) # Building vocabulary

for epoch in range(passes):
    random.shuffle(docs)

    model.alpha, model.min_alpha = alpha_val, alpha_val
    model.train(docs, total_examples=len(docs), epochs=model.iter)

    alpha_val -= alpha_delta

#Save model
model.save(os.path.join(ROOT_DIR, FLAGS.dataset, 'doc2vecFile'))
예제 #9
0
#!usr/bin/env python
# -*- coding:utf-8 -*-

from gensim.models import doc2vec
import numpy as np

sentences = doc2vec.TaggedLineDocument("result.txt")

model = doc2vec.Doc2Vec(sentences,
                        size=20,
                        window=3,
                        min_count=3,
                        workers=4,
                        iter=60)
# model.build_vocab(sentences)
model.train(sentences)
print('#########', model.vector_size)

corpus = model.docvecs
np.save("d2v.corpus.arff", corpus)
questions = processed_questions
del processed_questions


######################################### Create Doc2vec #########################################

from gensim.models import doc2vec

#Hyper Parameters for doc2vec Model
num_workers = multiprocessing.cpu_count()
context_size = 5
num_features = 50
min_word_Count = 1

model = doc2vec.Doc2Vec(questions, size = num_features, window = context_size, min_count = min_word_Count, workers = num_workers)

############################# create ques Vector ###################################

#questions_vec = np.array((len(questions), 50))
questions_vec = []

#storing all vectors of questions
for i in range(len(questions)):
	vec = model.docvecs[i]
	questions_vec.append(vec)


############################# create labels ###################################

question_type = ["who", "what", "when", "affirmation", "unknown"]
def Updates():
    try:
        print("updating Doc2Vec")
        print(updating)
        a = stem.snowball.ArabicStemmer()
        stopwords_list = stopwords.words('arabic')
        df = pd.read_csv('textc-Copy1.csv', encoding='utf-8')
        df["contenu"].fillna("محتوى فارغ", inplace=True)
        df["article"].fillna("محتوى فارغ", inplace=True)
        y = df['ToF']
        df = df.drop('ToF', axis=1)
        text = []
        for i in range(df.shape[0]):
            x = nltk.tokenize.wordpunct_tokenize(df.contenu[i])
            text1 = [a.stem(word) for word in x]
        text.append(text1)
        titre = [
            a.stem(word) for word in df.article if word not in stopwords_list
        ]
        #doc2vec
        docs = []
        analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
        for i, te in enumerate(text):
            tags = [i]
            docs.append(analyzedDocument(te, tags))
        model = doc2vec.Doc2Vec(docs,
                                vector_size=300,
                                non_negative=True,
                                window=8,
                                min_count=1,
                                workers=4,
                                dm=1)
        from gensim.test.utils import get_tmpfile
        fname = get_tmpfile("doc2vec.model")
        model.save(fname)
        model = doc2vec.Doc2Vec.load(fname)
        print("updating fastext")

        class MyItera(object):
            def __iter__(self):
                for line in Corpus.article:
                    filtered_sentence = []
                    for w in tokenize(line):
                        if w not in stop_words:
                            filtered_sentence.append(w)
                    yield filtered_sentence

        class MyIter(object):
            def __iter__(self):
                for line in Corpus.contenu:
                    filtered_sentence = []
                    for w in tokenize(line):
                        if w not in stop_words:
                            filtered_sentence.append(w)
                    yield filtered_sentence

        model = FastText(size=150, window=3, min_count=1)
        model.build_vocab(sentences=MyIter())
        total_examples = model.corpus_count
        model.train(sentences=MyIter(),
                    total_examples=total_examples,
                    epochs=5)

    except:
        Update()
예제 #12
0
def prepare_doc2vec(sentences, embedding_size:int, model_path:str):
    documents = [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
    model = doc2vec.Doc2Vec(documents, vector_size=embedding_size, min_count=1, workers=4)
    model.save(model_path)
예제 #13
0
def apply_pipeline(train_from_scratch=True, avoid_inference=False, shuffle_corpus=True,
                   include_genres=False, include_categories=True, include_app_ids=True,
                   verbose=False):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    game_names, game_tags = load_game_names(include_genres, include_categories)

    steam_tokens = load_tokens()

    documents = list(read_corpus(steam_tokens, game_tags, include_app_ids))

    if shuffle_corpus:
        # « Only if the training data has some existing clumping – like all the examples with certain words/topics are
        # stuck together at the top or bottom of the ordering – is native ordering likely to cause training problems.
        # And in that case, a single shuffle, before any training, should be enough to remove the clumping. »
        # Reference: https://stackoverflow.com/a/48080869
        random.shuffle(documents)

    if train_from_scratch:
        print('Creating a new Doc2Vec model from scratch.')
        model = doc2vec.Doc2Vec(documents,
                                vector_size=100,
                                window=5,
                                min_count=5,
                                epochs=20,
                                workers=multiprocessing.cpu_count())

        # NB: Do not follow the piece of advice given in https://rare-technologies.com/doc2vec-tutorial/
        # « I have obtained better results by iterating over the data several times and either:
        #     1. randomizing the order of input sentences, or
        #     2. manually controlling the learning rate over the course of several iterations. »
        # Indeed, in my experience, this leads to buggy results. Moreover, this approach is not recommended according to
        # https://stackoverflow.com/a/48080869

        model.save(get_doc_model_file_name())
    else:
        print('Loading previous Doc2Vec model.')
        model = doc2vec.Doc2Vec.load(get_doc_model_file_name())

    # Test doc2vec

    if verbose:

        try:
            # Spelunky + (Slay the Spire) - (Dream Quest)
            check_analogy(model, pos=['239350', '646570'], neg=['557410'])
        except TypeError:
            pass

        try:
            # Half-Life + (Witcher 2) - (Witcher)
            check_analogy(model, pos=['70', '20920'], neg=['20900'])
        except TypeError:
            pass

        query_app_ids = ['620', '364470', '504230', '583950', '646570', '863550', '794600']

        for query_app_id in query_app_ids:
            print('Query appID: {} ({})'.format(query_app_id, game_names[query_app_id]))
            compute_similarity_using_doc2vec_model(query_app_id, steam_tokens, model,
                                                   avoid_inference=avoid_inference,
                                                   num_items_displayed=10)

        # Check the relevance of the corresponding word2vec
        for query_word in ['anime', 'fun', 'violent']:
            compute_similarity_using_word2vec_model(query_word, steam_tokens, model)

        entity = get_doc_model_entity(model)
        tag_entity = set(tag for tag in entity if 'appID_' not in tag)

        print(tag_entity)

        query_tags = ['In-App Purchases', 'Free to Play', 'Violent', 'Early Access']

        for query_tag in tag_entity.intersection(query_tags):
            for query_app_id in query_app_ids:
                try:
                    sim = model.docvecs.similarity(get_tag_prefix() + query_app_id, query_tag)
                    print('Similarity = {:.0%} for tag {} vs. appID {} ({})'.format(sim, query_tag, query_app_id,
                                                                                    game_names[query_app_id]))
                except KeyError:
                    pass

        num_items_displayed = 3
        for query_tag in tag_entity:
            print('\nTag: {}'.format(query_tag))
            similarity_scores_as_tuples = model.docvecs.most_similar(positive=query_tag, topn=num_items_displayed)
            similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples)
            print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed)

    # Top 100

    query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True)

    num_neighbors = 10
    only_print_banners = True
    use_cosine_similarity = True

    label_database = np.array(model.docvecs.vectors_docs)
    doc_tags = list(model.docvecs.doctags.keys())

    init_indices = np.array(range(len(doc_tags)))
    bool_indices_to_remove = list(map(lambda x: not x.startswith(get_tag_prefix()), doc_tags))
    indices_to_remove = init_indices[bool_indices_to_remove]
    label_database = np.delete(label_database, indices_to_remove, axis=0)

    app_ids = [int(doc_tag[len(get_tag_prefix()):]) for doc_tag in doc_tags
               if doc_tag.startswith(get_tag_prefix())]

    knn = prepare_knn_search(label_database, use_cosine_similarity=use_cosine_similarity)

    query_des = None
    for query_app_id in query_app_ids:
        if avoid_inference:
            inferred_vector = label_database[app_ids.index(query_app_id)]
        else:
            # From query appID to query feature vector
            query = steam_tokens[str(query_app_id)]
            # Caveat: « Subsequent calls to this function may infer different representations for the same document. »
            # Reference: https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.Doc2Vec.infer_vector
            inferred_vector = model.infer_vector(query)

        if query_des is None:
            query_des = inferred_vector
        else:
            query_des = np.vstack((query_des, inferred_vector))

    # Matching of feature vectors
    matches = perform_knn_search_with_vectors_as_input(query_des, knn, num_neighbors)

    # From feature matches to appID matches
    matches_as_app_ids = transform_matches_to_app_ids(matches, app_ids)

    print_ranking(query_app_ids,
                  matches_as_app_ids,
                  num_elements_displayed=num_neighbors,
                  only_print_banners=only_print_banners)

    return
예제 #14
0
#%%
from gensim.models import doc2vec
import pandas as pd

#%%
all_data = pd.read_csv('all_data.csv')

#%% # 建立語料庫
documents = []
for i, sentence in enumerate(all_data['clean_data']):
    documents.append(doc2vec.TaggedDocument(sentence, [i]))

#%% # 模型建立
model = doc2vec.Doc2Vec(documents,
                        min_count=1,
                        window=15,
                        size=100,
                        sample=1e-3,
                        negative=5,
                        workers=4)
model.train(documents, total_examples=model.corpus_count, epochs=10)
model.save('doc2vec.model')

# print(len(model.docvecs))
예제 #15
0
def train_doc2vec():
    doc2vec_model = doc2vec.Doc2Vec(size=100, window=8, min_count=5, workers=8, iter=30, alpha=1e-2, min_alpha=1e-2)
    doc2vec_model.build_vocab(iter_docs_queries())
    doc2vec_model.train(iter_docs_queries(), total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.iter)
    doc2vec_model.save('doc2vec_weigths')
예제 #16
0
def main():

    #Set up logging configurations
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    
    model = doc2vec.Doc2Vec(hashfxn=myhash)

    #Load the trained model
    model = doc2vec.Doc2Vec.load("../../classifier/Doc2VectforNLPTraining")

    word_vectors = model.syn0
      
    num_clusters = int(word_vectors.shape[0] / 5)
    # print("number of clusters: {}".format(num_clusters))
   
    print("Clustering...")
    startTime = time.time()
    cluster_index = cfun.kmeans(num_clusters, word_vectors)
    endTime = time.time()

    print("Time taken for clustering: {} minutes".format((endTime - startTime)/60))
    
    clusterf = open("../../classifier/doc2vec/clusterIndex.pickle","wb") 
    
    #Save clusters
    pickle.dump(cluster_index,clusterf)
    
    # create a word/index dictionary, mapping each vocabulary word to a cluster number
    # zip(): make an iterator that aggregates elements from each of the iterables
    index_word_map = dict(zip(model.index2word, cluster_index))
    
    train = pd.read_csv("../../data/labeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)
    test = pd.read_csv("../../data/testData.tsv",
                   header=0, delimiter="\t", quoting=3)
    
    #Create feature vectors for training data
    trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float)
    
    #Create feature vectors for test data
    testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float)
    
    #Populate feature vectors after cleaing the data
    
    print("Processing training data...")
    counter = 0
    cleaned_training_data = preProc.clean_data(train)
    for review in cleaned_training_data:
        trainingDataFV[counter] = cfun.create_bag_of_centroids(review,num_clusters,index_word_map)
        counter += 1

    print("Processing test data...")
    counter = 0
    cleaned_test_data = preProc.clean_data(test)
    for review in cleaned_test_data:
        testDataFV[counter] = cfun.create_bag_of_centroids(review,num_clusters,index_word_map)
        counter += 1

    n_estimators = 100
    result = cfun.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV)
    output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
    output.to_csv("Doc2Vec_Clustering.csv", index=False, quoting=3)
예제 #17
0
from cleaner import clean
from gensim.models import doc2vec


def load_docs(filepath, clean_text=True):
    ret = []
    for f_name in os.listdir(filepath):
        if clean_text:
            ret.append(
                clean(
                    open(filepath + "/" + f_name,
                         'rb').read().decode('UTF-8')))
            continue
        ret.append(open(filepath + "/" + f_name, 'rb').read().decode('UTF-8'))
    return ret


def format_loaded_input_docs(docs):
    ret = []
    for id_, doc in enumerate(docs):
        ret.append(doc2vec.TaggedDocument(words=doc.split(), tags=[id_]))
    return ret


if __name__ == '__main__':
    docs = load_docs("samples/")
    formatted_docs = format_loaded_input_docs(docs)
    model = doc2vec.Doc2Vec(formatted_docs, size=5, window=10, workers=2)
    for e in model.docvecs:
        print e
예제 #18
0
    def structural_embedding(self, inputFile, outputFile):
        indexToName = self.generateWalkFile(inputFile, args.walkLength)
        sentences = doc.TaggedLineDocument(inputFile+'.walk')
        self.model = doc.Doc2Vec(sentences, size = dimensions, iter = iterations, window = window )

        saveVectors(list(self.docvecs), outputFile, indexToName)
예제 #19
0
# Tokenize and lemmatize
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                token) > 3:
            result.append(lemmatize_stemming(token))

    return result


def myhash(obj):
    return hash(obj) % (2**32)


model = doc2vec.Doc2Vec(hashfxn=myhash)

#Load the model we trained earlier
model = doc2vec.Doc2Vec.load("../classifier/doc2vec/Doc2VecTaggedDocs")

sentence1 = preprocess("Why do rockets look white?")
sentence2 = preprocess("Why are rockets and boosters painted white?")

inferred_embedding_2 = numpy.array(
    model.infer_vector(doc_words=sentence2, steps=30, alpha=0.025))
inferred_embedding_1 = numpy.array(
    model.infer_vector(doc_words=sentence1, steps=30, alpha=0.025))

print(
    numpy.dot(inferred_embedding_1, inferred_embedding_2) /
    (numpy.linalg.norm(inferred_embedding_1, ord=2) *
예제 #20
0
파일: tool.py 프로젝트: jiaxy/EHAdvisor
def preprocess(path,readMePath, test_id, train_id, with_args_num=True, with_pac_depth=True):


    data = pd.read_csv(path)

    columns = ['Method', 'Exception', 'ProjectId', 'ProjectName', 'MethodTop', 'MethodBottom',
               'ClassTop', 'ClassBottom', 'PacTop', 'PacBottom', 'Doc', 'Throw', 'Catch']
    data.reindex(columns=columns)

    data.drop_duplicates(subset=columns[:-2], keep=False, inplace=True)


    data = data[data.Exception.str.contains('java')]

    data.index = range(len(data))

    exs = list(data['Exception'].drop_duplicates())
    exs_to_id = {}
    for e in exs:
        exs_to_id[e] = len(exs_to_id)

    project_id_ = data['ProjectId']
    # data.drop(columns='ProjectId', inplace=True)
    # columns.remove('ProjectId')

    if with_args_num:
        columns.insert(2, 'ArgsNum')
        data = data.reindex(columns=columns, fill_value=0)
        data['ArgsNum'] = [int(s.count('@')) for s in data['Method']]
    if with_pac_depth:
        columns.insert(2, 'PacDepth')
        data = data.reindex(columns=columns, fill_value=0)
        data['PacDepth'] = [int(s.count('.')) for s in data['Method']]

    columns.insert(2, 'ExceptionType')
    data = data.reindex(columns=columns, fill_value=0)
    data['ExceptionType'] = [exs_to_id[e] for e in data['Exception']]

    zxqcommentPath = "comments.csv"
    zxqcomments = pd.read_csv(zxqcommentPath, header=None, error_bad_lines=False, encoding="gbk")
    zxqmethod = zxqcomments[0]
    zxqdict = {zxqcomments[0][0]: zxqcomments[2][0]}
    for i in range(len(zxqmethod)):
        zxqdict[zxqcomments[0][i]] = zxqcomments[2][i]
    doc = []
    zxqindex = 0
    for s in data['Doc']:
        tmp = []
        tmp.extend([p for p in s.split('|')])
        zxqmethod = data['Method'][zxqindex]
        zxqmethod = zxqmethod.split('$')[1].split('@')[0]

        zxqcom = zxqdict.get(zxqmethod)
        zxqcom = str(zxqcom)
        if (zxqcom == 'nan'):
            zxqcom = ""
        tmp.extend([p for p in zxqcom.split()])

        zxqindex = zxqindex + 1
        doc.append(tmp)


    doc_size = 128
    res = doc2vec_abstract(readMePath,size=doc_size)
    abstract = np.zeros(shape=(data.shape[0], doc_size))
    j = 0
    for i in project_id_:
        abstract[j] = res["project"]
        j += 1


    d = get_refer('depend_all.csv', 'depend')
    refers = np.zeros(shape=(data.shape[0], d["project"].shape[0]))
    j = 0
    for i in project_id_:
        refers[j] = d["project"]
        j += 1
    # exceptions = np.array(data.iloc[:, 1], dtype=np.str)

    data.index = range(len(data))



    train_id = data[data['ProjectId'].isin(train_id)].index.tolist()
    test_id = data[data['ProjectId'].isin(test_id)].index.tolist()


    data.drop(columns='ProjectId', inplace=True)
    data.drop(columns='ProjectName', inplace=True)
    columns.remove('ProjectId')
    columns.remove('ProjectName')

    features = np.array(data.iloc[:, 2:-3], dtype=np.float)
    labels = np.array([0 if int(x) == 1 else 1 for x in data['Throw']], dtype=np.int)

    _doc = doc2vec_generate(doc)

    vector_size = 128
    d2v = doc2vec.Doc2Vec(documents=_doc, min_count=1, vector_size=vector_size)
    # from gensim.test.utils import get_tmpfile
    # fname = get_tmpfile("doc2vec_sentences_model")
    # d2v.save(fname)
    # d2v = Doc2Vec.load(fname)
    sentences = np.array([list(d2v.infer_vector(n)) for n in doc], dtype=np.float)
    # sentences = np.zeros(shape=(data.shape[0], vector_size))
    # j = 0
    # for i in project_id_:
    #     sentences[j] = sen[j]
    #     j += 1

    # 划分train和test

    method = data['Method']

    exception = data['Exception']

    sentences_train, sentences_test = sentences[train_id], sentences[test_id]
    features_train, features_test = features[train_id], features[test_id]
    refers_train, refers_test = refers[train_id], refers[test_id]
    abstract_train, abstract_test = abstract[train_id], abstract[test_id]
    labels_train, labels_test = labels[train_id], labels[test_id]

    # return doc, features, labels, refers, abstract

    return method[test_id], exception[
        test_id], sentences_train, sentences_test, features_train, features_test, refers_train, refers_test, \
           abstract_train, abstract_test, labels_train, labels_test
 def test_dbow_neg_fromfile(self):
     """Test DBOW doc2vec training."""
     with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file:
         save_lee_corpus_as_line_sentence(corpus_file)
         model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, epochs=20)
         self.model_sanity(model)
예제 #22
0
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 500  # Word vector dimensionality
min_word_count = 100  # Minimum word count
num_workers = 4  # Number of threads to run in parallel
context = 15  # Context window size
downsampling = 1e-3  # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import doc2vec
print "Training model..."
model = doc2vec.Doc2Vec(workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling) #, alpha = 0.025, min_alpha = 0.025
model.build_vocab(sentences)
for epoch in range(3):
    model.train(sentences)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

model.docvecs.most_similar("ANSWER")

# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
예제 #23
0
파일: model.py 프로젝트: HeliWang/doc2vec
	for i,v in enumerate(reviews):
		label = '%s_%s'%(label_type,i)
		labelized.append(LabeledSentence(v, [label]))
	return labelized


allXs = x_train + x_test
x_train = labelizeReviews(x_train, 'Train')
x_test = labelizeReviews(x_test, 'Test')
allXs = labelizeReviews(allXs, 'All')
for i in range(10):
	print x_train[i]
  print x_test[i]

# Instantiate Doc2Vec model and build vocab
model = doc2vec.Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
model.build_vocab(allXs)
# model = doc2vec.Doc2Vec.load('doc2vec_model')
#Pass through the data set multiple times, shuffling the training reviews each time to improve accuracy
for epoch in range(20):
    model.train(utils.shuffle(x_train))

model.save('Model_after_train')
model = doc2vec.Doc2Vec.load('Model_after_train')
#print model.docvecs['All_0']


# get training set vectors from our models
def getVecs(model, corpus, size, vecs_type):
	vecs = np.zeros((len(corpus), size))
	for i in range(0 , len(corpus)):
예제 #24
0
def grid_search_doc2vec(X_texto, X_materia, y_train):
    lr_acc = []
    sizes = [500, 1000, 1500]  # size of the vector
    counts = [1, 5]  #min_count for words to be included in vocab
    dms = [
        0
    ]  # distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW)
    epoch = 10

    docs_token = [doc.split() for doc in X_texto]
    documents = [
        doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(docs_token)
    ]

    for size in sizes:
        for count in counts:
            for dm in dms:

                # define model, vocab and train it
                model = doc2vec.Doc2Vec(vector_size=size,
                                        min_count=count,
                                        epochs=epoch,
                                        workers=cores - 1,
                                        dm=dm)
                model.build_vocab(documents)
                model.train(documents,
                            total_examples=model.corpus_count,
                            epochs=model.epochs)

                # get vector representation of docs
                X_vector = []
                for doc in docs_token:
                    vector = model.infer_vector(doc)
                    X_vector.append(vector)

                X = pd.concat(
                    [X_materia.reset_index(drop=True),
                     pd.DataFrame(X_vector)],
                    axis=1)

                #oversampling
                sm = SMOTE(sampling_strategy=0.4)
                X_train_sm, y_train_sm = sm.fit_resample(X, y_train)

                # classifier: CV score
                lrg = LogisticRegression(solver="lbfgs",
                                         class_weight='balanced')
                scores = cross_val_score(lrg,
                                         X=X_train_sm,
                                         y=y_train_sm,
                                         cv=5,
                                         n_jobs=-1,
                                         verbose=1,
                                         scoring='recall')

                lr_acc.append((scores.mean(), size, count, dm))

                results = pd.DataFrame(lr_acc)

                parameters = {
                    'size': results.sort_values(0).iloc[-1, :][1],
                    'count': results.sort_values(0).iloc[-1, :][2],
                    'dm': results.sort_values(0).iloc[-1, :][3]
                }

    return results, parameters
예제 #25
0
print (len(w2v_train_doc2))

print ('now train word2vec')


import time
a=time.time()
class sentences_generator():
    def __init__(self, doc):
        self.doc = doc
    def __iter__(self):
        for i,line in enumerate(self.doc):
            sentence = TaggededDocument(line.split(), tags=[i])
            yield sentence
sents=sentences_generator(w2v_train_doc2)
model = doc2vec.Doc2Vec(sents,dm=1,vector_size=300,window=13,min_count=1,hs=1,workers=12,epochs=30)
b=time.time()
model.save("../data/pv.txt")
print (b-a)

import operator
def get_doc2vec_score(data,tf):
    DS=[]
    tag_num=0
    hit_num=0
    for row in tqdm(range(data.shape[0])):
        d2v_score={}
        doc_vec=model[train_idx[row]]
        doc_vec/=np.sqrt(sum(doc_vec**2))
        for w in tf[row]:
            try:
예제 #26
0
from gensim.models import doc2vec
from collections import namedtuple

# Load data

doc1 = ["This is a sentence", "This is another sentence"]
# Transform data (you can add more data preprocessing steps)
docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(doc1):
    words = text.lower().split()
    print words
    tags = ['test']
    #     tags = ['hello'+str(i)]
    print tags
    docs.append(analyzedDocument(words, tags))
# Train model (set min_count = 1, if you want the model to work with the provided example data set)
model = doc2vec.Doc2Vec(docs, size=100, window=300, min_count=1, workers=4)

# Get the vectors
print model.docvecs['test']
# print model.docvecs['hello0']
print model.docvecs[0]
print model.docvecs[1]  # 这两种方法的结果是一样 的
# 标签必须 unique?
'''
sentences=doc2vec.TaggedLineDocument(file_path)
model = doc2vec.Doc2Vec(sentences,size = 100, window = 300, min_count = 10, workers=4)
docvec = model.docvecs[99] 
'''
예제 #27
0
 def test_persistence(self):
     """Test storing/loading the entire model."""
     model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1)
     model.save(testfile())
     self.models_equal(model, doc2vec.Doc2Vec.load(testfile()))
예제 #28
0
cores = multiprocessing.cpu_count()

vector_size = 300
window_size= 15
word_min_count = 2
sampling_threashold = 1e-5
negative_size = 5
train_epoch = 100
dm = 1
worker_count = cores

train_data = read_data('data/ratings_train.txt')

train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]

TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d,c in train_docs]

doc_vectorizer = doc2vec.Doc2Vec(size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocal(tagged_train_docs)

for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002
    doc_vectorizer.min_alpha = doc_vectorizer.alpha

doc_vectorizer.save('model/doc2vec.model')

pprint(doc_vectorizer.most_similar('공포/Noun'))
pprint(doc_vectorizer.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle'))
예제 #29
0
if len(tag) == 0:
	sys.exit(1)

class MyTagDocument(object):
	def __init__(self, filename):
		self.filename = filename

	def __iter__(self):
		for line in open(self.filename):
			items = line.split('\t', 1)
                    	yield doc2vec.TaggedDocument(to_unicode(items[1]).split(), [items[0]])


documents=MyTagDocument('./data/articles.txt.' + tag)

model=doc2vec.Doc2Vec(documents, size=100, window=8, min_count=5, workers=8)
model.save('doc2vec')

output = open('output_doc2vec.txt.' + tag, 'w')
count = model.docvecs.count
idx = 0
while idx < count:
	tag = model.docvecs.index_to_doctag(idx)
	idx = idx + 1
	res=[]
	for f in model.docvecs[tag]:
		res.append(str(f))
	output.write(tag + "\tdoc2vec\t" + " ".join(res) + "\n")

output.close()
예제 #30
0
 def test_dms_hs(self):
     """Test DM/sum doc2vec training."""
     model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=1, negative=0,
                             alpha=0.05, min_count=2, iter=20)
     self.model_sanity(model)