Exemplo n.º 1
0
    def __init__(self, directory=None, train_size=0.3, textweight=0.8, size=300, seed=1, workers=1, passes=10, dm=0, min_count=3):

        # Read the data
        alldocs, docindex, classlabels = net.readNetworkData(directory)
        print('%d documents, %d classes, training ratio=%f' % (len(alldocs), len(classlabels), train_size))
        print('%d classes' % len(classlabels))

        #Initilize Doc2Vec
        if train_size  > 0: #label information is available for learning
            print('Adding Label Information')
            train, test = train_test_split(alldocs, train_size=train_size, random_state=seed)

            """
                Add supervised information to training data, use label information for learning
                Specifically, the doc2vec algorithm used the tags information as document IDs,
                and learn a vector representation for each tag (ID). We add the class label into the tags,
                so each class label will acts as a ID and is used to learn the latent representation
            """
            alldata = train[:]
            for x in alldata:
                x.tags.append('Label_'+x.labels)
            alldata.extend(test)
        else: # no label information is available, pure unsupervised learning
            alldata = alldocs[:]


        d2v = net.trainDoc2Vec(alldata, workers=workers, size=size, dm=dm, passes=passes, min_count=min_count)

        raw_walks, netwalks = net.getdeepwalks(directory, number_walks=20, walk_length=8)
        w2v = net.trainWord2Vec(raw_walks, buildvoc=1, passes=passes, size=size, workers=workers)

        if train_size > 0: #Print out the initial results
            print('Initialize Doc2Vec Model With Supervised Information...')
            Evaluation.evaluationEmbedModelFromTrainTest(d2v, train, test, classifierStr='SVM')
            print('Initialize Deep Walk Model')
            Evaluation.evaluationEmbedModelFromTrainTest(w2v, train, test, classifierStr='SVM')

        self.d2v = d2v
        self.w2v = w2v

        self.train(d2v, w2v, directory, alldata, passes=passes, weight=textweight)

        if textweight > 0.5:
            self.model = d2v
        else:
            self.model = w2v
Exemplo n.º 2
0
    def train(self, d2v, w2v, directory, alldata, passes=10, weight=0.9):

        raw_walks, walks = net.getdeepwalks(directory, number_walks=20, walk_length=10)
        for i in range(passes):
            print('Iterative Runing %d' % i)
            self.setWeights(d2v, w2v, weight=weight)

            #Train Word2Vec

            shuffle(raw_walks)
            print("Update W2V...")
            w2v.train(raw_walks,total_examples=w2v.corpus_count,epochs=w2v.epochs)
            self.setWeights(w2v, d2v, weight=(1-weight))

            print("Update D2V...")
            shuffle(alldata)  # shuffling gets best results
            d2v.train(alldata,total_examples=d2v.corpus_count,epochs=d2v.epochs)
Exemplo n.º 3
0
                                 size=numFea,
                                 dm=dm,
                                 passes=passes,
                                 min_count=3)

print('Classification Performance on Doc2Vec Model')
doc2vec_acc, doc2vec_macro_f1, doc_2vec_micro_f1 = \
     Evaluation.evaluationEmbedModelFromTrainTest(doc2vec_model, train, test, classifierStr='SVM')

print("##################")

#### Baseline 2, Deep Walk Model
print("##################")
print("Baseline 2, Deep Walk Model")
raw_walks, netwalks = net.getdeepwalks(directory,
                                       number_walks=20,
                                       walk_length=8)
deepwalk_model = net.trainWord2Vec(raw_walks,
                                   buildvoc=1,
                                   sg=1,
                                   passes=passes,
                                   size=numFea,
                                   workers=cores)
print('Classification Performance on DeepWalk Model')
doc2vec_acc, doc2vec_macro_f1, doc_2vec_micro_f1 = \
    Evaluation.evaluationEmbedModelFromTrainTest(deepwalk_model, train, test, classifierStr='SVM')

print("##################")

### Baseline 3, D2V+DW
print("##################")