def __init__(self, directory=None, train_size=0.3, textweight=0.8, size=300, seed=1, workers=1, passes=10, dm=0, min_count=3): # Read the data alldocs, docindex, classlabels = net.readNetworkData(directory) print('%d documents, %d classes, training ratio=%f' % (len(alldocs), len(classlabels), train_size)) print('%d classes' % len(classlabels)) #Initilize Doc2Vec if train_size > 0: #label information is available for learning print('Adding Label Information') train, test = train_test_split(alldocs, train_size=train_size, random_state=seed) """ Add supervised information to training data, use label information for learning Specifically, the doc2vec algorithm used the tags information as document IDs, and learn a vector representation for each tag (ID). We add the class label into the tags, so each class label will acts as a ID and is used to learn the latent representation """ alldata = train[:] for x in alldata: x.tags.append('Label_'+x.labels) alldata.extend(test) else: # no label information is available, pure unsupervised learning alldata = alldocs[:] d2v = net.trainDoc2Vec(alldata, workers=workers, size=size, dm=dm, passes=passes, min_count=min_count) raw_walks, netwalks = net.getdeepwalks(directory, number_walks=20, walk_length=8) w2v = net.trainWord2Vec(raw_walks, buildvoc=1, passes=passes, size=size, workers=workers) if train_size > 0: #Print out the initial results print('Initialize Doc2Vec Model With Supervised Information...') Evaluation.evaluationEmbedModelFromTrainTest(d2v, train, test, classifierStr='SVM') print('Initialize Deep Walk Model') Evaluation.evaluationEmbedModelFromTrainTest(w2v, train, test, classifierStr='SVM') self.d2v = d2v self.w2v = w2v self.train(d2v, w2v, directory, alldata, passes=passes, weight=textweight) if textweight > 0.5: self.model = d2v else: self.model = w2v
def train(self, d2v, w2v, directory, alldata, passes=10, weight=0.9): raw_walks, walks = net.getdeepwalks(directory, number_walks=20, walk_length=10) for i in range(passes): print('Iterative Runing %d' % i) self.setWeights(d2v, w2v, weight=weight) #Train Word2Vec shuffle(raw_walks) print("Update W2V...") w2v.train(raw_walks,total_examples=w2v.corpus_count,epochs=w2v.epochs) self.setWeights(w2v, d2v, weight=(1-weight)) print("Update D2V...") shuffle(alldata) # shuffling gets best results d2v.train(alldata,total_examples=d2v.corpus_count,epochs=d2v.epochs)
size=numFea, dm=dm, passes=passes, min_count=3) print('Classification Performance on Doc2Vec Model') doc2vec_acc, doc2vec_macro_f1, doc_2vec_micro_f1 = \ Evaluation.evaluationEmbedModelFromTrainTest(doc2vec_model, train, test, classifierStr='SVM') print("##################") #### Baseline 2, Deep Walk Model print("##################") print("Baseline 2, Deep Walk Model") raw_walks, netwalks = net.getdeepwalks(directory, number_walks=20, walk_length=8) deepwalk_model = net.trainWord2Vec(raw_walks, buildvoc=1, sg=1, passes=passes, size=numFea, workers=cores) print('Classification Performance on DeepWalk Model') doc2vec_acc, doc2vec_macro_f1, doc_2vec_micro_f1 = \ Evaluation.evaluationEmbedModelFromTrainTest(deepwalk_model, train, test, classifierStr='SVM') print("##################") ### Baseline 3, D2V+DW print("##################")