def __init__(self, directory=None, train_size=0.3, textweight=0.8, size=300, seed=1, workers=1, passes=10, dm=0, min_count=3): # Read the data alldocs, docindex, classlabels = net.readNetworkData(directory) print('%d documents, %d classes, training ratio=%f' % (len(alldocs), len(classlabels), train_size)) print('%d classes' % len(classlabels)) #Initilize Doc2Vec if train_size > 0: #label information is available for learning print('Adding Label Information') train, test = train_test_split(alldocs, train_size=train_size, random_state=seed) """ Add supervised information to training data, use label information for learning Specifically, the doc2vec algorithm used the tags information as document IDs, and learn a vector representation for each tag (ID). We add the class label into the tags, so each class label will acts as a ID and is used to learn the latent representation """ alldata = train[:] for x in alldata: x.tags.append('Label_'+x.labels) alldata.extend(test) else: # no label information is available, pure unsupervised learning alldata = alldocs[:] d2v = net.trainDoc2Vec(alldata, workers=workers, size=size, dm=dm, passes=passes, min_count=min_count) raw_walks, netwalks = net.getdeepwalks(directory, number_walks=20, walk_length=8) w2v = net.trainWord2Vec(raw_walks, buildvoc=1, passes=passes, size=size, workers=workers) if train_size > 0: #Print out the initial results print('Initialize Doc2Vec Model With Supervised Information...') Evaluation.evaluationEmbedModelFromTrainTest(d2v, train, test, classifierStr='SVM') print('Initialize Deep Walk Model') Evaluation.evaluationEmbedModelFromTrainTest(w2v, train, test, classifierStr='SVM') self.d2v = d2v self.w2v = w2v self.train(d2v, w2v, directory, alldata, passes=passes, weight=textweight) if textweight > 0.5: self.model = d2v else: self.model = w2v
print('Classification Performance on Doc2Vec Model') doc2vec_acc, doc2vec_macro_f1, doc_2vec_micro_f1 = \ Evaluation.evaluationEmbedModelFromTrainTest(doc2vec_model, train, test, classifierStr='SVM') print("##################") #### Baseline 2, Deep Walk Model print("##################") print("Baseline 2, Deep Walk Model") raw_walks, netwalks = net.getdeepwalks(directory, number_walks=20, walk_length=8) deepwalk_model = net.trainWord2Vec(raw_walks, buildvoc=1, sg=1, passes=passes, size=numFea, workers=cores) print('Classification Performance on DeepWalk Model') doc2vec_acc, doc2vec_macro_f1, doc_2vec_micro_f1 = \ Evaluation.evaluationEmbedModelFromTrainTest(deepwalk_model, train, test, classifierStr='SVM') print("##################") ### Baseline 3, D2V+DW print("##################") print("Baseline 3, Simple Combination of DeepWalk + Doc2Vec") d2v_train_vecs = [doc2vec_model.docvecs[doc.tags[0]] for doc in train] d2v_test_vecs = [doc2vec_model.docvecs[doc.tags[0]] for doc in test]