def MergeAndWriteTrainTest(): print(extClustFile) clustlabels = readClustLabel(extClustFile) listtuple_pred_true_text, uniqueTerms = combinePredTrueText( clustlabels, dataFileTrueTxt) WriteTrainTestInstances(traintestFile, listtuple_pred_true_text) return listtuple_pred_true_text
from groupTxt_ByClass import groupTxtByClass from word_vec_extractor import populateTermVecs from nltk.tokenize import word_tokenize from sent_vecgenerator import generate_sent_vecs_toktextdata from sklearn.ensemble import IsolationForest from generate_TrainTestTxtsTfIdf import comPrehensive_GenerateTrainTestTxtsByOutliersTfIDf from generate_TrainTestVectorsTfIdf import generateTrainTestVectorsTfIDf from sklearn.linear_model import LogisticRegression from time import time from sklearn import metrics from nltk.corpus import stopwords from txt_process_util import processTxtRemoveStopWordTokenized from nltk.tokenize import word_tokenize extClustFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/biomedical/2n-biomedical-w2vec-sparse-alpha-20000-0-labels" clustlabels = readClustLabel(extClustFile) dataFileTxtTrue = "/home/owner/PhD/dr.norbert/dataset/shorttext/biomedical/biomedicalraw" listtuple_pred_true_text, uniqueTerms = combinePredTrueText( clustlabels, dataFileTxtTrue) for itr in range(5): print("itr=" + str(itr)) for items in range(700, 1000, 50): trainTup_pred_true_txt, testTup_pred_true_txt = comPrehensive_GenerateTrainTestTxtsByOutliersTfIDf( listtuple_pred_true_text, items) perct_train_inst = len(trainTup_pred_true_txt) / len( listtuple_pred_true_text) print("perct_train_inst=" + str(perct_train_inst)) if perct_train_inst > 0.85: del trainTup_pred_true_txt