def MergeAndWriteTrainTest():
    print(extClustFile)
    clustlabels = readClustLabel(extClustFile)
    listtuple_pred_true_text, uniqueTerms = combinePredTrueText(
        clustlabels, dataFileTrueTxt)
    WriteTrainTestInstances(traintestFile, listtuple_pred_true_text)
    return listtuple_pred_true_text
from groupTxt_ByClass import groupTxtByClass
from word_vec_extractor import populateTermVecs
from nltk.tokenize import word_tokenize
from sent_vecgenerator import generate_sent_vecs_toktextdata
from sklearn.ensemble import IsolationForest
from generate_TrainTestTxtsTfIdf import comPrehensive_GenerateTrainTestTxtsByOutliersTfIDf
from generate_TrainTestVectorsTfIdf import generateTrainTestVectorsTfIDf
from sklearn.linear_model import LogisticRegression
from time import time
from sklearn import metrics
from nltk.corpus import stopwords
from txt_process_util import processTxtRemoveStopWordTokenized
from nltk.tokenize import word_tokenize

extClustFile = "/home/owner/PhD/dr.norbert/dataset/shorttext/biomedical/2n-biomedical-w2vec-sparse-alpha-20000-0-labels"
clustlabels = readClustLabel(extClustFile)

dataFileTxtTrue = "/home/owner/PhD/dr.norbert/dataset/shorttext/biomedical/biomedicalraw"
listtuple_pred_true_text, uniqueTerms = combinePredTrueText(
    clustlabels, dataFileTxtTrue)

for itr in range(5):
    print("itr=" + str(itr))
    for items in range(700, 1000, 50):
        trainTup_pred_true_txt, testTup_pred_true_txt = comPrehensive_GenerateTrainTestTxtsByOutliersTfIDf(
            listtuple_pred_true_text, items)
        perct_train_inst = len(trainTup_pred_true_txt) / len(
            listtuple_pred_true_text)
        print("perct_train_inst=" + str(perct_train_inst))
        if perct_train_inst > 0.85:
            del trainTup_pred_true_txt