def genAnalysis(modelfile,testfile,outputfile): maxParagraphLength = 20 maxParagraphs = 5 filterSizes = [1] num_filters = 64 wordEmbeddingDimension = 30 lrate = float(1e-3) labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs,maxParagraphLength,labels,vocabularySize,filterSizes,num_filters,wordEmbeddingDimension,lrate) testing = DataParser(maxParagraphs,maxParagraphLength,labels,vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) batchSize = 1 testing.restore() truePre=[] pred=[] for itr in range(testing.totalPages): data=testing.nextBatch(1) truePre.append(data[0]) pre=model.predict(data) pred.append(pre[0]) labelIDName = open("../labelId-labelName-full.txt").read().split("\n") labelIDName = [ [ int(x.split("\t")[0]) , x.split("\t")[1].rstrip() ] for x in labelIDName] # print(labelIDName) #making it a dictionary labelName = dict(labelIDName) # print(labelName[9026]) f = open(outputfile,"w") for i,v in enumerate(pred): temp = [(labId,labProb) for labId,labProb in enumerate(v) ] temp = sorted(temp,key=lambda x:x[1],reverse=True) #sorting based on label probability to get top k predLabel = [0]*len(temp) output = "" for itr in range(11): predLabel[temp[itr][0]] = 1 if truePre[i][0][temp[itr][0]] == 1: output = output + "," + labelName[temp[itr][0]] f.write(str(i) + "," + output + "\n") f.close()
def analyse(modelfile,testfile,outputfile): maxParagraphLength = 20 maxParagraphs = 10 filterSizes = [2,3,4] num_filters = 64 wordEmbeddingDimension = 100 lrate = float(0.001) poolLength = 2 labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs,maxParagraphLength,labels,vocabularySize,\ filterSizes,num_filters,wordEmbeddingDimension,lrate,poolLength) testing = DataParser(maxParagraphs,maxParagraphLength,labels,vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) batchSize = 1 testing.restore() truePre=[] pred=[] for itr in range(testing.totalPages): data=testing.nextBatch(1) truePre.append(data[0]) pre=model.predict(data) pred.append(pre[0]) labelids = open("../../dataset/sorted_labelid_sans5toplabels.txt","r").read().strip().split("\n") labelids = [ int(x) for x in labelids ] no_of_partition = 10 partition_size = labels / no_of_partition rank1 = [0]*no_of_partition rank3 = [0]*no_of_partition rank5 = [0]*no_of_partition for i,v in enumerate(pred): temp = [(labId,labProb) for labId,labProb in enumerate(v) ] temp = sorted(temp,key=lambda x:x[1],reverse=True) #sorting based on label probability to get top k rank1[ labelids.index( temp[0][0] ) / partition_size ] += 1 rank3[ labelids.index( temp[0][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[0][0] ) / partition_size ] += 1 rank3[ labelids.index( temp[1][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[1][0] ) / partition_size ] += 1 rank3[ labelids.index( temp[2][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[2][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[3][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[4][0] ) / partition_size ] += 1 rank1 = [ ( float(x) /testing.totalPages )*100 for x in rank1 ] rank3 = [ ( float(x) /( 3 * testing.totalPages) )*100 for x in rank3 ] rank5 = [ ( float(x) /( 5 * testing.totalPages) )*100 for x in rank5 ] print( rank1) print( rank3) print(rank5) filePtr = open( outputfile , "w") for i in rank1: filePtr.write( str(i) + "," ) filePtr.write("\n") for i in rank3: filePtr.write( str(i) + "," ) filePtr.write("\n") for i in rank5: filePtr.write( str(i) + "," ) filePtr.close()
def analyse(modelfile, testfile, outputfile): maxParagraphLength = 20 maxParagraphs = 10 filterSizes = [2, 3, 4] num_filters = 64 wordEmbeddingDimension = 100 lrate = float(0.001) poolLength = 2 labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs,maxParagraphLength,labels,vocabularySize,\ filterSizes,num_filters,wordEmbeddingDimension,lrate,poolLength) testing = DataParser(maxParagraphs, maxParagraphLength, labels, vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) batchSize = 1 testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) labelids = open("../../dataset/labelid_labelcount_sans5toplabels.txt", "r").read().strip().split("\n") labelcounts = [int((x.split("\t"))[1]) for x in labelids] print labelcounts totalNoofDocuments = 19406 # 6137 (test instances ) + rangelist = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 ] no_of_partition = len(rangelist) rank1 = [0] * no_of_partition rank3 = [0] * no_of_partition rank5 = [0] * no_of_partition for i, v in enumerate(pred): temp = [(labId, labProb) for labId, labProb in enumerate(v)] temp = sorted( temp, key=lambda x: x[1], reverse=True) #sorting based on label probability to get top k for ind, count in enumerate(rangelist): if labelcounts[temp[0][0]] <= (count * totalNoofDocuments) / 100: rank1[ind] += 1 rank3[ind] += 1 rank5[ind] += 1 if labelcounts[temp[1][0]] <= (count * totalNoofDocuments) / 100: rank3[ind] += 1 rank5[ind] += 1 if labelcounts[temp[2][0]] <= (count * totalNoofDocuments) / 100: rank3[ind] += 1 rank5[ind] += 1 if labelcounts[temp[3][0]] <= (count * totalNoofDocuments) / 100: rank5[ind] += 1 if labelcounts[temp[4][0]] <= (count * totalNoofDocuments) / 100: rank5[ind] += 1 rank1 = [(float(x) / testing.totalPages) * 100 for x in rank1] rank3 = [(float(x) / (3 * testing.totalPages)) * 100 for x in rank3] rank5 = [(float(x) / (5 * testing.totalPages)) * 100 for x in rank5] print(rank1) print(rank3) print(rank5) filePtr = open(outputfile, "w") for i in rank1: filePtr.write(str(i) + ",") filePtr.write("\n") for i in rank3: filePtr.write(str(i) + ",") filePtr.write("\n") for i in rank5: filePtr.write(str(i) + ",") filePtr.close()
paragraphLength = int(sys.argv[1]) maxParagraphs = int(sys.argv[2] ) filterSizes = [int(i) for i in sys.argv[3].split("-")] print(filterSizes) num_filters = int(sys.argv[4]) wordEmbeddingDimension = int(sys.argv[5]) batchSize= int(sys.argv[6]) epochEnd = int(sys.argv[7]) folder_name = sys.argv[8] nlabels = 8 vocabularySize = 244 training = DataParser(maxParagraphs,paragraphLength,nlabels,vocabularySize) training.getDataFromfile("../../Reuter_dataset/reuters_sparse_training.txt") model = Model(maxParagraphs,paragraphLength,nlabels,vocabularySize,filterSizes,num_filters,wordEmbeddingDimension) costfile = open("results/costfile.txt","a") output = folder_name epoch=0 # epochEnd=400 costepochs = [] for e in range(epoch,epochEnd): cost=0 for itr in range(int(training.totalPages/batchSize)): cost += model.train(training.nextBatch(batchSize))
def ComputeFscore(modelfile, testfile, outputfile): maxParagraphLength = int(sys.argv[1]) maxParagraphs = int(sys.argv[2]) filterSizes = [int(i) for i in sys.argv[3].split("-")] num_filters = int(sys.argv[4]) wordEmbeddingDimension = int(sys.argv[5]) # batchSize= int(sys.argv[6]) # epochs= int(sys.argv[7]) # folder_name = sys.argv[8] labels = 8 vocabularySize = 244 model = Model(maxParagraphs, maxParagraphLength, labels, vocabularySize, filterSizes, num_filters, wordEmbeddingDimension) testing = DataParser(maxParagraphs, maxParagraphLength, labels, vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) labelsCount = {} ConfusionMa = {} fScr = {} thres = 0.5 valid = int( len(truePre) * 0.5 ) #using first 50% data for threshold tuning - we have merged test and cv files labelsCount = {} ConfusionMa = {} fScr = {} thresLab = {} for la in range(labels): if la % 25 == 0: print("Current label", la) t = [] p = [] for i in range(valid): t.append(truePre[i][0][la]) p.append(pred[i][la]) bestF, bestThre = thresholdTuning(t, p) t = [] p = [] for i in range(valid, len(truePre)): t.append(truePre[i][0][la]) p.append(pred[i][la]) p = np.array(p) fScr[la] = f1_score(t, p >= bestThre) ConfusionMa[la] = confusion_matrix(t, p > bestThre) thresLab[la] = bestThre f = open(outputfile, "a") output = sys.argv[9] sum_fscore = 0.0 for i in range(labels): sum_fscore = sum_fscore + fScr[i] output = output + " , " + str(fScr[i]) output += " , " + str(sum_fscore / float(labels - 1)) print("Fscore at " + sys.argv[7] + " epochs: " + str(sum_fscore / float(labels - 1))) f.write(output + "\n") f.close()
def ComputePrecisionK(modelfile, testfile, outputfile): maxParagraphLength = int(sys.argv[1]) maxParagraphs = int(sys.argv[2]) filterSizes = [int(i) for i in sys.argv[3].split("-")] num_filters = int(sys.argv[4]) wordEmbeddingDimension = int(sys.argv[5]) lrate = float(sys.argv[10]) poolLength = int(sys.argv[11]) labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs, maxParagraphLength, labels, vocabularySize, filterSizes, num_filters, wordEmbeddingDimension, lrate, poolLength) testing = DataParser(maxParagraphs, maxParagraphLength, labels, vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) print("Computing Prec@k") #check if batchsize needs to be taken by parameter batchSize = 1 testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) k = 5 for i in [1, 3, 5]: val = ndcg_score(truePre, pred, i) print(val) def dcg_score(y_true, y_score, k=5): """Discounted cumulative gain (DCG) at rank K. Parameters ---------- y_true : array, shape = [n_samples] Ground truth (true relevance labels). y_score : array, shape = [n_samples, n_classes] Predicted scores. k : int Rank. Returns ------- score : float """ order = np.argsort(y_score)[::-1] y_true = np.take(y_true, order[:k]) gain = 2**y_true - 1 discounts = np.log2(np.arange(len(y_true)) + 2) return np.sum(gain / discounts) def ndcg_score(ground_truth, predictions, k=5): """Normalized discounted cumulative gain (NDCG) at rank K. Normalized Discounted Cumulative Gain (NDCG) measures the performance of a recommendation system based on the graded relevance of the recommended entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal ranking of the entities. Parameters ---------- ground_truth : array, shape = [n_samples] Ground truth (true labels represended as integers). predictions : array, shape = [n_samples, n_classes] Predicted probabilities. k : int Rank. Returns ------- score : float Example ------- >>> ground_truth = [1, 0, 2] >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]] >>> score = ndcg_score(ground_truth, predictions, k=2) 1.0 >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]] >>> score = ndcg_score(ground_truth, predictions, k=2) 0.6666666666 """ lb = LabelBinarizer() lb.fit(range(len(predictions) + 1)) T = lb.transform(ground_truth) scores = [] # Iterate over each y_true and compute the DCG score for y_true, y_score in zip(T, predictions): actual = dcg_score(y_true, y_score, k) best = dcg_score(y_true, y_true, k) score = float(actual) / float(best) scores.append(score) return np.mean(scores)
def ComputePrecisionK(modelfile, testfile, outputfile): maxParagraphLength = int(sys.argv[1]) maxParagraphs = int(sys.argv[2]) filterSizes = [int(i) for i in sys.argv[3].split("-")] num_filters = int(sys.argv[4]) wordEmbeddingDimension = int(sys.argv[5]) lrate = float(sys.argv[10]) labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs, maxParagraphLength, labels, vocabularySize, filterSizes, num_filters, wordEmbeddingDimension, lrate) testing = DataParser(maxParagraphs, maxParagraphLength, labels, vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) print("Computing Prec@k") #check if batchsize needs to be taken by parameter batchSize = 1 testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) K_list = [1, 3, 5, 10, 15] #prec@1 .....prec@NoofLabels precAtK = [0.0] * 16 # #As need to get Prec only on last 50% of test data as first 50% is for cross validation # valid=int(len(truePre)*0.5) # pred = pred[valid:] # truePre = truePre[valid:] for i, v in enumerate(pred): temp = [(labId, labProb) for labId, labProb in enumerate(v)] temp = sorted( temp, key=lambda x: x[1], reverse=True) #sorting based on label probability to get top k for ele in K_list: #1....No of Labels pBag = 0 #no of true positive for this instance for itr in range(ele): #top k ie top ele if truePre[i][0][temp[itr][0]] == 1: precAtK[ele] += 1 # pBag += 1 # precAtK[ele] += float(pBag)/float(ele) f = open(outputfile, "a") output = sys.argv[9] for k in K_list: precAtK[k] /= (k * len(pred)) print("Prec@" + str(k) + " = " + str(precAtK[k])) output = output + "," + "Prec@" + str(k) + "=," + str(precAtK[k]) # for key,val in enumerate(precAtK): # if key in K_list : #As need Prec@k only on Klist ie 1,3,5,10 # # print( val ) # # print( val/len(pred) ) # print( "prec@" + key + "=" + str(val/(key*len(pred))) ) # # f.write(str(key)+","+ str(val/len(pred))+"\n") # # f.write(str(key)+","+ str(val/(key*len(pred))+"\n") # output = output + ", prec@" + key + "," + str(val/(key*len(pred))) f.write(output + "\n") f.close()
def ComputePrecisionK(modelfile, testfile, param, ep, outputfile): param = param.split("_") maxParagraphLength = param[1] maxParagraphs = param[2] filterSizes = [int(x) for x in param[3].split("-")] num_filters = param[4] wordEmbeddingDimension = param[5] lrate = float(param[8]) poolLength = param[9] labels = 30938 vocabularySize = 101939 # print(filterSizes) # print(maxParagraphLength) # print(maxParagraphs) # print(lrate) model = Model(maxParagraphs, maxParagraphLength, labels, vocabularySize, filterSizes, num_filters, wordEmbeddingDimension, lrate, poolLength) testing = DataParser(maxParagraphs, maxParagraphLength, labels, vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) print("Computing Prec@k") #check if batchsize needs to be taken by parameter batchSize = 1 testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) K_list = [1, 3, 5] #prec@1 .....prec@NoofLabels precAtK = [0.0] * 6 # #As need to get Prec only on last 50% of test data as first 50% is for cross validation # valid=int(len(truePre)*0.5) # pred = pred[valid:] # truePre = truePre[valid:] for i, v in enumerate(pred): temp = [(labId, labProb) for labId, labProb in enumerate(v)] temp = sorted( temp, key=lambda x: x[1], reverse=True) #sorting based on label probability to get top k for ele in K_list: #1....No of Labels pBag = 0 #no of true positive for this instance for itr in range(ele): #top k ie top ele if truePre[i][0][temp[itr][0]] == 1: precAtK[ele] += 1 # pBag += 1 # precAtK[ele] += float(pBag)/float(ele) f = open(outputfile, "a") output = str(m) + "/cnn_dynamicmaxpool_" + str(ep) for k in K_list: precAtK[k] /= (k * len(pred)) print("Prec@" + str(k) + " = " + str(precAtK[k])) # output = output + "," + "Prec@" + str(k) + "=," + str(precAtK[k]) f.write(output + "\n") f.close()
def genAnalysis(modelfile, testfile, confusionFile): maxParagraphLength = 20 maxParagraphs = 5 filterSizes = [1] num_filters = 64 wordEmbeddingDimension = 30 lrate = float(1e-3) labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs, maxParagraphLength, labels, vocabularySize, filterSizes, num_filters, wordEmbeddingDimension, lrate) testing = DataParser(maxParagraphs, maxParagraphLength, labels, vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) valid = int( len(truePre) * 0.5 ) #using first 25% data for threshold tuning - we have merged test and cv files thresLab = {} for la in range(labels): t = [] p = [] for i in range(valid): t.append(truePre[i][0][la]) p.append(pred[i][la]) bestF, bestThre = thresholdTuning(t, p) thresLab[la] = bestThre print(thresLab) labelIDName = open("../labelId-labelName-full.txt").read().split("\n") labelIDName = [[int(x.split("\t")[0]), x.split("\t")[1].rstrip()] for x in labelIDName] # print(labelIDName) #making it a dictionary labelname = dict(labelIDName) # print(labelName[9026]) f = open(confusionFile, "w") for itr in range(valid, testing.totalPages): #on next 75% getting analaysis predLabel = [pred[itr][i] > thresLab[i] for i in range(labels)] output = "" for i in range(labels): if predLabel[i] == 1: output = output + "," + labelname[i] tn, fp, fn, tp = confusion_matrix(truePre[itr][0], predLabel).ravel() f.write( str(itr) + "," + str(tn) + "," + str(fp) + "," + str(fn) + "," + str(tp) + "," + output + "\n") f.close()
def ComputePrecisionK(modelfile,testfile): maxParagraphLength = 20 maxParagraphs = 10 filterSizes = [2,3] num_filters = 16 wordEmbeddingDimension = 50 lrate = float(0.001) poolLength = 5 labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs,maxParagraphLength,labels,vocabularySize,\ filterSizes,num_filters,wordEmbeddingDimension,lrate,poolLength) testing = DataParser(maxParagraphs,maxParagraphLength,labels,vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) batchSize = 1 testing.restore() truePre=[] pred=[] for itr in range(testing.totalPages): data=testing.nextBatch(1) truePre.append(data[0]) pre=model.predict(data) pred.append(pre[0]) labelids = open("../../dataset/sorted_labelid.txt","r").read().strip().split("\n") labelids = [ int(x) for x in labelids ] no_of_partition = 10 partition_size = labels / no_of_partition prec1 = [0]*no_of_partition prec3 = [0]*no_of_partition prec5 = [0]*no_of_partition for i,v in enumerate(pred): temp = [(labId,labProb) for labId,labProb in enumerate(v) ] temp = sorted(temp,key=lambda x:x[1],reverse=True) #sorting based on label probability to get top k #finding how many of these were true if truePre[i][0][temp[0][0]] == 1: prec1[ labelids.index( temp[0][0] ) / partition_size ] += 1 prec3[ labelids.index( temp[0][0] ) / partition_size ] += 1 prec5[ labelids.index( temp[0][0] ) / partition_size ] += 1 if truePre[i][0][temp[1][0]] == 1: prec3[ labelids.index( temp[1][0] ) / partition_size ] += 1 prec5[ labelids.index( temp[1][0] ) / partition_size ] += 1 if truePre[i][0][temp[2][0]] == 1: prec3[ labelids.index( temp[2][0] ) / partition_size ] += 1 prec5[ labelids.index( temp[2][0] ) / partition_size ] += 1 if truePre[i][0][temp[3][0]] == 1: prec5[ labelids.index( temp[3][0] ) / partition_size ] += 1 if truePre[i][0][temp[4][0]] == 1: prec5[ labelids.index( temp[4][0] ) / partition_size ] += 1 print( prec1 ) print( prec3 ) print( prec5 ) prec1 = [ ( float(x) /testing.totalPages )*100 for x in prec1 ] prec3 = [ ( float(x) /( 3 * testing.totalPages) )*100 for x in prec3 ] prec5 = [ ( float(x) /( 5 * testing.totalPages) )*100 for x in prec5 ] print( prec1 ) print( prec3 ) print( prec5 )
folder_name = sys.argv[8] lrate = float(sys.argv[9]) poolLength = int(sys.argv[10]) nlabels = 30938 # nlabels = 968 for parabel cluster 16 vocabularySize = 101939 training = DataParser(maxParagraphs, paragraphLength, nlabels, vocabularySize) # training.getDataFromfile("../../dataset/minusTop5Labels/wiki10_minusTop5labels_train.txt") #training.getDataFromfile("../../dataset/minusTop5Labels/wiki10_minusTop5labels_train.txt") # training.getDataFromfile("/home/khushboo/wiki10/dataset/original_split/wiki10_train_cluster_16.txt") training.getDataFromfile( "/home/khushboo/wiki10/dataset/original_split/wiki10_miml_train.txt") model = Model(maxParagraphs, paragraphLength, nlabels, vocabularySize, filterSizes, num_filters, wordEmbeddingDimension, lrate, poolLength) costfile = open("results/costfile.txt", "a") output = folder_name epoch = 0 # epochEnd=400 costepochs = [] for e in range(epoch, epochEnd): cost = 0 for itr in range(int(training.totalPages / batchSize)): cost += model.train(training.nextBatch(batchSize))