def getspamtokens(challenge=None, duplicates=False): # get list of spam words from old ideas if challenge is None: if duplicates: bayesspamwords = list( importDataHelper.readcsvdata(variables.complexbayesmixedpath + 'duplicateBayesSpamToken.csv')) else: bayesspamwords = list( importDataHelper.readcsvdata(variables.complexbayesmixedpath + 'bayesSpamToken.csv')) else: if duplicates: bayesspamwords = list( importDataHelper.readcsvdata( variables.complexbayeschallengebasedpath + challenge + '/duplicateBayesSpamToken.csv')) else: bayesspamwords = list( importDataHelper.readcsvdata( variables.complexbayeschallengebasedpath + challenge + '/bayesSpamToken.csv')) spamdict = {} for row in bayesspamwords: spamdict.update(row) return spamdict
def gettokenprobs(challenge=None, duplicates=False): if challenge is None: if duplicates: bayesphraseprobs = list( importDataHelper.readcsvdata(variables.complexbayesmixedpath + 'duplicateBayesTokenProbs.csv')) else: bayesphraseprobs = list( importDataHelper.readcsvdata(variables.complexbayesmixedpath + 'bayesTokenProbs.csv')) else: if duplicates: bayesphraseprobs = list( importDataHelper.readcsvdata( variables.complexbayeschallengebasedpath + challenge + '/duplicateBayesTokenProbs.csv')) else: bayesphraseprobs = list( importDataHelper.readcsvdata( variables.complexbayeschallengebasedpath + challenge + '/bayesTokenProbs.csv')) probdict = {} for row in bayesphraseprobs: probdict.update(row) return probdict
def gethamtokens(challenge=None, duplicates=False): # get list of ham words from old ideas if challenge is None: if duplicates: bayeshamwords = list( importDataHelper.readcsvdata(variables.simplebayesmixedpath + 'duplicateBayesHamToken.csv')) else: bayeshamwords = list( importDataHelper.readcsvdata(variables.simplebayesmixedpath + 'bayesHamToken.csv')) else: if duplicates: bayeshamwords = list( importDataHelper.readcsvdata( variables.simplebayeschallengebasedpath + challenge + '/duplicateBayesHamToken.csv')) else: bayeshamwords = list( importDataHelper.readcsvdata( variables.simplebayeschallengebasedpath + challenge + '/bayesHamToken.csv')) # convert spam and ham word lists to dicts hamdict = {} for row in bayeshamwords: hamdict.update(row) return hamdict
def train_linear_classificator(challenge, new=False): if new: unigram_tagger, st = spamFilter.prepare_tagger() idealist = list( importDataHelper.readcsvdata(variables.ideadbpath + challenge + '.csv')) featurelist = {} for idea in idealist: idea['TRIGGERED'] = [] idea['PREDICTION'] = "Ham" idea, ideafeatures = spamFilter.classify_and_get_idea(idea, unigram_tagger, st) if "unusable" in idea["STATUS"] or 'spam' in idea.get("SPAM", ""): ideafeatures["Spam"] = 1 else: ideafeatures["Spam"] = 0 for key in ideafeatures.keys(): featurelist[key] = featurelist.get(key, []) featurelist[key].append(ideafeatures[key]) else: if challenge == "all": idealist = [] for file in listdir(variables.linclasstrainingsdatapath): if isfile(join(variables.linclasstrainingsdatapath, file)): idealist += list(importDataHelper.readcsvdata(join(variables.linclasstrainingsdatapath, file))) else: idealist = list(importDataHelper.readcsvdata(variables.linclasstrainingsdatapath + challenge + ".csv")) featurelist = {} for key in idealist[0].keys(): featurelist[key] = [int(x) for x in idealist[0][key].replace('[', '').replace(']', '').split(',')] testdata = pd.DataFrame(featurelist) X = testdata.drop('Spam', axis=1) y = testdata['Spam'] importDataHelper.writecsvfiledict(variables.linclasstrainingsdatapath + challenge + ".csv", featurelist.keys(), featurelist) clf = RidgeClassifier().fit(X, y) print(clf.score(X, y)) return clf, clf.coef_
def load_data(challenge): print("USE loaded") data = {"DESCRIPTION": [], "SPAM": []} if challenge == "all": idealist = [] for file in listdir(variables.ideadbpath): if isfile(join(variables.ideadbpath, file)): idealist += list( importDataHelper.readcsvdata( join(variables.ideadbpath, file))) else: idealist = list( importDataHelper.readcsvdata(variables.ideadbpath + challenge + ".csv")) for idea in idealist: data["DESCRIPTION"].append(idea["DESCRIPTION"]) if "unusable" in idea.get("STATUS", ""): data["SPAM"].append(1) elif "usable" in idea.get("STATUS", ""): data["SPAM"].append(0) elif "spam" in idea.get("SPAM", ""): data["SPAM"].append(1) else: data["SPAM"].append(0) return pd.DataFrame(data)
def train_and_test(challenge): idealist = [] if challenge == "all": for file in listdir(variables.linclasstrainingsdatapath): if isfile(join(variables.linclasstrainingsdatapath, file)): filename = file.split(".")[0] idealist += list(importDataHelper.readcsvdata(join(variables.linclasstrainingsdatapath, file))) else: idealist = list(importDataHelper.readcsvdata(variables.linclasstrainingsdatapath + challenge + ".csv")) featurelist = {} for row in idealist: for key in row.keys(): featurelist[key] = featurelist.get(key, []) featurelist[key] += [int(x) for x in row[key].replace('[', '').replace(']', '').split(',')] testdata = pd.DataFrame(featurelist) X = testdata.drop('Spam', axis=1) y = testdata['Spam'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) clf = RidgeClassifier() y_score = clf.fit(X_train, y_train).decision_function(X_test) testres = clf.predict(X_test) fpr = dict() tpr = dict() roc_auc = dict() for i in [0, 1]: fpr[i], tpr[i], _ = roc_curve(y_test, y_score) roc_auc[i] = auc(fpr[i], tpr[i]) plt.figure() lw = 2 plt.plot(fpr[1], tpr[1], color="darkorange", lw=lw, label="ROC" % roc_auc[1]) plt.plot([0, 1], [0, 1], color="cornflowerblue", lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(challenge) plt.legend(loc="lower right") plt.savefig(variables.plotspath + "ROC_linClass_" + challenge + ".png") plt.show() # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) plt.show() confusion_matrix = ConfusionMatrix(y_test, testres) confusion_matrix.plot(normalized=True) plt.title(challenge) plt.savefig(variables.plotspath + "CM_linClass_" + challenge + ".png") plt.show() print(clf.coef_) print(classification_report(y_test, testres)) print(confusion_matrix.stats())
def eval_all(): challengedict = { "TCO": list(importDataHelper.readcsvdata("Data/DBs/ideaDB/TCO.csv")), "bionicRadar": list(importDataHelper.readcsvdata("Data/DBs/ideaDB/bionicRadar.csv")), "fabricDisplay": list(importDataHelper.readcsvdata("Data/DBs/ideaDB/fabricDisplay.csv")) } dupdict = {} for key in challengedict.keys(): idealist = [] for key2 in challengedict.keys(): if key2 is not key: idealist += challengedict[key2].copy() X_train, X_test = train_test_split(challengedict[key], test_size=0.33) idealist += X_train.copy() X_ndtrain = duplicateDetection.filterduplikates( X_train, variables.resultpath + "eval2" + key + ".csv") dupdict[key] = len(X_train) - len(X_ndtrain) X_ndtest = X_test.copy() idealist_test = X_test.copy() idealist_nodups_test = X_test.copy() results = evaluate_system(X_train, X_test, key) importDataHelper.writecsvfiledict( "Data/ResultsAllNew/evaluation" + key + ".csv", results.keys(), results) print("Done first set") results2 = evaluate_system(X_ndtrain, X_ndtest, key, dups=True) importDataHelper.writecsvfiledict( "Data/ResultsAllNew/evaluationResultsNoDups" + key + ".csv", results2.keys(), results2) print("Challenge training done", key) idealist_nodups = duplicateDetection.filterduplikates( idealist, variables.resultpath + "eval" + key + ".csv") dupdict[key + " All"] = len(idealist) - len(idealist_nodups) results = evaluate_system(idealist, idealist_test) importDataHelper.writecsvfiledict( "Data/ResultsAllNew/evaluationAll" + key + ".csv", results.keys(), results) print("Done first set") results2 = evaluate_system(idealist_nodups, idealist_nodups_test, dups=True) importDataHelper.writecsvfiledict( "Data/ResultsAllNew/evaluationResultsNoDupsAll" + key + ".csv", results2.keys(), results2) print("All training done", key) print(dupdict) importDataHelper.writecsvfiledict("Data/ResultsAllNew/dupNums.csv", dupdict.keys(), dupdict)
def extend_challenge_db(idealist): challengelist = {} for file in listdir(variables.ideadbpath): if isfile(join(variables.ideadbpath, file)): filename = file.split(".")[0] challengelist[filename] = list( importDataHelper.readcsvdata(join(variables.ideadbpath, file))) for idea in idealist: idea["CHALLENGE"] = idea.get("CHALLENGE", "") if "cscw19-1" in idea["CHALLENGE"]: challengelist["TCO"] = challengelist.get("TCO", []) if not any(e['ID'] == idea['ID'] for e in challengelist["TCO"]): challengelist["TCO"].append(idea) elif "chi19s1" in idea["CHALLENGE"]: challengelist["TCO"] = challengelist.get("TCO", []) if not any(e['ID'] == idea['ID'] for e in challengelist["TCO"]): challengelist["TCO"].append(idea) elif "bionic" in idea["CHALLENGE"].lower(): challengelist["bionicRadar"] = challengelist.get("bionicRadar", []) if not any(e['ID'] == idea['ID'] for e in challengelist["bionicRadar"]): challengelist["bionicRadar"].append(idea) elif "fabric" in idea["CHALLENGE"].lower(): challengelist["fabricDisplay"] = challengelist.get( "fabricDisplay", []) if not any(e['ID'] == idea['ID'] for e in challengelist["fabricDisplay"]): challengelist["fabricDisplay"].append(idea) for key in challengelist.keys(): importDataHelper.writecsvfile(join(variables.ideadbpath, key + ".csv"), challengelist[key][0].keys(), challengelist[key]) print("saved " + key)
def read_noun_corpus(): nounlist = list( importDataHelper.readcsvdata(variables.dbpath + 'NLPdata/NounDB.csv')) # convert spam and ham word lists to dicts nouncorpus = {} for row in nounlist: nouncorpus.update(row) return nouncorpus
def plot_Evaluation(dataset): N = 0 filterlistall = list( importDataHelper.readcsvdata(variables.evaluationpresultpath)) tplist = [] fplist = [] filterlistdataset = [] fig, ax = plt.subplots(figsize=(10, 10)) num = 0 pnum = 0 nnum = 0 maxnum = 0 temp = 0 gotdata = False i = 0 for filter in filterlistall: if dataset in filter["Dataset"]: i += 1 if filter["Variable"] not in "None": filterlistdataset.append( str(i) + ": " + filter["Filter"] + ": " + filter["Variable"]) else: filterlistdataset.append(str(i) + ": " + filter["Filter"]) if not gotdata: num = filter["population"] pnum = filter["P"] nnum = filter["N"] gotdata = True N += 1 if filter["TP"] in '': tplist.append(0) else: temp = int(filter["TP"]) tplist.append(int(filter["TP"])) if filter["FP"] in '': fplist.append(0) else: temp += int(filter["FP"]) fplist.append(int(filter["FP"])) if temp > maxnum: maxnum = temp ind = np.arange(N) p1 = plt.bar(ind, tplist) p2 = plt.bar(ind, fplist, bottom=tplist) plt.ylabel('Amount') plt.title(('Filter evaluation for ' + dataset + ' with ' + num + ' ideas (' + pnum + ' positives and ' + nnum + ' negatives)')) plt.xticks(ind, range(1, len(filterlistdataset))) plt.yticks(np.arange(0, maxnum, 20)) plt.legend((p1[0], p2[0]), ('TP', 'FP')) print(filterlistdataset) plt.show()
def add_all_ideas_toDB(): for file in listdir(variables.importpathclassified): if isfile(join(variables.importpathclassified, file)): if ".csv" in file: extend_challenge_db( list( importDataHelper.readcsvdata( join(variables.importpathclassified, file)))) print("finished: " + file) else: print("just csv supported right now")
def save_confusionmatrix(cm, path, applied_filters=[], description="", dataset=""): cmdict = list(importDataHelper.readcsvdata(path)) cmdict.append(cm.stats()) cmdict[len(cmdict) - 1]["applied Filter"] = applied_filters cmdict[len(cmdict) - 1]["Description"] = description cmdict[len(cmdict) - 1]["Dataset"] = dataset importDataHelper.writecsvfile(path, cmdict[0].keys(), cmdict) return 0
def classify_unreviewed(): idealist = list( importDataHelper.readcsvdata( "Data/Results/fabricDisplayunreviewed.csv")) idealist2 = list( importDataHelper.readcsvdata( "Data/Results/fabricDisplayClassified.csv")) print("bionic Radar:") for idea in idealist: if idea["ID"] in [ideas["ID"] for ideas in idealist2]: idealist.remove(idea) print(len(idealist)) for idea in idealist: print(" ") if "usable" not in idea.get("STATUS", ""): print("Content: " + idea["DESCRIPTION"]) print("Prediction: " + idea["PREDICTION"]) print("Bayes: " + idea["OTHERBayes"]) print("Others: " + idea["OTHERS"]) print("Filter: " + idea["TRIGGERED"]) x = input("Spam? (y/n)") if 'y' in x: idea["STATUS"] = "unusable" idealist2.append(idea) idealist.remove(idea) elif 'n' in x: idea["STATUS"] = "usable" idealist2.append(idea) idealist.remove(idea) else: importDataHelper.writecsvfile( "Data/Results/fabricDisplayClassified.csv", idealist2[0].keys(), idealist2) importDataHelper.writecsvfile( "Data/Results/fabricDisplayunreviewed.csv", idealist[0].keys(), idealist) importDataHelper.writecsvfile("Data/Results/fabricDisplayClassified.csv", idealist2[0].keys(), idealist2) importDataHelper.writecsvfile("Data/Results/fabricDisplayunreviewed.csv", idealist[0].keys(), idealist)
def extend_noun_corpus(): idealist = list( importDataHelper.readcsvdata(variables.importpathclassified + 'cscw19-unapproved-ideas_import.csv')) nouncorpus = read_noun_corpus() unigram_tagger = prepare_tagger() for idea in idealist: nouns = get_Nouns(idea['DESCRIPTION'], unigram_tagger) for noun in nouns: if noun not in nouncorpus: nouncorpus[noun] = "unclassified" importDataHelper.writecsvfiledict(variables.dbpath + 'NLPdata/NounDB.csv', nouncorpus.keys(), nouncorpus)
def traincomplexbayes(dataset=None): if dataset is None: print("Select a dataset: ") i = 0 print("Classified datasets") filesclass = [] for file in listdir(variables.importpathclassified): if isfile(join(variables.importpathclassified, file)): print("", i, ": ", file) filesclass.append((variables.importpathclassified, file)) i += 1 print("Unclassified datasets") for file in listdir(variables.importpathunclassified): if isfile(join(variables.importpathunclassified, file)): print("", i, ": ", file) filesclass.append((variables.importpathunclassified, file)) i += 1 selected = int(input("Which dataset do you want to use? ")) path = filesclass[selected][0] filename, fileformat = filesclass[selected][1].replace(".", ' ').split() if 'csv' in fileformat: idealist = list( importDataHelper.readcsvdata( join(path, filename + '.' + fileformat))) else: idealist = list( importDataHelper.readxmldata( join(path, filename + '.' + fileformat))) else: idealist = dataset[0] delete = "" while ('y' not in delete or 'n' in delete): delete = input( "Do you want to override old 5-word bayes results (y/n): ").lower( ) start = time.process_time_ns() if 'y' in delete: spamdictcom = {} hamdictcom = {} else: spamdictcom = complexBayes.getspamtokens( ) # load data this time to get data from both datasets hamdictcom = complexBayes.gethamtokens( ) # load data this time to get data from both datasets complexBayes.trainbayes(idealist, spamdictcom, hamdictcom) duration = time.process_time_ns() - start print("Duration (complex) bayestraining: ", duration / 1000000000, "seconds") return None
def match_iui_challenges(): unmatchedlist = list( importDataHelper.readcsvdata( "Data/ImportsClassified/iui-export-ideas.csv")) print(len(unmatchedlist)) challengelist = list( importDataHelper.readcsvdata( "Data/ImportsClassified/ideas-with-challenges.csv")) print(len(challengelist)) count_unmatched = 0 count_matched = 0 for idea in unmatchedlist: matched = False for idea2 in challengelist: if (idea["ID"] in idea2["ID"]): idea["CHALLENGE"] = idea2["CHALLENGE"] count_matched += 1 matched = True break if not matched: count_unmatched += 1 print(count_unmatched) print(count_matched) extend_challenge_db(unmatchedlist)
def spamdetection(): parser = argparse.ArgumentParser() parser.add_argument("path", help="Path to a csv or xml file with ideas") parser.add_argument("-t", "--train", help="to train the system. Requires classified ideas.", action="store_true") parser.add_argument( "--challenge", help= "give a challenge to use instead of the challenges given in an idea") args = parser.parse_args() filename, fileformat = os.path.basename(args.path).split('.') if fileformat == 'csv': idealist = list(importDataHelper.readcsvdata(args.path)) elif fileformat == 'xml': idealist = importDataHelper.readxmldata(args.path) else: print("Can not read the file, please use csv or xml files") return 1 challengelists = {} # Divide idea in challenges or use the given challenge if args.challenge is None: for idea in idealist: challenge = idea.get("CHALLENGE", "Cross-Domain") challengelists[challenge] = challengelists.get(challenge, []) challengelists[challenge].append(idea) else: challengelists[args.challenge] = idealist if args.train: for elem in challengelists: train(challengelists[elem], elem) else: classifiedlist = [] for elem in challengelists: if fileformat == "csv": classifiedlist += classify(challengelists[elem], elem, fileformat) importDataHelper.writecsvfile( os.path.dirname(args.path) + "/" + filename + "_classified.csv", classifiedlist[0].keys(), classifiedlist) else: idealist = classify(idealist, elem, fileformat) idealist.write( os.path.dirname(args.path) + "/" + filename + "_classified.xml")
def evaluate_fun(): idealist = list( importDataHelper.readcsvdata("Data/DBs/ideaDB/bionicRadar.csv")) X_train, X_test = train_test_split(idealist, test_size=0.33) X_ndtrain = duplicateDetection.filterduplikates( X_train, variables.resultpath + "evalbionicRadar.csv") X_ndtest = X_test.copy() results = evaluate_system(X_train, X_test, "bionicRadar") importDataHelper.writecsvfiledict( "Data/ResultsNew/evaluationResultsbionicRadar.csv", results.keys(), results) print("Done first set") results2 = evaluate_system(X_ndtrain, X_ndtest, "bionicRadar", dups=True) importDataHelper.writecsvfiledict( "Data/ResultsNew/evaluationResultsNoDupsbionicRadar.csv", results2.keys(), results2) print("Done")
def evaluationData_table(dataset): filterlistall = list(importDataHelper.readcsvdata(variables.evaluationpresultpath)) data = [] columns = ("Filter", "TP", "FP") num = 0 pnum = 0 nnum = 0 gotdata = False fig, ax = plt.subplots(figsize=(10, 10)) # hide axes fig.patch.set_visible(False) ax.axis('off') ax.axis('tight') for filter in filterlistall: if dataset in filter["Dataset"]: if not gotdata: num = filter["population"] pnum = filter["P"] nnum = filter["N"] gotdata = True if filter["Variable"] not in "None": if filter["TP"] in '' or filter["FP"] in '': data.append([filter["Filter"] + ": " + filter["Variable"], 0, 0]) else: data.append([filter["Filter"] + ": " + filter["Variable"], int(filter["TP"]), int(filter["FP"])]) else: if filter["TP"] in '' or filter["FP"] in '': data.append([filter["Filter"], 0, 0]) else: data.append([filter["Filter"], int(filter["TP"]), int(filter["FP"])]) df = pd.DataFrame(data, columns=columns) ax.table(cellText=df.values, colLabels=df.columns, loc='center') fig.tight_layout() plt.title('Filter evaluation for ' + dataset + ' with ' + num + ' ideas (' + pnum + ' positives and ' + nnum + ' negatives)') plt.show()
def duplicatefilter(dataset=None): if dataset is None: print("Select a dataset: ") i = 0 print("Classified datasets") filesclass = [] for file in listdir(variables.importpathclassified): if isfile(join(variables.importpathclassified, file)): print("", i, ": ", file) filesclass.append((variables.importpathclassified, file)) i += 1 print("Unclassified datasets") for file in listdir(variables.importpathunclassified): if isfile(join(variables.importpathunclassified, file)): print("", i, ": ", file) filesclass.append((variables.importpathunclassified, file)) i += 1 selected = int(input("Which dataset do you want to use? ")) path = filesclass[selected][0] filename, fileformat = filesclass[selected][1].replace(".", ' ').split() if 'csv' in fileformat: idealist = list( importDataHelper.readcsvdata( join(path, filename + '.' + fileformat))) else: idealist = list( importDataHelper.readxmldata( join(path, filename + '.' + fileformat))) else: fileformat = dataset[3] filename = dataset[2] path = dataset[1] idealist = dataset[0] idealist = duplicateDetection.filterduplikates( idealist, variables.duplicateresultpath + filename + 'Duplicates.csv') return idealist, path, filename, fileformat
def classifyideas(dataset=None): if dataset is None: print("Select a dataset: ") i = 0 print("Classified datasets") filesclass = [] for file in listdir(variables.importpathclassified): if isfile(join(variables.importpathclassified, file)): print("", i, ": ", file) filesclass.append((variables.importpathclassified, file)) i += 1 print("Unclassified datasets") for file in listdir(variables.importpathunclassified): if isfile(join(variables.importpathunclassified, file)): print("", i, ": ", file) filesclass.append((variables.importpathunclassified, file)) i += 1 selected = int(input("Which dataset do you want to use? ")) path = filesclass[selected][0] filename, fileformat = filesclass[selected][1].replace(".", ' ').split() if 'csv' in fileformat: idealist = list( importDataHelper.readcsvdata( join(path, filename + '.' + fileformat))) else: idealist = list( importDataHelper.readxmldata( join(path, filename + '.' + fileformat))) else: fileformat = dataset[3] filename = dataset[2] path = dataset[1] idealist = dataset[0] bayesbool = 'y' in input( "Do you want to use single word bayes to classify? (y/n) ").lower() complbayesbool = 'y' in input( "Do you want to use 5-word bayes to classify? (y/n) ").lower() filtersystembool = 'y' in input( "Do you want to use the Filtersystem to classify? (y/n) ").lower() if bayesbool: wordprobs = bayes.gettokenprobs() if complbayesbool: wordprobscom = complexBayes.gettokenprobs() if filtersystembool: unigram_tagger, st = prepare_tagger() spamlist = [] applied_filters = {} pred = [] actual = [] fplist = [] fnlist = [] start1 = time.time() for row in idealist: row['TRIGGERED'] = [] row['PREDICTION'] = "Ham" if bayesbool: bayesprob = bayes.classify(row['DESCRIPTION'], wordprobs) if bayesprob > 0.8: row['TRIGGERED'].append("bayes") applied_filters["bayes"] = int(applied_filters.get("bayes", 0)) + 1 row['PREDICTION'] = "Spam" if complbayesbool: combayesprob = complexBayes.classify(row['DESCRIPTION'], wordprobscom) if combayesprob > 0.8: row['TRIGGERED'].append("complex bayes: " + str(combayesprob)) applied_filters["complex bayes"] = int( applied_filters.get("complex bayes", 0)) + 1 row['PREDICTION'] = "Spam" if filtersystembool: row = spamFilter.classifyidea(row, unigram_tagger, st) actual.append("spam" in row.get('SPAM', "") or "unusable" in row.get("STATUS", "")) pred.append(row['PREDICTION'] == "Spam") for filter in row['TRIGGERED']: if 'bayes' not in filter: applied_filters[filter] = int(applied_filters.get(filter, 0)) + 1 spamlist.append(row) if row['PREDICTION'] == "Spam" and ("ham" in row.get('SPAM', "") or row.get("STATUS", "") == "usable"): fplist.append(row) elif row['PREDICTION'] == "Ham" and ("spam" in row.get( 'SPAM', "") or "unusable" in row.get("STATUS", "")): fnlist.append(row) cm = confusionMatrix.create_confusionmatrix(actual, pred) confusionMatrix.print_confusionmatrix(cm, True) description = "just filtersystem, Test enumeration fix with iui dataset" confusionMatrix.save_confusionmatrix( cm, variables.resultpath + "ConfusionMatrices.csv", applied_filters, description, filename) duration1 = time.time() - start1 print("Duration1: ", duration1, "seconds") print(applied_filters) ###################### Save results ###################### # importDataHelper.writecsvfile(variables.resultpath + 'IdeaDataSpam2.csv', spamlist[0].keys(), spamlist) if len(fplist) > 0: importDataHelper.writecsvfile( variables.filterresults + filename + '_fp.csv', fplist[0].keys(), fplist) if len(fnlist) > 0: importDataHelper.writecsvfile( variables.filterresults + filename + '_fn.csv', fnlist[0].keys(), fnlist) return None
def test(): # idealist = list(importDataHelper.readxmldata(variables.importpathunclassified + 'IdeaData.xml')) idealist = list( importDataHelper.readcsvdata(variables.importpathclassified + "ideas-with-challenges.csv")) idealistchallenge = {"bionicRadar": [], "fabricDisplay": []} print(len(idealist)) i = 0 j = 0 k = 0 for idea in idealist: if idea["STATUS"] == "unreviewed": if "bionic" in idea["CHALLENGE"].lower(): i += 1 idealistchallenge["bionicRadar"].append(idea) elif "fabric" in idea["CHALLENGE"].lower(): j += 1 idealistchallenge["fabricDisplay"].append(idea) else: k += 1 print("unreviewed bionic: ", i) print("unreviewed fabric: ", j) print("unreviewed others: ", k) idealisttrainingschallenge = {} idealisttrainingschallenge["fabricDisplay"] = list( importDataHelper.readcsvdata(variables.ideadbpath + 'fabricDisplay.csv')) idealisttrainingschallenge["bionicRadar"] = list( importDataHelper.readcsvdata(variables.ideadbpath + 'bionicRadar.csv')) idealisttrainingschallenge["TCO"] = list( importDataHelper.readcsvdata(variables.ideadbpath + 'TCO.csv')) idealisttrainingschallengewodups = {} idealisttrainingschallengewodups["fabricDisplay"] = list( importDataHelper.readcsvdata(variables.ideadbwithoutduppath + "fabricDisplay.csv")) idealisttrainingschallengewodups["bionicRadar"] = list( importDataHelper.readcsvdata(variables.ideadbwithoutduppath + "bionicRadar.csv")) idealisttrainingschallengewodups["TCO"] = list( importDataHelper.readcsvdata(variables.ideadbwithoutduppath + "TCO.csv")) idealistmixedtraining = idealisttrainingschallenge[ "fabricDisplay"] + idealisttrainingschallenge[ "bionicRadar"] + idealisttrainingschallenge["TCO"] idealistmixedtrainingwithoutdups = idealisttrainingschallengewodups[ "fabricDisplay"] + idealisttrainingschallengewodups[ "bionicRadar"] + idealisttrainingschallengewodups["TCO"] for key in idealistchallenge.keys(): idealisttraining = idealisttrainingschallenge[key] idealisttrainingwithoutdups = list( importDataHelper.readcsvdata(variables.ideadbwithoutduppath + key + ".csv")) # idealistchallengewithoutdups = duplicateDetection.filterduplikates(idealistchallenge[key], variables.resultpath + "test3.csv", idealisttrainingwithoutdups) print("duplicate detection done") bayes.trainbayes(idealisttraining, challenge=key, delete=True) bayes.trainbayes(idealisttrainingwithoutdups, challenge=key, delete=True, duplicates=True) print("bayes training TCO complete") bayes.trainbayes(idealistmixedtraining, delete=True) bayes.trainbayes(idealistmixedtrainingwithoutdups, delete=True, duplicates=True) print("bayes training mixed complete") wordprobs = bayes.gettokenprobs(challenge=key) wordprobswithoutdups = bayes.gettokenprobs(challenge=key, duplicates=True) wordprobsmixed = bayes.gettokenprobs() wordprobsmixedwithoutdups = bayes.gettokenprobs(duplicates=True) print("loaded probs") complexBayes.trainbayes(idealisttraining, challenge=key, delete=True) complexBayes.trainbayes(idealisttrainingwithoutdups, challenge=key, delete=True, duplicates=True) print("complex bayes training TCO complete") complexBayes.trainbayes(idealistmixedtraining, delete=True) complexBayes.trainbayes(idealistmixedtrainingwithoutdups, delete=True, duplicates=True) print("complex bayes training mixed complete") comwordprobs = complexBayes.gettokenprobs(challenge=key) comwordprobswithoutdups = complexBayes.gettokenprobs(challenge=key, duplicates=True) comwordprobsmixed = complexBayes.gettokenprobs() comwordprobsmixedwithoutdups = complexBayes.gettokenprobs( duplicates=True) print("loaded probs complex") linclass, lincoeff = linearClassifier.train_linear_classificator(key) print(lincoeff) linclassmixed, lincoeffmixed = linearClassifier.train_linear_classificator( "all") print(lincoeffmixed) useest = USEClassifier.train_classifier(key) useestmixed = USEClassifier.train_classifier("all") print("trained USE") unigram_tagger, st = spamFilter.prepare_tagger() i = 1 for idea in idealistchallenge[key]: print(i) idea["TRIGGERED"] = [""] # classify with challenge bayes with duplicates bayesprob = bayes.classify(idea["DESCRIPTION"], wordprobs) # classify with challenge bayes without duplicates bayesprobdup = bayes.classify(idea["DESCRIPTION"], wordprobswithoutdups) # classify with mixed challenge bayes with duplicates bayesprobmixed = bayes.classify(idea["DESCRIPTION"], wordprobsmixed) # classify with mixed challenge bayes without duplicates bayesprobmixedwithoutdup = bayes.classify( idea["DESCRIPTION"], wordprobsmixedwithoutdups) combayesprob = complexBayes.classify(idea["DESCRIPTION"], comwordprobs) # classify with challenge bayes without duplicates combayesprobdup = complexBayes.classify(idea["DESCRIPTION"], comwordprobswithoutdups) # classify with mixed challenge bayes with duplicates combayesprobmixed = complexBayes.classify(idea["DESCRIPTION"], comwordprobsmixed) # classify with mixed challenge bayes without duplicates combayesprobmixedwithoutdup = complexBayes.classify( idea["DESCRIPTION"], comwordprobsmixedwithoutdups) # classify with challenge USE: useclass, useclassprob = USEClassifier.classify(useest, idea) # classify with mixed challenge USE: usemixedclass, usemixedclassprob = USEClassifier.classify( useestmixed, idea) idea, ideadata = spamFilter.classify_and_get_idea( idea, unigram_tagger, st) allnull = True for keytest in ideadata.keys(): ideadata[keytest] = [ideadata[keytest]] if ideadata[keytest] == 1: allnull = False if not allnull: linclasspred, linclassprob = linearClassifier.classify( ideadata, linclass) linmixedclasspred, linmixedclassprob = linearClassifier.classify( ideadata, linclassmixed) else: linclasspred, linclassprob = 0, 0 linmixedclasspred, linmixedclassprob = 0, 0 idea["PREDICTION"] = "Bayes: " + str( bayesprobdup) + ", complexBayes " + str( combayesprobdup) + ", linClass: " + str( linmixedclasspred) + " " + str( linmixedclassprob) + ", USE: " + str( useclass) + " " + str(useclassprob) idea["OTHERBayes"] = "BayesTCO: " + str( bayesprob) + ", BayesMixed " + str( bayesprobmixed) + ", BayesMixed w/o dups " + str( bayesprobmixedwithoutdup) + ", compl BayesTCO: " + str( combayesprob) + ", compl BayesMixed: " + str( combayesprobmixed ) + ", compl BayesMixed w/o dups: " + str( combayesprobmixedwithoutdup) idea["OTHERS"] = "Lin Class: " + str(linclasspred) + " " + str( linclassprob) + ", USE mixed: " + str( usemixedclass) + " " + str(usemixedclassprob) i += 1 importDataHelper.writecsvfile( variables.resultpath + key + "unreviewed.csv", idealistchallenge[key][0].keys(), idealistchallenge[key])
def evaluate_filtersystem(): resultlist = [] unigram, st = prepare_tagger() for file in listdir(variables.importpathclassified): if isfile(join(variables.importpathclassified, file)): if ".csv" in file: idealist = list( importDataHelper.readcsvdata( join(variables.importpathclassified, file))) elif ".xml" in file: idealist = list( importDataHelper.readxmldata( join(variables.importpathclassified, file))) else: print( "Not able to read all files (just csv and xml are supported)" ) return 1 for filter in textDataFilter.textDataFilterList: if "count" in str(filter): if "more" in filter.__name__: for count in countmore: cm = evaluate_filter(filter, idealist, count) result = { "Dataset": file, "Filter": filter.__name__, "Variable": count } if cm is not None: result.update(cm.stats()) resultlist.append(result) elif "less" in filter.__name__: for count in countless: cm = evaluate_filter(filter, idealist, count) result = { "Dataset": file, "Filter": filter.__name__, "Variable": count } if cm is not None: result.update(cm.stats()) resultlist.append(result) elif "word" in filter.__name__: for count in countwords: cm = evaluate_filter(filter, idealist, count) result = { "Dataset": file, "Filter": filter.__name__, "Variable": count } if cm is not None: result.update(cm.stats()) resultlist.append(result) else: cm = evaluate_filter(filter, idealist) result = { "Dataset": file, "Filter": filter.__name__, "Variable": "None" } if cm is not None: result.update(cm.stats()) resultlist.append(result) for filter in textContentFilter.textContentFilterlist: if "unigram" in filter.__name__: cm = evaluate_filter(filter, idealist, unigram) result = { "Dataset": file, "Filter": filter.__name__, "Variable": "UnigramTagger" } if cm is not None: result.update(cm.stats()) resultlist.append(result) elif "containsnames" in filter.__name__: cm = evaluate_filter(filter, idealist, st) result = { "Dataset": file, "Filter": filter.__name__, "Variable": "StanfordNERTagger" } if cm is not None: result.update(cm.stats()) resultlist.append(result) else: cm = evaluate_filter(filter, idealist) result = { "Dataset": file, "Filter": filter.__name__, "Variable": "None" } if cm is not None: result.update(cm.stats()) resultlist.append(result) print(filter.__name__) importDataHelper.writecsvfile( variables.resultpath + "FilterEvaluation.csv", resultlist[0].keys(), resultlist)
def import_results(): fabricresults = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationfabricDisplay.csv"))[0]) fabricresultsNoDups = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationResultsNoDupsfabricDisplay.csv")) [0]) fabricresultsNoDupsAll = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationResultsNoDupsAllfabricDisplay.csv" ))[0]) fabricresultsAll = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationAllfabricDisplay.csv"))[0]) tcoresults = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationTCO.csv"))[0]) tcoresultsNoDups = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationResultsNoDupsTCO.csv"))[0]) tcoresultsNoDupsAll = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationResultsNoDupsAllTCO.csv"))[0]) tcoresultsAll = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationAllTCO.csv"))[0]) bionicresults = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationbionicRadar.csv"))[0]) bionicresultsNoDups = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationResultsNoDupsbionicRadar.csv")) [0]) bionicresultsNoDupsAll = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationResultsNoDupsAllbionicRadar.csv") )[0]) bionicresultsAll = evaluationHelper.convertResults( list( importDataHelper.readcsvdata( "Data/ResultsAllNew/evaluationAllbionicRadar.csv"))[0]) return { "fabricresults": fabricresults, "fabricresultsNoDups": fabricresultsNoDups, "fabricresults All": fabricresultsAll, "fabricresults AllNoDups": fabricresultsNoDupsAll, "tcoresults": tcoresults, "tcoresultsNoDups": tcoresultsNoDups, "tcoresults All": tcoresultsAll, "tcoresults AllNoDups": tcoresultsNoDupsAll, "bionicresults": bionicresults, "bionicresultsNoDups": bionicresultsNoDups, "bionicresults All": bionicresultsAll, "bionicresults AllNoDups": bionicresultsNoDupsAll }
def load_confusionmatrices(path): return list(importDataHelper.readcsvdata(path))