def extend_challenge_db(idealist): challengelist = {} for file in listdir(variables.ideadbpath): if isfile(join(variables.ideadbpath, file)): filename = file.split(".")[0] challengelist[filename] = list( importDataHelper.readcsvdata(join(variables.ideadbpath, file))) for idea in idealist: idea["CHALLENGE"] = idea.get("CHALLENGE", "") if "cscw19-1" in idea["CHALLENGE"]: challengelist["TCO"] = challengelist.get("TCO", []) if not any(e['ID'] == idea['ID'] for e in challengelist["TCO"]): challengelist["TCO"].append(idea) elif "chi19s1" in idea["CHALLENGE"]: challengelist["TCO"] = challengelist.get("TCO", []) if not any(e['ID'] == idea['ID'] for e in challengelist["TCO"]): challengelist["TCO"].append(idea) elif "bionic" in idea["CHALLENGE"].lower(): challengelist["bionicRadar"] = challengelist.get("bionicRadar", []) if not any(e['ID'] == idea['ID'] for e in challengelist["bionicRadar"]): challengelist["bionicRadar"].append(idea) elif "fabric" in idea["CHALLENGE"].lower(): challengelist["fabricDisplay"] = challengelist.get( "fabricDisplay", []) if not any(e['ID'] == idea['ID'] for e in challengelist["fabricDisplay"]): challengelist["fabricDisplay"].append(idea) for key in challengelist.keys(): importDataHelper.writecsvfile(join(variables.ideadbpath, key + ".csv"), challengelist[key][0].keys(), challengelist[key]) print("saved " + key)
def save_confusionmatrix(cm, path, applied_filters=[], description="", dataset=""): cmdict = list(importDataHelper.readcsvdata(path)) cmdict.append(cm.stats()) cmdict[len(cmdict) - 1]["applied Filter"] = applied_filters cmdict[len(cmdict) - 1]["Description"] = description cmdict[len(cmdict) - 1]["Dataset"] = dataset importDataHelper.writecsvfile(path, cmdict[0].keys(), cmdict) return 0
def spamdetection(): parser = argparse.ArgumentParser() parser.add_argument("path", help="Path to a csv or xml file with ideas") parser.add_argument("-t", "--train", help="to train the system. Requires classified ideas.", action="store_true") parser.add_argument( "--challenge", help= "give a challenge to use instead of the challenges given in an idea") args = parser.parse_args() filename, fileformat = os.path.basename(args.path).split('.') if fileformat == 'csv': idealist = list(importDataHelper.readcsvdata(args.path)) elif fileformat == 'xml': idealist = importDataHelper.readxmldata(args.path) else: print("Can not read the file, please use csv or xml files") return 1 challengelists = {} # Divide idea in challenges or use the given challenge if args.challenge is None: for idea in idealist: challenge = idea.get("CHALLENGE", "Cross-Domain") challengelists[challenge] = challengelists.get(challenge, []) challengelists[challenge].append(idea) else: challengelists[args.challenge] = idealist if args.train: for elem in challengelists: train(challengelists[elem], elem) else: classifiedlist = [] for elem in challengelists: if fileformat == "csv": classifiedlist += classify(challengelists[elem], elem, fileformat) importDataHelper.writecsvfile( os.path.dirname(args.path) + "/" + filename + "_classified.csv", classifiedlist[0].keys(), classifiedlist) else: idealist = classify(idealist, elem, fileformat) idealist.write( os.path.dirname(args.path) + "/" + filename + "_classified.xml")
def classify_unreviewed(): idealist = list( importDataHelper.readcsvdata( "Data/Results/fabricDisplayunreviewed.csv")) idealist2 = list( importDataHelper.readcsvdata( "Data/Results/fabricDisplayClassified.csv")) print("bionic Radar:") for idea in idealist: if idea["ID"] in [ideas["ID"] for ideas in idealist2]: idealist.remove(idea) print(len(idealist)) for idea in idealist: print(" ") if "usable" not in idea.get("STATUS", ""): print("Content: " + idea["DESCRIPTION"]) print("Prediction: " + idea["PREDICTION"]) print("Bayes: " + idea["OTHERBayes"]) print("Others: " + idea["OTHERS"]) print("Filter: " + idea["TRIGGERED"]) x = input("Spam? (y/n)") if 'y' in x: idea["STATUS"] = "unusable" idealist2.append(idea) idealist.remove(idea) elif 'n' in x: idea["STATUS"] = "usable" idealist2.append(idea) idealist.remove(idea) else: importDataHelper.writecsvfile( "Data/Results/fabricDisplayClassified.csv", idealist2[0].keys(), idealist2) importDataHelper.writecsvfile( "Data/Results/fabricDisplayunreviewed.csv", idealist[0].keys(), idealist) importDataHelper.writecsvfile("Data/Results/fabricDisplayClassified.csv", idealist2[0].keys(), idealist2) importDataHelper.writecsvfile("Data/Results/fabricDisplayunreviewed.csv", idealist[0].keys(), idealist)
def classifyideas(dataset=None): if dataset is None: print("Select a dataset: ") i = 0 print("Classified datasets") filesclass = [] for file in listdir(variables.importpathclassified): if isfile(join(variables.importpathclassified, file)): print("", i, ": ", file) filesclass.append((variables.importpathclassified, file)) i += 1 print("Unclassified datasets") for file in listdir(variables.importpathunclassified): if isfile(join(variables.importpathunclassified, file)): print("", i, ": ", file) filesclass.append((variables.importpathunclassified, file)) i += 1 selected = int(input("Which dataset do you want to use? ")) path = filesclass[selected][0] filename, fileformat = filesclass[selected][1].replace(".", ' ').split() if 'csv' in fileformat: idealist = list( importDataHelper.readcsvdata( join(path, filename + '.' + fileformat))) else: idealist = list( importDataHelper.readxmldata( join(path, filename + '.' + fileformat))) else: fileformat = dataset[3] filename = dataset[2] path = dataset[1] idealist = dataset[0] bayesbool = 'y' in input( "Do you want to use single word bayes to classify? (y/n) ").lower() complbayesbool = 'y' in input( "Do you want to use 5-word bayes to classify? (y/n) ").lower() filtersystembool = 'y' in input( "Do you want to use the Filtersystem to classify? (y/n) ").lower() if bayesbool: wordprobs = bayes.gettokenprobs() if complbayesbool: wordprobscom = complexBayes.gettokenprobs() if filtersystembool: unigram_tagger, st = prepare_tagger() spamlist = [] applied_filters = {} pred = [] actual = [] fplist = [] fnlist = [] start1 = time.time() for row in idealist: row['TRIGGERED'] = [] row['PREDICTION'] = "Ham" if bayesbool: bayesprob = bayes.classify(row['DESCRIPTION'], wordprobs) if bayesprob > 0.8: row['TRIGGERED'].append("bayes") applied_filters["bayes"] = int(applied_filters.get("bayes", 0)) + 1 row['PREDICTION'] = "Spam" if complbayesbool: combayesprob = complexBayes.classify(row['DESCRIPTION'], wordprobscom) if combayesprob > 0.8: row['TRIGGERED'].append("complex bayes: " + str(combayesprob)) applied_filters["complex bayes"] = int( applied_filters.get("complex bayes", 0)) + 1 row['PREDICTION'] = "Spam" if filtersystembool: row = spamFilter.classifyidea(row, unigram_tagger, st) actual.append("spam" in row.get('SPAM', "") or "unusable" in row.get("STATUS", "")) pred.append(row['PREDICTION'] == "Spam") for filter in row['TRIGGERED']: if 'bayes' not in filter: applied_filters[filter] = int(applied_filters.get(filter, 0)) + 1 spamlist.append(row) if row['PREDICTION'] == "Spam" and ("ham" in row.get('SPAM', "") or row.get("STATUS", "") == "usable"): fplist.append(row) elif row['PREDICTION'] == "Ham" and ("spam" in row.get( 'SPAM', "") or "unusable" in row.get("STATUS", "")): fnlist.append(row) cm = confusionMatrix.create_confusionmatrix(actual, pred) confusionMatrix.print_confusionmatrix(cm, True) description = "just filtersystem, Test enumeration fix with iui dataset" confusionMatrix.save_confusionmatrix( cm, variables.resultpath + "ConfusionMatrices.csv", applied_filters, description, filename) duration1 = time.time() - start1 print("Duration1: ", duration1, "seconds") print(applied_filters) ###################### Save results ###################### # importDataHelper.writecsvfile(variables.resultpath + 'IdeaDataSpam2.csv', spamlist[0].keys(), spamlist) if len(fplist) > 0: importDataHelper.writecsvfile( variables.filterresults + filename + '_fp.csv', fplist[0].keys(), fplist) if len(fnlist) > 0: importDataHelper.writecsvfile( variables.filterresults + filename + '_fn.csv', fnlist[0].keys(), fnlist) return None
def evaluate_filtersystem(): resultlist = [] unigram, st = prepare_tagger() for file in listdir(variables.importpathclassified): if isfile(join(variables.importpathclassified, file)): if ".csv" in file: idealist = list( importDataHelper.readcsvdata( join(variables.importpathclassified, file))) elif ".xml" in file: idealist = list( importDataHelper.readxmldata( join(variables.importpathclassified, file))) else: print( "Not able to read all files (just csv and xml are supported)" ) return 1 for filter in textDataFilter.textDataFilterList: if "count" in str(filter): if "more" in filter.__name__: for count in countmore: cm = evaluate_filter(filter, idealist, count) result = { "Dataset": file, "Filter": filter.__name__, "Variable": count } if cm is not None: result.update(cm.stats()) resultlist.append(result) elif "less" in filter.__name__: for count in countless: cm = evaluate_filter(filter, idealist, count) result = { "Dataset": file, "Filter": filter.__name__, "Variable": count } if cm is not None: result.update(cm.stats()) resultlist.append(result) elif "word" in filter.__name__: for count in countwords: cm = evaluate_filter(filter, idealist, count) result = { "Dataset": file, "Filter": filter.__name__, "Variable": count } if cm is not None: result.update(cm.stats()) resultlist.append(result) else: cm = evaluate_filter(filter, idealist) result = { "Dataset": file, "Filter": filter.__name__, "Variable": "None" } if cm is not None: result.update(cm.stats()) resultlist.append(result) for filter in textContentFilter.textContentFilterlist: if "unigram" in filter.__name__: cm = evaluate_filter(filter, idealist, unigram) result = { "Dataset": file, "Filter": filter.__name__, "Variable": "UnigramTagger" } if cm is not None: result.update(cm.stats()) resultlist.append(result) elif "containsnames" in filter.__name__: cm = evaluate_filter(filter, idealist, st) result = { "Dataset": file, "Filter": filter.__name__, "Variable": "StanfordNERTagger" } if cm is not None: result.update(cm.stats()) resultlist.append(result) else: cm = evaluate_filter(filter, idealist) result = { "Dataset": file, "Filter": filter.__name__, "Variable": "None" } if cm is not None: result.update(cm.stats()) resultlist.append(result) print(filter.__name__) importDataHelper.writecsvfile( variables.resultpath + "FilterEvaluation.csv", resultlist[0].keys(), resultlist)
def evaluate_results(): resultdict = import_results() safelist = [] for key in resultdict.keys(): print(key) print("Ideas: ", len(resultdict[key]["actual"])) print("Spam: ", resultdict[key]["actual"].count(True)) print("Ham: ", resultdict[key]["actual"].count(False)) bayespred = [x >= 0.9 for x in resultdict[key]["bayes"]] bayesprob = [x for x in resultdict[key]["bayes"]] combayespred = [x >= 0.9 for x in resultdict[key]["complexbayes"]] combayesprob = [x for x in resultdict[key]["complexbayes"]] linclasspred = [x[0] == 1 for x in resultdict[key]["linCLassifier"]] linclassprob = [x[1] for x in resultdict[key]["linCLassifier"]] filterpred = [ x == 1 for x in evaluationHelper.get_filter_results(resultdict[key] ["Filter"]) ] usepred = [x[0] == 1 for x in resultdict[key]["USE"]] useprob = [x[1] for x in resultdict[key]["USE"]] lin = False com = False use = False bay = False if True in bayespred and False in bayespred: cmbay = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], bayespred) safelist.append({"Data": key, "Filter": "Bayes", **cmbay.stats()}) bay = True print("Bayes") print("Precision: ", cmbay.PPV) print("Recall: ", cmbay.TPR, "\n") if True in combayespred and False in combayespred: cmcombay = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], combayespred) safelist.append({ "Data": key, "Filter": "complex Bayes", **cmcombay.stats() }) com = True print("Complex Bayes") print("Precision: ", cmcombay.PPV) print("Recall: ", cmcombay.TPR, "\n") if True in linclasspred and False in linclasspred: cmlinclass = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], linclasspred) safelist.append({ "Data": key, "Filter": "lin Classifier", **cmlinclass.stats() }) lin = True print("lin Classifier") print("Precision: ", cmlinclass.PPV) print("Recall: ", cmlinclass.TPR, "\n") if True in filterpred and False in filterpred: cmfilter = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], filterpred) safelist.append({ "Data": key, "Filter": "Filtersystem", **cmfilter.stats() }) print("Filtersystem") print("Precision: ", cmfilter.PPV) print("Recall: ", cmfilter.TPR, "\n") if True in usepred and False in usepred: cmuse = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], usepred) safelist.append({"Data": key, "Filter": "USE", **cmuse.stats()}) use = True print("USE Classifier") print("Precision: ", cmuse.PPV) print("Recall: ", cmuse.TPR, "\n") probs = [] classor = [] classtwo = [] classthree = [] countbayesdiff = 0 y = 0 for i in range(0, len(bayesprob)): classor.append(bayesprob[i] >= 0.9 or combayesprob[i] >= 0.9 or linclasspred[i] or usepred[i]) classtwo.append( (bayesprob[i] >= 0.9 and (combayesprob[i] >= 0.9 or linclasspred[i] or usepred[i])) or (combayesprob[i] >= 0.9 and (linclasspred[i] or usepred[i])) or (linclasspred[i] and usepred[i])) classthree.append( (bayesprob[i] >= 0.9 and combayesprob[i] >= 0.9 and (linclasspred[i] or usepred[i])) or (combayesprob[i] >= 0.9 and linclasspred[i] and usepred[i]) or (bayesprob[i] >= 0.9 and linclasspred[i] and usepred[i])) probs.append(0.0) if bay: probs[i] += bayesprob[i] y += 1 if com: probs[i] += combayesprob[i] y += 1 if lin: if linclasspred[i]: probs[i] += linclassprob[i] else: probs[i] += 1 - linclassprob[i] y += 1 if use: if usepred[i]: probs[i] += useprob[i] else: probs[i] += 1 - useprob[i] y += 1 if y > 0: probs[i] = probs[i] / y if bayesprob[i] >= 0.9 and combayesprob[i] < 0.9: countbayesdiff += 1 print("Bayes difference: ", countbayesdiff, "\n\n") avglow = [x >= 0.5 for x in probs] avghigh = [x >= 0.8 for x in probs] if True in avglow and False in avglow: cmavglow = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], avglow) safelist.append({ "Data": key, "Filter": "low avg", **cmavglow.stats() }) print("low Average") print("Precision: ", cmavglow.PPV) print("Recall: ", cmavglow.TPR, "\n") if True in avghigh and False in avghigh: cmavghigh = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], avghigh) safelist.append({ "Data": key, "Filter": "high avg", **cmavghigh.stats() }) print("high Average") print("Precision: ", cmavghigh.PPV) print("Recall: ", cmavghigh.TPR, "\n") if True in classor and False in classor: cmor = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], classor) safelist.append({ "Data": key, "Filter": "Or Classifiers", **cmor.stats() }) print("Classifier or") print("Precision: ", cmor.PPV) print("Recall: ", cmor.TPR, "\n") if True in classtwo and False in classtwo: cmtwo = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], classtwo) safelist.append({ "Data": key, "Filter": "Two Classifiers", **cmtwo.stats() }) print("Two Classifier") print("Precision: ", cmtwo.PPV) print("Recall: ", cmtwo.TPR, "\n") if True in classthree and False in classthree: cmthree = confusionMatrix.create_confusionmatrix( resultdict[key]["actual"], classthree) safelist.append({ "Data": key, "Filter": "Three Classifiers", **cmthree.stats() }) print("Three Classifier") print("Precision: ", cmthree.PPV) print("Recall: ", cmthree.TPR, "\n") importDataHelper.writecsvfile( "Data/Results/Evaluation/extendNewResultDicts.csv", safelist[0].keys(), safelist)
def test(): # idealist = list(importDataHelper.readxmldata(variables.importpathunclassified + 'IdeaData.xml')) idealist = list( importDataHelper.readcsvdata(variables.importpathclassified + "ideas-with-challenges.csv")) idealistchallenge = {"bionicRadar": [], "fabricDisplay": []} print(len(idealist)) i = 0 j = 0 k = 0 for idea in idealist: if idea["STATUS"] == "unreviewed": if "bionic" in idea["CHALLENGE"].lower(): i += 1 idealistchallenge["bionicRadar"].append(idea) elif "fabric" in idea["CHALLENGE"].lower(): j += 1 idealistchallenge["fabricDisplay"].append(idea) else: k += 1 print("unreviewed bionic: ", i) print("unreviewed fabric: ", j) print("unreviewed others: ", k) idealisttrainingschallenge = {} idealisttrainingschallenge["fabricDisplay"] = list( importDataHelper.readcsvdata(variables.ideadbpath + 'fabricDisplay.csv')) idealisttrainingschallenge["bionicRadar"] = list( importDataHelper.readcsvdata(variables.ideadbpath + 'bionicRadar.csv')) idealisttrainingschallenge["TCO"] = list( importDataHelper.readcsvdata(variables.ideadbpath + 'TCO.csv')) idealisttrainingschallengewodups = {} idealisttrainingschallengewodups["fabricDisplay"] = list( importDataHelper.readcsvdata(variables.ideadbwithoutduppath + "fabricDisplay.csv")) idealisttrainingschallengewodups["bionicRadar"] = list( importDataHelper.readcsvdata(variables.ideadbwithoutduppath + "bionicRadar.csv")) idealisttrainingschallengewodups["TCO"] = list( importDataHelper.readcsvdata(variables.ideadbwithoutduppath + "TCO.csv")) idealistmixedtraining = idealisttrainingschallenge[ "fabricDisplay"] + idealisttrainingschallenge[ "bionicRadar"] + idealisttrainingschallenge["TCO"] idealistmixedtrainingwithoutdups = idealisttrainingschallengewodups[ "fabricDisplay"] + idealisttrainingschallengewodups[ "bionicRadar"] + idealisttrainingschallengewodups["TCO"] for key in idealistchallenge.keys(): idealisttraining = idealisttrainingschallenge[key] idealisttrainingwithoutdups = list( importDataHelper.readcsvdata(variables.ideadbwithoutduppath + key + ".csv")) # idealistchallengewithoutdups = duplicateDetection.filterduplikates(idealistchallenge[key], variables.resultpath + "test3.csv", idealisttrainingwithoutdups) print("duplicate detection done") bayes.trainbayes(idealisttraining, challenge=key, delete=True) bayes.trainbayes(idealisttrainingwithoutdups, challenge=key, delete=True, duplicates=True) print("bayes training TCO complete") bayes.trainbayes(idealistmixedtraining, delete=True) bayes.trainbayes(idealistmixedtrainingwithoutdups, delete=True, duplicates=True) print("bayes training mixed complete") wordprobs = bayes.gettokenprobs(challenge=key) wordprobswithoutdups = bayes.gettokenprobs(challenge=key, duplicates=True) wordprobsmixed = bayes.gettokenprobs() wordprobsmixedwithoutdups = bayes.gettokenprobs(duplicates=True) print("loaded probs") complexBayes.trainbayes(idealisttraining, challenge=key, delete=True) complexBayes.trainbayes(idealisttrainingwithoutdups, challenge=key, delete=True, duplicates=True) print("complex bayes training TCO complete") complexBayes.trainbayes(idealistmixedtraining, delete=True) complexBayes.trainbayes(idealistmixedtrainingwithoutdups, delete=True, duplicates=True) print("complex bayes training mixed complete") comwordprobs = complexBayes.gettokenprobs(challenge=key) comwordprobswithoutdups = complexBayes.gettokenprobs(challenge=key, duplicates=True) comwordprobsmixed = complexBayes.gettokenprobs() comwordprobsmixedwithoutdups = complexBayes.gettokenprobs( duplicates=True) print("loaded probs complex") linclass, lincoeff = linearClassifier.train_linear_classificator(key) print(lincoeff) linclassmixed, lincoeffmixed = linearClassifier.train_linear_classificator( "all") print(lincoeffmixed) useest = USEClassifier.train_classifier(key) useestmixed = USEClassifier.train_classifier("all") print("trained USE") unigram_tagger, st = spamFilter.prepare_tagger() i = 1 for idea in idealistchallenge[key]: print(i) idea["TRIGGERED"] = [""] # classify with challenge bayes with duplicates bayesprob = bayes.classify(idea["DESCRIPTION"], wordprobs) # classify with challenge bayes without duplicates bayesprobdup = bayes.classify(idea["DESCRIPTION"], wordprobswithoutdups) # classify with mixed challenge bayes with duplicates bayesprobmixed = bayes.classify(idea["DESCRIPTION"], wordprobsmixed) # classify with mixed challenge bayes without duplicates bayesprobmixedwithoutdup = bayes.classify( idea["DESCRIPTION"], wordprobsmixedwithoutdups) combayesprob = complexBayes.classify(idea["DESCRIPTION"], comwordprobs) # classify with challenge bayes without duplicates combayesprobdup = complexBayes.classify(idea["DESCRIPTION"], comwordprobswithoutdups) # classify with mixed challenge bayes with duplicates combayesprobmixed = complexBayes.classify(idea["DESCRIPTION"], comwordprobsmixed) # classify with mixed challenge bayes without duplicates combayesprobmixedwithoutdup = complexBayes.classify( idea["DESCRIPTION"], comwordprobsmixedwithoutdups) # classify with challenge USE: useclass, useclassprob = USEClassifier.classify(useest, idea) # classify with mixed challenge USE: usemixedclass, usemixedclassprob = USEClassifier.classify( useestmixed, idea) idea, ideadata = spamFilter.classify_and_get_idea( idea, unigram_tagger, st) allnull = True for keytest in ideadata.keys(): ideadata[keytest] = [ideadata[keytest]] if ideadata[keytest] == 1: allnull = False if not allnull: linclasspred, linclassprob = linearClassifier.classify( ideadata, linclass) linmixedclasspred, linmixedclassprob = linearClassifier.classify( ideadata, linclassmixed) else: linclasspred, linclassprob = 0, 0 linmixedclasspred, linmixedclassprob = 0, 0 idea["PREDICTION"] = "Bayes: " + str( bayesprobdup) + ", complexBayes " + str( combayesprobdup) + ", linClass: " + str( linmixedclasspred) + " " + str( linmixedclassprob) + ", USE: " + str( useclass) + " " + str(useclassprob) idea["OTHERBayes"] = "BayesTCO: " + str( bayesprob) + ", BayesMixed " + str( bayesprobmixed) + ", BayesMixed w/o dups " + str( bayesprobmixedwithoutdup) + ", compl BayesTCO: " + str( combayesprob) + ", compl BayesMixed: " + str( combayesprobmixed ) + ", compl BayesMixed w/o dups: " + str( combayesprobmixedwithoutdup) idea["OTHERS"] = "Lin Class: " + str(linclasspred) + " " + str( linclassprob) + ", USE mixed: " + str( usemixedclass) + " " + str(usemixedclassprob) i += 1 importDataHelper.writecsvfile( variables.resultpath + key + "unreviewed.csv", idealistchallenge[key][0].keys(), idealistchallenge[key])