def train_linear_classificator(challenge, new=False): if new: unigram_tagger, st = spamFilter.prepare_tagger() idealist = list( importDataHelper.readcsvdata(variables.ideadbpath + challenge + '.csv')) featurelist = {} for idea in idealist: idea['TRIGGERED'] = [] idea['PREDICTION'] = "Ham" idea, ideafeatures = spamFilter.classify_and_get_idea(idea, unigram_tagger, st) if "unusable" in idea["STATUS"] or 'spam' in idea.get("SPAM", ""): ideafeatures["Spam"] = 1 else: ideafeatures["Spam"] = 0 for key in ideafeatures.keys(): featurelist[key] = featurelist.get(key, []) featurelist[key].append(ideafeatures[key]) else: if challenge == "all": idealist = [] for file in listdir(variables.linclasstrainingsdatapath): if isfile(join(variables.linclasstrainingsdatapath, file)): idealist += list(importDataHelper.readcsvdata(join(variables.linclasstrainingsdatapath, file))) else: idealist = list(importDataHelper.readcsvdata(variables.linclasstrainingsdatapath + challenge + ".csv")) featurelist = {} for key in idealist[0].keys(): featurelist[key] = [int(x) for x in idealist[0][key].replace('[', '').replace(']', '').split(',')] testdata = pd.DataFrame(featurelist) X = testdata.drop('Spam', axis=1) y = testdata['Spam'] importDataHelper.writecsvfiledict(variables.linclasstrainingsdatapath + challenge + ".csv", featurelist.keys(), featurelist) clf = RidgeClassifier().fit(X, y) print(clf.score(X, y)) return clf, clf.coef_
def eval_all(): challengedict = { "TCO": list(importDataHelper.readcsvdata("Data/DBs/ideaDB/TCO.csv")), "bionicRadar": list(importDataHelper.readcsvdata("Data/DBs/ideaDB/bionicRadar.csv")), "fabricDisplay": list(importDataHelper.readcsvdata("Data/DBs/ideaDB/fabricDisplay.csv")) } dupdict = {} for key in challengedict.keys(): idealist = [] for key2 in challengedict.keys(): if key2 is not key: idealist += challengedict[key2].copy() X_train, X_test = train_test_split(challengedict[key], test_size=0.33) idealist += X_train.copy() X_ndtrain = duplicateDetection.filterduplikates( X_train, variables.resultpath + "eval2" + key + ".csv") dupdict[key] = len(X_train) - len(X_ndtrain) X_ndtest = X_test.copy() idealist_test = X_test.copy() idealist_nodups_test = X_test.copy() results = evaluate_system(X_train, X_test, key) importDataHelper.writecsvfiledict( "Data/ResultsAllNew/evaluation" + key + ".csv", results.keys(), results) print("Done first set") results2 = evaluate_system(X_ndtrain, X_ndtest, key, dups=True) importDataHelper.writecsvfiledict( "Data/ResultsAllNew/evaluationResultsNoDups" + key + ".csv", results2.keys(), results2) print("Challenge training done", key) idealist_nodups = duplicateDetection.filterduplikates( idealist, variables.resultpath + "eval" + key + ".csv") dupdict[key + " All"] = len(idealist) - len(idealist_nodups) results = evaluate_system(idealist, idealist_test) importDataHelper.writecsvfiledict( "Data/ResultsAllNew/evaluationAll" + key + ".csv", results.keys(), results) print("Done first set") results2 = evaluate_system(idealist_nodups, idealist_nodups_test, dups=True) importDataHelper.writecsvfiledict( "Data/ResultsAllNew/evaluationResultsNoDupsAll" + key + ".csv", results2.keys(), results2) print("All training done", key) print(dupdict) importDataHelper.writecsvfiledict("Data/ResultsAllNew/dupNums.csv", dupdict.keys(), dupdict)
def extend_noun_corpus(): idealist = list( importDataHelper.readcsvdata(variables.importpathclassified + 'cscw19-unapproved-ideas_import.csv')) nouncorpus = read_noun_corpus() unigram_tagger = prepare_tagger() for idea in idealist: nouns = get_Nouns(idea['DESCRIPTION'], unigram_tagger) for noun in nouns: if noun not in nouncorpus: nouncorpus[noun] = "unclassified" importDataHelper.writecsvfiledict(variables.dbpath + 'NLPdata/NounDB.csv', nouncorpus.keys(), nouncorpus)
def evaluate_fun(): idealist = list( importDataHelper.readcsvdata("Data/DBs/ideaDB/bionicRadar.csv")) X_train, X_test = train_test_split(idealist, test_size=0.33) X_ndtrain = duplicateDetection.filterduplikates( X_train, variables.resultpath + "evalbionicRadar.csv") X_ndtest = X_test.copy() results = evaluate_system(X_train, X_test, "bionicRadar") importDataHelper.writecsvfiledict( "Data/ResultsNew/evaluationResultsbionicRadar.csv", results.keys(), results) print("Done first set") results2 = evaluate_system(X_ndtrain, X_ndtest, "bionicRadar", dups=True) importDataHelper.writecsvfiledict( "Data/ResultsNew/evaluationResultsNoDupsbionicRadar.csv", results2.keys(), results2) print("Done")
def classify_noun_corpus(): nouncorpus = read_noun_corpus() print( "Enter \'y\' if noun is concrete, \'n\' if noun is abstract, \'skip\' to skip the word or \'stop\' to safe results and stop classifying" ) for noun in nouncorpus.keys(): if "unclassified" in nouncorpus[noun]: answer = input(noun + ": ") if 'y' in answer: nouncorpus[noun] = "C" elif 'n' in answer: nouncorpus[noun] = "A" elif 'skip' in answer: pass else: importDataHelper.writecsvfiledict( variables.dbpath + 'NLPdata/NounDB.csv', nouncorpus.keys(), nouncorpus) break
def trainbayes(idealist, challenge=None, delete=False, duplicates=False): if delete: bayesspamwords = {} bayeshamwords = {} else: bayesspamwords = getspamtokens(challenge, duplicates) bayeshamwords = gethamtokens(challenge, duplicates) nspam = int(bayesspamwords.pop("<IdeaCount>", 0)) nham = int(bayeshamwords.pop("<IdeaCount>", 0)) for idea in idealist: if idea.get("STATUS", "") == "unusable": bayesspamwords = updatedb(idea['DESCRIPTION'], bayesspamwords) nspam += 1 elif idea.get("STATUS", "") == "usable": nham += 1 bayeshamwords = updatedb(idea['DESCRIPTION'], bayeshamwords) elif idea.get("STATUS", "") == "unreviewed" and "spam" in idea.get( 'SPAM', ""): bayesspamwords = updatedb(idea['DESCRIPTION'], bayesspamwords) nspam += 1 elif idea.get("STATUS", "") == "unreviewed" and "ham" in idea.get( 'SPAM', ""): nham += 1 bayeshamwords = updatedb(idea['DESCRIPTION'], bayeshamwords) # if "spam" in idea.get('SPAM', "") or "unusable" in idea.get("STATUS"): # bayesspamwords = updatedb(idea['DESCRIPTION'], bayesspamwords) # nspam += 1 # else: # nham += 1 # bayeshamwords = updatedb(idea['DESCRIPTION'], bayeshamwords) bayesspamwords["<IdeaCount>"] = nspam bayeshamwords["<IdeaCount>"] = nham if challenge is None: if duplicates: importDataHelper.writecsvfiledict( variables.complexbayesmixedpath + 'duplicateBayesSpamToken.csv', bayesspamwords.keys(), bayesspamwords) importDataHelper.writecsvfiledict( variables.complexbayesmixedpath + 'duplicateBayesHamToken.csv', bayeshamwords.keys(), bayeshamwords) probslist = calculateprobs(bayesspamwords, bayeshamwords, nspam, nham) importDataHelper.writecsvfiledict( variables.complexbayesmixedpath + 'duplicateBayesTokenProbs.csv', probslist.keys(), probslist) else: importDataHelper.writecsvfiledict( variables.complexbayesmixedpath + 'bayesSpamToken.csv', bayesspamwords.keys(), bayesspamwords) importDataHelper.writecsvfiledict( variables.complexbayesmixedpath + 'bayesHamToken.csv', bayeshamwords.keys(), bayeshamwords) probslist = calculateprobs(bayesspamwords, bayeshamwords, nspam, nham) importDataHelper.writecsvfiledict( variables.complexbayesmixedpath + 'bayesTokenProbs.csv', probslist.keys(), probslist) else: if not os.path.exists(variables.complexbayeschallengebasedpath + challenge): try: os.mkdir(variables.complexbayeschallengebasedpath + challenge) except OSError: print( "Path for Challenge does not exist and could not be created" ) if duplicates: importDataHelper.writecsvfiledict( variables.complexbayeschallengebasedpath + challenge + '/duplicateBayesSpamToken.csv', bayesspamwords.keys(), bayesspamwords) importDataHelper.writecsvfiledict( variables.complexbayeschallengebasedpath + challenge + '/duplicateBayesHamToken.csv', bayeshamwords.keys(), bayeshamwords) probslist = calculateprobs(bayesspamwords, bayeshamwords, nspam, nham) importDataHelper.writecsvfiledict( variables.complexbayeschallengebasedpath + challenge + '/duplicateBayesTokenProbs.csv', probslist.keys(), probslist) else: importDataHelper.writecsvfiledict( variables.complexbayeschallengebasedpath + challenge + '/bayesSpamToken.csv', bayesspamwords.keys(), bayesspamwords) importDataHelper.writecsvfiledict( variables.complexbayeschallengebasedpath + challenge + '/bayesHamToken.csv', bayeshamwords.keys(), bayeshamwords) probslist = calculateprobs(bayesspamwords, bayeshamwords, nspam, nham) importDataHelper.writecsvfiledict( variables.complexbayeschallengebasedpath + challenge + '/bayesTokenProbs.csv', probslist.keys(), probslist)
def evaluate_system(X_train, X_test, challenge=None, dups=False): unigram_tagger, st = spamFilter.prepare_tagger() features = {} data = {"DESCRIPTION": [], "Spam": []} for idea in X_train: idea, feature = spamFilter.classify_and_get_idea( idea, unigram_tagger, st) data["DESCRIPTION"].append(idea["DESCRIPTION"]) for key in feature.keys(): features[key] = features.get(key, []) features[key].append(feature[key]) if idea["STATUS"] == "unusable": features["Spam"] = features.get("Spam", []) features["Spam"].append(1) data["Spam"].append(1) elif idea["STATUS"] == "usable": features["Spam"] = features.get("Spam", []) features["Spam"].append(0) data["Spam"].append(0) elif "spam" in idea["SPAM"]: features["Spam"] = features.get("Spam", []) features["Spam"].append(1) data["Spam"].append(1) else: features["Spam"] = features.get("Spam", []) features["Spam"].append(0) data["Spam"].append(0) print("prepared") bayes.trainbayes(X_train, challenge=challenge, delete=True, duplicates=dups) complexBayes.trainbayes(X_train, challenge=challenge, delete=True, duplicates=dups) wordprobs = bayes.gettokenprobs(challenge=challenge, duplicates=dups) comwordprobs = complexBayes.gettokenprobs(challenge=challenge, duplicates=dups) linClass, coeff = linearClassifier.train_linear_classifier(features) useest = USEClassifier.train_classifier_idealist(pd.DataFrame(data)) print(coeff) actual = [] predbay = [] predcombay = [] predUSE = [] predLin = [] features = {} for idea in X_test: if not idea["STATUS"] == "unreviewed": actual.append(idea["STATUS"] == "unusable") else: actual.append("spam" in idea["SPAM"]) predbay.append(bayes.classify(idea["DESCRIPTION"], wordprobs)) predcombay.append( complexBayes.classify(idea["DESCRIPTION"], comwordprobs)) if idea["DESCRIPTION"] == "" or idea["DESCRIPTION"] == []: predUSE.append((0, 0.0)) else: predUSE.append( (USEClassifier.classify(useest, {"DESCRIPTION": idea["DESCRIPTION"]}))) idea["TRIGGERED"] = idea.get("TRIGGERED", []) idea, ideadata = spamFilter.classify_and_get_idea( idea, unigram_tagger, st) test = False for ideakey in ideadata.keys(): if ideadata[ideakey] == 1: features[ideakey] = features.get(ideakey, []) features[ideakey].append(1) test = True else: features[ideakey] = features.get(ideakey, []) features[ideakey].append(0) ideadata[ideakey] = [ideadata[ideakey]] if test: predLin.append( linearClassifier.classify(pd.DataFrame(ideadata), linClass)) else: predLin.append((0, 0.0)) results = { "actual": actual, "bayes": predbay, "complexbayes": predcombay, "USE": predUSE, "linCLassifier": predLin, "linClassCo": coeff, "Filter": features } importDataHelper.writecsvfiledict( variables.resultpath + "evaluationResults.csv", results.keys(), results) return results