def run(dataset): COUNT = 40000 cut = int((COUNT / 2) * 3 / 4) array = [[True, 7000], [True, 9000], [True, 11000], [True, 14000]] nlt = dict() skl = dict() # file for variable in array: var_name = str(variable[0]) + str(variable[1]) if NLTK: nlt_file = "bigram15-comb-" + dataset + "-" + var_name + "-nlt.csv" nlt[var_name] = open(nlt_file, 'a') nlt[var_name].write( str(datetime.datetime.today()) + " COUNT= " + str(COUNT) + "\n") if SKLEARN: skl_file = "bigram15-comb-" + dataset + "-" + var_name + "-skl.csv" skl[var_name] = open(skl_file, 'a') skl[var_name].write( str(datetime.datetime.today()) + " COUNT= " + str(COUNT) + "\n") # cycle for x in range(0, 5): print(x) corpora = crp.Corpora(dataset, count=COUNT, shuffle=True) for variable in array: print(str(variable[1])) var_name = str(variable[0]) + str(variable[1]) features = ftr.Features(corpora, total=COUNT, bigram=variable[0], bigram_count=15, inf_count=variable[1]) posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] nlt_output, skl_output = cls.train(trainfeats, testfeats, nlt=NLTK, skl=SKLEARN) if NLTK: print(str(nlt_output)) nlt[var_name].write(nlt_output) nlt[var_name].flush() if SKLEARN: skl[var_name].write(skl_output) skl[var_name].flush()
def run(dataset): nlt = dict() skl = dict() dir = "output/" + dataset + "/" + type + "/" os.makedirs(dir, exist_ok=True) # file for variable in array: var_name = str(variable) if nltk_run: nlt_file = dir + dataset + "-" + type + "-" + var_name + "-nlt.csv" nlt[var_name] = open(nlt_file, 'a') nlt[var_name].write(str(datetime.datetime.today()) + "\n") if sklearn_run: skl_file = dir + dataset + "-" + type + "-" + var_name + "-skl.csv" skl[var_name] = open(skl_file, 'a') skl[var_name].write(str(datetime.datetime.today()) + "\n") # cycle for x in range(0, 10): print(x) for variable in array: print(str(variable)) var_name = str(variable) COUNT = variable cut = int((COUNT / 2) * 4 / 5) corpora = crp.Corpora(dataset, count=COUNT, shuffle=True) features = ftr.Features(corpora, total=COUNT, inf_count=-1) posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] try: nlt_output, skl_output = cls.train(trainfeats, testfeats, nlt=nltk_run, skl=sklearn_run) except Exception as e: print(e) continue if nltk_run: print(str(nlt_output)) nlt[var_name].write(nlt_output) nlt[var_name].flush() if sklearn_run: print(str(skl_output)) skl[var_name].write(skl_output) skl[var_name].flush()
def run(dataset): nlt = dict() skl = dict() dir = "output/" + dataset + "/" + type + "/" os.makedirs(dir, exist_ok=True) # file for variable in array: var_name = str(variable) if nltk_run: nlt_file = dir + dataset + "-" + type + "-" + var_name + "-nlt.csv" nlt[var_name] = open(nlt_file, 'a') nlt[var_name].write(str(datetime.datetime.today()) + "\n") if sklearn_run: skl_file = dir + dataset + "-" + type + "-" + var_name + "-skl.csv" skl[var_name] = open(skl_file, 'a') skl[var_name].write(str(datetime.datetime.today()) + "\n") # cycle for x in range(0, 10): print(x) corpora = crp.Corpora(dataset, count=COUNT, shuffle=True) for variable in array: print(str(variable)) var_name = str(variable) features = ftr.Features(corpora, total=COUNT, bigram=True, stop=True, stem="porter", lower=True, inf_count=variable) posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] nlt_output, skl_output = cls.train(trainfeats, testfeats, dataset, nlt=nltk_run, skl=sklearn_run)
def run(dataset): COUNT = 5000 cut = int((COUNT / 2) * 3 / 4) nlt = dict() skl = dict() dir = "output/" + dataset + "/pos/" os.makedirs(dir, exist_ok=True) # cycle for x in range(0, 10): print(x) corpora = crp.Corpora(dataset, count=COUNT, shuffle=True) ##### var = "NONE" nlt_file = dir + dataset + "-" + "pos" + "-" + var + "-nlt.csv" # skl_file = "output/" + dataset + "/" + dataset + "-" + "pos" + "-" + var + "-skl.csv" nlt[var] = open(nlt_file, 'a') # skl[var] = open(skl_file, 'a') features = ftr.Features(corpora, total=COUNT, pos=None) posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] nlt_output, skl_output = cls.train(trainfeats, testfeats, skl=False) nlt[var].write(nlt_output) nlt[var].flush() print(str(nlt_output)) # skl[var].write(skl_output) ##### var = "JVNR" nlt_file = dir + dataset + "-" + "pos" + "-" + var + "-nlt.csv" # skl_file = "output/" + dataset + "/" + dataset + "-" + "pos" + "-" + var + "-skl.csv" nlt[var] = open(nlt_file, 'a') # skl[var] = open(skl_file, 'a') features = ftr.Features(corpora, total=COUNT, pos=["J", "V", "N", "R"]) posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] nlt_output, skl_output = cls.train(trainfeats, testfeats, skl=False) nlt[var].write(nlt_output) nlt[var].flush() print(str(nlt_output)) # skl[var].write(skl_output) ##### var = "EUJVNR" nlt_file = dir + dataset + "-" + "pos" + "-" + var + "-nlt.csv" # skl_file = "output/" + dataset + "/" + dataset + "-" + "pos" + "-" + var + "-skl.csv" nlt[var] = open(nlt_file, 'a') # skl[var] = open(skl_file, 'a') features = ftr.Features(corpora, total=COUNT, pos=["E", "U", "J", "V", "N", "R"]) posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] nlt_output, skl_output = cls.train(trainfeats, testfeats, skl=False) nlt[var].write(nlt_output) nlt[var].flush() print(str(nlt_output)) # skl[var].write(skl_output) ##### var = "FEUPJVNR" nlt_file = dir + dataset + "-" + "pos" + "-" + var + "-nlt.csv" # skl_file = "output/" + dataset + "/" + dataset + "-" + "pos" + "-" + var + "-skl.csv" nlt[var] = open(nlt_file, 'a') # skl[var] = open(skl_file, 'a') features = ftr.Features(corpora, total=COUNT, pos=["F", "E", "U", "P", "J", "V", "N", "R"]) posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] nlt_output, skl_output = cls.train(trainfeats, testfeats, skl=False) nlt[var].write(nlt_output) nlt[var].flush() print(str(nlt_output))
def run_test(dataset, type, iter=10, count=5000, shuffle=False, nltk_run=True, sklearn_run=True, inf_count=-1, bigram_count=50, pos=None, stop=False, stem="none", bigram=False, lower=True): cut = int((count / 2) * 3 / 4) nlt = dict() skl = dict() # file for variable in array: var_name = str(variable) if nltk_run: nlt_file = "output/" + dataset + "/" + dataset + "-" + type + "-" + var_name + "-nlt.csv" nlt[var_name] = open(nlt_file, 'a') nlt[var_name].write(str(datetime.datetime.today()) + "\n") if sklearn_run: skl_file = "output/" + dataset + "/" + dataset + "-" + type + "-" + var_name + "-skl.csv" skl[var_name] = open(skl_file, 'a') skl[var_name].write(str(datetime.datetime.today()) + "\n") # cycle for x in range(0, iter): print(x) corpora = crp.Corpora(dataset, count, shuffle) for variable in array: print(str(variable[1])) var_name = str(variable[0]) + str(variable[1]) features = ftr.Features(corpora, count, inf_count, bigram_count, pos, stop, stem, bigram, lower) posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] nlt_output, skl_output = cls.train(trainfeats, testfeats, nlt=nltk_run, skl=sklearn_run) if nltk_run: print(str(nlt_output)) nlt[var_name].write(nlt_output) nlt[var_name].flush() if sklearn_run: print(str(nlt_output)) skl[var_name].write(skl_output) nlt[var_name].flush() # dataset_array = ["stwits"] # # for dataset in dataset_array: # run(dataset)
sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str( lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(voted) + "\n" return (nltk_output, sklearn_output) if __name__ == '__main__': COUNT = 5000 cut = int((COUNT / 2) * 4 / 5) corpora = crp.Corpora("stwits", count=COUNT, shuffle=True) features = ftr.Features(corpora, total=COUNT, stem="porter", bigram=True, stop=True, inf_count=-1, lower=True) # features = ftr.Features(corpora, total=COUNT, bigram=True, stem="porter") posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))) nlt, skl = train(trainfeats, testfeats, skl=False, most=50) print(nlt, skl)
neg_prec) + ", " + str(pos_rec) + ", " + str(neg_rec) + "\n" sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str( lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(voted) + "\n" return (nltk_output, sklearn_output) if __name__ == '__main__': COUNT = 25000 cut = int((COUNT / 2) * 4 / 5) corpora = crp.Corpora("stanford", count=COUNT, shuffle=True) features = ftr.Features(corpora, total=COUNT, bigram=True, stem="porter", stop=False, lower=True, inf_count=5000) features_f = open("pickled/features.pickle", "wb") pickle.dump(features, features_f) features_f.close() posfeats = features.get_features_pos() negfeats = features.get_fearures_neg() trainfeats = negfeats[:cut] + posfeats[:cut] testfeats = negfeats[cut:] + posfeats[cut:] print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))