def SVM_Final_DataProcess(data): data = prep.preprocess("dict.txt", str(data), 1, 1) (data_res, data_vec) = data.vector_generator() data_vec_libsvm = Libsvm_format_generator(data_vec) return (data_res, data_vec_libsvm)
def SVM_DataProcess(dictionary, pos_data, pos_label, pos_num, neg_data, neg_label, neg_num, neu_data, neu_label, neu_num): pos = prep.preprocess(str(dictionary), str(pos_data), pos_num, pos_label) neg = prep.preprocess(str(dictionary), str(neg_data), neg_num, neg_label) neu = prep.preprocess(str(dictionary), str(neu_data), neu_num, neu_label) (pos_res, pos_vec) = pos.vector_generator() (neg_res, neg_vec) = neg.vector_generator() (neu_res, neu_vec) = neu.vector_generator() train_result = [] train_vec_libsvm = [] train_vec = [] train_result = pos_res + neg_res + neu_res train_vec = pos_vec + neg_vec + neu_vec train_vec_libsvm = Libsvm_format_generator(train_vec) print("Preprocess Finished") return (train_result, train_vec_libsvm)
import os import analyze_competition as a import prep as p import pickle def create_mhs(dir): mhs = [] for root, dirs, files in os.walk(dir): for file in files: file_name = root+"/"+file model = file.split("pickle")[1] mhs.append((file_name,model,'a')) return mhs if __name__=="__main__": preprocess = p.preprocess() analyze = a.analysis() meta_mhs = [] name_dict = {"pos_plus":"POS/NEG Max","pos_plus_big":"POS/NEG Max","pos_minus_big":"POS/NEG Min",'pos_minus':"POS/NEG Min",'squared_minus_big':"Squared Min",'squared_plus_big':"Squared Max",'squared_minus':"Squared Min",'squared_plus':"Squared Max","regular":"SVM","L1":"L1 regularization","doubly":"Doubly regularization","minmax":"Min Max","maxmin":"Max Min","test":"L1"} reverse={name_dict[a]:a for a in name_dict} dirs = ["test"] for dir in dirs: meta_mhs.append(create_mhs(dir)) meta_model_objects = [] for mhs in meta_mhs: meta_model_objects.append(preprocess.load_model_handlers(mhs)) cd = preprocess.extract_features_by_epoch("features_asr_modified") analyze.create_table(meta_model_objects,cd,name_dict) # analyze.get_metrices_for_table(meta_model_objects,cd) # analyze.append_data_retrieval(pickle.load(open("maxmin.pickle","rb")),pickle.load(open("minmax.pickle","rb")),reverse)
import sys from training_config import * from keras_conf import * from prep import preprocess from keras_lb import keras_train import argparse parser = argparse.ArgumentParser() parser.add_argument('-t', '--tags', nargs='+', help='tags list', required=True) parser.add_argument('-o', '--object', help='tag for model object') parser.add_argument('-b', '--balance', action='store_true', help='balance multiple dumps') parser.add_argument('-d', '--dump', action='store_true') args = parser.parse_args() tags = args.tags model_tag = args.object preprocess(tags, balance=args.balance, out_tag=model_tag) keras_train(model_tag=model_tag, dump=args.dump)
if 'txt' not in path: continue ## Windows correction path = path.replace("\\", "/") ## Trim path to match database key filename = path[path.rfind('/') + 1:] print("Counting ngrams for " + filename) ## Read file via preprocessing script with open(path, encoding='utf-8') as file: fulltext = file.read() ## Tokenize and check each token against search entry set tokens = word_tokenize(preprocess(fulltext.lower(), names=False)) tokens = [t.lower() for t in tokens if t.isalpha()] ## Filter punctation phrases = list(ngrams(tokens, gramsize)) toSkip = 0 for p in phrases: ## Skip counter. Set to gramsize - 1 if toSkip > 0: toSkip -= 1 continue ## Check to see if a base term is present. Skip if none are found. bfound = False for word in p: if word in baseTerms: bfound = True break
filelist = glob("**", recursive=True) results = dict() for path in filelist: counter = 0 if 'txt' not in path: continue ## Windows correction path = path.replace("\\", "/") ## Trim path to match database key filename = path[path.rfind('/') + 1:] print("Counting tokens for " + filename) with open(path, encoding='utf-8') as file: fulltext = preprocess(file.read()) ## Tokenize and check each token against search entry set tokens = word_tokenize(preprocess(fulltext.lower())) for t in tokens: if t in searchTerms: counter += 1 ## DEBUG if counter > 0: print("FOUND: " + str(counter)) ## Store count in results data object results[filename] = counter print("\nCounting complete. Writing to disk.")