def SVM_Final_DataProcess(data):
    data = prep.preprocess("dict.txt", str(data), 1, 1)
    (data_res, data_vec) = data.vector_generator()

    data_vec_libsvm = Libsvm_format_generator(data_vec)

    return (data_res, data_vec_libsvm)
Пример #2
0
def SVM_DataProcess(dictionary, pos_data, pos_label, pos_num, neg_data,
                    neg_label, neg_num, neu_data, neu_label, neu_num):
    pos = prep.preprocess(str(dictionary), str(pos_data), pos_num, pos_label)
    neg = prep.preprocess(str(dictionary), str(neg_data), neg_num, neg_label)
    neu = prep.preprocess(str(dictionary), str(neu_data), neu_num, neu_label)

    (pos_res, pos_vec) = pos.vector_generator()
    (neg_res, neg_vec) = neg.vector_generator()
    (neu_res, neu_vec) = neu.vector_generator()
    train_result = []
    train_vec_libsvm = []
    train_vec = []
    train_result = pos_res + neg_res + neu_res
    train_vec = pos_vec + neg_vec + neu_vec

    train_vec_libsvm = Libsvm_format_generator(train_vec)

    print("Preprocess Finished")
    return (train_result, train_vec_libsvm)
Пример #3
0
import os
import analyze_competition as a
import prep as p
import pickle
def create_mhs(dir):
    mhs = []
    for root, dirs, files in os.walk(dir):
        for file in files:
            file_name = root+"/"+file
            model = file.split("pickle")[1]
            mhs.append((file_name,model,'a'))
    return mhs


if __name__=="__main__":
    preprocess = p.preprocess()
    analyze = a.analysis()
    meta_mhs = []
    name_dict = {"pos_plus":"POS/NEG Max","pos_plus_big":"POS/NEG Max","pos_minus_big":"POS/NEG Min",'pos_minus':"POS/NEG Min",'squared_minus_big':"Squared Min",'squared_plus_big':"Squared Max",'squared_minus':"Squared Min",'squared_plus':"Squared Max","regular":"SVM","L1":"L1 regularization","doubly":"Doubly regularization","minmax":"Min Max","maxmin":"Max Min","test":"L1"}
    reverse={name_dict[a]:a for a in name_dict}
    dirs = ["test"]
    for dir in dirs:
        meta_mhs.append(create_mhs(dir))
    meta_model_objects = []
    for mhs in meta_mhs:
        meta_model_objects.append(preprocess.load_model_handlers(mhs))
    cd = preprocess.extract_features_by_epoch("features_asr_modified")
    analyze.create_table(meta_model_objects,cd,name_dict)
    # analyze.get_metrices_for_table(meta_model_objects,cd)
    # analyze.append_data_retrieval(pickle.load(open("maxmin.pickle","rb")),pickle.load(open("minmax.pickle","rb")),reverse)
Пример #4
0
import sys
from training_config import *
from keras_conf import *
from prep import preprocess
from keras_lb import keras_train
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-t', '--tags', nargs='+', help='tags list', required=True)
parser.add_argument('-o', '--object', help='tag for model object')
parser.add_argument('-b',
                    '--balance',
                    action='store_true',
                    help='balance multiple dumps')
parser.add_argument('-d', '--dump', action='store_true')

args = parser.parse_args()

tags = args.tags
model_tag = args.object

preprocess(tags, balance=args.balance, out_tag=model_tag)
keras_train(model_tag=model_tag, dump=args.dump)
Пример #5
0
    if 'txt' not in path:
        continue

    ## Windows correction
    path = path.replace("\\", "/")

    ## Trim path to match database key
    filename = path[path.rfind('/') + 1:]
    print("Counting ngrams for " + filename)

    ## Read file via preprocessing script
    with open(path, encoding='utf-8') as file:
        fulltext = file.read()

    ## Tokenize and check each token against search entry set
    tokens = word_tokenize(preprocess(fulltext.lower(), names=False))
    tokens = [t.lower() for t in tokens if t.isalpha()]  ## Filter punctation
    phrases = list(ngrams(tokens, gramsize))
    toSkip = 0
    for p in phrases:
        ## Skip counter. Set to gramsize - 1
        if toSkip > 0:
            toSkip -= 1
            continue

        ## Check to see if a base term is present. Skip if none are found.
        bfound = False
        for word in p:
            if word in baseTerms:
                bfound = True
                break
Пример #6
0
filelist = glob("**", recursive=True)

results = dict()
for path in filelist:
    counter = 0
    if 'txt' not in path:
        continue

    ## Windows correction
    path = path.replace("\\", "/")

    ## Trim path to match database key
    filename = path[path.rfind('/') + 1:]
    print("Counting tokens for " + filename)
    with open(path, encoding='utf-8') as file:
        fulltext = preprocess(file.read())

    ## Tokenize and check each token against search entry set
    tokens = word_tokenize(preprocess(fulltext.lower()))
    for t in tokens:
        if t in searchTerms:
            counter += 1

    ## DEBUG
    if counter > 0:
        print("FOUND: " + str(counter))

    ## Store count in results data object
    results[filename] = counter

print("\nCounting complete. Writing to disk.")