args = parser.parse_args() infolder = args.infolder task = args.feature recipe = args.recipe print 'Loading dataset...' data = ProfilingDataset(infolder) print 'Loaded {} users...\n'.format(len(data.entries)) config = data.config tasks = config.tasks if task in tasks: print ('Creating learning curves for %s task..' % task) if not recipe: recipe = config.recipes[task] clf = from_recipe(recipe) else: clf = from_recipe(recipe) print ('Loading recipe from file %s..' % recipe) X, y = data.get_data(task) # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.KFold(len(X), n_folds=5, random_state=0) # cv = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2, # random_state=0) title = 'Learning Curves from recipe %s' % recipe plot_learning_curve(clf, title, X, y, ylim=(0.3, 1.01), cv=cv, n_jobs=-1) plt.show() else: print('task "%s" does not exist - try one of the' ' following: %s' % (task, tasks))
parser.add_argument('-i', '--input', type=str, required=True, dest='infolder', help='path to folder with pan dataset for a language') parser.add_argument('-n', '--numfolds', type=int, dest='num_folds', default=4, help='Number of folds to use in cross validation') args = parser.parse_args() infolder = args.infolder num_folds = args.num_folds print('Loading dataset...') dataset = ProfilingDataset(infolder) print('Loaded %s users...\n' % len(dataset.entries)) config = dataset.config tasks = config.tasks print('\n--------------- Thy time of Running ---------------') for task in tasks: tictac = from_recipe(config.recipes[task]) cross_val(dataset, task, tictac, num_folds) # print results at end print('\n--------------- Thy time of Judgement ---------------') for message in log: print(message)
if __name__ == '__main__': parser = ArgumentParser(description='Train pan model on pan dataset') parser.add_argument('-i', '--input', type=str, required=True, dest='infolder', help='path to folder with pan dataset for a language') parser.add_argument('-o', '--output', type=str, required=True, dest='outfolder', help='path to folder where model should be written') args = parser.parse_args() infolder = args.infolder outfolder = args.outfolder dataset = ProfilingDataset(infolder) print('Loaded {} users...\n'.format(len(dataset.entries))) # get config config = dataset.config tasks = config.tasks print('\n--------------- Thy time of Running ---------------') all_models = {} for task in tasks: print('Learning to judge %s..' % task) # load data X, y = dataset.get_data(task) tictac = from_recipe(config.recipes[task]) all_models[task] = tictac.fit(X, y) modelfile = os.path.join(outfolder, '%s.bin' % dataset.lang) print('Writing model to {}'.format(modelfile)) joblib.dump(all_models, modelfile, compress=3)
args = parser.parse_args() infolder = args.infolder task = args.feature recipe = args.recipe print 'Loading dataset...' data = ProfilingDataset(infolder) print 'Loaded {} users...\n'.format(len(data.entries)) config = data.config tasks = config.tasks if task in tasks: print('Creating learning curves for %s task..' % task) if not recipe: recipe = config.recipes[task] clf = from_recipe(recipe) else: clf = from_recipe(recipe) print('Loading recipe from file %s..' % recipe) X, y = data.get_data(task) # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.KFold(len(X), n_folds=5, random_state=0) # cv = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2, # random_state=0) title = 'Learning Curves from recipe %s' % recipe plot_learning_curve(clf, title, X, y, ylim=(0.3, 1.01),
# get config config = dataset.config tasks = config.tasks print('\n--------------- Thy time of Running ---------------') list_model_names = ['tictac', 'lda', 'voting'] total_model = {} for model_name in list_model_names: all_models = {} if model_name != 'voting': for task in tasks: print('Learning to judge %s with %s' % (task, model_name)) # load data X, y = dataset.get_data(task) if 'meta' in list_model_names: X, X_cv, y, y_cv = train_test_split(X, y, test_size=split, random_state=42, stratify=y) tictac = from_recipe(config.recipes[task + '-' + model_name]) outline = "" for step in tictac.steps: if step[0] == "features": # print type(step[1]) for tf in step[1].transformer_list: # print type(tf[1]) # print type(tf[1].get_params()) outline += tf[0] + " with Params:[" + str(tf[1].get_params()) + "]+" else: # if hasattr(step[1], 'get_params'): # outline += step[0] + " with Params:[" + str(step[1].get_params()) + "]+" # else: # outline += step[0]+ "+" outline += step[0] + "+" outline = outline[:-1] + "\n"
from argparse import ArgumentParser from tictacs import from_recipe if __name__ == '__main__': parser = ArgumentParser(description='A tester of recipes for tic tacs') parser.add_argument('--recipe', '-r', required=True, dest='recipe', help='Path to the file where the recipe to create ' 'delicious tictacs resides. All recipes must be ' 'written in yaml format.', default='recipes/example.py') args = parser.parse_args() recipe = args.recipe print('Using recipe from file: %s' % recipe) texts = ['@sly_pedantic_octopus @glorified_ml I walked on ice yesterday,' ' but no one laughed when it broke #lol', '@blue_world I hate java #java #programming #hell', '@BarnieTheDinosaur raaawwwwr.', 'omg that just happened. #omg #rofl #yolo' ] print('Creating model...') tictac = from_recipe(recipe) print('Fitting model...') tictac.fit(texts, [0, 1, 1, 1]) print('Predicting with model...') res = tictac.predict(['#dog wtf omg java', '@blue yes broke #lol']) print('Predicted %s' % res)
dest='num_folds', default=4, help='Number of folds to use in cross validation') args = parser.parse_args() X_path = args.x_path y_path = args.y_path num_folds = args.num_folds # This part for tira-io with open(X_path, 'r') as xin: X = pickle.load(xin) with open(y_path, 'r') as yin: y = pickle.load(yin) ###### print('Number of docs: %d,%d' % (len(X), len(y))) for task in ['gender']: tictac = from_recipe("./config/recipes/gender.yml") outline = "" for step in tictac.steps: if step[0] == "features": # print type(step[1]) for tf in step[1].transformer_list: # print type(tf[1]) # print type(tf[1].get_params()) outline += tf[0] + " with Params:[" + str(tf[1].get_params()) + "]+" else: # if hasattr(step[1], 'get_params'): # outline += step[0] + " with Params:[" + str(step[1].get_params()) + "]+" # else: # outline += step[0]+ "+" outline += step[0] + "+" outline = outline[:-1] + "\n"