def get_methods_singletask(header, random_restarts=-1): FEATURES_BOW, FEATURES_BROWN, _, _=extract_feature_indices(header) GPCONSTRUCTOR=lambda kernel_constructor, name, random_restarts: MCGP(kernel_constructor=kernel_constructor, labels=LABELS, name=name, random_restarts=random_restarts) methodsmultitask=[ lambda: SklearnBaseline(lambda: DummyClassifier("most_frequent"), "MostFrequent", [0]), lambda: GPCONSTRUCTOR(kernel_constructor=lambda: single_task_kernel(FEATURES_BOW, False, "BOW"), name="BOWGPjoinedfeatures", random_restarts=random_restarts), lambda: GPCONSTRUCTOR(kernel_constructor=lambda: single_task_kernel(FEATURES_BROWN, False, "BROWN"), name="BROWNGPjoinedfeatures", random_restarts=random_restarts) ] return methodsmultitask, map(lambda x: x().name, methodsmultitask)
def __init__(self, X, y, train_set_ratios, foldtorun, splitter, evaluation_measures, methodnamesmultitask, methodsmultitask, methodnamessingletask, methodssingletask, print_metrics, header, random_restarts=-1, results={}, filter_retweets=True): self.X = X self.y = y self.methodnamesmultitask = methodnamesmultitask self.methodsmultitask = methodsmultitask self.methodnamessingletask = methodnamessingletask self.methodssingletask = methodssingletask self.foldtorun = foldtorun self.splitter = splitter self.evaluation_measures=evaluation_measures self.print_metrics = print_metrics self.results = results self.header=header _, _, self.postprocessed_task_column_id, self.rttypecol_processed_column_id=extract_feature_indices(header) self.methodnames_all = self.methodnamesmultitask+self.methodnamessingletask self.filter_retweets = filter_retweets
foldtorun=int(sys.argv[1]) methodname=sys.argv[2] train_set_ratios=[int(sys.argv[3])] fname=sys.argv[4] random_restarts=int(sys.argv[5]) filter_retweets=bool(int(sys.argv[6])) if len(sys.argv)>=8: #if random number generator seed has been passed seed=int(sys.argv[7]) import numpy as np np.random.seed(seed) else: initialize_seed_with_currtime() X, y, header = load_data(fname, labels_to_keep=LABELS) _, _, postprocessed_task_column_id, _=extract_feature_indices(header) splitter = foldsplitter(X, postprocessed_task_column_id, train_set_ratios) evaluation_measures = [sklearn.metrics.accuracy_score] tasks_number=len(set(X[:, postprocessed_task_column_id])) methodsmultitask, methodnamesmultitask = get_methods_multitask(tasks_number, header, random_restarts=random_restarts) methodssingletask, methodnamessingletask = get_methods_singletask(header, random_restarts=random_restarts) if methodname != None: #if we are interested in keeping only one method methodnamesmultitask, methodsmultitask = filter_methods(methodnamesmultitask, methodsmultitask, methodname) methodnamessingletask, methodssingletask = filter_methods(methodnamessingletask, methodssingletask, methodname) experiment = Experiment(X, y, train_set_ratios, foldtorun, splitter, evaluation_measures, methodnamesmultitask, methodsmultitask, methodnamessingletask, methodssingletask, print_metrics=print_metrics_multiclass, random_restarts=random_restarts, results={}, header=header, filter_retweets=filter_retweets)
foldtorun = int(sys.argv[1]) methodname = sys.argv[2] train_set_ratios = [int(sys.argv[3])] fname = sys.argv[4] random_restarts = int(sys.argv[5]) filter_retweets = bool(int(sys.argv[6])) if len(sys.argv) >= 8: #if random number generator seed has been passed seed = int(sys.argv[7]) import numpy as np np.random.seed(seed) else: initialize_seed_with_currtime() X, y, header = load_data(fname, labels_to_keep=LABELS) _, _, postprocessed_task_column_id, _ = extract_feature_indices(header) splitter = foldsplitter(X, postprocessed_task_column_id, train_set_ratios) evaluation_measures = [sklearn.metrics.accuracy_score] tasks_number = len(set(X[:, postprocessed_task_column_id])) methodsmultitask, methodnamesmultitask = get_methods_multitask( tasks_number, header, random_restarts=random_restarts) methodssingletask, methodnamessingletask = get_methods_singletask( header, random_restarts=random_restarts) if methodname != None: #if we are interested in keeping only one method methodnamesmultitask, methodsmultitask = filter_methods( methodnamesmultitask, methodsmultitask, methodname) methodnamessingletask, methodssingletask = filter_methods( methodnamessingletask, methodssingletask, methodname)