def simple_model_test(data_filter=DocFilter): dat = RoBData(test_mode=False) dat.generate_data(doc_level_only=True) metrics = BinaryMetricsRecorder(domains=dat.CORE_DOMAINS) stupid_metrics = BinaryMetricsRecorder(domains=dat.CORE_DOMAINS) multitask_docs = MultiTaskDocFilter(dat) # use the same ids as the multitask model multitask_uids = np.array(multitask_docs.available_ids) no_studies = len(multitask_uids) kf = KFold(no_studies, n_folds=5, shuffle=False) for domain in dat.CORE_DOMAINS: docs = data_filter(dat, domain=domain) uids = np.array(docs.available_ids) print "%d docs obtained for domain: %s" % (len(uids), domain) tuned_parameters = {"alpha": np.logspace(-4, -1, 10)} clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='f1') no_studies = len(uids) for train, test in kf: X_train_d, y_train = docs.Xy(np.intersect1d(uids, multitask_uids[train])) X_test_d, y_test = docs.Xy(np.intersect1d(uids, multitask_uids[test])) # vec = CountVectorizer(min_df=2) vec = InteractionHashingVectorizer(norm=None, non_negative=True, binary=True) X_train = vec.fit_transform(X_train_d, low=2) X_test = vec.transform(X_test_d) clf.fit(X_train, y_train) y_preds = clf.predict(X_test) metrics.add_preds_test(y_preds, y_test, domain=domain) stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain) metrics.save_csv('simple_acc.csv') stupid_metrics.save_csv('stupid_output.csv')
def multitask_test(fold=None, n_folds_total=5, pickle_metrics=False, metrics_out_dir=None): """run multitask experiment. if fold a fold is specified, run only that fold. """ logging.info('loading data into memory') dat = RoBData(test_mode=False) dat.generate_data(doc_level_only=True) logging.info('loading metric recorder') metrics = BinaryMetricsRecorder(domains=dat.CORE_DOMAINS) logging.info('generating training documents') train_docs = MultiTaskDocFilter(dat) logging.info('generating training ids') train_uids = np.array(train_docs.available_ids) logging.info('setting model parameters') tuned_parameters = {"alpha": np.logspace(-4, -1, 10)} clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='f1') no_studies = len(train_uids) logging.info('calculating folds') kf = KFold(no_studies, n_folds=n_folds_total, shuffle=False) if fold is not None: kf = [list(kf)[fold]] metrics_out_path = os.path.join( metrics_out_dir, "metrics_%s.pickle" % fold) for train, test in kf: logging.info('new fold starting!') X_train_d, y_train, i_train = train_docs.Xyi(train_uids[train]) logging.info('building up test data') interactions = {domain:[] for domain in dat.CORE_DOMAINS} for doc_text, doc_domain in zip(X_train_d, i_train): for domain in dat.CORE_DOMAINS: if domain == doc_domain: interactions[domain].append(True) else: interactions[domain].append(False) logging.info('adding test data to vectorizer') vec = ModularCountVectorizer() vec.builder_clear() logging.info('adding base features') vec.builder_add_docs(X_train_d, low=10) # add base features for domain in dat.CORE_DOMAINS: logging.info('adding interactions for domain %s' % (domain,)) print np.sum(interactions[domain]), "/", len(interactions[domain]), "added for", domain vec.builder_add_interaction_features(X_train_d, interactions=interactions[domain], prefix=domain+"-i-", low=2) # then add interactions logging.info('fitting vectorizer') X_train = vec.builder_fit_transform() logging.info('fitting model') clf.fit(X_train, y_train) for domain in dat.CORE_DOMAINS: test_docs = DocFilter(dat, domain=domain) # test on regular doc model domain_uids = np.array(test_docs.available_ids) test_uids = np.intersect1d(train_uids[test], domain_uids) X_test_d, y_test = test_docs.Xy(test_uids) # build up test vector vec.builder_clear() vec.builder_add_docs(X_test_d) # add base features vec.builder_add_docs(X_test_d, prefix=domain+'-i-') # add interactions X_test = vec.builder_transform() y_preds = clf.predict(X_test) metrics.add_preds_test(y_preds, y_test, domain=domain) if pickle_metrics: with open(metrics_out_path, 'wb') as out_f: pickle.dump(metrics, out_f) if fold is None: metrics.save_csv('multitask_acc.csv') else: metrics.save_csv(os.path.join(metrics_out_path, 'multitask.csv'))