('nltkword_unigram',), feats_content, feats_content + ['headingord', 'headingvec'], feats_local, ['headingord', 'headingvec'] + feats_struct, feats_struct, feats_local + feats_prev, feats_local + feats_post, feats_local + feats_window, feats_local + feats_prev + feats_post, feats_local + feats_prev + feats_post + feats_window, ] features = [ tuple(sorted(x)) for x in features ] if __name__ == "__main__": fallback = Store('store/features.h5' ) store = Store.from_caller(fallback=fallback) learner = repeat.RepeatLearner(liblinear.liblinearL(svm_type=0, output_probability=True)) ds = ALTA2012Full() proxy = StackingProxy(ds, learner, store=store) proxy.class_space = 'ebmcat' for feats in features: print "DOING:", len(feats), feats proxy.feature_spaces = feats e = Experiment(proxy, learner) proxy.store.new_TaskSetResult(e)
feats_position = ['positionabs','positionrel','positionrelbyte'] core = ( 'nltkword_unigram', 'treetaggerpos_bigram', 'treetaggerlemmapos_bigram', ) core += ('headingprev', 'headingvec', 'positionrel') struct_best = core + ('bowpost1', 'bowprev3', 'headingpost', 'isstructured', 'sentlenrel',) unstruct_best = core + ('bowprev','treetaggerpos_trigram','ttbprev') feats_all = tuple(sorted(set(sum(map(tuple, [struct_best, unstruct_best, feats_bow, feats_pos, feats_struct, feats_heading, feats_position]),tuple())))) datasets = [ ALTA2012Full(), ] if __name__ == "__main__": store = Store.from_caller() for ds in datasets: proxy = DataProxy(ds, store=store) proxy.inducer.process(proxy.dataset, fms=feats_all, cms=['ebmcat',], sqs=['abstract',], ) proxy.tokenstream_name = 'treetaggerlemmapos' proxy.tokenize(ext.bigram) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.bigram) proxy.tokenize(ext.trigram)