def tokenize(ds, features, store_path): """ Compute feature values and save them in a hydrat store. @param ds dataset to read from @param features names of features to read @param store_path path of store to write to """ class_space = 'ebmcat' #print >>sys.stderr, "=== opening store at {0} ===".format(store_path) with closing(Store(store_path, 'a', recursive_close=False)) as store: #print >>sys.stderr, "=== inducing features ({0}) ===".format(features) # Induce all the features for the new test data proxy = DataProxy(ds, store=store) proxy.inducer.process( proxy.dataset, fms=features, sqs=[ 'abstract', ], )
core = ( 'nltkword_unigram', 'treetaggerpos_bigram', 'treetaggerlemmapos_bigram', ) core += ('headingprev', 'headingvec', 'positionrel') struct_best = core + ('bowpost1', 'bowprev3', 'headingpost', 'isstructured', 'sentlenrel',) unstruct_best = core + ('bowprev','treetaggerpos_trigram','ttbprev') feats_all = tuple(sorted(set(sum(map(tuple, [struct_best, unstruct_best, feats_bow, feats_pos, feats_struct, feats_heading, feats_position]),tuple())))) datasets = [ ALTA2012Full(), ] if __name__ == "__main__": store = Store.from_caller() for ds in datasets: proxy = DataProxy(ds, store=store) proxy.inducer.process(proxy.dataset, fms=feats_all, cms=['ebmcat',], sqs=['abstract',], ) proxy.tokenstream_name = 'treetaggerlemmapos' proxy.tokenize(ext.bigram) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.bigram) proxy.tokenize(ext.trigram)
class_space = 'ebmcat' try: features = features.feature_sets[args.feats] except KeyError: parser.error("unknown feature group: {0}".format(args.feats)) l = repeat.RepeatLearner( liblinear.liblinearL(svm_type=0, output_probability=True)) store = Store(args.feat_store, 'r') # TODO: Do we want this read-only? for feature in features: spaces[feature] = store.get_Space(feature) spaces['ebmcat'] = store.get_Space('ebmcat') proxy = DataProxy(ALTA2012Full(), store=store) proxy.class_space = class_space L0_cl = [] L1_fv = [] L1_gs = None for feat in features: proxy.feature_spaces = feat proxy.split_name = 'crossvalidation' with Timer() as L0_timer: L0_cl.append(l(proxy.featuremap.raw, proxy.classmap.raw)) print >> sys.stderr, "== training L0 for {0} took {1:.2f}s ==".format( feat, L0_timer.elapsed) with Timer() as L1_cv_timer:
'core': core, 'dev': ('headingprev', 'headingvec', 'positionrel'), } datasets = [ ALTA2012Full(), ] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('output', help='write output to PATH', metavar='PATH') args = parser.parse_args() store = Store(args.output, 'a') for ds in datasets: proxy = DataProxy(ds, store=store) proxy.inducer.process(proxy.dataset, fms=feats_all, cms=['ebmcat',], sqs=['abstract',], sps=['crossvalidation'], ) proxy.tokenstream_name = 'treetaggerlemmapos' proxy.tokenize(ext.bigram) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.bigram) proxy.tokenize(ext.trigram)
class_space = 'ebmcat' try: features = features.feature_sets[args.feats] except KeyError: parser.error("unknown feature group: {0}".format(args.feats)) l = repeat.RepeatLearner(liblinear.liblinearL(svm_type=0, output_probability=True)) store = Store(args.feat_store, 'r') # TODO: Do we want this read-only? for feature in features: spaces[feature] = store.get_Space(feature) spaces['ebmcat'] = store.get_Space('ebmcat') proxy = DataProxy(ALTA2012Full(), store=store) proxy.class_space = class_space L0_cl = [] L1_fv = [] L1_gs = None for feat in features: proxy.feature_spaces = feat proxy.split_name = 'crossvalidation' with Timer() as L0_timer: L0_cl.append( l(proxy.featuremap.raw, proxy.classmap.raw) ) print >>sys.stderr, "== training L0 for {0} took {1:.2f}s ==".format(feat, L0_timer.elapsed) with Timer() as L1_cv_timer: e = Experiment(proxy, l)
def tokenize_extra(ds, store_path): """ Additional feature extraction for features that are not provided by the dataset implementation. @param ds dataset to read from @param store_path path of store to write to """ class_space = 'ebmcat' #print >>sys.stderr, "=== tokenize_extra for {0} ===".format(store_path) with closing(Store(store_path, 'a', recursive_close=False)) as store: proxy = DataProxy(ds, store=store) proxy.tokenstream_name = 'treetaggerlemmapos' proxy.tokenize(ext.bigram) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.trigram) # Hackish workaround for store being unexpectedly closed with closing(Store(store_path, 'a', recursive_close=False)) as store: proxy = DataProxy(ds, store=store) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.bigram)