def tokenize_extra(ds, store_path): """ Additional feature extraction for features that are not provided by the dataset implementation. @param ds dataset to read from @param store_path path of store to write to """ class_space = 'ebmcat' #print >>sys.stderr, "=== tokenize_extra for {0} ===".format(store_path) with closing(Store(store_path, 'a', recursive_close=False)) as store: proxy = DataProxy(ds, store=store) proxy.tokenstream_name = 'treetaggerlemmapos' proxy.tokenize(ext.bigram) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.trigram) # Hackish workaround for store being unexpectedly closed with closing(Store(store_path, 'a', recursive_close=False)) as store: proxy = DataProxy(ds, store=store) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.bigram)
def induce(ts, store_path, features, spaces): """ Induce features for a list of abstracts. @param ts TokenStream (mapping from id to line) """ ds = NewDocuments(ts) # Merge feature spaces into the store with closing(Store(store_path, 'a')) as store: for space in spaces: md = {'name': space, 'type': 'feature'} store.add_Space(spaces[space], md) # We do the feature induction in a subprocess to avoid Python holding on to memory. for feature in features: tokenize(ds, [feature], store_path) #p = mp.Process(target=tokenize, args=(ds, [feature], store_path)) #p.start() #p.join() #p.terminate() tokenize_extra(ds, store_path)
def tokenize(ds, features, store_path): """ Compute feature values and save them in a hydrat store. @param ds dataset to read from @param features names of features to read @param store_path path of store to write to """ class_space = 'ebmcat' #print >>sys.stderr, "=== opening store at {0} ===".format(store_path) with closing(Store(store_path, 'a', recursive_close=False)) as store: #print >>sys.stderr, "=== inducing features ({0}) ===".format(features) # Induce all the features for the new test data proxy = DataProxy(ds, store=store) proxy.inducer.process( proxy.dataset, fms=features, sqs=[ 'abstract', ], )
help='existing hydrat Store generated by features.py') parser.add_argument("output", help="produce output in PATH", metavar="PATH") args = parser.parse_args() class_space = 'ebmcat' try: features = features.feature_sets[args.feats] except KeyError: parser.error("unknown feature group: {0}".format(args.feats)) l = repeat.RepeatLearner( liblinear.liblinearL(svm_type=0, output_probability=True)) store = Store(args.feat_store, 'r') # TODO: Do we want this read-only? for feature in features: spaces[feature] = store.get_Space(feature) spaces['ebmcat'] = store.get_Space('ebmcat') proxy = DataProxy(ALTA2012Full(), store=store) proxy.class_space = class_space L0_cl = [] L1_fv = [] L1_gs = None for feat in features: proxy.feature_spaces = feat proxy.split_name = 'crossvalidation'
feature_sets = { 'all': feats_all, 'core': core, 'dev': ('headingprev', 'headingvec', 'positionrel'), } datasets = [ ALTA2012Full(), ] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('output', help='write output to PATH', metavar='PATH') args = parser.parse_args() store = Store(args.output, 'a') for ds in datasets: proxy = DataProxy(ds, store=store) proxy.inducer.process(proxy.dataset, fms=feats_all, cms=['ebmcat',], sqs=['abstract',], sps=['crossvalidation'], ) proxy.tokenstream_name = 'treetaggerlemmapos' proxy.tokenize(ext.bigram) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.bigram)
import numpy as np import scipy.sparse as sp from common import Timer if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("model", help="read model from") parser.add_argument("data", help="store containing pre-tokenized data") parser.add_argument("feat", help="store containing feature data") parser.add_argument("output", help="write output to PATH", metavar="PATH") args = parser.parse_args() features, L0_cl, L1_cl = load(open(args.model)) fallback = Store(args.feat, 'r') store = Store(args.data, 'r', fallback=fallback) with Timer() as overall_timer: L0_preds = [] for feat, cl in zip(features, L0_cl): fm = store.get_FeatureMap('NewDocuments', feat) # We need to trim the fv as the feature space may have grown when we tokenized more documents. # Hydrat's design is such that new features are appended to the end of a feature space, so # we can safely truncate the feature map. train_feat_count = cl.metadata['train_feat_count'] assert (train_feat_count <= fm.raw.shape[1]) fv = fm.raw[:, :train_feat_count] with Timer() as cl_timer: pred = cl(fv) print >> sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format(