Пример #1
0
def tokenize_extra(ds, store_path):
    """
  Additional feature extraction for features that are not provided by the dataset
  implementation.

  @param ds dataset to read from
  @param store_path path of store to write to
  """
    class_space = 'ebmcat'

    #print >>sys.stderr,  "=== tokenize_extra for {0} ===".format(store_path)
    with closing(Store(store_path, 'a', recursive_close=False)) as store:
        proxy = DataProxy(ds, store=store)

        proxy.tokenstream_name = 'treetaggerlemmapos'
        proxy.tokenize(ext.bigram)

        proxy.tokenstream_name = 'treetaggerpos'
        proxy.tokenize(ext.trigram)

    # Hackish workaround for store being unexpectedly closed
    with closing(Store(store_path, 'a', recursive_close=False)) as store:
        proxy = DataProxy(ds, store=store)

        proxy.tokenstream_name = 'treetaggerpos'
        proxy.tokenize(ext.bigram)
Пример #2
0
def induce(ts, store_path, features, spaces):
    """
  Induce features for a list of abstracts.

  @param ts TokenStream (mapping from id to line)
  """
    ds = NewDocuments(ts)

    # Merge feature spaces into the store
    with closing(Store(store_path, 'a')) as store:
        for space in spaces:
            md = {'name': space, 'type': 'feature'}
            store.add_Space(spaces[space], md)

    # We do the feature induction in a subprocess to avoid Python holding on to memory.
    for feature in features:
        tokenize(ds, [feature], store_path)
        #p = mp.Process(target=tokenize, args=(ds, [feature], store_path))
        #p.start()
        #p.join()
        #p.terminate()

    tokenize_extra(ds, store_path)
Пример #3
0
def tokenize(ds, features, store_path):
    """
  Compute feature values and save them in a hydrat store.

  @param ds dataset to read from
  @param features names of features to read
  @param store_path path of store to write to
  """
    class_space = 'ebmcat'

    #print >>sys.stderr,  "=== opening store at {0} ===".format(store_path)
    with closing(Store(store_path, 'a', recursive_close=False)) as store:

        #print >>sys.stderr,  "=== inducing features ({0}) ===".format(features)
        # Induce all the features for the new test data
        proxy = DataProxy(ds, store=store)
        proxy.inducer.process(
            proxy.dataset,
            fms=features,
            sqs=[
                'abstract',
            ],
        )
Пример #4
0
                        help='existing hydrat Store generated by features.py')
    parser.add_argument("output",
                        help="produce output in PATH",
                        metavar="PATH")
    args = parser.parse_args()

    class_space = 'ebmcat'

    try:
        features = features.feature_sets[args.feats]
    except KeyError:
        parser.error("unknown feature group: {0}".format(args.feats))

    l = repeat.RepeatLearner(
        liblinear.liblinearL(svm_type=0, output_probability=True))
    store = Store(args.feat_store, 'r')  # TODO: Do we want this read-only?

    for feature in features:
        spaces[feature] = store.get_Space(feature)
    spaces['ebmcat'] = store.get_Space('ebmcat')

    proxy = DataProxy(ALTA2012Full(), store=store)
    proxy.class_space = class_space

    L0_cl = []
    L1_fv = []
    L1_gs = None
    for feat in features:
        proxy.feature_spaces = feat
        proxy.split_name = 'crossvalidation'
Пример #5
0
feature_sets = {
  'all': feats_all,
  'core': core,
  'dev': ('headingprev', 'headingvec', 'positionrel'),
}

datasets = [
  ALTA2012Full(),
  ]

if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument('output', help='write output to PATH', metavar='PATH')
  args = parser.parse_args()

  store = Store(args.output, 'a')

  for ds in datasets:
    proxy = DataProxy(ds, store=store)
    proxy.inducer.process(proxy.dataset, 
      fms=feats_all,
      cms=['ebmcat',], 
      sqs=['abstract',],
      sps=['crossvalidation'],
    )

    proxy.tokenstream_name = 'treetaggerlemmapos'
    proxy.tokenize(ext.bigram)

    proxy.tokenstream_name = 'treetaggerpos'
    proxy.tokenize(ext.bigram)
Пример #6
0
import numpy as np
import scipy.sparse as sp

from common import Timer

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("model", help="read model from")
    parser.add_argument("data", help="store containing pre-tokenized data")
    parser.add_argument("feat", help="store containing feature data")
    parser.add_argument("output", help="write output to PATH", metavar="PATH")
    args = parser.parse_args()

    features, L0_cl, L1_cl = load(open(args.model))
    fallback = Store(args.feat, 'r')
    store = Store(args.data, 'r', fallback=fallback)

    with Timer() as overall_timer:
        L0_preds = []
        for feat, cl in zip(features, L0_cl):
            fm = store.get_FeatureMap('NewDocuments', feat)
            # We need to trim the fv as the feature space may have grown when we tokenized more documents.
            # Hydrat's design is such that new features are appended to the end of a feature space, so
            # we can safely truncate the feature map.
            train_feat_count = cl.metadata['train_feat_count']
            assert (train_feat_count <= fm.raw.shape[1])
            fv = fm.raw[:, :train_feat_count]
            with Timer() as cl_timer:
                pred = cl(fv)
                print >> sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format(