예제 #1
0
def tokenize_extra(ds, store_path):
    """
  Additional feature extraction for features that are not provided by the dataset
  implementation.

  @param ds dataset to read from
  @param store_path path of store to write to
  """
    class_space = 'ebmcat'

    #print >>sys.stderr,  "=== tokenize_extra for {0} ===".format(store_path)
    with closing(Store(store_path, 'a', recursive_close=False)) as store:
        proxy = DataProxy(ds, store=store)

        proxy.tokenstream_name = 'treetaggerlemmapos'
        proxy.tokenize(ext.bigram)

        proxy.tokenstream_name = 'treetaggerpos'
        proxy.tokenize(ext.trigram)

    # Hackish workaround for store being unexpectedly closed
    with closing(Store(store_path, 'a', recursive_close=False)) as store:
        proxy = DataProxy(ds, store=store)

        proxy.tokenstream_name = 'treetaggerpos'
        proxy.tokenize(ext.bigram)
예제 #2
0
def tokenize(ds, features, store_path):
    """
  Compute feature values and save them in a hydrat store.

  @param ds dataset to read from
  @param features names of features to read
  @param store_path path of store to write to
  """
    class_space = 'ebmcat'

    #print >>sys.stderr,  "=== opening store at {0} ===".format(store_path)
    with closing(Store(store_path, 'a', recursive_close=False)) as store:

        #print >>sys.stderr,  "=== inducing features ({0}) ===".format(features)
        # Induce all the features for the new test data
        proxy = DataProxy(ds, store=store)
        proxy.inducer.process(
            proxy.dataset,
            fms=features,
            sqs=[
                'abstract',
            ],
        )
예제 #3
0
    class_space = 'ebmcat'

    try:
        features = features.feature_sets[args.feats]
    except KeyError:
        parser.error("unknown feature group: {0}".format(args.feats))

    l = repeat.RepeatLearner(
        liblinear.liblinearL(svm_type=0, output_probability=True))
    store = Store(args.feat_store, 'r')  # TODO: Do we want this read-only?

    for feature in features:
        spaces[feature] = store.get_Space(feature)
    spaces['ebmcat'] = store.get_Space('ebmcat')

    proxy = DataProxy(ALTA2012Full(), store=store)
    proxy.class_space = class_space

    L0_cl = []
    L1_fv = []
    L1_gs = None
    for feat in features:
        proxy.feature_spaces = feat
        proxy.split_name = 'crossvalidation'

        with Timer() as L0_timer:
            L0_cl.append(l(proxy.featuremap.raw, proxy.classmap.raw))
            print >> sys.stderr, "== training L0 for {0} took {1:.2f}s ==".format(
                feat, L0_timer.elapsed)

        with Timer() as L1_cv_timer:
예제 #4
0
  'core': core,
  'dev': ('headingprev', 'headingvec', 'positionrel'),
}

datasets = [
  ALTA2012Full(),
  ]

if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument('output', help='write output to PATH', metavar='PATH')
  args = parser.parse_args()

  store = Store(args.output, 'a')

  for ds in datasets:
    proxy = DataProxy(ds, store=store)
    proxy.inducer.process(proxy.dataset, 
      fms=feats_all,
      cms=['ebmcat',], 
      sqs=['abstract',],
      sps=['crossvalidation'],
    )

    proxy.tokenstream_name = 'treetaggerlemmapos'
    proxy.tokenize(ext.bigram)

    proxy.tokenstream_name = 'treetaggerpos'
    proxy.tokenize(ext.bigram)
    proxy.tokenize(ext.trigram)