Пример #1
0
def tokenize(ds, features, store_path):
    """
  Compute feature values and save them in a hydrat store.

  @param ds dataset to read from
  @param features names of features to read
  @param store_path path of store to write to
  """
    class_space = 'ebmcat'

    #print >>sys.stderr,  "=== opening store at {0} ===".format(store_path)
    with closing(Store(store_path, 'a', recursive_close=False)) as store:

        #print >>sys.stderr,  "=== inducing features ({0}) ===".format(features)
        # Induce all the features for the new test data
        proxy = DataProxy(ds, store=store)
        proxy.inducer.process(
            proxy.dataset,
            fms=features,
            sqs=[
                'abstract',
            ],
        )
Пример #2
0

core = ( 'nltkword_unigram', 'treetaggerpos_bigram', 'treetaggerlemmapos_bigram', )
core += ('headingprev', 'headingvec', 'positionrel')
struct_best = core + ('bowpost1', 'bowprev3', 'headingpost', 'isstructured', 'sentlenrel',)
unstruct_best = core + ('bowprev','treetaggerpos_trigram','ttbprev')

feats_all  = tuple(sorted(set(sum(map(tuple, [struct_best, unstruct_best, feats_bow, feats_pos, feats_struct, feats_heading, feats_position]),tuple()))))

datasets = [
  ALTA2012Full(),
  ]

if __name__ == "__main__":
  store = Store.from_caller()

  for ds in datasets:
    proxy = DataProxy(ds, store=store)
    proxy.inducer.process(proxy.dataset, 
      fms=feats_all,
      cms=['ebmcat',], 
      sqs=['abstract',],
    )

    proxy.tokenstream_name = 'treetaggerlemmapos'
    proxy.tokenize(ext.bigram)

    proxy.tokenstream_name = 'treetaggerpos'
    proxy.tokenize(ext.bigram)
    proxy.tokenize(ext.trigram)
Пример #3
0
    class_space = 'ebmcat'

    try:
        features = features.feature_sets[args.feats]
    except KeyError:
        parser.error("unknown feature group: {0}".format(args.feats))

    l = repeat.RepeatLearner(
        liblinear.liblinearL(svm_type=0, output_probability=True))
    store = Store(args.feat_store, 'r')  # TODO: Do we want this read-only?

    for feature in features:
        spaces[feature] = store.get_Space(feature)
    spaces['ebmcat'] = store.get_Space('ebmcat')

    proxy = DataProxy(ALTA2012Full(), store=store)
    proxy.class_space = class_space

    L0_cl = []
    L1_fv = []
    L1_gs = None
    for feat in features:
        proxy.feature_spaces = feat
        proxy.split_name = 'crossvalidation'

        with Timer() as L0_timer:
            L0_cl.append(l(proxy.featuremap.raw, proxy.classmap.raw))
            print >> sys.stderr, "== training L0 for {0} took {1:.2f}s ==".format(
                feat, L0_timer.elapsed)

        with Timer() as L1_cv_timer:
Пример #4
0
  'core': core,
  'dev': ('headingprev', 'headingvec', 'positionrel'),
}

datasets = [
  ALTA2012Full(),
  ]

if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument('output', help='write output to PATH', metavar='PATH')
  args = parser.parse_args()

  store = Store(args.output, 'a')

  for ds in datasets:
    proxy = DataProxy(ds, store=store)
    proxy.inducer.process(proxy.dataset, 
      fms=feats_all,
      cms=['ebmcat',], 
      sqs=['abstract',],
      sps=['crossvalidation'],
    )

    proxy.tokenstream_name = 'treetaggerlemmapos'
    proxy.tokenize(ext.bigram)

    proxy.tokenstream_name = 'treetaggerpos'
    proxy.tokenize(ext.bigram)
    proxy.tokenize(ext.trigram)
Пример #5
0
  class_space = 'ebmcat'

  try:
    features = features.feature_sets[args.feats]
  except KeyError:
    parser.error("unknown feature group: {0}".format(args.feats))

  l = repeat.RepeatLearner(liblinear.liblinearL(svm_type=0, output_probability=True))
  store = Store(args.feat_store, 'r') # TODO: Do we want this read-only?

  for feature in features:
    spaces[feature] = store.get_Space(feature)
  spaces['ebmcat'] = store.get_Space('ebmcat')

  proxy = DataProxy(ALTA2012Full(), store=store)
  proxy.class_space = class_space
  
  L0_cl = []
  L1_fv = []
  L1_gs = None
  for feat in features: 
    proxy.feature_spaces = feat
    proxy.split_name = 'crossvalidation'

    with Timer() as L0_timer:
      L0_cl.append( l(proxy.featuremap.raw, proxy.classmap.raw) )
      print >>sys.stderr, "== training L0 for {0} took {1:.2f}s ==".format(feat, L0_timer.elapsed)

    with Timer() as L1_cv_timer:
      e = Experiment(proxy, l)
Пример #6
0
def tokenize_extra(ds, store_path):
    """
  Additional feature extraction for features that are not provided by the dataset
  implementation.

  @param ds dataset to read from
  @param store_path path of store to write to
  """
    class_space = 'ebmcat'

    #print >>sys.stderr,  "=== tokenize_extra for {0} ===".format(store_path)
    with closing(Store(store_path, 'a', recursive_close=False)) as store:
        proxy = DataProxy(ds, store=store)

        proxy.tokenstream_name = 'treetaggerlemmapos'
        proxy.tokenize(ext.bigram)

        proxy.tokenstream_name = 'treetaggerpos'
        proxy.tokenize(ext.trigram)

    # Hackish workaround for store being unexpectedly closed
    with closing(Store(store_path, 'a', recursive_close=False)) as store:
        proxy = DataProxy(ds, store=store)

        proxy.tokenstream_name = 'treetaggerpos'
        proxy.tokenize(ext.bigram)
Пример #7
0
def tokenize_extra(ds, store_path):
  """
  Additional feature extraction for features that are not provided by the dataset
  implementation.

  @param ds dataset to read from
  @param store_path path of store to write to
  """
  class_space = 'ebmcat'

  #print >>sys.stderr,  "=== tokenize_extra for {0} ===".format(store_path)
  with closing(Store(store_path, 'a', recursive_close=False)) as store:
    proxy = DataProxy(ds, store=store)

    proxy.tokenstream_name = 'treetaggerlemmapos'
    proxy.tokenize(ext.bigram)

    proxy.tokenstream_name = 'treetaggerpos'
    proxy.tokenize(ext.trigram)

  # Hackish workaround for store being unexpectedly closed
  with closing(Store(store_path, 'a', recursive_close=False)) as store:
    proxy = DataProxy(ds, store=store)

    proxy.tokenstream_name = 'treetaggerpos'
    proxy.tokenize(ext.bigram)