Exemplo n.º 1
0
def main(args):
    if args.features:
        feature_path = args.features
    else:
        feature_path = os.path.join(args.model, 'PragerFeats')

    bucketlist_path = os.path.join(args.model, 'bucketlist')

    # display paths
    logger.info("buckets path: %s", bucketlist_path)
    logger.info("features output path: %s", feature_path)

    with open(bucketlist_path) as f:
        bucketlist = map(str.strip, f)

    lang_count = tally_lf(bucketlist, args.jobs)
    total_feats = len(lang_count)
    logger.info("unique features: {0}".format(total_feats))

    if args.k is not None:
        logger.info("Prager-style feature selection, k = {0}".format(args.k))
        feats = prager_select(bucketlist, lang_count, args.k, args.jobs)
    elif args.count is not None:
        logger.info("Top-N feature selection using TF-ILF, N = {0}".format(
            args.count))
        feats = tfilf_select(bucketlist, lang_count, args.count, args.jobs)
    else:
        raise ValueError("no feature selection type specified")

    logger.info("selected features: {0} / {1} ({2:.2f}%)".format(
        len(feats), total_feats, 100. * len(feats) / total_feats))

    write_features(feats, feature_path)
    logger.info('wrote features to "%s"', feature_path)
Exemplo n.º 2
0
def main(args):
  if args.features:
    feature_path = args.features
  else:
    feature_path = os.path.join(args.model, 'PragerFeats')

  bucketlist_path = os.path.join(args.model, 'bucketlist')

  # display paths
  logger.info("buckets path: %s", bucketlist_path)
  logger.info("features output path: %s", feature_path)

  with open(bucketlist_path) as f:
    bucketlist = map(str.strip, f)

  lang_count = tally_lf(bucketlist, args.jobs)
  total_feats = len(lang_count)
  logger.info("unique features: {0}".format(total_feats))

  if args.k is not None:
    logger.info("Prager-style feature selection, k = {0}".format(args.k))
    feats = prager_select(bucketlist, lang_count, args.k, args.jobs)
  elif args.count is not None:
    logger.info("Top-N feature selection using TF-ILF, N = {0}".format(args.count))
    feats = tfilf_select(bucketlist, lang_count, args.count, args.jobs)
  else:
    raise ValueError("no feature selection type specified")

  logger.info("selected features: {0} / {1} ({2:.2f}%)".format(len(feats), total_feats, 100. * len(feats) / total_feats))

  write_features(feats, feature_path)
  logger.info('wrote features to "%s"', feature_path )
Exemplo n.º 3
0
  lang_w_path = os.path.join(args.model, 'IGweights.lang.bin')
  domain_w_path = os.path.join(args.model, 'IGweights.domain')
  feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats')

  # display paths
  if not SILENT:
    print "model path:", args.model
    print "lang weights path:", lang_w_path
    print "domain weights path:", domain_w_path
    print "feature output path:", feature_path

  lang_w = read_weights(lang_w_path)
  domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None

  features_per_lang = select_LD_features(lang_w, domain_w, args.feats_per_lang, ignore_domain=args.no_domain_ig)
  if args.per_lang:
    with open(feature_path + '.perlang', 'w') as f:
      writer = csv.writer(f)
      for i in range(len(features_per_lang)):
        writer.writerow(map(repr,features_per_lang[i]))
      

  final_feature_set = reduce(set.union, map(set, features_per_lang.values()))
  if not SILENT:
    print 'selected %d features' % len(final_feature_set)

  write_features(sorted(final_feature_set), feature_path)
  if not SILENT:
    print 'wrote features to "%s"' % feature_path 

Exemplo n.º 4
0
    print "features output path:", feature_path
    if args.tokens_per_order:
        print "max ngram order:", args.max_order
        print "tokens per order:", args.tokens_per_order
    else:
        print "tokens:", args.tokens

    with open(bucketlist_path) as f:
        bucketlist = map(str.strip, f)

    doc_count = tally(bucketlist, args.jobs)
    print "unique features:", len(doc_count)
    if args.doc_count:
        # The constant true is used to indicate output to default location
        doc_count_path = os.path.join(
            args.model, 'DF_all') if args.doc_count == True else args.doc_count
        write_weights(doc_count, doc_count_path)
        print "wrote DF counts for all features to:", doc_count_path

    if args.tokens_per_order:
        # Choose a number of features for each length of token
        feats = ngram_select(doc_count, args.max_order, args.tokens_per_order)
    else:
        # Choose a number of features overall
        feats = sorted(
            sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens])
    print "selected features: ", len(feats)

    write_features(feats, feature_path)
    print 'wrote features to "%s"' % feature_path
Exemplo n.º 5
0
  print "features output path:", feature_path
  if args.tokens_per_order:
    print "max ngram order:", args.max_order
    print "tokens per order:", args.tokens_per_order
  else:
    print "tokens:", args.tokens

  with open(bucketlist_path) as f:
    bucketlist = map(str.strip, f)

  doc_count = tally(bucketlist, args.jobs)
  print "unique features:", len(doc_count)
  if args.doc_count:
    # The constant true is used to indicate output to default location
    doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count
    write_weights(doc_count, doc_count_path)
    print "wrote DF counts for all features to:", doc_count_path

  if args.tokens_per_order:
    # Choose a number of features for each length of token
    feats = ngram_select(doc_count, args.max_order, args.tokens_per_order)
  else:
    # Choose a number of features overall
    feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
  print "selected features: ", len(feats)

  write_features(feats, feature_path)
  print 'wrote features to "%s"' % feature_path 

  
Exemplo n.º 6
0
        # we want full debug output.
        if DFfeats is None or args.debug:
            # Compute DF per-feature
            doc_count = tally(b_dirs, args.jobs)
            if args.debug:
                doc_count_path = os.path.join(model_dir, "DF_all")
                write_weights(doc_count, doc_count_path)
                print "wrote DF counts for all features to:", doc_count_path

        if DFfeats is None:
            # Choose the first-stage features
            DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens)

        if args.debug:
            feature_path = os.path.join(model_dir, "DFfeats")
            write_features(DFfeats, feature_path)
            print 'wrote features to "%s"' % feature_path

        # Dispose of the first-pass tokenize output as it is no longer
        # needed.
        if not args.debug:
            for b in b_dirs:
                shutil.rmtree(b)

        # Second-pass tokenization to only obtain counts for the selected features.
        # As the first-pass set is typically much larger than the second pass, it often
        # works out to be faster to retokenize the raw documents rather than iterate
        # over the first-pass counts.
        DF_scanner = Scanner(DFfeats)
        b_dirs = build_index(items, DF_scanner, buckets_dir, args.buckets, args.jobs, args.chunksize)
"""
Select the most highly-weighted N features across any number of files.

Marco Lui, February 2013
"""

import argparse

from common import read_weights, write_features

if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument("-n","--number", type=int, default=200, metavar='N', 
    help="keep top N features per file")
  parser.add_argument("output", metavar='PATH', help="output to PATH")
  parser.add_argument("files", metavar="FILE", nargs='*', help="read weighted features from FILE")
  args = parser.parse_args()

  out_f = open(args.output, 'w') if args.output else sys.stdout
  feats = set()

  for path in args.files:
    w = read_weights(path)
    feats |= set(sorted(w, key=w.get, reverse=True)[:args.number])

  write_features(sorted(feats), args.output)
Exemplo n.º 8
0
    lang_w_path = os.path.join(args.model, 'IGweights.lang.bin')
    domain_w_path = os.path.join(args.model, 'IGweights.domain')
    feature_path = args.output if args.output else os.path.join(
        args.model, 'LDfeats')

    # display paths
    print "model path:", args.model
    print "lang weights path:", lang_w_path
    print "domain weights path:", domain_w_path
    print "feature output path:", feature_path

    lang_w = read_weights(lang_w_path)
    domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None

    features_per_lang = select_LD_features(lang_w,
                                           domain_w,
                                           args.feats_per_lang,
                                           ignore_domain=args.no_domain_ig)
    if args.per_lang:
        with open(feature_path + '.perlang', 'w') as f:
            writer = csv.writer(f)
            for i in range(len(features_per_lang)):
                writer.writerow(map(repr, features_per_lang[i]))

    final_feature_set = reduce(set.union, map(set, features_per_lang.values()))
    print 'selected %d features' % len(final_feature_set)

    write_features(sorted(final_feature_set), feature_path)
    print 'wrote features to "%s"' % feature_path
Exemplo n.º 9
0
        # we want full debug output.
        if DFfeats is None or args.debug:
            # Compute DF per-feature
            doc_count = tally(b_dirs, args.jobs)
            if args.debug:
                doc_count_path = os.path.join(model_dir, 'DF_all')
                write_weights(doc_count, doc_count_path)
                print "wrote DF counts for all features to:", doc_count_path

        if DFfeats is None:
            # Choose the first-stage features
            DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens)

        if args.debug:
            feature_path = os.path.join(model_dir, 'DFfeats')
            write_features(DFfeats, feature_path)
            print 'wrote features to "%s"' % feature_path

        # Dispose of the first-pass tokenize output as it is no longer
        # needed.
        if not args.debug:
            for b in b_dirs:
                shutil.rmtree(b)

        # Second-pass tokenization to only obtain counts for the selected features.
        # As the first-pass set is typically much larger than the second pass, it often
        # works out to be faster to retokenize the raw documents rather than iterate
        # over the first-pass counts.
        DF_scanner = Scanner(DFfeats)
        b_dirs = build_index(items, DF_scanner, buckets_dir, args.buckets,
                             args.jobs, args.chunksize)