def main(args): if args.features: feature_path = args.features else: feature_path = os.path.join(args.model, 'PragerFeats') bucketlist_path = os.path.join(args.model, 'bucketlist') # display paths logger.info("buckets path: %s", bucketlist_path) logger.info("features output path: %s", feature_path) with open(bucketlist_path) as f: bucketlist = map(str.strip, f) lang_count = tally_lf(bucketlist, args.jobs) total_feats = len(lang_count) logger.info("unique features: {0}".format(total_feats)) if args.k is not None: logger.info("Prager-style feature selection, k = {0}".format(args.k)) feats = prager_select(bucketlist, lang_count, args.k, args.jobs) elif args.count is not None: logger.info("Top-N feature selection using TF-ILF, N = {0}".format( args.count)) feats = tfilf_select(bucketlist, lang_count, args.count, args.jobs) else: raise ValueError("no feature selection type specified") logger.info("selected features: {0} / {1} ({2:.2f}%)".format( len(feats), total_feats, 100. * len(feats) / total_feats)) write_features(feats, feature_path) logger.info('wrote features to "%s"', feature_path)
def main(args): if args.features: feature_path = args.features else: feature_path = os.path.join(args.model, 'PragerFeats') bucketlist_path = os.path.join(args.model, 'bucketlist') # display paths logger.info("buckets path: %s", bucketlist_path) logger.info("features output path: %s", feature_path) with open(bucketlist_path) as f: bucketlist = map(str.strip, f) lang_count = tally_lf(bucketlist, args.jobs) total_feats = len(lang_count) logger.info("unique features: {0}".format(total_feats)) if args.k is not None: logger.info("Prager-style feature selection, k = {0}".format(args.k)) feats = prager_select(bucketlist, lang_count, args.k, args.jobs) elif args.count is not None: logger.info("Top-N feature selection using TF-ILF, N = {0}".format(args.count)) feats = tfilf_select(bucketlist, lang_count, args.count, args.jobs) else: raise ValueError("no feature selection type specified") logger.info("selected features: {0} / {1} ({2:.2f}%)".format(len(feats), total_feats, 100. * len(feats) / total_feats)) write_features(feats, feature_path) logger.info('wrote features to "%s"', feature_path )
lang_w_path = os.path.join(args.model, 'IGweights.lang.bin') domain_w_path = os.path.join(args.model, 'IGweights.domain') feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats') # display paths if not SILENT: print "model path:", args.model print "lang weights path:", lang_w_path print "domain weights path:", domain_w_path print "feature output path:", feature_path lang_w = read_weights(lang_w_path) domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None features_per_lang = select_LD_features(lang_w, domain_w, args.feats_per_lang, ignore_domain=args.no_domain_ig) if args.per_lang: with open(feature_path + '.perlang', 'w') as f: writer = csv.writer(f) for i in range(len(features_per_lang)): writer.writerow(map(repr,features_per_lang[i])) final_feature_set = reduce(set.union, map(set, features_per_lang.values())) if not SILENT: print 'selected %d features' % len(final_feature_set) write_features(sorted(final_feature_set), feature_path) if not SILENT: print 'wrote features to "%s"' % feature_path
print "features output path:", feature_path if args.tokens_per_order: print "max ngram order:", args.max_order print "tokens per order:", args.tokens_per_order else: print "tokens:", args.tokens with open(bucketlist_path) as f: bucketlist = map(str.strip, f) doc_count = tally(bucketlist, args.jobs) print "unique features:", len(doc_count) if args.doc_count: # The constant true is used to indicate output to default location doc_count_path = os.path.join( args.model, 'DF_all') if args.doc_count == True else args.doc_count write_weights(doc_count, doc_count_path) print "wrote DF counts for all features to:", doc_count_path if args.tokens_per_order: # Choose a number of features for each length of token feats = ngram_select(doc_count, args.max_order, args.tokens_per_order) else: # Choose a number of features overall feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens]) print "selected features: ", len(feats) write_features(feats, feature_path) print 'wrote features to "%s"' % feature_path
print "features output path:", feature_path if args.tokens_per_order: print "max ngram order:", args.max_order print "tokens per order:", args.tokens_per_order else: print "tokens:", args.tokens with open(bucketlist_path) as f: bucketlist = map(str.strip, f) doc_count = tally(bucketlist, args.jobs) print "unique features:", len(doc_count) if args.doc_count: # The constant true is used to indicate output to default location doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count write_weights(doc_count, doc_count_path) print "wrote DF counts for all features to:", doc_count_path if args.tokens_per_order: # Choose a number of features for each length of token feats = ngram_select(doc_count, args.max_order, args.tokens_per_order) else: # Choose a number of features overall feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] ) print "selected features: ", len(feats) write_features(feats, feature_path) print 'wrote features to "%s"' % feature_path
# we want full debug output. if DFfeats is None or args.debug: # Compute DF per-feature doc_count = tally(b_dirs, args.jobs) if args.debug: doc_count_path = os.path.join(model_dir, "DF_all") write_weights(doc_count, doc_count_path) print "wrote DF counts for all features to:", doc_count_path if DFfeats is None: # Choose the first-stage features DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens) if args.debug: feature_path = os.path.join(model_dir, "DFfeats") write_features(DFfeats, feature_path) print 'wrote features to "%s"' % feature_path # Dispose of the first-pass tokenize output as it is no longer # needed. if not args.debug: for b in b_dirs: shutil.rmtree(b) # Second-pass tokenization to only obtain counts for the selected features. # As the first-pass set is typically much larger than the second pass, it often # works out to be faster to retokenize the raw documents rather than iterate # over the first-pass counts. DF_scanner = Scanner(DFfeats) b_dirs = build_index(items, DF_scanner, buckets_dir, args.buckets, args.jobs, args.chunksize)
""" Select the most highly-weighted N features across any number of files. Marco Lui, February 2013 """ import argparse from common import read_weights, write_features if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-n","--number", type=int, default=200, metavar='N', help="keep top N features per file") parser.add_argument("output", metavar='PATH', help="output to PATH") parser.add_argument("files", metavar="FILE", nargs='*', help="read weighted features from FILE") args = parser.parse_args() out_f = open(args.output, 'w') if args.output else sys.stdout feats = set() for path in args.files: w = read_weights(path) feats |= set(sorted(w, key=w.get, reverse=True)[:args.number]) write_features(sorted(feats), args.output)
lang_w_path = os.path.join(args.model, 'IGweights.lang.bin') domain_w_path = os.path.join(args.model, 'IGweights.domain') feature_path = args.output if args.output else os.path.join( args.model, 'LDfeats') # display paths print "model path:", args.model print "lang weights path:", lang_w_path print "domain weights path:", domain_w_path print "feature output path:", feature_path lang_w = read_weights(lang_w_path) domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None features_per_lang = select_LD_features(lang_w, domain_w, args.feats_per_lang, ignore_domain=args.no_domain_ig) if args.per_lang: with open(feature_path + '.perlang', 'w') as f: writer = csv.writer(f) for i in range(len(features_per_lang)): writer.writerow(map(repr, features_per_lang[i])) final_feature_set = reduce(set.union, map(set, features_per_lang.values())) print 'selected %d features' % len(final_feature_set) write_features(sorted(final_feature_set), feature_path) print 'wrote features to "%s"' % feature_path
# we want full debug output. if DFfeats is None or args.debug: # Compute DF per-feature doc_count = tally(b_dirs, args.jobs) if args.debug: doc_count_path = os.path.join(model_dir, 'DF_all') write_weights(doc_count, doc_count_path) print "wrote DF counts for all features to:", doc_count_path if DFfeats is None: # Choose the first-stage features DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens) if args.debug: feature_path = os.path.join(model_dir, 'DFfeats') write_features(DFfeats, feature_path) print 'wrote features to "%s"' % feature_path # Dispose of the first-pass tokenize output as it is no longer # needed. if not args.debug: for b in b_dirs: shutil.rmtree(b) # Second-pass tokenization to only obtain counts for the selected features. # As the first-pass set is typically much larger than the second pass, it often # works out to be faster to retokenize the raw documents rather than iterate # over the first-pass counts. DF_scanner = Scanner(DFfeats) b_dirs = build_index(items, DF_scanner, buckets_dir, args.buckets, args.jobs, args.chunksize)