def main(): parser = argparse.ArgumentParser(description='get pickeled instances') subparsers = parser.add_subparsers(dest='subparser_name' ,help='sub-command help') parser_serialize = subparsers.add_parser('serialize', help='pickle instances') parser_serialize.add_argument('--data', required=True, help='path to output pickled files') parser_serialize.add_argument('--threads', type=int, default = 1 , help='number of threads to run in parallel') parser_add = subparsers.add_parser('add', help='add to pickled instances') parser_add.add_argument('--data', required=True, help='path to output pickled files') parser_add.add_argument('--threads', type=int, default = 1 , help='number of threads to run in parallel') args = parser.parse_args() logging.info("pid: " + str(os.getpid())) if args.subparser_name == "serialize": earmark_ids = list(get_earmarks_from_db()) logging.info("Got %d earmarks" % len(earmark_ids)) entity_ids = list(get_entities_from_db()) logging.info("Got %d entities" % len(entity_ids)) instances = get_matching_instances(entity_ids, earmark_ids, get_earmark_entity_tuples(), args.threads) logging.info("Got %d instances" % len(instances)) logging.info("Creating pipe") fgs = [ JaccardFeatureGenerator(), ] pipe = Pipe(fgs, instances, num_processes=1) logging.info("Pushing into pipe") pipe.push_all_parallel() # group by earmark and document: pairs = [("JACCARD_FG","JACCARD_FG_max_inferred_name_jaccard" ), ("JACCARD_FG", "JACCARD_FG_max_cell_jaccard")] fgs = [ RankingFeatureGenerator(pairs = pairs), DifferenceFeatureGenerator(pairs = pairs) ] grouper = InstancesGrouper(['earmark_id', 'document_id']) pipe = BlocksPipe(grouper, fgs, pipe.instances, num_processes=1 ) pipe.push_all_parallel() #Serialize logging.info("Start Serializing") serialize_instances(pipe.instances, args.data) logging.info("Done!") elif args.subparser_name == "add": instances = load_instances(args.data) logging.info("Creating pipe") fgs = [ InfixFeatureGenerator() ] pipe = Pipe(fgs, instances, num_processes=args.threads) logging.info("Pushing into pipe") pipe.push_all_parallel() #Serialize logging.info("Start Serializing") serialize_instances(pipe.instances, args.data) logging.info("Done!")
def folder_to_scipy(folder_path, feature_space = None): instances = load_instances(folder_path) x, y, feature_space = pipe.instances_to_matrix(instances, feature_space= feature_space, dense = True) return x, y, feature_space