def serialize_student_group(students, data_folder): instances = [] for i in range(len(students)): for j in range(i+1, len(students), 1): instances.append(get_instance(students[i], students[j])) logging.info("Created %d instances" %(len(instances))) if len(instances) == 0: logging.warn("FAILED TO GENERATE INSTANCES!") return fgs = [IsSameFeatureGenerator(fields=['ZipCode', 'Gender', 'Language', 'HomeLanguage' ,'BirthCountry', 'Race', 'Food', 'ESL', 'LEP', 'SpecialED','CatchmentSchool', 'ThisGradeSchoolKey']), AbsoluteDifferenceFeatureGenerator(fields=['GPA', 'EighthMathISAT', 'EighthReadingISAT', 'AttendanceRate']), DistanceFeatureGenerator(), OtherFeaturesFeatureGenerator(), ] pipe = pipe = Pipe(fgs, instances, num_processes=1) pipe.push_all_parallel() serialize_instances(pipe.instances, data_folder) """
def main(): parser = argparse.ArgumentParser(description='get pickeled instances') subparsers = parser.add_subparsers(dest='subparser_name' ,help='sub-command help') parser_serialize = subparsers.add_parser('serialize', help='pickle instances') parser_serialize.add_argument('--data', required=True, help='path to output pickled files') parser_serialize.add_argument('--threads', type=int, default = 1 , help='number of threads to run in parallel') parser_add = subparsers.add_parser('add', help='add to pickled instances') parser_add.add_argument('--data', required=True, help='path to output pickled files') parser_add.add_argument('--threads', type=int, default = 1 , help='number of threads to run in parallel') args = parser.parse_args() logging.info("pid: " + str(os.getpid())) if args.subparser_name == "serialize": earmark_ids = list(get_earmarks_from_db()) logging.info("Got %d earmarks" % len(earmark_ids)) entity_ids = list(get_entities_from_db()) logging.info("Got %d entities" % len(entity_ids)) instances = get_matching_instances(entity_ids, earmark_ids, get_earmark_entity_tuples(), args.threads) logging.info("Got %d instances" % len(instances)) logging.info("Creating pipe") fgs = [ JaccardFeatureGenerator(), ] pipe = Pipe(fgs, instances, num_processes=1) logging.info("Pushing into pipe") pipe.push_all_parallel() # group by earmark and document: pairs = [("JACCARD_FG","JACCARD_FG_max_inferred_name_jaccard" ), ("JACCARD_FG", "JACCARD_FG_max_cell_jaccard")] fgs = [ RankingFeatureGenerator(pairs = pairs), DifferenceFeatureGenerator(pairs = pairs) ] grouper = InstancesGrouper(['earmark_id', 'document_id']) pipe = BlocksPipe(grouper, fgs, pipe.instances, num_processes=1 ) pipe.push_all_parallel() #Serialize logging.info("Start Serializing") serialize_instances(pipe.instances, args.data) logging.info("Done!") elif args.subparser_name == "add": instances = load_instances(args.data) logging.info("Creating pipe") fgs = [ InfixFeatureGenerator() ] pipe = Pipe(fgs, instances, num_processes=args.threads) logging.info("Pushing into pipe") pipe.push_all_parallel() #Serialize logging.info("Start Serializing") serialize_instances(pipe.instances, args.data) logging.info("Done!")