def get_features(instances, num_processes):
    logging.info("Creating pipe")

    fgs = [JaccardFeatureGenerator(), InfixFeatureGenerator()]
    pipe = Pipe(fgs, instances, num_processes=num_processes)
    pipe.push_all_parallel()

    # group by earmark
    fgs = [
        RankingFeatureGenerator(
            feature_group="JACCARD_FG", feature="JACCARD_FG_max_inferred_name_jaccard", prefix="G1_"
        ),
        RankingFeatureGenerator(feature_group="JACCARD_FG", feature="JACCARD_FG_max_cell_jaccard", prefix="G1_"),
    ]
    grouper = InstancesGrouper(["earmark_id"])
    pipe = BlocksPipe(grouper, fgs, pipe.instances, num_processes=num_processes)
    pipe.push_all_parallel()

    return pipe.instances
Пример #2
0
def serialize_student_group(students, data_folder):
    instances = []
    for i in range(len(students)):
        for j in range(i+1, len(students), 1):
            instances.append(get_instance(students[i], students[j]))
    logging.info("Created %d instances" %(len(instances)))
    if len(instances) == 0:
        logging.warn("FAILED TO GENERATE INSTANCES!")
        return
    fgs = [IsSameFeatureGenerator(fields=['ZipCode', 'Gender', 'Language', 'HomeLanguage'
    ,'BirthCountry', 'Race', 'Food', 'ESL', 'LEP', 'SpecialED','CatchmentSchool',
    'ThisGradeSchoolKey']),
    AbsoluteDifferenceFeatureGenerator(fields=['GPA', 'EighthMathISAT', 'EighthReadingISAT', 'AttendanceRate']),
    DistanceFeatureGenerator(),
    OtherFeaturesFeatureGenerator(),
    ]
    pipe = pipe = Pipe(fgs, instances, num_processes=1)
    pipe.push_all_parallel()
    serialize_instances(pipe.instances, data_folder)
    """
def get_features(instances, num_processes):
    logging.info("Creating pipe")

    fgs = [JaccardFeatureGenerator(), InfixFeatureGenerator()]
    pipe = Pipe(fgs, instances, num_processes=num_processes)
    pipe.push_all_parallel()

    #group by earmark
    fgs = [
        RankingFeatureGenerator(feature_group="JACCARD_FG",
                                feature="JACCARD_FG_max_inferred_name_jaccard",
                                prefix='G1_'),
        RankingFeatureGenerator(feature_group="JACCARD_FG",
                                feature="JACCARD_FG_max_cell_jaccard",
                                prefix='G1_')
    ]
    grouper = InstancesGrouper(['earmark_id'])
    pipe = BlocksPipe(grouper,
                      fgs,
                      pipe.instances,
                      num_processes=num_processes)
    pipe.push_all_parallel()

    return pipe.instances