Пример #1
0
def run(geoentities):

    try:

        print "starting ai.predict"
        connection.close()

        start = datetime.now()        

        classifier = DNNLinearCombinedClassifier(
            model_dir=MODEL_DIR,
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=[100,50]
        )
        print "classifier:", classifier

        print "creating the classifier took", (datetime.now() - start).total_seconds(), "seconds"

        df = get_fake_df()
        print "about to populate data frame for prediction"
        start_df = datetime.now()
        
        for index, geoentity in enumerate(geoentities):
            place_id = geoentity.place_id
            name = geoentity.target

	    feature_admin_levels = set([g.admin_level for g in geoentities if g.admin_level and g.target == name])
            if feature_admin_levels:
                lowest_admin_level = min(feature_admin_levels)
            else:
                lowest_admin_level = -99

            population = g.population
            is_highest_population = population and population == max([g.population for g in geoentities if g.target == name]) or False

            admin_level = geoentity.admin_level
            df['admin_level'].append(str(geoentity.admin_level or "None"))
            df['cluster_frequency'].append(geoentity.cluster_frequency or 0)
            df['country_code'].append(geoentity.country_code or "UNKNOWN")
            df['country_rank'].append(geoentity.country_rank or 999)
            df['edit_distance'].append(str(geoentity.edit_distance))
            df['has_mpoly'].append(str(geoentity.has_mpoly or False))
            df['has_pcode'].append(str(geoentity.has_pcode or False))
            df['is_country'].append(str(admin_level == 0))
            df['is_lowest_admin_level'].append(str(lowest_admin_level == g.admin_level))
            df['is_highest_population'].append(str(is_highest_population))
            df['median_distance'].append(geoentity.median_distance_from_all_other_points)
            df['matches_topic'].append(str(geoentity.matches_topic or "False"))
            df['population'].append(geoentity.population)
            df['popularity'].append(geoentity.popularity)

        print "populating df took", ((datetime.now() - start_df).total_seconds() / 60), "minutes"

        for index, row in enumerate(classifier.predict_proba(input_fn=lambda: input_fn(df))):
            geoentities[index].probability = row[1]

    except Exception as e:
        fail("EXCPETION in scripts.ai.predict.run: " + str(e))
Пример #2
0
def train():
    try:

        start = datetime.now()

        print "starting appbkto.scripts.predict.train"
        connection.close()
        features = list(Feature.objects.filter(verified=True).values("id","featureplace__id","featureplace__place__admin_level","featureplace__correct","featureplace__place_id","featureplace__cluster_frequency","featureplace__place__country_code","featureplace__country_rank","featureplace__place__mpoly","featureplace__place__pcode","featureplace__popularity","featureplace__place__population","featureplace__median_distance","featureplace__place__topic_id","topic_id"))
        print "features:", type(features), len(features)

        rmtree(MODEL_DIR, ignore_errors=True)

        print "creating classifier"
        classifier = DNNLinearCombinedClassifier(
            model_dir=MODEL_DIR,
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=[100,50]
        )
        print "classifier:", classifier

        number_of_features = len(features)

        print "training with real data"

        print "shuffle the features"
        shuffle(features)

        half = number_of_features / 2
        print "half is", half

        df_train = get_df_from_features(features[:half])
        df_test = get_df_from_features(features[half:])

        for filename in listdir(PATH_TO_DIRECTORY_OF_INPUT_DATA):
            print "filename for import :", filename
            if filename.endswith(".csv"):
                df = get_df_from_csv(PATH_TO_DIRECTORY_OF_INPUT_DATA + "/" + filename)
                #print "loaded", filename, "into", df
                half = len(df.values()[0]) / 2
                print "half:", half
                for column_name in df:
                    if type(df_train[column_name][0]) != type(df[column_name][0]):
                        print "mismatch type for ", column_name
                        print "type(df_train[column_name][0]):", type(df_train[column_name][0])
                        print "type(df[column_name][:half][0]):", type(df[column_name][0])
                    df_train[column_name] = df[column_name][:half]
                    df_test[column_name] = df[column_name][half:]
 
        print "fitting"
        try:
            classifier.fit(input_fn=lambda: input_fn(df_train), steps=200)
        except Exception as e:
            fail("EXCEPTION fitting model in scripts.ai.predict.train: " + str(e))
        print "\nfitted"
        results = classifier.evaluate(input_fn=lambda: input_fn(df_test), steps=10)
        for key in sorted(results):
            print("%s: %s" % (key, results[key]))

        print "took", ((datetime.now() - start).total_seconds() / 60), "minutes to train"

    except Exception as e:
        fail("EXCEPTION in ai.predict.train: " + str(e))