示例#1
0
def feature_selection(activity_threshold=3):
    """Train classifier on DonorsChoose set given a label to choose most important features.

    INPUT:
    OUTPUT: list of most important columns
    """

    dc_districts = get_donorschoose.districts()
    dc_index = dc_districts.index

    census = get_census.all_states()
    census = census.loc[dc_index].copy()

    columns = ["STNAME", "LATCOD", "LONCOD", 
               "TOTALREV", "TFEDREV", "TSTREV", "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", 
               "Z32", "Z34", "Z35", "HR1", "HE1", "HE2"]
    nces = get_nces.districts(columns=columns, nonneg=True)

    data = pd.concat([census, nces.loc[census.index]], axis=1)
    data.dropna(inplace=True)

    label = dc_districts.activity > activity_threshold
    label = label.loc[data.index]

    print label.value_counts()

    feature_importance.importance(data._get_numeric_data(), label)
示例#2
0
def feature_selection(activity_threshold=3):
    """Train classifier on DonorsChoose set given a label to choose most important features.

    INPUT:
    OUTPUT: list of most important columns
    """

    dc_districts = get_donorschoose.districts()
    dc_index = dc_districts.index

    census = get_census.all_states()
    census = census.loc[dc_index].copy()

    columns = [
        "STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV",
        "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "Z32", "Z34", "Z35",
        "HR1", "HE1", "HE2"
    ]
    nces = get_nces.districts(columns=columns, nonneg=True)

    data = pd.concat([census, nces.loc[census.index]], axis=1)
    data.dropna(inplace=True)

    label = dc_districts.activity > activity_threshold
    label = label.loc[data.index]

    print label.value_counts()

    feature_importance.importance(data._get_numeric_data(), label)
示例#3
0
def potential_districts(sim, n_potential=15, activity_threshold=3):
    """Find potentially active districts outside of DonorsChoose network.

    OUTPUT: pandas dataframe of recommended distrcits
            districts.topo.json
            district.json
    """

    dc_districts = get_donorschoose.districts()

    active_districts = set(dc_districts[dc_districts.activity > activity_threshold].index.values.astype(np.int))
    all_districts = set(sim.data.index.values.astype(np.int))

    potential = all_districts - (active_districts & all_districts)

    rms = sim.rms_score(potential, active_districts, normalize=True)
     
    # potential districts most similar to active districts in descending order
    potential_df = pd.DataFrame(sorted(zip(potential, rms), key=lambda (x, y): y, reverse=True))
    potential_df.columns = ["leaid", "score"]
    potential_df.index = potential_df.pop("leaid")
    potential_df["State"] = sim.data["State"].loc[potential_df.index]

    # pick at most n_potential recommendations for each state
    recommend = []
    for state in sim.data.State.value_counts().index:
        recommend.extend(potential_df[potential_df.State == state].head(n_potential).index.values)


    rec_df = sim.data[["District Name", "STNAME", "State", "LATCOD", "LONCOD"]].loc[recommend]
    rec_df["score"] = potential_df.score.loc[recommend]

    N_rec = len(rec_df)
    rec_df.dropna(inplace=True)
    print "NaNs: drop {} districts".format(N_rec - len(rec_df))
 
    # build tooltip text
    district_info = []
    for leaid in rec_df.index:
        tooltip = []
        tooltip.append( "{}".format(rec_df.loc[leaid, "District Name"]) )
        tooltip.append( "students: {}".format(sim.data.loc[leaid, "Total Students"].astype(np.int)) )
        tooltip.append("")
        
        most_sim = sim.most_similar(leaid)
        most_sim.drop(leaid)
        most_sim = most_sim.loc[filter(lambda leaid: True if leaid in active_districts else False, most_sim.index)]
        closest = most_sim.head(1).index[0]
        
#         same, close = sim.closest_features([leaid, closest])
#         closest_features = list(same) + list(close)
        
        tooltip.append( "Most similar to {}, {}".format(most_sim.loc[closest, "District Name"], most_sim.loc[closest, "State"]) )
#         tooltip.append( "(based on: {}, {})".format(closest_features[0], closest_features[1]) )
        
        tooltip.append( "students: {}".format(sim.data.loc[closest, "Total Students"].astype(np.int)) )
        tooltip.append( "projects: {}".format(dc_districts.loc[closest, "projects"].astype(np.int)) )
        donation_per_project = dc_districts.loc[closest, "total_donations"] / dc_districts.loc[closest, "projects"]
        tooltip.append( "received donations/project: ${:.2f}".format(donation_per_project) )
        htmltooltip = "<br/>".join(tooltip) 
        district_info.append(htmltooltip)

    info_series = pd.Series(district_info)
    info_series.index = rec_df.index
    rec_df["info"] = info_series

    to_geojson(rec_df)

    return rec_df
示例#4
0
def potential_districts(sim, n_potential=15, activity_threshold=3):
    """Find potentially active districts outside of DonorsChoose network.

    OUTPUT: pandas dataframe of recommended distrcits
            districts.topo.json
            district.json
    """

    dc_districts = get_donorschoose.districts()

    active_districts = set(dc_districts[
        dc_districts.activity > activity_threshold].index.values.astype(
            np.int))
    all_districts = set(sim.data.index.values.astype(np.int))

    potential = all_districts - (active_districts & all_districts)

    rms = sim.rms_score(potential, active_districts, normalize=True)

    # potential districts most similar to active districts in descending order
    potential_df = pd.DataFrame(
        sorted(zip(potential, rms), key=lambda (x, y): y, reverse=True))
    potential_df.columns = ["leaid", "score"]
    potential_df.index = potential_df.pop("leaid")
    potential_df["State"] = sim.data["State"].loc[potential_df.index]

    # pick at most n_potential recommendations for each state
    recommend = []
    for state in sim.data.State.value_counts().index:
        recommend.extend(potential_df[potential_df.State == state].head(
            n_potential).index.values)

    rec_df = sim.data[["District Name", "STNAME", "State", "LATCOD",
                       "LONCOD"]].loc[recommend]
    rec_df["score"] = potential_df.score.loc[recommend]

    N_rec = len(rec_df)
    rec_df.dropna(inplace=True)
    print "NaNs: drop {} districts".format(N_rec - len(rec_df))

    # build tooltip text
    district_info = []
    for leaid in rec_df.index:
        tooltip = []
        tooltip.append("{}".format(rec_df.loc[leaid, "District Name"]))
        tooltip.append("students: {}".format(
            sim.data.loc[leaid, "Total Students"].astype(np.int)))
        tooltip.append("")

        most_sim = sim.most_similar(leaid)
        most_sim.drop(leaid)
        most_sim = most_sim.loc[filter(
            lambda leaid: True
            if leaid in active_districts else False, most_sim.index)]
        closest = most_sim.head(1).index[0]

        #         same, close = sim.closest_features([leaid, closest])
        #         closest_features = list(same) + list(close)

        tooltip.append("Most similar to {}, {}".format(
            most_sim.loc[closest, "District Name"], most_sim.loc[closest,
                                                                 "State"]))
        #         tooltip.append( "(based on: {}, {})".format(closest_features[0], closest_features[1]) )

        tooltip.append("students: {}".format(
            sim.data.loc[closest, "Total Students"].astype(np.int)))
        tooltip.append("projects: {}".format(
            dc_districts.loc[closest, "projects"].astype(np.int)))
        donation_per_project = dc_districts.loc[
            closest, "total_donations"] / dc_districts.loc[closest, "projects"]
        tooltip.append(
            "received donations/project: ${:.2f}".format(donation_per_project))
        htmltooltip = "<br/>".join(tooltip)
        district_info.append(htmltooltip)

    info_series = pd.Series(district_info)
    info_series.index = rec_df.index
    rec_df["info"] = info_series

    to_geojson(rec_df)

    return rec_df