示例#1
0
def tallal_usnews_details_merge():
    df_details = clean_state_city()
    df_usnews_ranks = usnews_build()
    df_tallal = pd.read_csv('../../data/edit/tallal_usnews.csv')
    df_tallal = df_tallal.fillna('')

    df_tallal_usnews = df_tallal.merge(df_usnews_ranks,
                                       left_on='usnews_guess',
                                       right_on='name',
                                       how='left').reset_index(drop=True)
    df_tallal_usnews_details = df_tallal_usnews.merge(
        df_details, on='College Name or Type',
        how='inner').reset_index(drop=True)
    df_tallal_usnews_details = df_tallal_usnews_details[
        df_tallal_usnews_details['College Name or Type'] != '']
    df_tallal_usnews_details = df_tallal_usnews_details.groupby(
        ['Unnamed: 0_y']).first().reset_index()

    df_tallal_usnews_details['groups'] = ''
    for item in ['positive', 'neutral', 'bad', 'foreign', 'bad_school']:
        df_tallal_usnews_details.loc[df_tallal_usnews_details['name_rev'] ==
                                     item, 'groups'] = item
    df_tallal_usnews_details.loc[(df_tallal_usnews_details['groups'] == '') & (
        df_tallal_usnews_details['usnews_guess'] == ''),
                                 'groups'] = 'bad_school'

    print 'tallal', len(df_tallal_usnews_details), df_tallal_usnews_details[
        'College Name or Type'].nunique()
    df_tallal_usnews_details = df_tallal_usnews_details.rename(columns={
        'State_y': 'State',
        'City_y': 'City'
    })
    df_tallal_usnews_details.to_csv(
        '../../data/edit/df_usnews_details_tallal.csv')
    return
def union_college_name_type():
    # The unmatched school names/types featured "union"
    # entry/College Name or Type_unique_marked2.csv: one of the unions is 1
    # Paired with state/city information
    df = pd.read_csv(
        '../../data/entry/College Name or Type_unique_marked2.csv')

    # big10 also contains big12. However, as big12 are southern schools, big10 are
    # midwest schools, they don't overlap with each other. So I can safely group
    # them into one group.
    df_union = df[(df['big10']==1.0) | (df['sisters']==1.0) | (df['pac']==1.0) | (df['hbcu']==1.0) \
                  | (df['public_ivy']==1.0) | (df['west_ivy']==1.0) | (df['new_ivy'] ==1.0) \
                  | (df['little_ivy']==1.0) | (df['sec']==1.0) | (df['flagship']==1.0)]
    print df_union.head()
    print len(df_union['College Name or Type'])

    df_details = clean_state_city()
    print df_details.head()

    df_union_school_place = df_union.merge(df_details,
                                           on='College Name or Type',
                                           how='left')
    df_union_school_place = df_union_school_place[[
        'College Name or Type', 'State', 'City', 'big10', 'sisters', 'pac',
        'hbcu', 'public_ivy', 'west_ivy', 'little_ivy', 'sec', 'flagship'
    ]]
    print df_union_school_place.head()
    print len(df_union_school_place['College Name or Type'])

    df_union_school_place.to_csv('../../data/edit/df_union_school_place.csv')
    return df_union_school_place
def topN_college_name_type():
    # The unmatched school names/types featured "top N"
    # entry/College Name or Type_unique_marked2.csv: name_rev1_flag = '', guess = 1
    # Paired with state/city information
    df = pd.read_csv(
        '../../data/entry/College Name or Type_unique_marked2.csv')
    df_topN = df[(df['name_rev1_flag'].isnull()) & (df['guess'] == 1.0)]
    print df_topN.head()
    print len(df_topN['College Name or Type'])

    df_details = clean_state_city()
    print df_details.head()

    df_topN_school_place = df_topN.merge(df_details,
                                         on='College Name or Type',
                                         how='left')
    df_topN_details = df_topN_school_place
    df_topN_school_place = df_topN_school_place[[
        'College Name or Type', 'State', 'City'
    ]]
    print df_topN_school_place.head()
    print len(df_topN_school_place['College Name or Type'])

    df_topN_details.to_csv('../../data/edit/df_topN_details.csv')
    df_topN_school_place.to_csv('../../data/edit/df_topN_school_place.csv')
    return df_topN_school_place
def tallal_college_name_type():
    # The unmatched school names/types with some information of school names
    # entry/College Name or Type_unique_marked2.csv: name_rev1_flag = 1
    # Given to Tallal to manually add a revised name
    # Then I am going to match it again with the four candidate names until fully
    # matched
    df = pd.read_csv(
        '../../data/entry/College Name or Type_unique_marked2.csv')
    df_tallal = df[df['name_rev1_flag'] == 1.0]
    print df_tallal.head()
    print len(df_tallal['College Name or Type'])

    df_details = clean_state_city()
    print df_details.head()

    df_tallal_school_place = df_tallal.merge(df_details,
                                             on='College Name or Type',
                                             how='left')
    df_tallal_school_place = df_tallal_school_place[[
        'College Name or Type', 'State', 'City'
    ]]
    print df_tallal_school_place.head()
    print len(df_tallal_school_place['College Name or Type'])

    df_tallal_school_place.to_csv('../../data/edit/df_tallal_school_place.csv')
    return df_tallal_school_place
示例#5
0
def vague_details_merge():
    df_details = clean_state_city()
    df_vague = pd.read_csv('../../data/edit/df_vague.csv')
    df_vague = df_vague.fillna('')
    df_vague = df_vague[df_vague['College Name or Type'] != '']
    df_vague = df_vague[df_vague['groups'] != 'tallal']
    # added entry/Copy of college_name_05_07_tallal.csv
    print 'vague_pre', len(
        df_vague), df_vague['College Name or Type'].nunique()
    df_vague_details = df_vague.merge(df_details,
                                      on='College Name or Type',
                                      how='inner').reset_index()
    print 'vague', len(
        df_vague_details), df_vague_details['College Name or Type'].nunique()
    df_vague_details.to_csv('../../data/edit/df_usnews_details_vague.csv')
    return
示例#6
0
def exact_usnews_details_merge():
    df_details = clean_state_city()
    df_match_cand_all = pd.read_csv('../../data/edit/df_match_cand_all.csv')
    df_match_cand_all = df_match_cand_all.fillna('')
    df_exact_usnews_details = df_match_cand_all.merge(
        df_details, on='College Name or Type', how='inner').reset_index()
    df_exact_usnews_details = df_exact_usnews_details.groupby(
        ['User Name']).first().reset_index()
    print 'exact', len(df_exact_usnews_details), df_exact_usnews_details[
        'College Name or Type'].nunique()
    print 'exact_user', df_exact_usnews_details['User Name'].nunique(), len(
        df_exact_usnews_details[df_exact_usnews_details['User Name'] != ''])

    df_exact_usnews_details = df_exact_usnews_details.rename(columns={
        'State_y': 'State',
        'City_y': 'City'
    })
    df_exact_usnews_details.to_csv(
        '../../data/edit/df_usnews_details_exact.csv')
    return
示例#7
0
def school_usnews_details_merge():
    # quick detour:
    df_univ = pd.read_csv('../../data/entry/df_merge_univ_marked.csv')
    df_lac = pd.read_csv('../../data/entry/df_merge_lac_marked.csv')
    df_topN_details = pd.read_csv('../../data/edit/df_topN_details.csv')
    df_topN_school_place = pd.read_csv(
        '../../data/edit/df_topN_school_place.csv')
    print 'brutal-force', len(df_univ), len(df_lac), len(df_topN_details), len(
        df_topN_school_place)
    # Well, let's just merge them brutal-forcely!

    #formal start:
    df_details = clean_state_city()
    print 'df_details', len(
        df_details), df_details['College Name or Type'].nunique()
    for item in ['topN_univ', 'topN_lac', 'union']:
        df_merged_usnews = pd.read_csv(
            '../../data/edit/df_{}_merged_usnews.csv'.format(item))
        df_merged_usnews = df_merged_usnews.fillna('')
        df_merged_usnews = df_merged_usnews[
            df_merged_usnews['College Name or Type'] != '']
        df_merged_usnews_squeeze = df_merged_usnews.groupby(
            ['College Name or Type', 'State', 'City']).first().reset_index()
        df_merged_usnews_squeeze.to_csv(
            '../../data/edit/df_{}_squeeze.csv'.format(item))
        print item + '_squeeze', len(
            df_merged_usnews_squeeze
        ), df_merged_usnews_squeeze['College Name or Type'].nunique()

        df_usnews_details = df_merged_usnews_squeeze.merge(
            df_details,
            on=['College Name or Type', 'State', 'City'],
            how='inner').reset_index()
        print item, len(df_usnews_details
                        ), df_usnews_details['College Name or Type'].nunique()
        df_usnews_details.to_csv(
            '../../data/edit/df_usnews_details_{}.csv'.format(item))
    return
示例#8
0
def college_name_conclude():
    df_details = clean_state_city()
    #print df_details.columns.tolist()
    list_var = [
        'User Name', 'College Name or Type', 'Major', 'Degree GPA', 'LSAT 2',
        'LSAT 3', 'LSAT 1', 'LSDAS GPA', 'Class Rank', 'LSAT', 'Race',
        'Gender', 'State', 'Race2', 'City', 'Years out of Undergrad',
        'extra curricular', 'additional info'
    ]
    list_usnews = [
        'name', 'State_acronym', 'enrollment', 'group', 'location', 'public',
        'rank', 'tuition'
    ]

    list_cat = ['exact', 'topN_univ', 'topN_lac', 'union', 'tallal', 'vague']
    df_details_clean = pd.DataFrame()
    for item in list_cat:
        df = pd.read_csv(
            '../../data/edit/df_usnews_details_{}.csv'.format(item))
        df = df.fillna('')

        if item == 'vague':
            for x in [
                    'name', 'State_acronym', 'enrollment', 'group', 'location',
                    'public', 'rank', 'tuition'
            ]:
                df[x] = ''
        if (item != 'vague') & (item != 'tallal'):
            df['groups'] = ''

        df = df[list_var + list_usnews + ['groups']]
        df = df[df['College Name or Type'] != '']
        df_details_clean = df_details_clean.append(df, ignore_index=True)
        print item, '/', df['User Name'].nunique(
        ), df['College Name or Type'].nunique(
        )  #len(df[df['User Name']!='']),len(df)

    df_details_clean = df_details_clean.groupby(['User Name'
                                                 ]).first().reset_index()
    print 'df_details_clean', '/', df_details_clean['User Name'].nunique(
    ), df_details_clean['College Name or Type'].nunique()
    print 'df_details_clean', '||', len(df_details_clean), len(
        df_details_clean[df_details_clean['User Name'] != ''])
    df_details_clean.to_csv(
        '../../data/edit/df_details_college_name_cleaned.csv')

    df_details = df_details[list_var]
    print 'df_details', '/', df_details['User Name'].nunique(
    ), df_details['College Name or Type'].nunique()
    print 'df_details', '||', len(df_details), len(
        df_details[df_details['User Name'] != ''])

    df_user_list = df_details[['User Name']]
    df_details_college = df_user_list.merge(df_details_clean,
                                            on=['User Name'],
                                            how='left').reset_index()
    print 'df_details_college', '/', df_details_college['User Name'].nunique(
    ), df_details_college['College Name or Type'].nunique()
    print 'df_details_college', '||', len(df_details_college), len(
        df_details_college[df_details_college['User Name'] != ''])

    print df_details_college['User Name'].nunique(), len(df_details_college)
    df_noname = df_details_college[df_details_college['User Name'] == '']
    df_noname.to_csv('../../data/edit/df_noname.csv')

    df_details_college = df_details_college[
        df_details_college['User Name'] != '']
    print len(df_details_college), df_details_college['User Name'].nunique()

    df_details_college.to_csv(
        '../../data/edit/df_details_race_college_cleaned.csv')
    # Key college variable: name, rank, group, groups
    # groups:['positive','neutral','bad','bad_school','foreign']

    return