示例#1
0
from ast import literal_eval
from anon_id_utils import get_anon_id

get_all = 0

print "Getting users..."
if get_all:
    # this is for ALL users in the zooniverse
    #and is gonna take a loooooong time

    cursor = the_cnx.cursor()
    getusers = "select login as user_name, id as zooniverse_id from zoohome_300315.users"
    cursor.execute(getusers)
    the_users = pd.DataFrame(cursor.fetchall())
    the_users.columns = cursor.column_names
    the_users['anon_id'] = [get_anon_id(q) for q in the_users.zooniverse_id]
    cursor.close()

else:
    user_file = '/Volumes/Brooke_SD/VOLCROWE/survey/survey_answered_ids_CONFIDENTIAL.csv'
    the_users = pd.read_csv(user_file)
    the_users.set_index('zooniverse_id', drop=False, inplace=True)

# We are going to loop through users and check each project for the existence of a user stats file for that project.
# paths and project names come from sessions_allproj_byuser.py

all_poss_proj = 'galaxy_zoo galaxy_zoo_starburst solarstormwatch moon_zoo ancient_lives whales milky_way oldweather sea_floor cyclone_center bat_detective cancer_cells andromeda serengeti planet_four notes_from_nature spacewarp leaf worms plankton radio war_diary m83 wise sunspot condor asteroid kelp penguin higgs_hunter chicago planet_hunter illustratedlife chimp orchid wisconsin crater'.split(
)

# just make sure you're starting clean here (since this appends later)
for thisproj in all_poss_proj:
# cursor = the_cnx.cursor()

get_all = 0


print "Getting users..."
if get_all:
    # this is for ALL users in the zooniverse
    #and is gonna take a loooooong time
    
    cursor = the_cnx.cursor()
    getusers = "select login as user_name, id as zooniverse_id, from zoohome_300315.users"
    cursor.execute(getusers)
    the_users = pd.DataFrame(cursor.fetchall())
    the_users.columns = cursor.column_names
    the_users['anon_id'] = [get_anon_id(q) for q in the_users.zooniverse_id]
    cursor.close()

else:
    user_file = '/Volumes/Brooke_SD/VOLCROWE/survey/survey_answered_ids_CONFIDENTIAL.csv'
    the_users = pd.read_csv(user_file)
    the_users.set_index('zooniverse_id', drop=False, inplace=True)


filestr_base = "/Volumes/Brooke_SD/VOLCROWE/user_stats_out"

# if you're not starting from scratch, this != 0
starting_row = 0

# if you only need to go up to a certain point
ending_row = 10000
示例#3
0
    elif '_body' in discussions_all.columns:
        body = '_body'
        
    # if this isn't a column then we have to extract it from the body column
    tags = 'none'
    if 'tags' in discussions_all.columns:
        tags = 'tags'
    elif '_tags' in discussions_all.columns:
        tags = '_tags'

    
    anon_id = 'anon_id'
    if not anon_id in discussions_all.columns:
        print project_name,"Warning: anon_id wasn't defined, defining from ",userid," i.e. numeric Zooniverse user ID"
        discussions_all['anon_id'] = (discussions_all[userid] < 1).astype(str)
        discussions_all.anon_id = [get_anon_id(q) for q in discussions_all[userid]]

        
    
        
    createdat = 'created_at'
            
    # not sure what happens if this isn't a list, as it isn't with the ouroboros talk projects
    if body != 'none' and (project_name == 'sea_floor' or project_name == 'planet_hunters'):
        discussions_all['body_arr'] = discussions_all[body].apply(literal_eval)
    elif body != 'none':
        discussions_all['body_arr'] = discussions_all[body].astype(str)
        discussions_all['body_arr'] = discussions_all['body_arr'].apply(lambda x:x.split(' '))
    else:
        discussions_all['body_arr'] = ['' for q in discussions_all.anon_id]