def parse_PageViews(metric,next_metric,key_index=8,startindex=8,endindex=-1,n_lines='all',addTotal=False): '''Parses the excel_in_PageViewsPerMonthPopularWikisNormalized_* file, extracts the `metric` passed as an argument and stores a dygraph csv file.''' collect = False data = OrderedDict() # the n_lines parameter causes problems on a monthly basis. # Dan Andreescu sees no reason it should not always be 'all' # considering that performance is not an issue with this project n_lines = 'all' n_collected = 0 with open(old_to_new.page_views,'r') as f: for i,line in enumerate(f): if metric in line: print '%s found on line %s'%(metric,i+1) collect = True elif next_metric in line: collect = False if collect: # linebreaks are '\r\n' vals = line[:-1].split(',') # filter the empty lines if len(vals) > 2: # print vals[startindex:endindex] k = vals[key_index] if k not in data: if k == 'project': data[k] = vals[startindex:] elif k != '': data[k] = vals[startindex:endindex] n_collected +=1 if n_collected == n_lines: collect = False # for d,dd in data.items(): # print d,dd # reformat the time labels data['project'] = fix_dates(data['project'],format='MM/YYYY') return data
def get_data(): """ Get the datasets into memory and perform pre-processing Returns: A dict with table name as key and the cleaned dataset as value """ import datasets, utils data = {table: getattr(datasets, table) for table in datasets.names} date_columns = get_date_columns() for table, col in date_columns.items(): data[table] = utils.fix_dates(getattr(datasets, table), col) dicts = utils.get_dicts() # TODO: use dicts to transform the data from categories to codes. return data
def parse_StatisticsMonthly(metric,next_metric,startindex=2,endindex=-2,n_lines='all'): '''Parses the excel_in_StatisticsMonthly_* file, extracts the `metric` passed as an argument and stores a dygraph csv file''' collect = False data = OrderedDict() # the n_lines parameter causes problems on a monthly basis. # Dan Andreescu sees no reason it should not always be 'all' # considering that performance is not an issue with this project n_lines = 'all' n_collected = 0 with open(old_to_new.monthly_stats,'r') as f: for i,line in enumerate(f): if metric in line: print '%s found on line %s'%(metric,i+1) collect = True elif next_metric in line: collect = False elif collect: # linebreaks are '\r\n' vals = line[:-2].split(',') # filter the empty lines if len(vals) > 2: k = vals[1] if k != '': data[k] = vals[startindex:endindex] # print len(data[vals[1]]) n_collected +=1 if n_collected == n_lines: collect = False # reformat the time labels data['project'] = fix_dates(data['project'],format='MM/YYYY') return data
def update_user_input_places(user_input, dfs, config): # Nivel Estado if (user_input["city_name"] == "Todos" and user_input["health_region_name"] == "Todos"): data = dfs["state"][dfs["state"]["state_name"] == user_input["state_name"]] user_input["place_type"] = "state_num_id" # Escolhe Rt para SimulaCovid user_input = choose_rt(user_input, dfs, level=3) # Nivel Regional elif user_input["city_name"] == "Todos": data = dfs["health_region"][dfs["health_region"]["health_region_name"] == user_input["health_region_name"]] user_input["place_type"] = "health_region_id" # Escolhe Rt para SimulaCovid user_input = choose_rt(user_input, dfs, level=2) # Nivel Cidade else: data = dfs["city"][ (dfs["city"]["state_name"] == user_input["state_name"]) & (dfs["city"]["city_name"] == user_input["city_name"])] user_input["place_type"] = "city_id" # Escolhe Rt para SimulaCovid user_input = choose_rt(user_input, dfs, level=1) # Seleciona localidade para títulos user_input["locality"] = utils.choose_place( city=user_input["city_name"], region=user_input["health_region_name"], state=user_input["state_name"], ) # Update dos ids user_input = update_user_input_ids(data, user_input) return user_input, utils.fix_dates(data)