def parse_PageViews(metric,next_metric,key_index=8,startindex=8,endindex=-1,n_lines='all',addTotal=False):
	'''Parses the excel_in_PageViewsPerMonthPopularWikisNormalized_* file, extracts the `metric` passed as an argument and stores a dygraph csv file.'''

	collect = False
	data = OrderedDict()
	
	# the n_lines parameter causes problems on a monthly basis.
	# Dan Andreescu sees no reason it should not always be 'all'
	# considering that performance is not an issue with this project
	n_lines = 'all'
	n_collected = 0

	with open(old_to_new.page_views,'r') as f:

		for i,line in enumerate(f):

			if metric in line:
				print '%s found on line %s'%(metric,i+1)
				collect = True
			elif next_metric in line:
				collect = False

			if collect:
				# linebreaks are '\r\n'
				vals = line[:-1].split(',')
				# filter the empty lines
				if len(vals) > 2:

					# print vals[startindex:endindex]
					k = vals[key_index]
					
					if k not in data:

						if k == 'project':
							data[k] = vals[startindex:]	
						elif k != '':
							data[k] = vals[startindex:endindex]

						n_collected +=1
						if n_collected == n_lines:
							collect = False

	# for d,dd in data.items():
	# 	print d,dd

	# reformat the time labels 
	data['project'] = fix_dates(data['project'],format='MM/YYYY')

	return data
예제 #2
0
def get_data():
    """
    Get the datasets into memory and perform pre-processing
    Returns:
        A dict with table name as key and the cleaned dataset as value
    """
    import datasets, utils
    data = {table: getattr(datasets, table) for table in datasets.names}
    date_columns = get_date_columns()
    for table, col in date_columns.items():
        data[table] = utils.fix_dates(getattr(datasets, table), col)

    dicts = utils.get_dicts()
    # TODO: use dicts to transform the data from categories to codes.

    return data
def parse_StatisticsMonthly(metric,next_metric,startindex=2,endindex=-2,n_lines='all'):
	'''Parses the excel_in_StatisticsMonthly_* file, extracts the `metric` passed as an argument and stores a dygraph csv file'''

	collect = False
	data = OrderedDict()


	# the n_lines parameter causes problems on a monthly basis.
	# Dan Andreescu sees no reason it should not always be 'all'
	# considering that performance is not an issue with this project
	n_lines = 'all'
	n_collected = 0

	with open(old_to_new.monthly_stats,'r') as f:

		for i,line in enumerate(f):

			if metric in line:
				print '%s found on line %s'%(metric,i+1)
				collect = True
			elif next_metric in line:
				collect = False

			elif collect:
				# linebreaks are '\r\n'
				vals = line[:-2].split(',')
				# filter the empty lines
				if len(vals) > 2:

					k = vals[1]
					if k != '':

						data[k] = vals[startindex:endindex]

						# print len(data[vals[1]])
						n_collected +=1
						if n_collected == n_lines:
							collect = False

	# reformat the time labels 
	data['project'] = fix_dates(data['project'],format='MM/YYYY')

	return data
예제 #4
0
def update_user_input_places(user_input, dfs, config):

    # Nivel Estado
    if (user_input["city_name"] == "Todos"
            and user_input["health_region_name"] == "Todos"):
        data = dfs["state"][dfs["state"]["state_name"] ==
                            user_input["state_name"]]
        user_input["place_type"] = "state_num_id"
        # Escolhe Rt para SimulaCovid
        user_input = choose_rt(user_input, dfs, level=3)

    # Nivel Regional
    elif user_input["city_name"] == "Todos":
        data = dfs["health_region"][dfs["health_region"]["health_region_name"]
                                    == user_input["health_region_name"]]
        user_input["place_type"] = "health_region_id"
        # Escolhe Rt para SimulaCovid
        user_input = choose_rt(user_input, dfs, level=2)

    # Nivel Cidade
    else:
        data = dfs["city"][
            (dfs["city"]["state_name"] == user_input["state_name"])
            & (dfs["city"]["city_name"] == user_input["city_name"])]
        user_input["place_type"] = "city_id"
        # Escolhe Rt para SimulaCovid
        user_input = choose_rt(user_input, dfs, level=1)

    # Seleciona localidade para títulos
    user_input["locality"] = utils.choose_place(
        city=user_input["city_name"],
        region=user_input["health_region_name"],
        state=user_input["state_name"],
    )

    # Update dos ids
    user_input = update_user_input_ids(data, user_input)
    return user_input, utils.fix_dates(data)