Exemplo n.º 1
0
def test_toplevel_load_dataset(dw_instances, profile):
    datadotworld.load_dataset('agentid/datasetid', profile=profile)
    assert_that(
        dw_instances[profile].load_dataset,
        called().times(1).with_args(equal_to('agentid/datasetid'),
                                    force_update=equal_to(False)))
    assert_that(
        dw_instances[profile].load_dataset,
        called().times(1).with_args(equal_to('agentid/datasetid'),
                                    auto_update=equal_to(False)))
Exemplo n.º 2
0
def get_data(dataset_name, dataframe_name):
    """Request data from datadotworld API, and returns pandas dataframe.

    Additional information on the datadotworld api can be found at the
    following site: https://apidocs.data.world/api

    Parameters
    ----------
    dataset_name:     str
                    name assigned to the desired dataset stored with
                    datadotworld service.

    dataframe_name:   str
                    name of the key associated with the datadotworld
                    dataset which stores objects within the dataset within
                    a dictionary of dataframes in key value pair.

    Returns
    -------
    Pandas Dataframe

    Examples
    --------
    >>> get_data(dataset_name='census2020', dataframe_name='Kansas')

    >>> get_data('performance_indicators', 'public_safety')
    """
    dataworld_obj = dw.load_dataset(dataset_name)
    dataframe = dataworld_obj.dataframes[dataframe_name]

    return dataframe
Exemplo n.º 3
0
def import_covid_data(filename, FIPS_name):
    # Load COVID19 county data using datadotworld API
    # Data provided by Johns Hopkins, file provided by Associated Press
    dataset = dw.load_dataset(
        "associatedpress/johns-hopkins-coronavirus-case-tracker",
        auto_update=True)
    # the dataset includes multiple dataframes. We will only use #2
    covid_data = dataset.dataframes["2_cases_and_deaths_by_county_timeseries"]
    # Include only oberservation for political entities within states
    # i.e., not territories, etc... drop any nan fip values with covid_data[FIPS_name] > 0
    covid_data = covid_data[covid_data[FIPS_name] < 57000]
    covid_data = covid_data[covid_data[FIPS_name] > 0]

    # Transform FIPS codes into integers (not floats)
    covid_data[FIPS_name] = covid_data[FIPS_name].astype(int)
    covid_data.set_index([FIPS_name, "date"], inplace=True)
    # Prepare a column for state abbreviations. We will draw these from a
    # dictionary created in the next step.
    covid_data["state_abr"] = ""
    for state, abr in state_dict.items():
        covid_data.loc[covid_data["state"] == state, "state_abr"] = abr
    # Create "Location" which concatenates county name and state abbreviation
    covid_data["Location"] = covid_data["location_name"] + ", " + \
        covid_data["state_abr"]

    return covid_data
Exemplo n.º 4
0
def import_covid_data(FIPS_name):
    # Load COVID19 county data using datadotworld API
    # Data provided by John Hopkins, file provided 
    # by Associated Press
    dataset = dw.load_dataset(
        "associatedpress/johns-hopkins-coronavirus-case-tracker",
        auto_update = True)
    
    #the dataset includes multiple dataframes. We will only use #2
    covid_data = dataset.dataframes[
        "2_cases_and_deaths_by_county_timeseries"]
    covid_data = covid_data[covid_data[FIPS_name] < 57000]
    covid_data = covid_data[covid_data[FIPS_name] > 0]
    # Transform FIPS codes into integers
    covid_data[FIPS_name] = covid_data[FIPS_name].astype(int)
    covid_data.set_index([FIPS_name, "date"], inplace = True)
    
    # Prepare a column for state abbreviations. we will draw these 
    # from state_dict
    covid_data["state_abr"] = ""
    for state, abr in state_dict.items():
        #.loc[row(s), col]
        covid_data.loc[
            covid_data["state"] == state, "state_abr"] = abr
    # save location name as Cass, ND 
    covid_data["Location"] = covid_data["location_name"] + ", " +\
        covid_data["state_abr"]
    return covid_data
Exemplo n.º 5
0
def load_us_covid_dataset(
    county_level: bool = False,
    death: bool = False,
    cumulative: bool = True,
    start_date: str = '2020-01-23',
    end_date: Optional[str] = None,
    selected_counties: Optional[List[str]] = None,
):
    epi_df = dw.load_dataset(
        dataset_key='covid-19-data-resource-hub/covid-19-case-counts',
        force_update=False,
        auto_update=True,
    ).dataframes['covid_19_activity']
    ctry_col = 'country_short_name'
    state_col = 'province_state_name'
    county_col = 'county_name'
    date_col = 'report_date'
    case_col = f'''people_{f"death{'' if cumulative else '_new'}" if death else f"positive{'' if cumulative else '_new'}_cases"}_count'''
    cdfs, columns = [], []
    if county_level:
        if selected_counties is None:
            epi_df = epi_df.loc[(epi_df[ctry_col] == 'United States') &
                                (epi_df[state_col].isin(state2abbr.keys())) &
                                (epi_df[county_col] != 'Unknown'),
                                [date_col, state_col, county_col, case_col]]
        else:
            states, counties = zip(*[c.split('/') for c in selected_counties])
            selected_states = list(set(states))
            selected_counties = list(set(counties))
            epi_df = epi_df.loc[(epi_df[ctry_col] == 'United States') &
                                (epi_df[state_col].isin(selected_states)) &
                                (epi_df[county_col].isin(selected_counties)),
                                [date_col, state_col, county_col, case_col]]
        for (state, county), data in epi_df.groupby([state_col, county_col]):
            data = data.loc[:, [date_col, case_col]].groupby(date_col).sum()
            data.index = pd.to_datetime(data.index)
            cdfs.append(data)
            columns.append(f"{state}/{county}")
    else:
        epi_df = epi_df.loc[(epi_df[ctry_col] == 'United States') &
                            (epi_df[state_col].isin(state2abbr.keys())),
                            [date_col, state_col, county_col, case_col]]
        for state, data in epi_df.groupby(state_col):
            data = data.loc[:, [date_col, case_col]].groupby(date_col).sum()
            cdfs.append(data)
            columns.append(state)

    epi_df = pd.concat(cdfs, axis=1)
    epi_df.columns = columns
    epi_df.index.name = 'date'
    epi_df.index = pd.to_datetime(epi_df.index)
    epi_df.fillna(0.0, inplace=True)
    start_date = pd.to_datetime(start_date)
    if end_date is None:
        end_date = pd.to_datetime(datetime.today().date())
    else:
        end_date = pd.to_datetime(end_date)
    end_date = end_date - pd.Timedelta(1, unit='d')
    epi_df = epi_df.loc[start_date:end_date]
    return epi_df
Exemplo n.º 6
0
def last_update():
    cvd_data1 = dw.load_dataset(
        'https://data.world/associatedpress/johns-hopkins-coronavirus-case-tracker',
        auto_update=True)
    cvd_data1 = cvd_data1.dataframes['1_county_level_confirmed_cases']
    cvd_data1['date'] = pd.to_datetime(cvd_data1['last_update'])
    date = cvd_data1.date[0].strftime('%B %d, %Y')
    return date
Exemplo n.º 7
0
def dataworld_table_to_csv(data_path, dataworld_path=ZIPCODE_DATASET_PATH):
    dataset = dw.load_dataset(dataworld_path)
    properties = dataset.describe()
    dataset_name = properties["resources"][0]["name"]
    zipcode_df = dataset.dataframes[dataset_name]
    zipcode_df["zip_code"] = zipcode_df["zip_code"].apply(
        lambda x: '{0:0>5}'.format(x))
    zipcode_df.to_csv(data_path)
Exemplo n.º 8
0
def read_dtw_csv(project_key, filename, **kwargs):
    '''Reads a dataframe from a raw CSV file on data.world (circumventing DTW's preprocessing).'''
    datasets = dw.load_dataset(project_key, force_update=True)
    data_bytes = datasets.raw_data[filename]
    new_file, tmpfilename = tempfile.mkstemp()
    print('Writing CSV to temp file:', tmpfilename)
    os.write(new_file, data_bytes)
    os.close(new_file)
    return pd.read_csv(tmpfilename, **kwargs)
Exemplo n.º 9
0
def audio_features_tosql():
    past_music = dw.load_dataset('kcmillersean/billboard-hot-100-1958-2017')
    past_music.dataframes
    audio_features_df = past_music.dataframes['audiio']
    audio_features_df = past_music.dataframes['audiio']
    audio_features_df.to_sql('audio_features',
                             con=engine,
                             if_exists='append',
                             chunksize=1500)
Exemplo n.º 10
0
def grab_data():
    past_music = dw.load_dataset('kcmillersean/billboard-hot-100-1958-2017')
    past_music.dataframes
    features_df = past_music.dataframes['audiio']
    billboard_df = past_music.dataframes['hot_stuff_2']
    features_df = past_music.dataframes['audiio']
    billboard_df = past_music.dataframes['hot_stuff_2']
    billboard_df["year"] = billboard_df["weekid"].str[0:4]
    billboard_df["year"] = billboard_df["year"].astype(int)
    feautre_obj = pickle.dumps(features_df)
    billboard_obj = pickle.dumps(billboard_df)
Exemplo n.º 11
0
def dw_load_data(path):
    """
    Function to load the datasets from the data.world using their 
    Python connector REST API
    1. takes input from user for the dataset path username/dataset_name
    2. returns LocalDataset object which holds data
    """
    dataset = dw.load_dataset(path, auto_update=True)
    df_dict = dataset.dataframes
    print("**Dictionary of the dataframes present**")
    print(df_dict)
    return dataset
def import_covid_data(filename, fips_name):
    # Load COVID19 county data using datadotworld API
    # Data provided by Johns Hopkins, file provided by Associated Press
    dataset = dw.load_dataset("associatedpress/johns-hopkins-coronavirus-case-tracker")
    covid_data = dataset.dataframes["2_cases_and_deaths_by_county_timeseries"]
    covid_data = covid_data[covid_data[fips_name] < 57000]
    covid_data[fips_name] = covid_data[fips_name].astype(int)
    covid_data.set_index([fips_name, "date"], inplace = True)
    covid_data.loc[:, "state_abr"] = ""
    for state, abr in state_dict.items():
        covid_data.loc[covid_data["state"] == state, "state_abr"] = abr

    return covid_data
Exemplo n.º 13
0
def read_dtw_excel(project_key, filename, select_sheet=None):
    '''Reads a dataframe from a raw Excel file on data.world (circumventing DTW's preprocessing).'''
    datasets = dw.load_dataset(project_key, force_update=True)
    data_bytes = datasets.raw_data[filename]
    new_file, tmpfilename = tempfile.mkstemp()
    print('Writing excel file to temp file:', tmpfilename)
    os.write(new_file, data_bytes)
    os.close(new_file)
    xl = pd.ExcelFile(tmpfilename)
    if select_sheet:
        return xl.parse(select_sheet)
    sheet_names = xl.sheet_names
    if len(sheet_names) == 1:
        return xl.parse(sheet_names[0])
    return dict((name, xl.parse(name)) for name in sheet_names)
Exemplo n.º 14
0
def getCovidJSON():
    dataset = dw.load_dataset('markmarkoh/coronavirus-data',auto_update=True)

    dfs = dataset.dataframes

    json_ret = {}
    json_ret['full_data'] = dfs['full_data'].to_json(date_format='iso', orient='split')

    json_ret['new_cases'] = dfs['new_cases'].to_json(date_format='iso', orient='split')

    json_ret['total_deaths'] = dfs['total_deaths'].to_json(date_format='iso', orient='split')

    json_ret['total_cases'] = dfs['total_cases'].to_json(date_format='iso', orient='split')

    json_ret['new_deaths'] = dfs['new_deaths'].to_json(date_format='iso', orient='split')


    return json_ret
    def process(self):
        #self.diagram()
        # print('* ProcessLoad Data.World')
        self.getSummary()[self.get_class_key()] = {}
        self.getSummary()[self.get_class_key()]['before'] = 0
        '''
        import_file_name is the full path and name of import file
        returns the original raw data as pandas dataframe
        '''
        # download to ~/.dw/cache/{}/latest/data/grb_drains.csv
        self.dataframe = dw.load_dataset(self.import_file_name,
                                         auto_update=True)

        fstr = '~/.dw/cache/{}/latest/data/lgrow_current.csv'.format(
            'citizenlabs/lgrow-storm-drains-current')

        #
        self.dataframe = pd.read_csv(fstr)

        self.addColumns()
        cols = 'columns: '
        for col in self.get_dataframe().columns.values:
            cols += col + ', '

        self.addPath('''
    Overview                           Details   
    --------                           -------
                                       ({})
                                          |
    [Retrieve Production Dataset]         |     source: {} 
       |                                  |
       |                               [Get Data from DataWorld] (response.data: {})       
       |                                  |
    [Load Production Dataset]             |     source: {}                        
       |                                  |               
       |                               [Cache DW data] <--- ({})
       |                                  |        (production count: {})'''.
                     format(self.import_file_name, self.getClassName(), cols,
                            self.getClassName(), fstr, len(self.dataframe)))

        # SUMMARIZE
        self.getSummary()[self.get_class_key()]['after'] = len(self.dataframe)
Exemplo n.º 16
0
def load_world_covid_dataset(
    death: bool = False,
    cumulative: bool = True,
    n_ctry: Optional[int] = None,
    start_date: str = '2020-01-23',
    end_date: Optional[str] = None,
):

    epi_df = dw.load_dataset(
        dataset_key='covid-19-data-resource-hub/covid-19-case-counts',
        force_update=False,
        auto_update=True,
    ).dataframes['covid_19_activity']
    ctry_col = 'country_short_name'
    date_col = 'report_date'
    case_col = f'''people_{f"death{'' if cumulative else '_new'}" if death else f"positive{'' if cumulative else '_new'}_cases"}_count'''
    if n_ctry is None:
        ref_ctry = ref_countries
    else:
        ref_ctry = epi_df.loc[:[date_col, ctry_col, state_col, case_col]].groupby([ctry_col, date_col]).sum().reset_index()\
            .groupby(ctry_col).last().sort_values(case_col, ascending=False).head(n_ctry).index.values
    ref_data = epi_df.loc[epi_df[ctry_col].isin(ref_ctry),
                          [date_col, ctry_col, case_col]]
    columns, cdfs = [], []
    for ctry, data in ref_data.groupby(ctry_col):
        data = data[[date_col, case_col]].groupby(date_col).sum()
        data.index = pd.to_datetime(data.index)
        cdfs.append(data)
        columns.append(ctry)
    epi_df = pd.concat(cdfs, axis=1)
    epi_df.columns = columns
    epi_df.index.name = 'date'
    epi_df.index = pd.to_datetime(epi_df.index)
    epi_df.fillna(0.0, inplace=True)
    start_date = pd.to_datetime(start_date)
    if end_date is None:
        end_date = pd.to_datetime(datetime.today().date())
    else:
        end_date = pd.to_datetime(end_date)
    end_date = end_date - pd.Timedelta(1, unit='d')
    epi_df = epi_df.loc[start_date:end_date]
    return epi_df
Exemplo n.º 17
0
def fetch_dataframe() -> pd.DataFrame:
    """
    Fetch the raw endpoints dataset as Pandas dataframe and validate it
    for the required fields.
    """

    dataframe = dw.load_dataset(
        settings.DATADOTWORLD['dataset'],
        auto_update=True).dataframes[settings.DATADOTWORLD['dataframe']]

    supplied_fields = list(dataframe)

    missing_fields = set(REQUIRED_FIELDS) - set(supplied_fields)

    if missing_fields:
        raise ValueError(
            f'The provided endpoints dataset does not include required fields: '
            f'{", ".join(missing_fields)}. '
            f'Fields provided: {", ".join(supplied_fields)}')

    return dataframe.fillna('')
Exemplo n.º 18
0
def load_bed_and_population_data():
    beds = dw.load_dataset(
        dataset_key='liz-friedman/hospital-capacity-data-from-hghi',
        force_update=False,
        auto_update=True,
    ).dataframes['20_population']
    beds = beds.loc[:, [
        'hrr', 'total_hospital_beds', 'total_icu_beds', 'adult_population',
        'population_65'
    ]]
    beds[['county', 'state']] = beds.hrr.str.split(', ', expand=True)
    beds = beds.loc[:, [
        'state', 'total_hospital_beds', 'total_icu_beds', 'adult_population',
        'population_65'
    ]].groupby('state').sum()
    geo = pd.read_csv(
        "https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv",
        usecols=['state', 'population', 'pop_density'],
        index_col='state',
    )
    geo['area'] = geo['population'] / geo['pop_density']
    beds['density'] = beds['adult_population'] / geo.loc[beds.index, 'area']
    return beds
def fetch_dataset(DATASET_URL=DATASET_URL):
    """
        Fetchs the data.world dataset from the given url path using dw.load_dataset()

        The load_dataset() function facilitates maintaining copies of datasets on the
        local filesystem. It will download a given dataset's datapackage and store it
        under ~/.dw/cache. When used subsequently, load_dataset() will use the copy
        stored on disk and will work offline, unless it's called with force_update=True
        or auto_update=True.

        force_update=True will overwrite your local copy unconditionally.
        auto_update=True will only overwrite your local copy if a newer version of the dataset is available on data.world.

        Returns
        -------
        `datadotworld.models.dataset.LocalDataset` object

    """
    sys.stdout.write("\n> Fetching bookmarks from: https://data.world/" +
                     DATASET_URL + " -> ")
    with Spinner():
        dataset = dw.load_dataset(DATASET_URL, auto_update=True)
        print("\n")

    if args.verbose:
        colorama.init(autoreset=True)
        print(
            colorama.Fore.BLACK + colorama.Back.YELLOW +
            "\n Local Dataset Info: " + "---" * 23, "\n")

        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(dataset.describe())
        print("\n", dataset.dataframes)

        print(colorama.Fore.BLACK + colorama.Back.YELLOW + "\n" + "---" * 30)

    return dataset
Exemplo n.º 20
0
def get_data(key, data_name):
    """
    Return datadotworld dataset as pandas dataframe.

    Parameters
    ----------
    key:        str
        Dataset key for target data.world dataset.

    data_name:  str
        Name of the data.world dataset or value associated with the key

    Returns
    -------
    pandas dataframe

    Examples
    --------
    >>> load_data(key='org/division', data_name='employee_history')
    """
    data_obj = dw.load_dataset(dataset_key=key, auto_update=True)
    data = data_obj.dataframes[data_name]

    return data
Exemplo n.º 21
0
def import_Umemployment_data(filename, FIPS_name):
    # data provided by USDA
    # https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/
    dataset = dw.load_dataset("unemployment.csv", auto_update=True)
    # the dataset includes multiple dataframes. We will only use
    unemployment_data = dataset.dataframes["Unemployment_rate_2019"]
    # drop any nan fip values with covid_data[FIPS_name] > 0
    unemployment_data = unemployment_data[covid_data[FIPS_name] < 57000]
    unemployment_data = unemployment_data[covid_data[FIPS_name] > 0]

    # Transform FIPS codes into integers (not floats)
    unemployment_data[FIPS_name] = unemployment_data[FIPS_name].astype(int)
    unemployment_data.set_index([FIPS_name, "date"], inplace=True)
    # Prepare a column for state abbreviations. We will draw these from a
    # dictionary created in the next step.
    unemployment_data["state_abr"] = ""
    for state, abr in state_dict.items():
        unemployment_data.loc[unemployment_data["state"] == state,
                              "state_abr"] = abr
    # Create "Location" which concatenates county name and state abbreviation
    unemployment_data["Location"] = unemployment_data["location_name"] + ", " + \
        unemployment_data["state_abr"]

    return unemployment_data
Exemplo n.º 22
0
import pandas as pd
import datadotworld as dw
d = dw.load_dataset('ian/3-centuries-of-uk-economy-data') # pull data into ~/.dw
s = pd.Series({k: v.shape for k, v in d.dataframes.items()})

df_orig = d.dataframes['m6_mthly_prices_and_wages']
cols = df_orig.iloc[1:5]
names = cols.iloc[:,0].values
cols = pd.MultiIndex.from_arrays(cols.iloc[:,2:].values)
cols.names = names

df = df_orig.iloc[5:].set_index(['column_a', 'column_b'])
df.index.names = ['year', 'month']
df.columns = cols
df = df.astype(float)
Exemplo n.º 23
0
# Import the datadotworld module as dw
import datadotworld as dw

# Import the city council votes dataset
dataset = dw.load_dataset('stephen-hoover/chicago-city-council-votes')
queryResults = dw.query('http://data.world/tutorial/sparqltutorial',
                        sparql_query,
                        query_type='sparql')

# Use the dataframe property of the resulting query to create a dataframe variable named `houseStark`
houseStark = queryResults.dataframe

# Use pp.pprint() to print the dataframe to the screen.
pp.pprint(houseStark)

# Import the datadotworld module as dw and the sys module
import datadotworld as dw
import sys

# Import a dataset
refugee_dataset = dw.load_dataset('nrippner/refugee-host-nations')

# Get the size of the dataset:
sys.getsizeof(refugee_dataset)

# List all of the data files:
dataframes = refugee_dataset.dataframes
for df in dataframes:
    pp.pprint(df)

# print all of the files in a dataset:
resources = refugee_dataset.describe()['resources']
pp.pprint('name:')
for r in resources:
    pp.pprint(r['name'])
pp.pprint('\ntype of file:')
# userid: berkj
# Email: [email protected]
# Assignment Number: assignment1
# Honor statement: I pledge on my honor that I have neither given nor
#  received unauthorized aid on this assignment.
# Exercise 4:

# The datadotworld module and dataset have already been loaded for you:
import datadotworld as dw

dataset = dw.load_dataset(
    'https://data.world/stephen-hoover/chicago-city-council-votes')

# Use the dataframes property to assign the alderman_votes table to the variable votes_dataframe.
votes_dataframe = dataset.dataframes['alderman_votes']

# Use the pandas shape property to get rows/columns size for the `votes_dataframe` dataframe.
pp.pprint(votes_dataframe.shape)

# Use the pandas head function to print the first 3 rows of the `votes_dataframe` dataframe.
pp.pprint(votes_dataframe.head(3))
Exemplo n.º 26
0
'''
Created on Nov 28, 2017

@author: drews
'''

import datadotworld as dw

dataset = dw.load_dataset('data-society/european-soccer-data')

matches = dataset.dataframes['match']

# drop unneeded columns
# keep players, xy coords of players, date, id, goals, home/away team ids
matches = matches.drop(matches.columns[[range(77, 115)]], 1)
matches = matches.drop(matches.columns[[range(0, 5)]], 1)
matches = matches.dropna(0, 'any')

matches.to_csv('matches.csv', index=False)
print('wrote matches.csv')

teams = dataset.dataframes['team_attributes']
teams = teams.drop(teams.columns[[0, 1, 6]], 1)

# change string values to integer values
teams['buildupplayspeedclass'] = teams['buildupplayspeedclass'].replace(
    ['Fast', 'Balanced', 'Slow'], [3, 2, 1])
teams['buildupplaydribblingclass'] = teams[
    'buildupplaydribblingclass'].replace(['Lots', 'Normal', 'Little'],
                                         [3, 2, 1])
teams['buildupplaypassingclass'] = teams['buildupplaypassingclass'].replace(
Exemplo n.º 27
0
import datadotworld as d

td = intro_dataset = d.load_dataset('rfabbri/test1')

q = """SELECT * WHERE {?s ?p ?o}"""

q2 = """
PREFIX po: <http://purl.org/socialparticipation/po/>
SELECT ?s WHERE {?s a po:Participant}
"""

r = d.query('rfabbri/test1', q2, query_type='sparql')
# userid: berkj
# Email: [email protected]
# Assignment Number: assignment1
# Honor statement: I pledge on my honor that I have neither given nor
#  received unauthorized aid on this assignment.
# Exercise 5:

# datadotworld module has been imported as dw
import datadotworld as dw

# We've loaded two datasets to use 'int_dataset' and 'fipsCodes_dataset'
int_dataset = dw.load_dataset('https://data.world/jonloyens/intermediate-data-world')
fipsCodes_dataset = dw.load_dataset('https://data.world/uscensusbureau/fips-state-codes')

## Create two dataframes: police_shootings from the 'fatal_police_shootings_data' table of int_dataset and state_abbrvs, from the 'statesfipscodes' table of fipsCodes_dataset
police_shootings = int_dataset.dataframes['fatal_police_shootings_data']
state_abbrvs = fipsCodes_dataset.dataframes['statesfipscodes']

## Merge the two datasets together on the state and stusab fields. Assign to a merged_dataframe variable.
merged_dataframe = police_shootings.merge(state_abbrvs, how = 'left', left_on = 'state', right_on='stusab')


## Add a 'citystate' column to the merged_dataframe dataframe, populating it with the concatinated values from the 'city' and 'state_name' columns, separated by ', '.
merged_dataframe["citystate"] = merged_dataframe["city"] + ", " + merged_dataframe["state_name"]

## Print first 5 rows of merged_dataframe
pp.pprint(merged_dataframe.head(5))
# datadotworld module has been imported as dw
import datadotworld as dw

# We've loaded two datasets to use 'int_dataset' and 'fipsCodes_dataset'
int_dataset = dw.load_dataset(
    'https://data.world/jonloyens/intermediate-data-world')
fipsCodes_dataset = dw.load_dataset(
    'https://data.world/uscensusbureau/fips-state-codes')

## Create two dataframes: police_shootings from the 'fatal_police_shootings_data' table of int_dataset and state_abbrvs, from the 'statesfipscodes' table of fipsCodes_dataset
police_shootings = int_dataset.dataframes['fatal_police_shootings_data']
state_abbrvs = fipsCodes_dataset.dataframes['statesfipscodes']

## Merge the two datasets together on the state and stusab fields. Assign to a merged_dataframe variable.
merged_dataframe = police_shootings.merge(state_abbrvs,
                                          how='left',
                                          left_on='state',
                                          right_on='stusab')

## Add a 'citystate' column to the merged_dataframe dataframe, populating it with the concatinated values from the 'city' and 'state_name' columns, separated by ', '.
merged_dataframe['citystate'] = merged_dataframe[
    'city'] + ", " + merged_dataframe['state_name']

## Print first 5 rows of merged_dataframe
pp.pprint(merged_dataframe.head(5))
Exemplo n.º 30
0
"""

# %%  Imports.

import datadotworld as dw
import io
import pandas as pd

# %%  Reading, parsing, naming, upsampling and interpolating the data.

#  Loading the data (CSV) from: https://data.world/aryoryte/
DDWUsrDir = "aryoryte/"
DDWUaDir = "meteorological-uppsala-automatic-weather-station-1998-2017"
DDWUaDir = DDWUsrDir + DDWUaDir
dataPath = "original/Uppsala 1998 till 2017.csv"
UaCSV = dw.load_dataset(DDWUaDir).raw_data[dataPath]

#  Names given to the input variuables/features/...
colnames = ['UTC', 'windDir', 'windSpeed', 'airTemp', 'dewPt', 'relHum']

#  Reading the data from the CSV file into a Pandas dataframe; headers
#  are at row 6 (but given the names colnames), trying to skip the last
#  row which is blank/empty but an initial space - fails..., choosing
#  parse_dates True to most quickly (C engine) parse the dates of the
#  UTC/index column.
df = pd.read_csv(io.StringIO(UaCSV.decode('utf-8')),
                 header=6,
                 sep=';',
                 names=colnames,
                 index_col='UTC',
                 skipinitialspace=True,
# userid: berkj
# Email: [email protected]
# Assignment Number: assignment1
# Honor statement: I pledge on my honor that I have neither given nor
#  received unauthorized aid on this assignment.
# Exercise 1:

# Import the datadotworld module as dw
import datadotworld as dw

# Import the city council votes dataset
dataset = dw.load_dataset('stephen-hoover/chicago-city-council-votes')
Exemplo n.º 32
0
 def load_children(self):
     if self.data_set is None:
         full_name = self.owner + "/" + self.data_set_name
         self.data_set = dw.load_dataset(full_name)
     for table_name in self.data_set.dataframes.keys():
         self.add_child(DataDotWorldTableNode(self, table_name))
"""
  Name     : c3_27_datadotworld_1.py
  Book     : Hands-on Data Science with Anaconda)
  Publisher: Packt Publishing Ltd. 
  Author   : Yuxing Yan and James Yan
  Date     : 1/15/2018
  email    : [email protected]
             [email protected]
"""


import datadotworld as dw
dataset = 'jonloyens/an-intro-to-dataworld-dataset'
data = dw.load_dataset(dataset, force_update=True) 
list(dataset.dataframes)
Exemplo n.º 34
0
 def from_uri(cls, uri: str, **kwargs) -> "DataDotWorldTable":
     dataset_name = "/".join(uri.split("/")[2:-1])
     dataset = dw.load_dataset(dataset_name)
     df = dataset.dataframes[uri.split("/")[-1]]
     return cls(inner_data=df, uri=uri, **kwargs)
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import datadotworld as dw

dataset = dw.load_dataset('tonatihu/prueba-sngular')
dataset.describe()
dataframedatos = dataset.dataframes['datos']
dataframeprovincia = dataset.dataframes['provincia']

left = pd.DataFrame(dataframeprovincia)
right = pd.DataFrame(dataframedatos)

mergequery = pd.merge(left, right, on='id_provincia', how='inner')
prov = mergequery.groupby('provincia')
readytoplot = prov['ventas_totales'].agg(np.sum)

print(readytoplot)

plt.title("Ventas por provincia")
plt.xlabel("Ventas totales")
plt.ylabel("Provincia")
plt.plot(readytoplot)
plt.show()
# userid: berkj
# Email: [email protected]
# Assignment Number: assignment1
# Honor statement: I pledge on my honor that I have neither given nor
#  received unauthorized aid on this assignment.
# Exercise 4:

# The datadotworld module and dataset have already been loaded for you:
import datadotworld as dw
dataset = dw.load_dataset('https://data.world/stephen-hoover/chicago-city-council-votes')

# Use the dataframes property to assign the alderman_votes table to the variable votes_dataframe.
votes_dataframe = dataset.dataframes['alderman_votes']

# Use the pandas shape property to get rows/columns size for the `votes_dataframe` dataframe.
pp.pprint(votes_dataframe.shape)

# Use the pandas head function to print the first 3 rows of the `votes_dataframe` dataframe.
pp.pprint(votes_dataframe.head(3))