def normalize_etpinard_df(df='https://plot.ly/~etpinard/191.csv', columns='x y size text'.split(), category_col='category', possible_categories=['Africa', 'Americas', 'Asia', 'Europe', 'Oceania']): """Reformat a dataframe in etpinard's format for use in plot functions and sklearn models""" possible_categories = ['Africa', 'Americas', 'Asia', 'Europe', 'Oceania'] if possible_categories is None else possible_categories df.columns = clean_columns(df.columns) df = pd.read_csv(df) if isinstance(df, str) else df columns = clean_columns(list(columns)) df2 = pd.DataFrame(columns=columns) df2[category_col] = np.concatenate([np.array([categ] * len(df)) for categ in possible_categories]) columns = zip(columns, [[clean_columns(categ + ', ' + column) for categ in possible_categories] for column in columns]) for col, category_cols in columns: df2[col] = np.concatenate([df[label].values for label in category_cols]) return df2
def download(names=None, verbose=True): """ Download CSV or HTML tables listed in `names` and save them to DATA_PATH/`names`.csv Uses table in data_info.csv (internal DATA_INFO) to determine URL or file path from dataset name. Also looks TODO: if name is a valid URL then download it and create a name and store the name: url in data_info.csv before downloading """ names = [names] if isinstance(names, basestring) else names # names = names or list(BIG_URLS.keys()) # download them all, if none specified! file_paths = {} for name in names: name = name.lower().strip() if name in BIG_URLS: file_paths[name] = download_file(BIG_URLS[name][0], data_path=BIGDATA_PATH, size=BIG_URLS[name][1], verbose=verbose) if file_paths[name].endswith('.tar.gz'): logger.info('Extracting {}'.format(file_paths[name])) untar(file_paths[name]) file_paths[name] = file_paths[ name][: -7] # FIXME: rename tar.gz file so that it mimics contents if file_paths[name].endswith('.zip'): file_paths[name] = unzip(file_paths[name]) else: df = pd.read_html(DATA_INFO['url'][name], **DATA_INFO['downloader_kwargs'][name])[-1] df.columns = clean_columns(df.columns) file_paths[name] = os.path.join(DATA_PATH, name + '.csv') df.to_csv(file_paths[name]) return file_paths
def get_data(name='food_carbon', url=None): """ Retrieve data from local cache in data/ or download from url >>> get_data('food_carbon').shape (16, 4) """ name = make_name(name) # replace spaces with _ ,etc) if name == 'food_carbon': df = pd.read_html( 'http://www.greeneatz.com/foods-carbon-footprint.html', header=0)[0] df.to_csv('food_carbon.csv') elif name == 'capitals': df = pd.read_html( 'https://en.wikipedia.org/wiki/List_of_capitals_in_the_United_States', header=0)[0] df.columns = clean_columns(df.columns) return df
def download_unzip(names=None, verbose=True): """ Download CSV or HTML tables listed in `names`, unzip and to DATA_PATH/`names`.csv .txt etc Also normalizes file name extensions (.bin.gz -> .w2v.bin.gz). Uses table in data_info.csv (internal DATA_INFO) to determine URL or file path from dataset name. Also looks TODO: if name is a valid URL then download it and create a name and store the name: url in data_info.csv before downloading """ names = [names] if isinstance(names, (str, basestring)) else names # names = names or list(BIG_URLS.keys()) # download them all, if none specified! file_paths = {} for name in names: name = name.lower().strip() if name in BIG_URLS: file_paths[name] = download_file(BIG_URLS[name][0], data_path=BIGDATA_PATH, size=BIG_URLS[name][1], verbose=verbose) if file_paths[name].lower().endswith('.tar.gz'): logger.info('Extracting {}'.format(file_paths[name])) file_paths[name] = untar(file_paths[name]) if file_paths[name].lower().endswith('.zip'): file_paths[name] = unzip(file_paths[name]) logger.debug('download_unzip.filepaths=' + str(file_paths)) else: df = pd.read_html(DATA_INFO['url'][name], **DATA_INFO['downloader_kwargs'][name])[-1] df.columns = clean_columns(df.columns) file_paths[name] = os.path.join(DATA_PATH, name + '.csv') df.to_csv(file_paths[name]) logger.debug('download_unzip.filepaths=' + str(file_paths)) new_file_paths = normalize_ext(file_paths[name]) logger.debug('download_unzip.new_filepaths=' + str(new_file_paths)) file_paths[name] = rename_file(file_paths[name], new_file_paths) logger.debug('download_unzip.filepaths=' + str(file_paths)) return file_paths
def offline_plotly_scatter_bubble( df, x='x', y='y', size_col='size', text_col='text', category_col='category', possible_categories=None, filename='offline_plotly_scatter_bubble.html', config={'displaylogo': False}, xscale=None, yscale='log', layout={ 'hovermode': 'closest', 'showlegend': False, 'autosize': True }, marker={'sizemode': 'area'}, ): """Interactive scatterplot of a DataFrame with the size and color of circles linke to two columns config keys: fillFrame setBackground displaylogo sendData showLink linkText staticPlot scrollZoom plot3dPixelRatio displayModeBar showTips workspace doubleClick autosizable editable layout keys: angularaxis annotations autosize bargap bargroupgap barmode barnorm boxgap boxgroupgap boxmode calendar direction dragmode font geo height hiddenlabels hiddenlabelssrc hidesources hovermode images legend mapbox margin orientation paper_bgcolor plot_bgcolor radialaxis scene separators shapes showlegend sliders smith ternary title titlefont updatemenus width xaxis yaxis marker keys: autocolorscale blend border cauto cmax cmin color colorbar colors colorscale colorsrc colorssrc line maxdisplayed opacity opacitysrc outliercolor reversescale showscale size sizemax sizemin sizemode sizeref sizesrc symbol symbolsrc marker['sizeref'] gives the denominator of the circle scaling factor. Typically it should be about a tenth of the minimum 'size' column value >>> from nlpia.data.loaders import get_data >>> from nlpia.plots import offline_plotly_scatter_bubble >>> df = get_data('cities_us_wordvectors_pca2_meta') >>> html = offline_plotly_scatter_bubble( ... df.sort_values('population', ascending=False)[:350].copy().sort_values('population'), ... filename='plotly_scatter_bubble.html', ... x='x', y='y', ... size_col='population', text_col='name', category_col='timezone', ... xscale=None, yscale=None, # 'log' or None ... layout={}, marker={'sizeref': 3000}) """ config_default = dict(DEFAULT_PLOTLY_CONFIG) marker_default = { 'size': size_col, 'sizemode': 'area', 'sizeref': int(df[size_col].min() * .8) } marker_default.update(marker) size_col = marker_default.pop('size') layout_default = { 'xaxis': graph_objs.XAxis(title=x, type=xscale), 'yaxis': graph_objs.YAxis(title=y, type=yscale), } layout_default.update(**layout) if config is not None: config_default.update(config) df.columns = clean_columns(df.columns) if possible_categories is None and category_col is not None: if category_col in df.columns: category_labels = df[category_col] else: category_labels = np.array(category_col) possible_categories = list(set(category_labels)) possible_categories = [ None ] if possible_categories is None else possible_categories if category_col in df: masks = [ np.array(df[category_col] == label) for label in possible_categories ] else: masks = [np.array([True] * len(df))] * len(possible_categories) print(marker_default) data = { 'data': [ graph_objs.Scatter(x=df[x][mask].values, y=df[y][mask].values, text=df[text_col][mask].values, marker=graph_objs.Marker( size=df[size_col][mask] if size_col in df.columns else size_col, **marker_default), mode='markers', name=str(category_name)) for (category_name, mask) in zip(possible_categories, masks) ], 'layout': graph_objs.Layout(**layout_default) } return offline_plotly_data(data, filename=filename, config=config_default)
columns capitals = capitals2 columns[2] = 'statehood_year' capitals capitals.columns = columns capitals capitals['city'] = capitals.capital capitals['population'] = capitals.population.astype(int) capitals['population'] = capitals.population_2010.astype(int) capitals['capital_since'] = capitals.capital_since.astype(int) capitals['population_2010_metro'] = capitals.notes.astype(int) capitals['population_2010_metro'] = capitals.notes.astype(float) capitals['population_2010_rank'] = capitals.unnamed_8.astype(int) ''.join(re.findall('\w', '\thello_w orld& ')) capitals capitals.columns = clean_columns(capitals.columns) %paste capitals.columns = clean_columns(capitals.columns) %paste capitals.columns = clean_columns(capitals.columns) capitals.columns capitals.unnamed_8 capitals.unnamed_8 == capitals.population_2010_rank (capitals.unnamed_8 == capitals.population_2010_rank).all() del capitals['unnamed_8'] capitals.population_2010_rank !pwd !whoami !uname hist cities