示例#1
0
      {paths:{base:'/static/base',
              plotly:'https://cdn.plot.ly/plotly-1.5.1.min.js?noext'}});
  </script>"""))
cf.go_offline(); configure_plotly()
init_notebook_mode(connected=False)
pd.set_option('display.float_format',lambda x:'%.6f'%x)
url='https://raw.githubusercontent.com/noahgift/'+\
    'real_estate_ml/master/data/'+\
    'Zip_Zhvi_SingleFamilyResidence.csv'
re=pd.read_csv(url).dropna().astype({'RegionID':'int'})\
     .astype({'RegionName':'int'}).astype({'SizeRank':'int'})
re_median=pd.concat([re[re['CountyName']=='Los Angeles'].median(),
                     re[re['CountyName']=='San Francisco'].median(),
                     re.median()],axis=1,sort=False).iloc[3:]
re_median.columns=['Los Angeles','San Francisco','Median USA']
layout=cf.Layout(height=500,width=800)
re_median.iplot(title='Median Single Family Home Prices 1996-2017',
                xTitle='Year',yTitle='Sales Price',
                shape=(4,1),fill=True,layout=layout)
re_median.T.iloc[:,:7].style.set_precision(1)

import pylab; from skimage import io,color,measure
cmaps=['ocean','cool','gnuplot2','terrain',
       'winter','spring','summer','autumn']
pylab.style.use('ggplot')
def vector(file_num,cm,level=.85):
    file_path='https://olgabelitskaya.gitlab.io/data/decors/'
    file_name='00_00_%03d'%(file_num)+'.png'
    img=io.imread(file_path+file_name)
    gray_img=color.colorconv.rgb2grey(img) 
    contours=measure.find_contours(gray_img,level)
示例#2
0
        ax2.set_title('Loan status')
        plt.xticks(rotation=90)
    ax2.set_xlabel(col_name)

    plt.tight_layout()


# ### Feature correlations

# In[23]:

corr = df_selected.corr(method='spearman')

# In[24]:

layout = cf.Layout(height=600, width=600)
corr.abs().iplot(kind='heatmap',
                 layout=layout.to_plotly_json(),
                 colorscale='RdBu')

# In[25]:

import scipy
import scipy.cluster.hierarchy as sch

X = df_selected.corr(method='spearman').values
d = sch.distance.pdist(X)  # vector of ('55' choose 2) pairwise distances
L = sch.linkage(d, method='complete')
ind = sch.fcluster(L, 0.25 * d.max(), 'distance')
columns = [df_selected.columns.tolist()[i] for i in list((np.argsort(ind)))]
df_selected_new = df_selected.reindex_axis(columns, axis=1)
示例#3
0
trace_1 = go.Pie(labels=grouped.region, values=grouped.session_id,
                 name='session_id', showlegend=False)
layout2 = go.Layout(plot_bgcolor='rgb(26,26,26)',
                    paper_bgcolor='rgb(26,26,26)',
                    font_color='white',
                    hovermode='closest')
fig2 = go.Figure(data=[trace_1], layout=layout2)

# Step 3. Create a plotly figure
layout = cf.Layout(
    legend=dict(
        orientation='h',
        y=1,
        xanchor='center',
        x=0.4),
    margin={'t': 0},
    plot_bgcolor='rgb(26,26,26)',
    paper_bgcolor='rgb(26,26,26)',
    font_color='white',
    font_size=11
)

# aggregating the data for plotting
regions = user_info.groupby('region').count()['user_id'].reset_index()
regions = regions.iplot(kind='pie', labels='region', values='user_id', asFigure=True, legend=False, theme='space',
                        title='% of Users by Region')

user_age = user_info.groupby(['region', 'age_range']).count()['user_id'].unstack()
user_age = user_age.iplot(kind='bar', theme='space', asFigure=True, xTitle='age range', yTitle='number of users',
                          title='Age Demographic by Region')
示例#4
0
def plot_correlation(
        df,
        cluster=False,
        layered_cluster=False,
        iplot=False,
        triangle=False,
        like=None,
        sort_by_carrier=False,
        figsize=(35, 30),
        title="",
):

    if iplot and type(df.columns) == pd.core.indexes.multi.MultiIndex:
        df.columns = [" ".join(col).strip() for col in df.columns.values]

    if cluster:
        df = cluster_correlations(df, layered=layered_cluster)

    if sort_by_carrier:
        columns = []
        for l in [
                "offwind-dc",
                "offwind-ac",
                "onwind",
                "solar",
                "CCGT",
                "OCGT",
                "H2",
                "battery",
                "LK",
                "LN",
        ]:
            columns.extend(df.filter(like=l, axis=1).columns)
        df = df.reindex(columns, axis="columns")

    df = df.rename(columns=opts["nice_names"])
    corr = df.corr()

    if like is not None:
        corr = corr.filter(like=like)

    if iplot:

        if triangle:
            corr = corr.where(np.tril(np.ones(corr.shape)).astype(np.bool))

        lt = cf.Layout(height=1000, width=1000)

        corr.iplot(kind="heatmap",
                   colorscale="PiYG",
                   zmin=-1,
                   zmax=1,
                   layout=lt)

    else:

        f, ax = plt.subplots(figsize=figsize)
        mask = np.zeros_like(corr, dtype=np.bool)

        if triangle:
            mask[np.triu_indices_from(mask)] = True

        sns.heatmap(
            corr,
            mask=mask,
            vmin=-1.0,
            vmax=1.0,
            cmap=sns.diverging_palette(230, 10, as_cmap=True),
            square=True,
            ax=ax,
        )

        ax.set_ylabel("")
        ax.set_xlabel("")
        ax.set_title(title)
示例#5
0
def createEarningsTables(columnName, columnNameMethod):
  
  dfg = dfgSource.groupby(['track_title_id', columnName])['payable_amount'].sum().unstack(columnName)
  dfg = dfg.rename(columns = lambda columnNameYearQuarter: toDate(columnNameYearQuarter))
  
  dfgMaxAmount = dfg.max(axis=1) # determine the max amount per row, e.g (axis=1)
  dfgGeneralInfo = dfgMaxAmount.to_frame();
  dfgGeneralInfo.columns = ['Highest earnings in a year']
  #print(dfGeneral)
  dfgSource['distribution_year'] = dfgSource['distribution_year'].astype(int) # for min() method used below
  dfgGeneralInfo['start year'] = dfgSource.groupby(['track_title_id'])['distribution_year'].min()

  dfgGeneralInfo['song_release_year'] = dfgSource.groupby(['track_title_id'])['song_release_year'].first()         # first() means get the first value from the list return from groupby
  dfgGeneralInfo.loc[dfgGeneralInfo["song_release_year"] == '','song_release_year'] = dfgGeneralInfo["start year"] # when song_relase_year has empty value, copy value from start year
  dfgGeneralInfo["song_release_year"] = dfgGeneralInfo["song_release_year"].astype(int)                            # convert to int

  dfgGeneralInfo['music_style'] = dfgSource.groupby(['track_title_id'])['music_style'].first()
  dfgGeneralInfo['sheet'] = dfgSource.groupby(['track_title_id'])['sheet'].first()

  print("The " + str(nrOfSongs) + " highest annual earning songs:\n", dfgGeneralInfo.to_string())
  #downloadcsv(dfgGeneralInfo, "dfgGeneralInfo_" + columnName + ".csv")

  print("\n\nPayable amount per " + columnName + ":\n", dfg.to_string())
  #dfg.transpose().iplot(kind="scatter", xTitle = columnName, yTitle= 'Payable amount', layout=cf.Layout(height=1000, width=1800))
 
  # now create the ratio table, normalized based on the first NaN value
  dfg = dfg.div(dfgMaxAmount, axis=0)  # divide all values in a row by the maximum amount found in that row (= kinda normalization) 
  dfgShifted = dfg.copy();
  dfgShifted['song_release_year'] = pd.to_datetime(dfgGeneralInfo["song_release_year"].astype(int), format="%Y")
  print(dfgShifted.to_string())
  earliestSongReleaseYear = dfgShifted['song_release_year'].min()
  dfgShifted = dfgShifted.apply(lambda row: shiftRowBasedOnFirstIndexLargerThenZeroAndSongReleaseYear(row, earliestSongReleaseYear), axis=1, result_type='expand', raw=True) #shift columns to left/right
  
  # Not needed for year rename colums, but getting column index (get_loc) which is the year number and converting that index to year-month string
  dfgShifted = dfgShifted.rename(columns = lambda x: columnNameMethod(dfgShifted.columns.get_loc(x)))
  
  # make a copy because otherwise the mean calculation includes the count row
  dfgShiftedWithExtra = dfgShifted.copy()
  dfgShiftedWithExtra.loc['count'] = dfgShifted.count()
  dfgShiftedWithExtra.loc['mean'] = dfgShifted.mean()
  dfgShiftedWithExtra.loc['standar dev.'] = dfgShifted.std()
  dfgShiftedWithExtra.loc['min'] = dfgShifted.min()
  dfgShiftedWithExtra.loc['max'] = dfgShifted.max()
  
  setFloatFormat('{:.3f}') # show 3 digits dfgShited table
  import IPython
  print(dfgShiftedWithExtra.to_string())
  #downloadcsv(dfgShiftedWithExtra, "dfgShiftedWithExtra_" + columnName + ".csv")
  setNormalFloatFormat()
  dfgShifted.transpose().iplot(kind="scatter", xTitle = 'Payable amount', yTitle= 'Payable amount', layout=cf.Layout(height=1000, width=1800))
示例#6
0
increase_frameheight()
enable_plotly_in_cell()

dfg = df
top_songs = dfg.groupby(['track_title_id', 'distribution_year'])['payable_amount'].sum().unstack('distribution_year').max(axis=1).sort_values(ascending=False).head(150).index.tolist()
companies = ['SPOTIFY', 'APPLE MUSIC']
print('Checking the nr of plays done with: ', companies)
dfg = dfg[dfg['music_user'].isin(companies)]
dfg = dfg[dfg['track_title_id'].isin(top_songs)].groupby(['track_title_id', 'distribution_period'])['number_of_plays'].sum().unstack('distribution_period')
dfg = dfg.rename(columns = lambda columnNameYearQuarter: toDate(columnNameYearQuarter))

dfgMaxPlaysPerSong = dfg.max(axis=1).sort_values(ascending=False).to_frame()
dfgMaxPlaysPerSong.columns = ['Maximum plays in a quarter']
print(dfgMaxPlaysPerSong.to_string())
print("\n\nNr of plays per quarter:\n", dfg.to_string())
dfg.transpose().iplot(kind="scatter", xTitle = 'Distribution period', yTitle= 'Number of plays', layout=cf.Layout(height=1000, width=1800))
dfg = dfg.div(dfgMaxPlaysPerSong['Maximum plays in a quarter'], axis=0)

dfgShifted = dfg.copy();
dfgShifted = dfgShifted.apply(lambda row: shiftRowBasedOnFirstIndexLargerThenZero(row), axis=1, result_type='expand', raw=True)
dfgShifted = dfgShifted.rename(columns = lambda x: toYearMonth(dfgShifted.columns.get_loc(x) + 1))
print('Number of plays historically\n', dfgShifted.to_string())
dfgShifted.transpose().iplot(kind="scatter", xTitle = 'Period since receiving revenue', yTitle= 'Number of plays', layout=cf.Layout(height=1000, width=1800))

increase_frameheight()
enable_plotly_in_cell()

#Songs waarvan %streaming revenue tenminste 50% bedraagt van (totale revenu -/-international sources)
#Songs waarvan %streaming revenue tenminste 75% bedraagt van (totale revenu -/-international sources) 
#Songs waarvan %streaming revenue tenminste 90% bedraagt van (totale revenu -/-international sources) (even op zoek naar sweetspot tussen waar er nog volume in aantals songs zit en waar je echt correlatie ziet)
unitedStatesOnlyCriterion = df['region'] == 'domestic'