예제 #1
0
파일: pair_plot.py 프로젝트: adacher/DSLR
def display(data, color, names):
    try:
        fig = go.Figure(data=go.Splom(
            dimensions=[
                dict(label=names[0], values=data[names[0]]),
                dict(label=names[1], values=data[names[1]]),
                dict(label=names[2], values=data[names[2]]),
                dict(label=names[3], values=data[names[3]]),
                dict(label=names[4], values=data[names[4]]),
                dict(label=names[5], values=data[names[5]]),
                dict(label=names[6], values=data[names[6]]),
                dict(label=names[7], values=data[names[7]]),
                dict(label=names[8], values=data[names[8]]),
                dict(label=names[9], values=data[names[9]])
            ],
            marker=dict(color=color, size=5),
        ))
        fig.update_layout(title='Features scatter plot matrix',
                          template='seaborn',
                          title_font_size=20,
                          height=1500)
    except Exception as err:
        print("Error: " + str(err))
        sys.exit(1)
    fig.show()
예제 #2
0
def scatter_matrix():
    df = df_dict['covid-us-state']
    df.fips = df.fips.apply(lambda x: str(x).zfill(2))
    df = df[df.date == max(df.date)]
    df = df.drop(columns='date', axis=1).reset_index(drop=True)
    state_pop = df_dict['state-population']
    state_area =  df_dict['state-area']

    mask_use = df_dict['mask-use-by-county']
    mask_use.countyfp = mask_use.countyfp.apply(lambda x: str(x).zfill(5))
    mask_use['wear_mask_prob'] = 0.25 * mask_use['rarely'] + 0.5 * mask_use['sometimes'] + \
                    0.75 * mask_use['frequently'] + 1.0 * mask_use['always']
    mask_use['state_code'] = mask_use.apply(lambda x: fip_to_state(x.countyfp), axis=1)
    mask_use['county'] = mask_use.apply(lambda x: fip_to_county(x.countyfp), axis=1)
    df_agg = mask_use.groupby('state_code').agg(['mean'])
    df_agg.columns = ["_".join(x) for x in np.ravel(df_agg.columns)]
    df_agg.reset_index(inplace=True)
    df_agg.rename(columns={'wear_mask_prob_mean' : 'wear_mask_prob'}, inplace=True)
    df_agg = df_agg[['state_code', 'wear_mask_prob']]
    df_agg.drop(df_agg[df_agg['state_code'] == 'N/A'].index, inplace = True)
    df_agg.drop(df_agg[df_agg['state_code'] == 'DC'].index, inplace = True)
    df_agg['state'] = df_agg['state_code'].apply(lambda x: state_map_dict[x])
    df_agg = df_agg[['state', 'wear_mask_prob']]
    data_frames = [df, state_pop, state_area, df_agg]
    df_merged = reduce(lambda left, right: pd.merge(left,right,on=['state'],
                                                how='inner'), data_frames)

    df_merged['CFR'] = df_merged['deaths'] / df_merged['cases']
    df_merged['IR'] = df_merged['cases'] / df_merged['total']
    df_merged['PD'] = df_merged['total'] / df_merged['area']
    df_merged['WMP'] = df_merged['wear_mask_prob']
    df_ana = df_merged.loc[:, ['state', 'CFR', 'IR', 'PD', 'WMP']]
    df_ana[['CFR', 'IR', 'PD', 'WMP']] = np.round(df_ana[['CFR', 'IR', 'PD', 'WMP']], 3)

    fig = go.Figure(data=go.Splom(
                dimensions=[dict(label='CFR', # 'Fatality rate',
                                 values=df_ana['CFR']),
                            dict(label='IR', #'Infection rate',
                                 values=df_ana['IR']),
                            dict(label='PD', #'Population density',
                                 values=df_ana['PD']),
                            dict(label='WMP', #'Wear mask prob.',
                                 values=df_ana['WMP'])],
                text=df_ana['state'],
#                 hovertemplate="%{x}, %{y}",
                marker=dict(showscale=False, # colors encode categorical variables
                            line_color='white', line_width=0.5),
                showupperhalf=False,
                ))

    fig.update_layout(
    title='Scatter Matrix',
    dragmode='select',
    width=600,
    height=600,
    hovermode='closest',
    )
    return fig
def scatter_matrix(df):

    df = df.sort_values(by="Churn", ascending=True)
    classes = df["Churn"].unique().tolist()
    classes

    class_code = {classes[k]: k for k in range(2)}
    class_code

    color_vals = [class_code[cl] for cl in df["Churn"]]
    color_vals

    pl_colorscale = "Viridis"

    pl_colorscale

    text = [df.loc[k, "Churn"] for k in range(len(df))]
    text

    trace = go.Splom(
        dimensions=[
            dict(label="tenure", values=df["tenure"]),
            dict(label="MonthlyCharges", values=df["MonthlyCharges"]),
            dict(label="TotalCharges", values=df["TotalCharges"]),
        ],
        text=text,
        marker=dict(
            color=color_vals,
            colorscale=pl_colorscale,
            size=3,
            showscale=False,
            line=dict(width=0.1, color="rgb(230,230,230)"),
        ),
    )
    axis = dict(showline=True, zeroline=False, gridcolor="#fff", ticklen=4)

    layout = go.Layout(
        dict(
            title="Scatter plot matrix for Numerical columns for customer attrition",
            autosize=False,
            height=800,
            width=800,
            dragmode="select",
            hovermode="closest",
            plot_bgcolor="rgba(240,240,240, 0.95)",
            xaxis1=dict(axis),
            yaxis1=dict(axis),
            xaxis2=dict(axis),
            yaxis2=dict(axis),
            xaxis3=dict(axis),
            yaxis3=dict(axis),
        )
    )
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)
예제 #4
0
def mp(selected_patient, selector, values):
    x = multiplot_soz(selected_patient, selector, values)
    data = []
    for e in x:
        d = go.Splom(
            dimensions=[dict(label=k, values=e[k]) for k in e if k != 0],
            name=e[0],
            marker=dict(size=4),
            diagonal=dict(visible=False))
        data.append(d)
    layout = go.Layout(title="Multiplot prove",
                       dragmode='select',
                       hovermode='closest',
                       showlegend=True)
    fig = go.Figure(data=data, layout=layout)
    return fig
예제 #5
0
def pairwise_plot(df: pd.DataFrame, cols: list = None):
    """Returns pairplot for features listed in cols parameter"""
    if cols is not None:
        df = df[[cols]]
    fig = go.Figure(data=go.Splom(dimensions=[{
        'label': i,
        'values': df[i]
    } for i in cols], ),
                    layout=go.Layout(
                        title={
                            'text': 'Pairwise Plot',
                            'x': 0.5,
                            'xanchor': 'center',
                            'yanchor': 'top'
                        }))
    return fig
예제 #6
0
def UnderSample(df, _class, method = 'cc', strategy = 'auto', n_jobs = 1, ratio = None, transform = None, offline = None):
    """
       NearMiss - Select values which are closest to minority class.
       TomeLinks - uses connected sets between class borders which are closest. If there are no other points closer, it assumes they are noise or borderline and remove them.
       ENN - Edited Nearest Neighbors, remove instances from majorit which are near bordeline
       NCL - NeighborhoodCleaningRule - Uses ENN to remove majority samples. Finds Nearest neighbors and if all are correctly label it keeps them.
       CC - Cluster Centroids - Finds Clusters of Majority Samples with K-means, then keeps cluster centroids of the clusters as the new majority sample.   
    """
    #https://towardsdatascience.com/sampling-techniques-for-extremely-imbalanced-data-part-i-under-sampling-a8dbc3d8d6d8

    Y = df[_class]
    X = df.drop(_class, axis = 1)

    if method.lower() == 'nearmiss':
        x, y = NearMiss(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y)
    elif method.lower() == 'tomelinks':
        x, y = TomekLinks(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y)
    elif method.lower() == 'ncl':
        x, y = NeighbourhoodCleaningRule(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y)
    elif method.lower() == 'cc':
        x, y = ClusterCentroids(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y)
    else:
        raise Exception("{} is not a valid method for UserSampling".format(method))

    df = pd.DataFrame([x, y], columns = list(df.columns) + [_class])

    fig = go.Figure()

    fig.add_trace(
    
        go.Splom(
            dimensions = [
                dict(label = column, values = df[column]) for column in df.columns
            ], 
            marker = dict(
                color = df[_class]
            )
        )
    )

    fig.show()
    
    if transform:
        return df
    
    return
예제 #7
0
def scatter_go(df,dimensions=['SEXO','EDAD'],for_text='MUNICIPIO'):
    index_vals = df[for_text].astype('category').cat.codes

    data=go.Splom(
        dimensions=[dict(label=dimension,
                         values=df[dimension]) for
                         dimension in dimensions],
        text=df[for_text],
        marker=dict(color=index_vals,
                    showscale=False, # colors encode categorical variables
                    line_color='white', line_width=0.5)
        )

    fig = go.Figure(data)
    fig.update_layout(
        title='Iris Data set',
        dragmode='select',
        width=600,
        height=600,
        hovermode='closest',
    )
    return fig
def scatter_matrix_plotly(data, columns):
    """

    Examples
    --------

    >>> columns = ['a', 'b', 'c']
    >>> data = [
    ...     dict(a=1, b=2, c=3, epoch=1),
    ...        dict(a=2, b=1, c=1, epoch=2),
    ...        dict(a=3, b=3, c=2, epoch=3),
    ... ]
    >>> chart = scatter_matrix_plotly(data, columns)
    """

    # Looks ugly
    import plotly.graph_objects as go
    import pandas as pd

    df = pd.DataFrame(data)
    index_vals = df['epoch'].astype('category').cat.codes

    fig = go.Figure(data=go.Splom(
        showlowerhalf=False,
        diagonal_visible=False,
        text=df['epoch'],
        dimensions=[dict(label=col, values=df[col]) for col in columns],
        marker=dict(color=index_vals,
                    showscale=False,
                    line_color='white',
                    line_width=0.5)))

    fig.update_layout(template='plotly_dark')
    fig.update_layout(showlegend=True, width=600, height=600)

    return fig
예제 #9
0
def cluster(df: pd.DataFrame, k_min=2, k_max=10, multivariate=True) -> dict:
    """
    @df -> input data\n
    @k_min -> minimum cluster number\n
    @k_max -> maximum cluster number\n
    @multivariate -> boolean multivariate clustering vs pairwise -> to be developed\n
    """

    if multivariate:

        results = {k: None for k in range(k_min, k_max + 1)}

        for ix, k in enumerate(results.keys()):

            cl = KMeans(n_clusters=k,
                        random_state=44,
                        algorithm='full',
                        n_init=5,
                        init='k-means++').fit(df)

            out_df = pd.concat([df, pd.Series(cl.labels_, name='cluster')],
                               axis=1)

            results[k] = {'centroids': cl.cluster_centers_,
                          'labels': cl.labels_,
                          'inertia': cl.inertia_,
                          'df': out_df,
                          'figure': go.Figure(go.Splom(dimensions = [{'label': lab, 'values': out_df[lab]} for lab in \
                                                                            [d for d in out_df.columns if d != 'cluster']],
                                                      showupperhalf = False,
                                                      marker = {'color': out_df['cluster'], 'showscale' : False, 'colorscale': 'inferno'},
                                                      opacity = .8,
                                                      diagonal_visible = False,
                                                      ),)}

        return results
예제 #10
0
def chart_pairs(df, title="Time Series Pairs Plot", **kwargs):
    """
    Pairwise scatter matrix plot for timeseries

    Parameters
    ----------
    df : DataFrame
        pandas DataFrame with a datetime index and columns representing the futures contract, ordered by most recent expiry
    title : str, optional
        Chart title, by default "Time Series Pairs Plot"
    **kwargs
        keyword arguments to pass to plotly.graph_objects.Figure.update_layout function
    """
    dt_idx = df.index.name
    df = df.reset_index().copy()

    dims = []
    for c, i in df.iteritems():
        dims.append(dict(
            label=c,
            values=df[c],
        ))

    fig = go.Figure()
    fig.add_trace(
        go.Splom(
            dimensions=dims,
            showupperhalf=False,
            marker=dict(color=df[dt_idx].astype(int), colorscale="Portland"),
            diagonal_visible=False,
        ))
    fig.update_layout(width=1000, height=1000, title=title)
    if kwargs is not None:
        fig.update_layout(**kwargs)

    return fig
예제 #11
0
def display_selected_data(selectedAreaMap, selectedAreaDropdown, selectedAttr):

    df_selected = df
    title_part = ' census tracks'
    key = 'geoid'

    font_ann = dict(size=10, color=colors['text'])

    if selectedAreaMap is not None:
        points = selectedAreaMap["points"]
        area_names = [str(point["text"].split("<br>")[2]) for point in points]
        df_selected = df_selected[df_selected[key].isin(area_names)]

    index_vals = df_selected['boro_name'].astype('category').cat.codes
    coef_list = []

    # find pearson coeff and p_value for each pair of attributes
    pairs = combinations(selectedAttr, 2)
    flag = True
    for pair in pairs:
        if len(df_selected[pair[0]]) >= 2 and len(df_selected[pair[1]]) >= 2:
            coef_list.append(
                pearsonr(df_selected[pair[0]], df_selected[pair[1]]))
        else:
            flag = False
    if flag:
        ann = [
            dict(
                x=1,
                y=1,
                xref="x2",
                yref="y1",
                font=font_ann,
                text="PCC: " + str(round(coef_list[0][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[0][1])),
                showarrow=False,
            ),
            dict(
                x=1,
                y=1,
                xref="x1",
                yref="y2",
                font=font_ann,
                text="PCC: " + str(round(coef_list[0][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[0][1])),
                showarrow=False,
            ),
            dict(
                x=1,
                y=1,
                xref="x3",
                yref="y1",
                font=font_ann,
                text="PCC: " + str(round(coef_list[1][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[1][1])),
                showarrow=False,
            ),
            dict(
                x=1,
                y=1,
                xref="x1",
                yref="y3",
                font=font_ann,
                text="PCC: " + str(round(coef_list[1][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[1][1])),
                showarrow=False,
            ),
            dict(
                x=1,
                y=1,
                xref="x3",
                yref="y2",
                font=font_ann,
                text="PCC: " + str(round(coef_list[2][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[2][1])),
                showarrow=False,
            ),
            dict(
                x=1,
                y=1,
                xref="x2",
                yref="y3",
                font=font_ann,
                text="PCC: " + str(round(coef_list[2][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[2][1])),
                showarrow=False,
            ),
        ]
    else:
        ann = []

    axisd = dict(showline=True,
                 zeroline=False,
                 gridcolor='#cecece',
                 showticklabels=True)

    # here we build a scatter matrix, and add annotations for each subgraph
    layout = go.Layout(dragmode='select',
                       margin=dict(l=0, r=0, b=0, t=0, pad=0),
                       autosize=False,
                       hovermode='closest',
                       font=dict(color=colors['text2'], size=12),
                       plot_bgcolor=colors['background'],
                       paper_bgcolor=colors['background'],
                       xaxis1=dict(axisd),
                       xaxis2=dict(axisd),
                       xaxis3=dict(axisd),
                       xaxis4=dict(axisd),
                       yaxis1=dict(axisd),
                       yaxis2=dict(axisd),
                       yaxis3=dict(axisd),
                       yaxis4=dict(axisd),
                       annotations=ann)

    fig = go.Figure(
        data=go.Splom(
            dimensions=[
                dict(label=selectedAttr[0],
                     values=df_selected[selectedAttr[0]]),
                dict(label=selectedAttr[1],
                     values=df_selected[selectedAttr[1]]),
                dict(label=selectedAttr[2],
                     values=df_selected[selectedAttr[2]]),
            ],
            text=df_selected['boro_name'] + ', ' + df_selected['ntaname'],
            hoverinfo="x+y+text",
            # showlegend=True,
            marker=dict(
                color=index_vals,
                showscale=False,  # colors encode categorical variables
                line_color='black',
                line_width=0.4),
            diagonal=dict(visible=True)),
        layout=layout)

    return fig
def create_sol_multiview():
    dataOrig = analyze.loadAudioFeatures()
    fullLib = analyze.loadLibraryFromFiles()
    # list: 3799 of dict:18
    # [{'danceability': 0.469, 'energy': 0.625, 'key': 4, 'loudness': -5.381, 'mode': 0, 'speechiness': 0.0306, 'acousticness': 0.00515, 'instrumentalness': 2.03e-05, 'liveness': 0.0682, 'valence': 0.325, 'tempo': 76.785, 'type': 'audio_features', 'id': '6PBzdsMi6YNdYAevzozBRi', 'uri': 'spotify:track:6PBzdsMi6YNdYAevzozBRi', 'track_href': 'https://api.spotify.com/v1/tracks/6PBzdsMi6YNdYAevzozBRi', 'analysis_url': 'https://api.spotify
    #  {'danceability': 0.76, 'energy': 0.608, 'key': 9, 'loudness': -8.673, 'mode': 0, 'speechiness': 0.0347, 'acousticness': 0.315, 'instrumentalness': 0.79, 'liveness': 0.121, 'valence': 0.727, 'tempo': 119.032, 'type': 'audio_features', 'id': '4dJYJTPbUgFK5pCQ5bYD4g', 'uri': 'spotify:track:4dJYJTPbUgFK5pCQ5bYD4g', 'track_href': 'https://api.spotify.com/v1/tracks/4dJYJTPbUgFK5pCQ5bYD4g', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4dJYJTPbUgFK5pCQ5bYD4g', 'duration_ms': 254118, 'time_signature': 4}
    #  {'danc..
    dtype = [('danceability', '<f8'), ('energy', '<f8'), ('key', '<f8'), ('loudness', '<f8'), ('mode', '<f8'),
             ('speechiness', '<f8'), ('acousticness', '<f8'), ('instrumentalness', '<f8'), ('liveness', '<f8'),
             ('valence', '<f8'),
             ('tempo', '<f8'), ('type', '<f8'), ('id', '<f8'), ('duration_ms', '<f8'), ('time_signature', '<f8'), ]

    keys = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness']
    keys = ['danceability', 'energy', 'key', 'loudness', 'valence', 'speechiness', 'tempo', 'time_signature']

    # keys = ['danceability', 'energy', 'loudness']
    # keys = ['danceability', 'energy']

    # ('danceability','energy','key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
    #      'valence', 'tempo'):

    dataArray = []
    for key in dataOrig[0]:
        if key in keys:
            # data[key] = [li[key] for li in dataOrig]
            dataArray.append([li[key] for li in dataOrig])

    # dataArray list:8  3799
    # one row per audio feature
    # [[0.469, 0.76, 0.598, 0.706, 0.756, 0.555, 0.53, 0.716, 0.481, 0.415, 0.684, 0.593, 0.395, 0.487, 0.671, 0.691, 0.155, 0.61, 0.171, 0.203, 0.181,
    #  [0.625, 0.608, 0.509, 0.653, 0.549, 0.71, 0.362, 0.685, 0.491, 0.42, 0.62, 0.626, 0.704, 0.757, 0.603, 0.669, 0
    #  [4, 9, 9, 7, 7, 10, 5, 4, 11, 3, 0, 4, 5, 0, 4, 1, 10, 11, 7, 2, 10, 10, 10, 0, 8, 9, 11, 6, 11, 6, 10, 1, 0, 3, 0,

    dataArray = np.array(dataArray)

    # call MinMaxScaler object
    min_max_scaler = MinMaxScaler()
    # feed in a numpy array
    minmaxscaled = min_max_scaler.fit_transform(dataArray)
    # wrap it up if you need a dataframe
    # df = pd.DataFrame(X_train_norm)

    dataArrayMean = np.mean(dataArray)
    dataArrayStd = np.std(dataArray)
    allsongsstandardized = (dataArray - dataArrayMean) / dataArrayStd

    X_train_norm = allsongsstandardized
    X_train_norm = np.flip(np.rot90(X_train_norm, 3))

    dataToDisplay = np.flip(np.rot90(dataArray, 3))

    # allsongs: list:3799 x 8\
    # one row per song
    # [[0.469, 0.625, 4, -5.381, 0, 0.0306, 0.00515, 2.03e-05],
    # [0.76, 0.608, 9, -8.673, 0, 0.0347, 0.315, 0.79],
    # [0.598, 0.509, 9, -9.719, 1, 0.0269, 0.593, 0.0503],

    kmeans = KMeans(n_clusters=7)
    kmeans.fit(X_train_norm)

    predict = kmeans.predict(X_train_norm)

    centroids = kmeans.cluster_centers_
    correct = 0
    # for i in range(len(X1)):
    #    predict_me = np.array(X1[i].astype(float))
    #    predict_me = predict_me.reshape(-1, len(predict_me))
    #    prediction = kmeans.predict(predict_me)
    #    print(prediction[0])

    cs2 = kmeans.labels_.astype(float)

    fig = go.Figure(data=go.Splom(
        dimensions=[dict(label=keys[0],
                         values=dataToDisplay[:, 0]),
                    dict(label=keys[1],
                         values=dataToDisplay[:, 1]),
                    dict(label=keys[2],
                         values=dataToDisplay[:, 2]),
                    dict(label=keys[3],
                         values=dataToDisplay[:, 3]),
                    dict(label=keys[4],
                         values=dataToDisplay[:, 4]),
                    dict(label=keys[5],
                         values=dataToDisplay[:, 5]),
                    dict(label=keys[6],
                         values=dataToDisplay[:, 6]),
                    dict(label=keys[7],
                         values=dataToDisplay[:, 7])
                    ],

        marker=dict(color=cs2,
                    showscale=False,  # colors encode categorical variables
                    line_color='white', line_width=0.5)
    ))
    fig.show()

    return fig
def create_figure_backup():
    dataOrig = analyze.loadAudioFeatures()
    fullLib = analyze.loadLibraryFromFiles()
    # list: 3799 of dict:18
    # [{'danceability': 0.469, 'energy': 0.625, 'key': 4, 'loudness': -5.381, 'mode': 0, 'speechiness': 0.0306, 'acousticness': 0.00515, 'instrumentalness': 2.03e-05, 'liveness': 0.0682, 'valence': 0.325, 'tempo': 76.785, 'type': 'audio_features', 'id': '6PBzdsMi6YNdYAevzozBRi', 'uri': 'spotify:track:6PBzdsMi6YNdYAevzozBRi', 'track_href': 'https://api.spotify.com/v1/tracks/6PBzdsMi6YNdYAevzozBRi', 'analysis_url': 'https://api.spotify
    #  {'danceability': 0.76, 'energy': 0.608, 'key': 9, 'loudness': -8.673, 'mode': 0, 'speechiness': 0.0347, 'acousticness': 0.315, 'instrumentalness': 0.79, 'liveness': 0.121, 'valence': 0.727, 'tempo': 119.032, 'type': 'audio_features', 'id': '4dJYJTPbUgFK5pCQ5bYD4g', 'uri': 'spotify:track:4dJYJTPbUgFK5pCQ5bYD4g', 'track_href': 'https://api.spotify.com/v1/tracks/4dJYJTPbUgFK5pCQ5bYD4g', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4dJYJTPbUgFK5pCQ5bYD4g', 'duration_ms': 254118, 'time_signature': 4}
    #  {'danc..
    dtype = [('danceability', '<f8'), ('energy', '<f8'), ('key', '<f8'), ('loudness', '<f8'), ('mode', '<f8'),
             ('speechiness', '<f8'), ('acousticness', '<f8'), ('instrumentalness', '<f8'), ('liveness', '<f8'),
             ('valence', '<f8'),
             ('tempo', '<f8'), ('type', '<f8'), ('id', '<f8'), ('duration_ms', '<f8'), ('time_signature', '<f8'), ]

    keys = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness']
    keys = ['danceability', 'energy', 'key', 'loudness', 'valence', 'speechiness', 'tempo', 'time_signature']

    # keys = ['danceability', 'energy', 'loudness']
    # keys = ['danceability', 'energy']

    # ('danceability','energy','key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
    #      'valence', 'tempo'):

    dataArray = []
    for key in dataOrig[0]:
        if key in keys:
            # data[key] = [li[key] for li in dataOrig]
            dataArray.append([li[key] for li in dataOrig])

    # dataArray list:8  3799
    # one row per audio feature
    # [[0.469, 0.76, 0.598, 0.706, 0.756, 0.555, 0.53, 0.716, 0.481, 0.415, 0.684, 0.593, 0.395, 0.487, 0.671, 0.691, 0.155, 0.61, 0.171, 0.203, 0.181,
    #  [0.625, 0.608, 0.509, 0.653, 0.549, 0.71, 0.362, 0.685, 0.491, 0.42, 0.62, 0.626, 0.704, 0.757, 0.603, 0.669, 0
    #  [4, 9, 9, 7, 7, 10, 5, 4, 11, 3, 0, 4, 5, 0, 4, 1, 10, 11, 7, 2, 10, 10, 10, 0, 8, 9, 11, 6, 11, 6, 10, 1, 0, 3, 0,

    dataArray = np.array(dataArray)
    dataArrayMean = np.mean(dataArray)
    dataArrayStd = np.std(dataArray)
    allsongsstandardized = (dataArray - dataArrayMean) / dataArrayStd

    X_train_norm = allsongsstandardized
    X_train_norm = np.flip(np.rot90(X_train_norm, 3))

    dataToDisplay = np.flip(np.rot90(dataArray, 3))

    # allsongs = []
    # for songOrig in dataArray:
    #    song = []
    #    for key in keys:
    #        song.append(dataArray[key])
    #    allsongs.append(song)

    # allsongs: list:3799 x 8\
    # one row per song
    # [[0.469, 0.625, 4, -5.381, 0, 0.0306, 0.00515, 2.03e-05],
    # [0.76, 0.608, 9, -8.673, 0, 0.0347, 0.315, 0.79],
    # [0.598, 0.509, 9, -9.719, 1, 0.0269, 0.593, 0.0503],

    # X1 = np.array(dataArray)
    # y = np.array(dataArray2)

    # kmeans = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=3000,
    #      n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    #     random_state=None, tol=0.0001, verbose=0)
    kmeans = KMeans(n_clusters=7)
    kmeans.fit(X_train_norm)

    predict = kmeans.predict(X_train_norm)
    # data['cluster'] = predict

    # df = px.data.gapminder().query("country=='Canada'")
    # fig = px.line(df, x="year", y="lifeExp", title='Life expectancy in Canada')
    # fig.show()

    # fig = go.Figure(data=X_train_norm.__array__())
    # fig.write_html('first_figure.html', auto_open=True)

    # fig = px.scatter(kmeans.cluster_centers_)
    # pd.plotting.parallel_coordinates(pd.array(X_train_norm),0)
    # plt.show()
    # print(numpy.info(X1))

    centroids = kmeans.cluster_centers_
    correct = 0
    # for i in range(len(X1)):
    #    predict_me = np.array(X1[i].astype(float))
    #    predict_me = predict_me.reshape(-1, len(predict_me))
    #    prediction = kmeans.predict(predict_me)
    #    print(prediction[0])

    # print(correct / len(X1))
    X2 = dataArray[0]

    nCols = len(X2)
    nRows = dataArray.shape[0]

    # colors = cm.rainbow(np.linspace(0, 1, len(dataArray)))

    # cs1 = [colors[i // len(dataArray)] for i in range(len(dataArray) * len(dataArray))]  # could be done with numpy's repmat
    cs2 = kmeans.labels_.astype(float)
    # cs3 = cs2 ** nRows
    # cs3 = np.repeat(cs2, nRows)
    # Xs1 = dataArray * nRows  # use list multiplication for repetition

    fig = go.Figure()

    # fig.add_trace(go.Scatter(x=dataArray[0], y=dataArray[1] ** 2, mode='markers', marker_color=cs2))
    # fig.show()

    fig = go.Figure(data=go.Splom(
        dimensions=[dict(label=keys[0],
                         values=dataToDisplay[:, 0]),
                    dict(label=keys[1],
                         values=dataToDisplay[:, 1]),
                    dict(label=keys[2],
                         values=dataToDisplay[:, 2]),
                    dict(label=keys[3],
                         values=dataToDisplay[:, 3]),
                    dict(label=keys[4],
                         values=dataToDisplay[:, 4]),
                    dict(label=keys[5],
                         values=dataToDisplay[:, 5]),
                    dict(label=keys[6],
                         values=dataToDisplay[:, 6]),
                    dict(label=keys[7],
                         values=dataToDisplay[:, 7])
                    ],

        marker=dict(color=cs2,
                    showscale=False,  # colors encode categorical variables
                    line_color='white', line_width=0.5)
    ))
    fig.show()

    for i, center in enumerate(kmeans.cluster_centers_):
        j = i % len(X_train_norm[0])
        k = (i + 1) % len(X_train_norm[0])
        # plt.figure(i)
        # plt.suptitle("scatterplot "+str(i)+" "+str(j)+":"+str(k))
        # plt.scatter(X_train_norm[:, j], X_train_norm[:, k], c=cs2, s=5, alpha=0.4)
        # plt.scatter(centroids[:,j], centroids[:,k], c='black', s=5)
        fig.add_trace(go.Scatter(x=X_train_norm[:, j], y=np.arange(min(X_train_norm[:, j]), max(X_train_norm[:, j])),
                                 mode='lines'))
        # fig.add_trace(go.Scatter(centroids[:,j], 'b.', markersize=2))

    fig.show()

    # plt.scatter(X_train_norm[:, 0], X_train_norm[:, 1], c=cs2, s=5, alpha=0.4)
    # plt.scatter(X_train_norm[:, 0], X_train_norm[:, 2], c=cs2, s=5, alpha=0.4)

    # plt.plot(allsongsstandardized)
    # plt.figure(2)
    # plt.plot(dataArray[0],' r.', markersize=1)
    # plt.figure(3)
    # plt.plot(dataArray[1], 'b.', markersize=1)
    # plt.figure(4)
    # plt.plot(dataArray[2], 'y.', markersize=1)

    # plt.scatter(dataArray[0], dataArray[3], c="blue", alpha=0.1)

    # plt.figure(5)
    # plt.scatter(dataArray[0], dataArray[0], c="blue", alpha=0.1)

    # plt.subplot(321, label="one")
    # plt.hist(dataArray[0], bins=200)
    # plt.title("exess")
    # plt.subplot(322, label="two")
    # plt.hist(dataArray[1], bins=200)
    # plt.title("222222")
    # plt.subplot(323)
    # plt.hist(dataArray[2], bins=200)
    # plt.title("ex333333ess")

    # plt.scatter(Xs1[1], Ys[1], c="blue", alpha=0.1)

    # plt.scatter(Xs1, Ys.flatten(), color=cs)

    # plt.grid(True)
    # plt.show()

    clusteredSongs = [[] for i in range(kmeans.n_clusters)]
    for i, cluster in enumerate(cs2):
        songCluster = clusteredSongs[int(cluster)]
        track = next((item for item in fullLib['tracks'] if item['track']['id'] == dataOrig[i]['id']), None)
        if (track is not None):
            songCluster.append({**track, **dataOrig[i]})
            # print(str(i)+' '+str(track['track']['artists'][0]['name'])+ ' - '+
            #      str(track['track']['album']['name'])+ ' - '+
            #     str(track['track']['name'])+' song '+str(dataOrig[i])+' ' )

    return fig
예제 #14
0
               mode="lines+markers+text"))
fig.update_layout(title="Линейная зависимость")
fig.write_html("E:\\7 семестр\\ПЭОЭД\\Лаб7\\line.html", auto_open=True)
#Гистограма
trace = go.Bar(x=signs_df["skill1"],
               y=signs_df["value"],
               text=signs_df["skill2"],
               textposition='auto')
fig = go.Figure(data=trace, layout=go.Layout(barmode='stack'))
fig.update_layout(title="Гистограма")
fig.write_html("E:\\7 семестр\\ПЭОЭД\\Лаб7\\gist.html", auto_open=True)
#Матрица рассеевания
fig = go.Figure(data=go.Splom(
    dimensions=[
        dict(label='skill1', values=signs_df['skill1']),
        dict(label='skill2', values=signs_df['skill2']),
        dict(label='value', values=signs_df['value'])
    ],
    text=signs_df['skill2'],
))
fig.update_layout(title="Матрица рассеяния")
fig.write_html("E:\\7 семестр\\ПЭОЭД\\Лаб7\\matrix.html", auto_open=True)
#Облако
text = ""
for skills in df["key_skills"].values:
    for el in skills.split(';'):
        text += el + ' '
wordcloud = WordCloud(width=3000, height=2000,
                      background_color='black').generate(str(text))
wordcloud.to_file("E:\\7 семестр\\ПЭОЭД\\Лаб7\\cloud.png")
#размах выбросов
print(len(my.FindVibros(df, "min_salary")))
예제 #15
0
        x, y = SVMSMOTE(stratey = strategy, k_neighbors = knn, n_jobs = n_jobs, ratio = ratio, out_step = stepsize).fit_resample(X, Y)
    elif method.lower() == 'kmeans':
        x, y = KmeansSMOTE(stratey = strategy,  k_neighbors = knn, n_jobs = n_jobs, ratio = ratio, ).fit_resample(X, Y)
    else:
        raise Exception("{} is not a valid method for OverSampling".format(method))

    df = pd.DataFrame([x, y], columns = list(df.columns) + [_class])

    fig = go.Figure()

    fig.add_trace(
    
        go.Splom(
            dimensions = [
                dict(label = column, values = df[column]) for column in df.columns
            ], 
            marker = dict(
                color = df[_class]
            )
        )
    )

    fig.show()
    
    if transform:
        return df

    return


def UnderSample(df, _class, method = 'cc', strategy = 'auto', n_jobs = 1, ratio = None, transform = None, offline = None):
    """
예제 #16
0
def display_selected_data(selectedArea, choiceNB):

    if choiceNB == 'boroughs':
        df_selected = df_trees_properties_boro
        title_part = ' boroughs'
        key = 'borough'

    else:
        title_part = ' neighborhoods'
        df_selected = df_trees_properties
        key = 'ntaname'

    font_ann = dict(
        size=10,
        color=colors['text']
    )

    if selectedArea is not None:
        points = selectedArea["points"]
        area_names = [str(point["text"].split("<br")[0])
                      for point in points]
        df_selected = df_selected[df_selected[key].isin(area_names)]

    index_vals = df_selected['borough'].astype('category').cat.codes
    coef_list = []

    # find pearson coeff and p_value for each pair of attributes
    pairs = [['trees/sq.mile', 'avg.landprice_thous$/acre'], ['trees/sq.mile',
                                                              'properties/sq.mile'], ['avg.landprice_thous$/acre', 'properties/sq.mile']]
    flag = True
    for pair in pairs:
        if len(df_selected[pair[0]]) >= 2 and len(df_selected[pair[1]]) >= 2:
            coef_list.append(
                pearsonr(df_selected[pair[0]], df_selected[pair[1]]))
        else:
            flag = False
    if flag:
        ann = [
            dict(
                x=5000,
                y=6000,
                xref="x2",
                yref="y1",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[0][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[0][1])),
                showarrow=False,

            ),
            dict(
                x=6000,
                y=5000,
                xref="x1",
                yref="y2",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[0][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[0][1])),
                showarrow=False,
            ),
            dict(
                x=14000,
                y=6000,
                xref="x3",
                yref="y1",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[1][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[1][1])),
                showarrow=False,
            ),
            dict(
                x=6000,
                y=14000,
                xref="x1",
                yref="y3",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[1][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[1][1])),
                showarrow=False,
            ),
            dict(
                x=14000,
                y=6000,
                xref="x3",
                yref="y2",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[2][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[2][1])),
                showarrow=False,
            ),
            dict(
                x=6000,
                y=14000,
                xref="x2",
                yref="y3",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[2][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[2][1])),
                showarrow=False,
            ),

        ]
    else:
        ann = []

    axisd = dict(showline=True,
                 zeroline=False,
                 gridcolor='#104752',
                 showticklabels=True)

    # here we build a scatter matrix, and add annotations for each subgraph
    layout = go.Layout(
        dragmode='select',

        margin=dict(l=0, r=0, b=0, t=0, pad=0),
        autosize=False,
        hovermode='closest',
        font=dict(color=colors['text'], size=12),
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        xaxis1=dict(axisd),
        xaxis2=dict(axisd),
        xaxis3=dict(axisd),
        xaxis4=dict(axisd),
        yaxis1=dict(axisd),
        yaxis2=dict(axisd),
        yaxis3=dict(axisd),
        yaxis4=dict(axisd),
        annotations=ann)

    fig = go.Figure(data=go.Splom(
        dimensions=[dict(label='trees/sq.mile',
                         values=df_selected['trees/sq.mile']),
                    dict(label='avg.landprice($K/A)',
                         values=df_selected['avg.landprice_thous$/acre']),
                    dict(label='properties/sq.mile',
                         values=df_selected['properties/sq.mile']),
                    ],
        text=(df_selected[key]+': '+df_selected['borough']
              if key == 'ntaname' else df_selected[key]),
        hoverinfo="x+y+text",
        # showlegend=True,
        marker=dict(color=index_vals,
                    showscale=False,  # colors encode categorical variables
                    line_color='white', line_width=0.4),
        diagonal=dict(visible=True)
    ), layout=layout
    )

    return fig