Exemplo n.º 1
0
    def calculate_TSNE(self,
                       indecies=None,
                       sample=5000,
                       n_jobs=1,
                       multicore=False,
                       **kwargs):
        """
        Calculate the TSNE and store it in the h5 object

        Args:
            indecies (list): select a subset based on indecies
            sample (int): number of cells to downsample to if None use them all
            n_jobs (int): number of cpus to use (if multicore is True)
            multicore (bool): use the MultiCoreTSNE package
            **kwargs: pass any other arguments to TSNE
        """
        if not self.mode in ['w', 'r+', 'a']:
            raise ValueError('cant write for readonly')
        dsdata = self.fractions.copy()
        if indecies is not None:
            dsdata = dsdata.loc[indecies]
        if sample is not None:
            if dsdata.shape[0] < sample: sample = dsdata.shape[0]
            dsdata = dsdata.sample(n=sample)
        if self.verbose:
            sys.stderr.write("executing TSNE decomposition on " +
                             str(dsdata.shape[0]) + " cells\n")
        tsne = None
        if multicore:
            from MulticoreTSNE import MulticoreTSNE as mTSNE
            if self.verbose: sys.stderr.write("Using MulticoreTSNE\n")
            tsne = mTSNE(n_jobs=n_jobs, **kwargs).fit_transform(dsdata)
        else:
            if self.verbose: sys.stderr.write("Using sklearn TSNE\n")
            tsne = TSNE(**kwargs).fit_transform(dsdata)
        tsne = pd.DataFrame(tsne, columns=['x', 'y'])
        tsne.index = dsdata.index
        tsne = tsne.reset_index().merge(
            self.clusters.reset_index()[['cluster_id', 'k', 'db_id'] +
                                        self.groupby],
            on=['db_id'])
        tsne['cluster_name'] = tsne['cluster_id'].astype(str)
        self.tsne = tsne
         "Off-Target", "Blocked", "Corners", "Offsides", "Free Kicks", \
         "Saves", "Pass Accuracy %", "Passes", "Distance Covered (Kms)", \
         "Fouls Committed", "Yellow Card", "Yellow & Red", "Red"] # , "1st Goal"

names = ["Goal Scored", "On-Target", "Off-Target", "Ball Possession %", \
         "Fouls Committed"]

data_file_name = "../data/FIFA_2018_Statistics.csv"
df = pd.read_csv(data_file_name)
df = df[names]

df_norm = (df - df.mean()) / (df.max() - df.min())

print(df_norm)

input_data_mat = np.array(df_norm)

df_embedded = TSNE(n_components=2).fit_transform(df_norm)

# pca = PCA(n_components=2)
# df_embedded = pca.fit(input_data_mat).transform(input_data_mat)

print(df_embedded)

df_embedded = pd.DataFrame(df_embedded)
df_embedded.reset_index(inplace=True)
df_embedded = df_embedded.rename(columns={0: "x", 1: "y"})
print(df_embedded)

df_embedded.to_csv("../data/tsne-results.csv", index=0)