Exemplo n.º 1
0
    def load_dataset(self, base_path):
        r"""Read data matrix and labels vector from files.

        Parameters
        ----------
        base_path : string
            The base path relative to which files are stored.

        Returns
        -------
        data : ndarray
            The :math:`n \\times p` data matrix .

        labels : ndarray
            The :math:`n`-dimensional vector containing labels.

        feature_names : list
            The list containing the names of the features

        """
        data_path = os.path.join(base_path, self.get_file('data'))
        labels_path = os.path.join(base_path, self.get_file('labels'))

        # DATA
        poslab = self._dataset_options.pop('positive_label', None)
        samples_on = self._dataset_options.pop('samples_on', 'col')
        pd_data = pd.read_excel(data_path, **self._dataset_options)

        if samples_on == 'col':
            pd_data = pd_data.transpose()

        # Retrieve feature names from the column names of the DataFrame
        feature_names = pd_data.columns
        if feature_names.shape[0] != np.unique(feature_names).shape[0]:
            import sys
            sys.stderr.write("Feature names specified are not unique. "
                             "Assigning a unique label.\n")
            feature_names_u = np.array(feature_names, dtype=str)
            for it, _ in enumerate(feature_names_u):
                feature_names_u[it] += '_{}'.format(it)
            np.savetxt("id_correspondence.csv",
                       np.stack((np.array(feature_names), feature_names_u),
                                axis=-1),
                       delimiter=",",
                       fmt='%s')

        # if not self.get_option('data_preprocessing') is None:
        # ### TODO Check!!!
        #     # if rank == 0:
        #         # print("Preprocessing data...")
        #
        #     self.get_option('data_preprocessing').load_data(pd_data)
        #     pd_data = self.get_option('data_preprocessing').process()

        ##################
        # LABELS
        ##################
        # Before loading labels, remove parameters that were likely specified
        # for data only.
        self._dataset_options.pop('usecols', None)
        pd_labels = pd.read_xls(labels_path, **self._dataset_options)
        if poslab is None:
            uv = np.sort(np.unique(pd_labels.as_matrix()))
            if len(uv) != 2:
                raise Exception("More than two unique values in the labels "
                                "array.")
            poslab = uv[0]

        def _to_plus_minus(x):
            """Convert labels to -1 / +1."""
            return +1. if x == poslab else -1.

        # Convert labels to -1/+1
        pd_labels_mapped = pd_labels.applymap(_to_plus_minus)

        data = pd_data.as_matrix()
        labels = pd_labels_mapped.as_matrix().ravel()
        if data.shape[0] != labels.shape[0]:
            raise ValueError("The number of samples in data do not correspond "
                             "to the number of samples in labels.")
        return data, labels, feature_names
Exemplo n.º 2
0
#idx1 = np.repeat(time, N, axis=1)

plt.scatter(x=idxMat,y=dfPoissTrial.values)

#%%

dfPoissTrial = pd.DataFrame(data=poissTrial)
sns.regplot(x=dfPoissTrial.index,y=dfPoissTrial.values,data=dfPoissTrial)

#%%

from scipy.stats import nbinom
import matplotlib.pyplot as plt

result_NB_20k = pd.read_xls('results/NB_20kpool.xlsx');

result_NB_20k['r'], result_NB_20k['p'] = convert_params(result_NB_20k['alpha'],result_NB_20k['mu'])

#%%
from scipy.stats import nbinom
import matplotlib.pyplot as plt

x = np.arange(0, 17, 1)

out_r, out_p = convert_params(out_mu, out_alpha)

for i in np.arange(0,24,6):

    r = out_r[i]; p = out_p[i];
    plt.plot(x, nbinom.pmf(x, r, p), label=i)
Exemplo n.º 3
0
    def load_dataset(self, base_path):
        r"""Read data matrix and labels vector from files.

        Parameters
        ----------
        base_path : string
            The base path relative to which files are stored.

        Returns
        -------
        data : ndarray
            The :math:`n \\times p` data matrix .

        labels : ndarray
            The :math:`n`-dimensional vector containing labels.

        feature_names : list
            The list containing the names of the features

        """
        data_path = os.path.join(base_path, self.get_file('data'))
        labels_path = os.path.join(base_path, self.get_file('labels'))

        # DATA
        poslab = self._dataset_options.pop('positive_label', None)
        samples_on = self._dataset_options.pop('samples_on', 'col')
        pd_data = pd.read_excel(data_path, **self._dataset_options)

        if samples_on == 'col':
            pd_data = pd_data.transpose()

        # Retrieve feature names from the column names of the DataFrame
        feature_names = pd_data.columns
        if feature_names.shape[0] != np.unique(feature_names).shape[0]:
            import sys
            sys.stderr.write("Feature names specified are not unique. "
                             "Assigning a unique label.\n")
            feature_names_u = np.array(feature_names, dtype=str)
            for it, _ in enumerate(feature_names_u):
                feature_names_u[it] += '_{}'.format(it)
            np.savetxt("id_correspondence.csv",
                       np.stack((np.array(feature_names),
                                 feature_names_u), axis=-1),
                       delimiter=",", fmt='%s')

        # if not self.get_option('data_preprocessing') is None:
        # ### TODO Check!!!
        #     # if rank == 0:
        #         # print("Preprocessing data...")
        #
        #     self.get_option('data_preprocessing').load_data(pd_data)
        #     pd_data = self.get_option('data_preprocessing').process()

        ##################
        # LABELS
        ##################
        # Before loading labels, remove parameters that were likely specified
        # for data only.
        self._dataset_options.pop('usecols', None)
        pd_labels = pd.read_xls(labels_path, **self._dataset_options)
        if poslab is None:
            uv = np.sort(np.unique(pd_labels.as_matrix()))
            if len(uv) != 2:
                raise Exception("More than two unique values in the labels "
                                "array.")
            poslab = uv[0]

        def _to_plus_minus(x):
            """Convert labels to -1 / +1."""
            return +1. if x == poslab else -1.

        # Convert labels to -1/+1
        pd_labels_mapped = pd_labels.applymap(_to_plus_minus)

        data = pd_data.as_matrix()
        labels = pd_labels_mapped.as_matrix().ravel()
        if data.shape[0] != labels.shape[0]:
            raise ValueError("The number of samples in data do not correspond "
                             "to the number of samples in labels.")
        return data, labels, feature_names
Exemplo n.º 4
0
import pandas as pd
reviews = pd.read_xls("sanmigue.xls", index_col=0)
import seaborn as sns
filelist = os.listdir(filedirectory)

pathlist = []
appendeddf = []

print filelist

#creates a list of paths to .csv files in the filelist
for files in filelist:
    if files.endswith(".xls"):
        pathlist.append(os.path.join(filedirectory, files))

print type(pathlist)
print pathlist

#reads each .csv file on the filelist into a pandas dataframe and adds each dataframe together
for paths in pathlist:
    xlscontents = pd.read_xls(paths)
    appendeddf.append(csvcontents)

#concatinates the dataframes into one single dataframe
outputdf = pd.concat(appendeddf)

outputdf.to_csv(filedirectory + '\output.csv')
#for paths in pathlist:

#def csv_reader(file_obj)
#    with open(file_obj, 'r') as userfile:
#        userFileReader = csv.reader(userfile)
#        for row in userFileReader:
#            print row
Exemplo n.º 6
0
#import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
#import numpy as np

test = pd.read_xls("~/Documents/Gujarati data sets/Excel/Table6_1.xls")
test.head()