def load_dataset(self, base_path): r"""Read data matrix and labels vector from files. Parameters ---------- base_path : string The base path relative to which files are stored. Returns ------- data : ndarray The :math:`n \\times p` data matrix . labels : ndarray The :math:`n`-dimensional vector containing labels. feature_names : list The list containing the names of the features """ data_path = os.path.join(base_path, self.get_file('data')) labels_path = os.path.join(base_path, self.get_file('labels')) # DATA poslab = self._dataset_options.pop('positive_label', None) samples_on = self._dataset_options.pop('samples_on', 'col') pd_data = pd.read_excel(data_path, **self._dataset_options) if samples_on == 'col': pd_data = pd_data.transpose() # Retrieve feature names from the column names of the DataFrame feature_names = pd_data.columns if feature_names.shape[0] != np.unique(feature_names).shape[0]: import sys sys.stderr.write("Feature names specified are not unique. " "Assigning a unique label.\n") feature_names_u = np.array(feature_names, dtype=str) for it, _ in enumerate(feature_names_u): feature_names_u[it] += '_{}'.format(it) np.savetxt("id_correspondence.csv", np.stack((np.array(feature_names), feature_names_u), axis=-1), delimiter=",", fmt='%s') # if not self.get_option('data_preprocessing') is None: # ### TODO Check!!! # # if rank == 0: # # print("Preprocessing data...") # # self.get_option('data_preprocessing').load_data(pd_data) # pd_data = self.get_option('data_preprocessing').process() ################## # LABELS ################## # Before loading labels, remove parameters that were likely specified # for data only. self._dataset_options.pop('usecols', None) pd_labels = pd.read_xls(labels_path, **self._dataset_options) if poslab is None: uv = np.sort(np.unique(pd_labels.as_matrix())) if len(uv) != 2: raise Exception("More than two unique values in the labels " "array.") poslab = uv[0] def _to_plus_minus(x): """Convert labels to -1 / +1.""" return +1. if x == poslab else -1. # Convert labels to -1/+1 pd_labels_mapped = pd_labels.applymap(_to_plus_minus) data = pd_data.as_matrix() labels = pd_labels_mapped.as_matrix().ravel() if data.shape[0] != labels.shape[0]: raise ValueError("The number of samples in data do not correspond " "to the number of samples in labels.") return data, labels, feature_names
#idx1 = np.repeat(time, N, axis=1) plt.scatter(x=idxMat,y=dfPoissTrial.values) #%% dfPoissTrial = pd.DataFrame(data=poissTrial) sns.regplot(x=dfPoissTrial.index,y=dfPoissTrial.values,data=dfPoissTrial) #%% from scipy.stats import nbinom import matplotlib.pyplot as plt result_NB_20k = pd.read_xls('results/NB_20kpool.xlsx'); result_NB_20k['r'], result_NB_20k['p'] = convert_params(result_NB_20k['alpha'],result_NB_20k['mu']) #%% from scipy.stats import nbinom import matplotlib.pyplot as plt x = np.arange(0, 17, 1) out_r, out_p = convert_params(out_mu, out_alpha) for i in np.arange(0,24,6): r = out_r[i]; p = out_p[i]; plt.plot(x, nbinom.pmf(x, r, p), label=i)
import pandas as pd reviews = pd.read_xls("sanmigue.xls", index_col=0) import seaborn as sns
filelist = os.listdir(filedirectory) pathlist = [] appendeddf = [] print filelist #creates a list of paths to .csv files in the filelist for files in filelist: if files.endswith(".xls"): pathlist.append(os.path.join(filedirectory, files)) print type(pathlist) print pathlist #reads each .csv file on the filelist into a pandas dataframe and adds each dataframe together for paths in pathlist: xlscontents = pd.read_xls(paths) appendeddf.append(csvcontents) #concatinates the dataframes into one single dataframe outputdf = pd.concat(appendeddf) outputdf.to_csv(filedirectory + '\output.csv') #for paths in pathlist: #def csv_reader(file_obj) # with open(file_obj, 'r') as userfile: # userFileReader = csv.reader(userfile) # for row in userFileReader: # print row
#import matplotlib.pyplot as plt import seaborn as sns import pandas as pd #import numpy as np test = pd.read_xls("~/Documents/Gujarati data sets/Excel/Table6_1.xls") test.head()