def __init__(self, datafile, transform=None, target_transform=None,\ loader=None): ''' Initializes a KaggleMNIST instance. @param datafile input datafile. @param transform image transforms. @param target_transform labels transform. ''' # Load data data = pd.read_csv(datafile, index_col=False) # Removing labels from data if 'label' in list(data.columns.values): targets = data['label'].values.tolist() data.drop('label', axis=1, inplace=True) else: targets = range(data.shape[0]) # Converting the remaining data to values data = data.values.astype(float) # Saving data self.datafile = datafile self.transform = transform self.target_transform = target_transform self.loader = loader # Main data self.classes = targets self.imgs = data
def get_features_from_file(self, features_file: str) -> (pd.DataFrame, list): """ Load features from a file and return the data as panda DataFrame and the names of the valid files """ label_data = pd.read_csv(filepath_or_buffer=self._label_file, sep=",") iter_csv = pd.read_csv(filepath_or_buffer=features_file, sep=",", iterator=True, chunksize=10000) feature_data = pd.concat([chunk for chunk in iter_csv]) ids = [f.split(".")[0] for f in self._all_files] label_data = label_data[label_data['track_id'].isin(ids)] data = pd.merge(label_data, feature_data, on='track_id') if (self._split != "test"): data = data.drop('genre_id', axis=1) valid_files = [f"{v:06}" + ".mp3" for v in data['track_id'].values] self._pd_data = data.drop('track_id', axis=1) self._pd_labels = label_data.drop('track_id', axis=1) data = data.drop('track_id', axis=1) print(f"Shape of dataset: {data.shape}") return data.values, valid_files
def read_data(self, file_path): data = pd.read_csv(file_path, header=None, sep=",", names=["labels", "title", "description"], index_col=False) data["text"] = data["title"] + ". " + data["description"] data["labels"] = data["labels"] - 1 data.drop(columns=["title", "description"], inplace=True) data.dropna(inplace=True) return data
def __init__(self, archiveRoot, datasetName, datasetType='TRAIN', noise=0, transform=None): self.samples = [] self.labels = [] dataset = [i for i in os.listdir(archiveRoot) if i == datasetName] if dataset: print('dataset is found') if noise == 0: data = pd.read_csv(archiveRoot + '/' + dataset[0] + '/' + dataset[0] + '_' + datasetType + '.tsv', sep='\t', header=None) else: data = pd.read_csv(archiveRoot + '/' + dataset[0] + '/' + dataset[0] + '_' + str(noise) + '_' + datasetType + '.tsv', sep='\t', header=None) self.labels = torch.Tensor(data.values[:, 0] - 1).long() self.targets = self.labels self.samples = data.drop(columns=[0]).to_numpy() self.data = self.samples std_ = self.samples.std(axis=1, keepdims=True) std_[std_ == 0] = 1.0 self.samples = (self.samples - self.samples.mean(axis=1, keepdims=True)) / std_ else: raise FileNotFoundError
def read_data(self, file_path): data = pd.read_csv(file_path, header=None, sep=",", names=[ "labels", "question_title", "question_content", "best_answer" ], index_col=False) data.dropna(inplace=True) data["text"] = data["question_title"] + data[ "question_content"] + data["best_answer"] data["labels"] = data["labels"] - 1 data.drop( columns=["question_title", "question_content", "best_answer"], inplace=True) return data
def _get_data_loader(batch_size, data_dir, data_file): print("Get data loader for file {}.".format(data_file)) data = pd.read_csv(os.path.join(data_dir, data_file), header=None, names=None) y = torch.from_numpy(data[[0]].values).float().squeeze() x = torch.from_numpy(data.drop([0], axis=1).values).float() ds = torch.utils.data.TensorDataset(x, y) return torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True)
def _preprocess_data(self, data, metadata): if data.isna().sum().sum(): data.apply(self.sample_column, axis=0, result_type='broadcast') # TODO: Find more sophisticated way to deal with string/int/categorical columns cat_cols = [] targets = None for col in metadata['dataResources'][0]['columns']: col_name = col['colName'] if col_name == 'd3mIndex': data = data.drop(col_name, axis=1) continue n_unique_vals = len(data[col_name].unique()) if n_unique_vals == 1: data = data.drop(col_name, axis=1) elif col['role'][0] == 'suggestedTarget': targets = data[col_name] data = data.drop(col_name, axis=1) elif col['colType'] in ['categorical', 'string']: if n_unique_vals > 25: data = data.drop(col_name, axis=1) else: cat_cols.append(col_name) # One-hot encode the categorical features and normalize the real features data = pd.get_dummies(data, columns=cat_cols) features = data.to_numpy() features = self._normalize_data(features) # Ordinal encode and normalize the targets assert targets is not None targets, _ = pd.factorize(targets) targets = np.expand_dims(targets, axis=-1) targets = self._normalize_data(targets) # Recombine the targets and features data = np.concatenate((features, targets), axis=-1) # Convert the dataset to a tensor and permute the dimensions such that the columns come first data = torch.from_numpy(data).to(torch.float) # TODO: add more info to the 3rd dim (ie cat vs num) data = data.unsqueeze(dim=-1).permute(1, 0, 2) # [seq_len, batch_size, dim] return data
def __init__(self, data): self.len = len(data) self.data = data data['label'] = data['label'].astype('int') label = data.loc[:, 'label'] feature = data.drop('label', axis=1) feature = feature.astype('float') self.label = torch.LongTensor(np.array(label)) self.feature = torch.Tensor(np.array(feature))
def screen(self, data, label, method, screen_num=10): data.drop(['Type'], axis=1, inplace=True) if (isinstance(method, str)): if (method == 'rf_importance'): fea_name = fea_sel(data, label)[:screen_num] data = data.loc[:, fea_name] # logging.debug('Fea_name {} by rf_importance: {}'.format(screen_num,fea_name) ) elif (isinstance(method, list)): # method should be a list of fea_name. fea_name = method[:screen_num] data = data.loc[:, fea_name] # logging.debug('Fea_name by list: {}'.format(fea_name)) else: raise Exception( 'WARNING from SHYyyyyyy: Check your Screener Configuration.') data['Type'] = label return data
def __call__(self, data): #data is in panda format data = data.drop(['Unnamed: 32', 'id'], axis=1) cols = data.columns.tolist() cols = cols[1:] + [cols[0]] data = data[cols] # data['diagnosis'].loc[data['diagnosis'] == 'M'] = 0 # data['diagnosis'].loc[data['diagnosis'] == 'B'] = 1 diag = {"M": 0, "B": 1} data["diagnosis"].replace(diag, inplace=True) return data
def oneHot(self, data): rows = data.shape[0] results = np.zeros((rows, 4)) for idx, i in enumerate(data.season.values): results[idx, i] = 1 data.insert(loc=2, column='winter', value = results[:,0]) data.insert(loc=3, column='spring', value = results[:,1]) data.insert(loc=4, column='summer', value = results[:,2]) data.insert(loc=5, column='fall', value = results[:,3]) data = data.drop(['season'], axis=1) return data
def __init__(self): data = pd.read_csv("creditcard.csv") data['normAmount'] = StandardScaler().fit_transform( data['Amount'].values.reshape(-1, 1)) data = data.drop(['Time', 'Amount'], axis=1) self.len = data.shape[0] self.X = np.array(data.loc[:, data.columns != 'Class']) self.y = np.array(data.loc[:, data.columns == 'Class']) self.X = Variable(torch.FloatTensor(self.X)) self.y = Variable(torch.FloatTensor(self.y))
def inverse_tr_external(self, data): """ Recover the original data """ excluded = pd.DataFrame() # if self.prep_included is not None and len(self.prep_included) > 0: # excluded = self.df[self.prep_included] # Included # self.df = self.df.drop(self.prep_included, axis=1) # Excluded # excluded, self.df = self.df, excluded # Inverting both # else: if self.prep_excluded is not None: excluded = data[self.prep_excluded] data.drop(self.prep_excluded, axis=1, inplace=True) data = pd.DataFrame(self.scaler.inverse_transform(data.values), columns=data.columns) self.__from_dummies__() # Scaler contains all columns self.__squash_in_range__() self.__round_integers__() data = pd.concat([data, excluded], axis=1)[self.cols_order] return data
def __call__(self, data): #data is in panda format data = data.replace('Iris-setosa', 0) data = data.replace('Iris-virginica', 1) data = data.replace('Iris-versicolor', 2) data = data[data.iloc[:, -1] != 2] data = data.drop(columns="Id") data = pd.DataFrame(data.as_matrix(), columns=[ 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species' ]) return data
def load_dataset(path, sep, header): data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True) labels = data.iloc[:,0] features = data.drop(data.columns[0], axis=1) if header==None: # Adjust the column names after dropped the 0th column above # New column names are 0 (inclusive) to len(features.columns) (exclusive) new_column_names = list(range(0, len(features.columns))) features.columns = new_column_names return features, labels
def _get_data_loader(batch_size, data_dir, filename): """Instantiate a PyTorch data loader """ data = pd.read_csv(os.path.join(data_dir, filename), header=None, names=None) # Load labels from first column labels = torch.from_numpy(data[[0]].values).float().squeeze() X = torch.from_numpy(data.drop([0], axis=1).values).float() tensor_ds = torch.utils.data.TensorDataset(X, labels) return torch.utils.data.DataLoader(tensor_ds, batch_size=batch_size)
def __init__(self, archiveRoot, datasetName, labelIndex, idIndex, iteration=0, datasetType="TRAIN", transform=None): self.samples = [] self.labels = [] self.ids = [] print(os.listdir(archiveRoot)) iterationName = datasetName if iteration != 0: iterationName = 'iter-%s' % (str(iteration)) dataset = [i for i in os.listdir(archiveRoot) if i == iterationName] print(dataset) if dataset: print('dataset is found') if iteration == 0: data = pd.read_csv(archiveRoot + '/' + dataset[0], header=None, index_col=None) else: data = pd.read_csv(archiveRoot + '/' + dataset[0] + '/' + datasetName + '_' + datasetType + '.csv', header=None, index_col=None) print(data.shape) print(data.head(5)) self.labels = torch.Tensor(data.values[:, labelIndex]).long() self.ids = torch.Tensor(data.values[:, idIndex]).long() self.targets = self.labels self.samples = data.drop(columns=[labelIndex, idIndex]).to_numpy() print(self.samples.shape) self.data = self.samples std_ = self.samples.std(axis=1, keepdims=True) std_[std_ == 0] = 1.0 self.samples = (self.samples - self.samples.mean(axis=1, keepdims=True)) / std_ else: raise FileNotFoundError
def train_cross_validation_split(data_path): dir_path = os.path.dirname(os.path.abspath(data_path)) fold_dirs = glob.glob(os.path.join(dir_path, 'folds_*')) if len(fold_dirs) == 5: for fold_dir in fold_dirs: train_path = os.path.join(fold_dir, 'train.csv') val_path = os.path.join(fold_dir, 'val.csv') yield pd.read_csv(train_path), pd.read_csv(val_path) else: kfold = KFold(n_splits=5, shuffle=True, random_state=42) data = read_data(data_path) for i, (train_ids, val_ids) in enumerate(kfold.split(X=data.drop('active', axis=1).values, y=data['active'].values)): train_data = data.iloc[train_ids, :] val_data = data.iloc[val_ids, :] # os.makedirs(os.path.join(dir_path, 'folds_{}'.format(i)), exist_ok=True) # train_data.to_json(os.path.join(os.path.join(dir_path, 'folds_{}'.format(i)), 'train.json')) # val_data.to_json(os.path.join(os.path.join(dir_path, 'folds_{}'.format(i)), 'val.json')) yield train_data, val_data
def limit_categorical_field(data, field_name, min_occur, raw_data): value_counts = raw_data[field_name].value_counts() categories = [k for k, v in value_counts.items() if v > min_occur] categories_dict = {k: i + 1 for i, k in enumerate(categories)} # Convert missing values to a separate category - 0 categories_dict[np.nan] = 0 # Convert all the infrequent values to a separate category unknown_val = len(categories_dict) def gen_category_id(cat): if cat in categories_dict: return categories_dict[cat] else: return unknown_val data[field_name + '_id'] = data[field_name].apply(gen_category_id) return data.drop(field_name, axis=1), raw_data.drop(field_name, axis=1)
def preprocess_category_name(data, raw_data): for i in range(4): def get_part(x): # Handle missing values (np.nan) if type(x) != str: return np.nan parts = x.split('/') if i >= len(parts): return np.nan else: return parts[i] field_name = 'category_part_' + str(i) data[field_name] = data['category_name'].apply(get_part) raw_data[field_name] = raw_data['category_name'].apply(get_part) data, raw_data = limit_categorical_field(data, field_name, MIN_CATEGORY_NAMES, raw_data) return data.drop('category_name', axis=1)
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="input dataset") parser.add_argument("directory", type=str, help="directory to store data files") parser.add_argument("-i", "--iterations", type=int, help="iterations to do", default=1000) parser.add_argument("-l", "--learning_rate", type=float, help="learning rate", default=0.01) parser.add_argument( "-s", "--sample", type=int, help= "number of samples to use from dataset. If not passed - whole dataset is used", default=None) parser.add_argument("-mb", "--mini_batch", type=int, help="minibatch size, 1000 is default", default=1000) parser.add_argument("-tvs", "--train_validation_split", type=float, help="train - validation split fraction", default=0.8) parser.add_argument( "-pf", "--pickle_file", type=int, help="pickle file index to dump neural network state after learning", default=None) parser.add_argument( "-uf", "--unpickle_file", type=int, help= "pickle file index to restore neural network state from at the beginning", default=None) parser.add_argument("-ml", "--middle_layers", type=int, help="number of middle layers", default=20) parser.add_argument("-mln", "--middle_layer_neurons", type=int, help="middle layers neuron count", default=2) parser.add_argument("--case", type=int, help="case of data popularity distribution", default=1) parser.add_argument("-ha", "--hidden_activation", help="activation to use on hidden layers", type=str) parser.add_argument("-oa", "--out_activation", help="activation to use on out layer", type=str) parser.add_argument( "-ihl", "--input_has_labels", help= "pass this is input has class label. Needed for optimal predictor evaluation", action="store_true") parser.add_argument("--seed", help="seed for item sampling", type=int) parser.add_argument("-fc", "--force_cpu", help="force cpu execution for PyTorch", action="store_true") # parser.add_argument("-aef", # "--alternative_error_function", # help="use alternative error function - error for Poisson distribution", # action="store_true") args = parser.parse_args() # In the next section you should define a mapping of items distribution # Case 1 if args.case == 1: generator = PoissonZipfGenerator(10_000, 20.0, 0.8, 0) dist_mapping = generator.get_distribution_map() # Case 2 elif args.case == 2: generator = PoissonZipfGenerator(5_000, 40.0, 0.8, 0) dist_mapping = generator.get_distribution_map() generator2 = PoissonShuffleZipfGenerator(5_000, 40.0, 0.8, 5_000, 10_000_000) dist_mapping2 = generator2.get_distribution_map() for k, v in dist_mapping2.items(): dist_mapping[k] = v for k, v in dist_mapping.items(): dist_mapping[k] = v / 2.0 else: raise AttributeError("Unknown case passed") # End of section if args.seed: torch.manual_seed(args.seed) np.random.seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) data = pd.read_csv(args.input, header=None) if args.sample: data = data.sample(n=args.sample) n = len(data) train_size = n * args.train_validation_split train_data = data.sample(n=int(train_size)) valid_data = data.drop(train_data.index) if not os.path.exists(args.directory): os.makedirs(args.directory) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.force_cpu: device = "cpu" print("Running on: {0}".format(device)) if args.unpickle_file is not None: filename = "nn_{0}.p".format(args.unpickle_file) filename = os.path.join(args.directory, filename) with open(filename, "rb") as unpickle_file: nn = pickle.load(unpickle_file) else: layers = [data.shape[1] - 2 ] + ([args.middle_layer_neurons] * args.middle_layers) + [1] nn = TorchFeedforwardNN(layers, hidden_activation=args.hidden_activation, out_activation=args.out_activation) if torch.cuda.is_available(): nn.to(device) sample_map = {} for k, v in tqdm(dist_mapping.items(), desc="Preprocessing dataset"): sample_map[k] = data[data.ix[:, 0] == k] learning_rate = args.learning_rate prev_dist = 10**10 inp_train = np.matrix(train_data.iloc[:, 1:train_data.shape[1] - 1]) outp_train = np.matrix(train_data.iloc[:, train_data.shape[1] - 1:train_data.shape[1]]) inp_valid = np.matrix(valid_data.iloc[:, 1:valid_data.shape[1] - 1]) outp_valid = np.matrix(valid_data.iloc[:, valid_data.shape[1] - 1:valid_data.shape[1]]) if args.case == 1: optim_err = calc_aver_error(inp_valid, outp_valid, args.input_has_labels) optim_err_train = calc_aver_error(inp_train, outp_train, args.input_has_labels) elif args.case == 2: optim_err = calc_case_2_optim_err(valid_data, args.input_has_labels) optim_err_train = calc_case_2_optim_err(train_data, args.input_has_labels) else: raise AttributeError("Unknown case passed") inp_train = torch.from_numpy(inp_train) outp_train = torch.from_numpy(outp_train) inp_valid = torch.from_numpy(inp_valid) outp_valid = torch.from_numpy(outp_valid) if torch.cuda.is_available(): inp_train = inp_train.to(device) outp_train = outp_train.to(device) inp_valid = inp_valid.to(device) outp_valid = outp_valid.to(device) dist_file = os.path.join(args.directory, "distance.txt") error_file = os.path.join(args.directory, "error.txt") with open(error_file, "w") as err_f: with open(dist_file, "w") as f: # dist = 0.0 # for k, v in tqdm(dist_mapping.items(), desc="Evaluating distance"): # item = sample_map[k].sample(n=1) # pop = nn.evaluate(np.matrix(item.iloc[:, 1:item.shape[1] - 1]), # np.matrix(item.iloc[:, item.shape[1] - 1:item.shape[1]]))[0] # # dist += abs(v - pop) # # dist /= 2.0 # f.write(f"{dist}\n") # f.flush() err_f.write("{} {}\n".format(optim_err_train, optim_err)) for _ in tqdm(range(args.iterations), desc="Running iterations"): train_loader = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(inp_train, outp_train), batch_size=args.mini_batch, shuffle=True) for inp, target in tqdm(train_loader, desc="Running minibatches"): nn.backpropagation_learn(inp, target, args.learning_rate, show_progress=True, stochastic=False) dist = 0.0 err = 0.0 for k, v in tqdm(dist_mapping.items(), desc="Evaluating distance"): item = sample_map[k].sample(n=1) inp = torch.from_numpy( np.matrix(item.iloc[:, 1:item.shape[1] - 1])) outp = torch.from_numpy( np.matrix(item.iloc[:, item.shape[1] - 1:item.shape[1]])) err += nn.evaluate(inp, outp) pop = float( nn( torch.Tensor( np.matrix(item.iloc[:, 1:item.shape[1] - 1])).double())) pop = np.exp(-pop) - 10**-15 dist += abs(v - pop) err /= len(dist_mapping) dist /= 2.0 prev_dist = dist f.write(f"{dist} {err}\n") f.flush() train_err = nn.evaluate(inp_train, outp_train) valid_err = nn.evaluate(inp_valid, outp_valid) err_f.write("{} {}\n".format(train_err, valid_err)) err_f.flush() if args.pickle_file is not None: filename = "nn_{0}.p".format(args.pickle_file) filename = os.path.join(args.directory, filename) with open(filename, "wb") as pickle_file: pickle.dump(nn, pickle_file) cache_file = os.path.join(args.directory, "cache_hit.txt") with open(cache_file, "w") as f: popularities = [] for k, v in tqdm(dist_mapping.items(), desc="Evaluating distance"): item = sample_map[k].sample(n=1) pop = float( nn( torch.Tensor(np.matrix(item.iloc[:, 1:item.shape[1] - 1])).double())) pop = np.exp(-pop) - 10**-15 # tmp = np.matrix(item.iloc[:, 1:item.shape[1] - 1]) # tmp = np.exp(-tmp) - 10 ** -15 # transform from log # pop = float(np.mean(tmp, axis=1)) # tmp = np.exp(-np.matrix(item.iloc[:, -1:])) - 10 ** -15 # transform from log # pop = float(tmp) popularities.append((k, pop)) mean_val = np.mean([x[1] for x in popularities]) median_val = np.median([x[1] for x in popularities]) print("Popularity mean: {}".format(mean_val)) print("Popularity median: {}".format(median_val)) stat_file = os.path.join(args.directory, "stat.txt") with open(stat_file, "w") as f_stat: f_stat.write("Popularity mean: {}".format(mean_val)) f_stat.write("Popularity median: {}".format(median_val)) pops_sorted = list( sorted(popularities, key=lambda x: x[1], reverse=True)) pop_order_predicted = [x[0] for x in pops_sorted] order_file = os.path.join(args.directory, "order.txt") with open(order_file, "w") as f1: for item in pops_sorted: f1.write("{0} {1} {2}\n".format(item[0], item[1], dist_mapping[item[0]])) pred_items_real_pops = [dist_mapping[i] for i in pop_order_predicted] distrib_pop_ordered = sorted(dist_mapping.values(), reverse=True) theory_hit = 0.0 practice_hit = 0.0 for distrib_pop, pred_item_pop in zip(distrib_pop_ordered, pred_items_real_pops): theory_hit += distrib_pop practice_hit += pred_item_pop f.write(f"{theory_hit} {practice_hit}\n")
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: data = inputs.horizontal_concat(outputs) data = data.copy() # mark datetime column times = data.metadata.list_columns_with_semantic_types( ( "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/DateTime", ) ) if len(times) != 1: raise ValueError( f"There are {len(times)} indices marked as datetime values. Please only specify one" ) self._time_column = list(data)[times[0]] # if datetime columns are integers, parse as # of days if ( "http://schema.org/Integer" in inputs.metadata.query_column(times[0])["semantic_types"] ): self._integer_time = True data[self._time_column] = pd.to_datetime( data[self._time_column] - 1, unit="D" ) else: data[self._time_column] = pd.to_datetime( data[self._time_column], unit="s" ) # sort by time column data = data.sort_values(by=[self._time_column]) # mark key and grp variables self.key = data.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/PrimaryKey" ) # mark target variables self._targets = data.metadata.list_columns_with_semantic_types( ( "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", "https://metadata.datadrivendiscovery.org/types/TrueTarget", "https://metadata.datadrivendiscovery.org/types/Target", ) ) self._target_types = [ "i" if "http://schema.org/Integer" in data.metadata.query_column(t)["semantic_types"] else "c" if "https://metadata.datadrivendiscovery.org/types/CategoricalData" in data.metadata.query_column(t)["semantic_types"] else "f" for t in self._targets ] self._targets = [list(data)[t] for t in self._targets] self.target_column = self._targets[0] # see if 'GroupingKey' has been marked # otherwise fall through to use 'SuggestedGroupingKey' grouping_keys = data.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/GroupingKey" ) suggested_grouping_keys = data.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey" ) if len(grouping_keys) == 0: grouping_keys = suggested_grouping_keys drop_list = [] else: drop_list = suggested_grouping_keys grouping_keys_counts = [ data.iloc[:, key_idx].nunique() for key_idx in grouping_keys ] grouping_keys = [ group_key for count, group_key in sorted(zip(grouping_keys_counts, grouping_keys)) ] self.filter_idxs = [list(data)[key] for key in grouping_keys] # drop index data.drop( columns=[list(data)[i] for i in drop_list + self.key], inplace=True ) # check whether no grouping keys are labeled if len(grouping_keys) == 0: concat = pd.concat([data[self._time_column], data[self.target_column]], axis=1) concat.columns = ['ds', 'y'] concat['unique_id'] = 'series1' # We have only one series else: # concatenate columns in `grouping_keys` to unique_id column concat = data.loc[:, self.filter_idxs].apply(lambda x: ' '.join([str(v) for v in x]), axis=1) concat = pd.concat([concat, data[self._time_column], data[self.target_column]], axis=1) concat.columns = ['unique_id', 'ds', 'y'] # Series must be complete in the frequency concat = DeepMarkovModelPrimitive._ffill_missing_dates_per_serie(concat, 'D') # remove duplicates concat = concat.drop_duplicates(['unique_id', 'ds']) self._training_inputs = concat
print("Using device: {}".format( torch.cuda.get_device_name(torch.cuda.current_device()))) # Load data data = pd.read_csv("data/processed.csv", index_col=0) # Drop null columns data.dropna(axis='columns', how='any', inplace=True) # Isolate season 1980 data # season_1980_data = data[data.index.str.contains('198009|198010|198011|198012|198101|198102')] # data = data[~data.index.str.contains('198009|198010|198011|198012|198101|198102')] # Training data y_data = data[['win', 'tie', 'loss']] x_data = data.drop(['win', 'tie', 'loss', 'team'], axis='columns').dropna(axis='columns', how='any') # Data parameters input_size = len(x_data.columns) output_size = len(y_data.columns) # Create dataset dataset = torch.utils.data.TensorDataset( torch.Tensor(x_data.values), torch.argmax(torch.Tensor(y_data.values), dim=1)) # Create model hidden_size = 10 model = torch.nn.Sequential(torch.nn.Linear(input_size, hidden_size), torch.nn.Tanh(), torch.nn.Linear(hidden_size, output_size),
def load_data(path, CONFIG=None): data = pd.read_csv(path) #delete useless columns data.drop('artist_name', 1) data.drop('track_name', 1) #delete samples without lyrics data = data[data['lyrics'] != '-'] logging.info('Loaded {} samples'.format(data.shape[0])) #transform the dataframe into (sample, label) pairs #for the moment samples is a numpy array samples = data['lyrics'].values valence = torch.tensor(data['valence'].values, dtype=torch.float32) arousal = torch.tensor(data['arousal'].values, dtype=torch.float32) labels = torch.zeros(valence.size(0)) labelsprob = torch.zeros(4) for i in range(valence.size(0) - 1): labels[i] = int(ec.findEmotion(valence[i], arousal[i])) - 1 labelsprob[labels[i].long()] = labelsprob[labels[i].long()] + 1 labelsprob = labelsprob * 100 / (valence.size(0) - 1) #build the vocabulary vocab = Vocabulary(''.join(str(e) for e in samples)) logging.info('Vocab size : {}'.format(vocab.size())) #transform the samples into tensors and pad them to the maximum length samples = pad_sequence([text_to_tensor(e, vocab) for e in samples], batch_first=True) samples = samples[:, :120] print(samples.size()) train_size = int(samples.shape[0] * CONFIG['train_val_cutoff']) train_samples, validation_samples = samples[:train_size], samples[ train_size:] train_labels, validation_labels = labels[:train_size], labels[train_size:] print(train_samples.size()) #build the training data loader train_dataset = MyDataset(train_samples, train_labels) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=CONFIG['num_workers']) logging.info( 'Created training data loader having {} samples and {} batches of size {}' .format(train_size, len(train_data_loader), CONFIG['batch_size'])) #build the validation data loader validation_dataset = MyDataset(validation_samples, validation_labels) validation_data_loader = torch.utils.data.DataLoader( validation_dataset, batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=CONFIG['num_workers']) logging.info( 'Created validation data loader having {} samples and {} batches of size {}' .format(samples.shape[0] - train_size, len(validation_data_loader), CONFIG['batch_size'])) return vocab, train_data_loader, validation_data_loader
# Boltzmann Machines # Importing the libraries import numpy as np import pandas as pd import torch import torch.nn as nn import torch.nn.parallel import torch.optim as optim import torch.utils.data from torch.autograd import Variable import matplotlib.pyplot as plt # research data import data = pd.read_csv('recommender_data.csv') data = data.drop(["user_sequence"], axis=1) data["course"] = data["course_id"].apply(lambda x: x[2:]) data = data.drop(["course_id"], axis=1) data["course"] = pd.to_numeric(data["course"]) data["completed"] = 1 data = data.assign(id=(data["course"]).astype('category').cat.codes) #split data into train and test from sklearn.cross_validation import train_test_split train_set, test_set = train_test_split(data, test_size=0.25) # count of users and courses users = len(data["learner_id"].unique()) courses = len(data["course"].unique()) train_set = np.array(train_set) test_set = np.array(test_set)
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="input dataset") parser.add_argument("directory", type=str, help="directory to store data files") parser.add_argument("-i", "--iterations", type=int, help="iterations to do", default=1000) parser.add_argument("-l", "--learning_rate", type=float, help="learning rate", default=0.01) parser.add_argument("-s", "--sample", type=int, help="number of samples to use from dataset. If not passed - whole dataset is used", default=None) parser.add_argument("-es", "--eval_sample", type=int, help="number of samples to use from for evaluation", default=None) parser.add_argument("-mb", "--mini_batch", type=int, help="minibatch size, 1000 is default", default=1000) parser.add_argument("-mbl", "--mini_batch_log", type=int, help="after how many batches evaluate the error", default=100) parser.add_argument("-tvs", "--train_validation_split", type=float, help="train - validation split fraction", default=0.8) parser.add_argument("-pf", "--pickle_file", type=int, help="pickle file index to dump neural network state after learning", default=None) parser.add_argument("-uf", "--unpickle_file", type=int, help="pickle file index to restore neural network state from at the beginning", default=None) parser.add_argument("--seed", help="seed for item sampling", type=int) parser.add_argument("-fc", "--force_cpu", help="force cpu execution for PyTorch", action="store_true") args = parser.parse_args() if not os.path.exists(args.directory): os.makedirs(args.directory) if args.seed: torch.manual_seed(args.seed) np.random.seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) data = pd.read_csv(args.input, header=None, index_col=None, names=None) if args.sample: data = data.sample(n=args.sample) n = len(data) train_size = n * args.train_validation_split train_data = data.sample(n=int(train_size)) valid_data = data.drop(train_data.index) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.force_cpu: device = "cpu" print("Running on: {0}".format(device)) if args.unpickle_file is not None: filename = "dlstm_{0}.p".format(args.unpickle_file) filename = os.path.join(args.directory, filename) with open(filename, "rb") as unpickle_file: nn = pickle.load(unpickle_file) else: layers = [inputs_num, 16, 16, outputs_num] nn = LSTMSoftmax(layers) if torch.cuda.is_available(): nn.to(device) inp_train = np.matrix(train_data.iloc[:, :inputs_num]).astype(float) outp_train = np.matrix(train_data.iloc[:, inputs_num:]) inp_valid = np.matrix(valid_data.iloc[:, :inputs_num]).astype(float) outp_valid = np.matrix(valid_data.iloc[:, inputs_num:]) inp_train = torch.from_numpy(inp_train).type(torch.FloatTensor) outp_train = torch.from_numpy(outp_train).type(torch.FloatTensor) inp_valid = torch.from_numpy(inp_valid).type(torch.FloatTensor) outp_valid = torch.from_numpy(outp_valid).type(torch.FloatTensor) if torch.cuda.is_available(): inp_train = inp_train.to(device) outp_train = outp_train.to(device) inp_valid = inp_valid.to(device) outp_valid = outp_valid.to(device) log_counter = args.mini_batch_log error_file = os.path.join(args.directory, "error.txt") with open(error_file, "w") as f: for _ in tqdm(range(args.iterations), desc="Running iterations"): train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(inp_train, outp_train), batch_size=args.mini_batch, shuffle=True) for inp, target in tqdm(train_loader, desc="Running minibatches"): nn.backpropagation_learn(inp, target, args.learning_rate, show_progress=True, stochastic=False) log_counter -= 1 if log_counter == 0: log_counter = args.mini_batch_log if args.eval_sample is None: train_err = nn.evaluate(inp_train, outp_train) valid_err = nn.evaluate(inp_valid, outp_valid) else: train_tmp = train_data.sample(n=args.eval_sample) valid_tmp = valid_data.sample(n=args.eval_sample) inp_train_tmp = np.matrix(train_tmp.iloc[:, :inputs_num]).astype(float) outp_train_tmp = np.matrix(train_tmp.iloc[:, inputs_num:]) inp_valid_tmp = np.matrix(valid_tmp.iloc[:, :inputs_num]).astype(float) outp_valid_tmp = np.matrix(valid_tmp.iloc[:, inputs_num:]) inp_train_tmp = torch.from_numpy(inp_train_tmp).type(torch.FloatTensor) outp_train_tmp = torch.from_numpy(outp_train_tmp).type(torch.FloatTensor) inp_valid_tmp = torch.from_numpy(inp_valid_tmp).type(torch.FloatTensor) outp_valid_tmp = torch.from_numpy(outp_valid_tmp).type(torch.FloatTensor) train_err = nn.evaluate(inp_train_tmp, outp_train_tmp) valid_err = nn.evaluate(inp_valid_tmp, outp_valid_tmp) f.write("{} {}\n".format(train_err, valid_err)) f.flush() if args.pickle_file is not None: filename = "dlstm_{0}.p".format(args.pickle_file) filename = os.path.join(args.directory, filename) with open(filename, "wb") as pickle_file: pickle.dump(nn, pickle_file)
def PrepareDataset(data, \ BATCH_SIZE = 64, \ seq_len = seq_len_, \ pred_len = pred_len_, \ train_propotion = 0.7, \ valid_propotion = 0.15, \ masking = True, \ mask_ones_proportion = 0.8): """ Prepare training and testing datasets and dataloaders. Convert speed/volume/occupancy matrix to training and testing dataset. The vertical axis of speed_matrix is the time axis and the horizontal axis is the spatial axis. Args: speed_matrix: a Matrix containing spatial-temporal speed data for a network seq_len: length of input sequence pred_len: length of predicted sequence Returns: Training dataloader Testing dataloader """ time_len = data.shape[0] #speed_matrix = speed_matrix.clip(0, 100) #limit the values to 0-100 max_data = data.max().max() #speed_matrix = speed_matrix / max_speed data_sequences, data_labels, data_pats = [], [], [] for p in data['patient_id'].unique(): pat_len = len(data[data['patient_id'] == p]) if (pat_len > (seq_len + pred_len)): #for i in range(time_len - seq_len - pred_len): for i in range(pat_len - seq_len - pred_len): data_sequences.append( data.drop( ['SepsisLabel'], axis=1)[data['patient_id'] == p].iloc[i:i + seq_len].values) #data_labels.append(data['SepsisLabel'][data['pat_id']==p].iloc[i+seq_len:i+seq_len+pred_len].values) data_labels.append( data['SepsisLabel'][data['patient_id'] == p].iloc[i + seq_len + pred_len:i + seq_len + pred_len + 1].values) data_pats.append(p) #print(i) data_sequences, data_labels, data_pats = np.asarray( data_sequences), np.asarray(data_labels), np.asarray(data_pats) #print(data_sequences.shape) #(951, 48, 42) if masking: print( 'Split Speed finished. Start to generate Mask, Delta, Last_observed_X ...' ) np.random.seed(1024) #Mask = np.random.choice([0,1], size=(data_sequences.shape), p = [1 - mask_ones_proportion, mask_ones_proportion]) #speed_sequences = np.multiply(speed_sequences, Mask) Mask = data_sequences if opt.mask: Mask[Mask != 0] = 1 else: Mask[Mask != 0] = 0 # temporal information interval = 1 # 5 minutes S = np.zeros_like(data_sequences) # time stamps for i in range(S.shape[1]): S[:, i, :] = interval * i #print(S) Delta = np.zeros_like(data_sequences) # time intervals for i in range(1, S.shape[1]): Delta[:, i, :] = S[:, i, :] - S[:, i - 1, :] missing_index = np.where(Mask == 0) X_last_obsv = np.copy(data_sequences) for idx in range(missing_index[0].shape[0]): i = missing_index[0][idx] j = missing_index[1][idx] k = missing_index[2][idx] if j != 0 and j != (seq_len - 1): Delta[i, j + 1, k] = Delta[i, j + 1, k] + Delta[i, j, k] if j != 0: X_last_obsv[i, j, k] = X_last_obsv[i, j - 1, k] # last observation #this should be column wise Delta = Delta / Delta.max() # normalize # shuffle and split the dataset to training and testing datasets print( 'Generate Mask, Delta, Last_observed_X finished. Start to shuffle and split dataset ...' ) sample_size = data_sequences.shape[0] index = np.arange(sample_size, dtype=int) np.random.seed(1024) np.random.shuffle(index) #patients = data['pat_id'].unique(): #pat_sample_size=len(patients) #first split patients #train_pat_index = int(np.floor(pat_sample_size * train_propotion)) #valid_pat_index = int(np.floor(pat_sample_size * ( train_propotion + valid_propotion))) #patients[:train_pat_index] #patients[train_pat_index:valid_pat_index] #patients[valid_pat_index:] #train_index=[] #for p in patients[:train_pat_index]: #item= np.where(data_pats==p) #train_index.append(item) #valid_index=[] #for p in patients[train_pat_index:valid_pat_index]: #item= np.where(data_pats==p) #valid_index.append(item) #test_index=[] #for p in patients[valid_pat_index:]: #item= np.where(data_pats==p) #test_index.append(item) data_sequences = data_sequences[index] data_labels = data_labels[index] if masking: X_last_obsv = X_last_obsv[index] Mask = Mask[index] Delta = Delta[index] data_sequences = np.expand_dims(data_sequences, axis=1) X_last_obsv = np.expand_dims(X_last_obsv, axis=1) Mask = np.expand_dims(Mask, axis=1) Delta = np.expand_dims(Delta, axis=1) dataset_agger = np.concatenate( (data_sequences, X_last_obsv, Mask, Delta), axis=1) train_index = int(np.floor(sample_size * train_propotion)) valid_index = int( np.floor(sample_size * (train_propotion + valid_propotion))) if masking: train_data, train_label = dataset_agger[: train_index], data_labels[: train_index] valid_data, valid_label = dataset_agger[ train_index:valid_index], data_labels[train_index:valid_index] test_data, test_label = dataset_agger[valid_index:], data_labels[ valid_index:] else: train_data, train_label = data_sequences[: train_index], data_labels[: train_index] valid_data, valid_label = data_sequences[ train_index:valid_index], data_labels[train_index:valid_index] test_data, test_label = data_sequences[valid_index:], data_labels[ valid_index:] train_data, train_label = torch.Tensor(train_data), torch.Tensor( train_label) valid_data, valid_label = torch.Tensor(valid_data), torch.Tensor( valid_label) test_data, test_label = torch.Tensor(test_data), torch.Tensor(test_label) train_dataset = utils.TensorDataset(train_data, train_label) valid_dataset = utils.TensorDataset(valid_data, valid_label) test_dataset = utils.TensorDataset(test_data, test_label) train_dataloader = utils.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) valid_dataloader = utils.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) test_dataloader = utils.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) X_mean = np.mean(data_sequences, axis=0) print('Finished') return train_dataloader, valid_dataloader, test_dataloader, max_data, X_mean
continue plt.figure(count) plt.hist(df[i],alpha=0.7, rwidth=0.85) plt.tight_layout() plt.xlabel(i) plt.ylabel('Count') count=count+1 plt.show() # In[ ]: #preprocess data data=pd.read_csv("./drive/My Drive/DataSet/kc_house_data.csv",parse_dates=['date'],dtype = np.float32); data=data.drop(['id','date'],axis=1) #One Hot Encoding for yr_renovated data['yr_renovated'] = data['yr_renovated'].apply(lambda x: 1 if x > 0 else 0) #remove outlier housing=data IQR = housing['price'].quantile(.75) - housing['price'].quantile(.25) upper_bound = housing['price'].quantile(.75) + 3 * IQR upper_bound_mask = housing.price > upper_bound lower_bound = housing['price'].quantile(.25) - 3 * IQR lower_bound_mask = housing.price < upper_bound housing_no_outliers = housing[housing["price"] < upper_bound] housing_no_outliers=housing_no_outliers[housing_no_outliers["price"]> lower_bound]
merchant = pd.concat([trans_train[['merchant']], trans_test[['merchant']]], axis=0) result = merchant.groupby( ['merchant']).size().reset_index().rename(columns={0: 'merchant_times'}) new_result = result[result['merchant_times'] > 10] new_result = new_result.merge(data_trans, on='merchant', how='left') data = new_result[[ 'UID', 'merchant', 'trans_type1', 'trans_type2', 'trans_amt', 'code1', 'acc_id1', 'device1', 'mac1', 'ip1', 'ip1_sub' ]] for col in data.columns: if (data[col].dtype == 'object') and (col != 'UID'): data = encode_count(data, col) train = data.drop(['merchant', 'UID'], axis=1).fillna(-1) label = data['merchant'].values if (os.path.exists('./feature/merchant_np.npy')): merchant_weight = np.load('./feature/merchant_np.npy') for item in ['merchant']: result = data.groupby([ 'UID' ])[item].apply(max_list).reset_index().rename(columns={ item: 'arr_%s' % item }).fillna(0) y = y.merge(result[['UID', 'arr_%s' % item]], on=['UID'], how='left').fillna(0) sub = sub.merge(result[['UID', 'arr_%s' % item]],
mean, std = data[each].mean(), data[each].std() scaled_features[each] = [mean, std] data.loc[:, each] = (data[each] - mean) / std data.head() print(data.describe()) # Set test data set and training data # Here the test data is the last 100 values test_data = data[-100:] # Now remove the test data from the data set data = data[:-100] # Separate the data into features and targets target_fields = ['selected_object_4', 'selected_object_5', 'selected_object_6', 'selected_object_7'] features, targets = data.drop(target_fields, axis=1), data[target_fields] test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields] print(features.shape); print(test_features.shape) train_targets = targets.to_numpy() train_features = features.to_numpy() val_features = test_features.to_numpy() val_targets = test_targets.to_numpy() nnmodel = model.ANN() print(nnmodel) x_train = torch.from_numpy(train_features)