def preprocess_text(self, name_csv, input_text_path, output_text_path, input_image_path): print("\nInizio preprocessamento testo..") data = pd.read_csv(name_csv, delimiter=';') # leggo il csv # Preprocessing descrizone mapping_name_description = load_descriptions(data) clean_descriptions(mapping_name_description) #vocabulary = to_vocabulary(mapping_name_description) #print("Lunghezza vocabolario: ", len(vocabulary)) save_descriptions(mapping_name_description, '../clean_dataset.csv') tokenizer = create_tokenizer(mapping_name_description) vocab_size = len(tokenizer.word_index) + 1 max_len = max_length(mapping_name_description) create_sequences(tokenizer, max_len, mapping_name_description, vocab_size, input_image_path, input_text_path, output_text_path) print("Fine preprocessamento testo..")
def read_data(self, window, resample): print('| Reading depression data ...') conditions = os.listdir(f'{self.data_path}/condition') controls = os.listdir(f'{self.data_path}/control') # random.shuffle(conditions) # random.shuffle(controls) files = conditions + controls random.shuffle(files) xs, ys = [], [] for i, filename in enumerate(files): data_class = 'condition' if 'condition' in filename else 'control' raw_df = pd.read_csv(f'{self.data_path}/{data_class}/{filename}', index_col='timestamp', parse_dates=True) raw_df = raw_df.drop(columns=['date']) raw_df = raw_df.rename(columns={'activity': f'{data_class}_{i + 1}'}) df = raw_df total = df.resample(resample).sum() if resample: df = df.resample(resample).mean() # Extract features and normalise # df = (df - df.min()) / (df.max() - df.min()) mean, deviation = df.values.mean(), df.values.std() mn, mx = df.min(), df.max() # df['rate'] = df / mean # df['maxrate'] = df.iloc[:, 0] / mx # df['max'] = mx # df['min'] = mn df['mean'] = mean df['deviation'] = deviation # df['total'] = (total - total.min()) / (total.max() - total.min()) # print(df) # exit() seq = utils.create_sequences(df.values, window, 0) xs.extend(seq) data_class = 1 if data_class == 'condition' else 0 ys.extend(np.full(seq.shape[0], data_class)) xs, ys = np.array(xs), np.array(ys) return { 'X': xs, 'y': ys }
def train_test_split(features, labels, scaler, seq_len, n_features, test_type='all_stocks', test_size=0.25): if test_type not in ['all_stocks', 'new_stocks']: raise ValueError('invalid test_type provided') X_train = np.empty(shape=(0, seq_len, n_features)) X_test = np.empty(shape=(0, seq_len, n_features)) y_train = np.empty(shape=(0, 1)) y_test = np.empty(shape=(0, 1)) if test_type == 'new_stocks': n_train = len(features.keys()) - \ int(len(features.keys()) * test_size) train_ticks = ", ".join(list(features.keys())[n_train:]) print('Using ' + train_ticks + ' data for testing') for i, tick in enumerate(features.keys()): scaled = scaler.transform(features[tick]) x, y = utils.create_sequences(scaled, labels[tick], seq_len) if test_type == 'new_stocks': if i < n_train: X_train = np.concatenate((X_train, x), axis=0) y_train = np.concatenate((y_train, y), axis=0) else: X_test = np.concatenate((X_test, x), axis=0) y_test = np.concatenate((y_test, y), axis=0) elif test_type == 'all_stocks': # Split the data into training and testing sets xt, xv, yt, yv = tts(x, y, test_size=test_size, shuffle=False) X_train = np.concatenate((X_train, xt), axis=0) X_test = np.concatenate((X_test, xv), axis=0) y_train = np.concatenate((y_train, yt), axis=0) y_test = np.concatenate((y_test, yv), axis=0) return X_train, X_test, y_train, y_test
train = load_set(filename) print('Dataset: %d' % len(train)) train_descriptions = load_clean_descriptions('descriptions.txt', train) print('Descriptions: train=%d' % len(train_descriptions)) # photo features train_features = load_photo_features('features.pkl', train) print('Photos: train=%d' % len(train_features)) # prepare tokenizer tokenizer = create_tokenizer(train_descriptions) vocab_size = len(tokenizer.word_index) + 1 print('Vocabulary Size: %d' % vocab_size) max_length = max_length(train_descriptions) print('Description Length: %d' % max_length) # prepare sequences X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features) # dev dataset # load test set filename = 'dataset/Flickr_8k.devImages.txt' test = load_set(filename) print('Dataset: %d' % len(test)) test_descriptions = load_clean_descriptions('descriptions.txt', test) print('Descriptions: test=%d' % len(test_descriptions)) test_features = load_photo_features('features.pkl', test) print('Photos: test=%d' % len(test_features)) # prepare sequences X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, test_features)
for line in corpus] # tokenize corpus into words print('Pre-processing') print('vocab size: ', len(set([token for line in corpus for token in line]))) print('num. of lines: ', len(corpus)) print(' '.join(corpus[0])) corpus = reduce_corpus( corpus, min_len=MIN_SEQ_LEN ) # reduce corpus size - remove lines with length less than MIN_SEQ_LEN corpus = [[START_TOKEN] + line + [END_TOKEN] for line in corpus] # add start and end tokens corpus = reduce_vocab( corpus, UNK_TOKEN, min_count=MIN_COUNT ) # reduce vocab size - remove token with count less than MIN_COUNT vocab = list(set([token for line in corpus for token in line])) # extract vocabulary of corpus corpus = create_sequences( corpus, max_len=MAX_SEQ_LEN) # create sequences of max length MAX_SEQ_LEN print('\nPost-processing') print('vocab size: ', len(vocab)) print('num. of lines: ', len(corpus)) print(' '.join(corpus[0])) tensor, lang = tokenize_subword( corpus, reserved_tokens=[MASK_TOKEN, START_TOKEN, END_TOKEN ]) # tokenize corpus and prepare padded Tensor sequences train, test = train_test_split( tensor, test_size=TEST_SIZE, random_state=RANDOM_STATE) # split dataset into train and test train = tf.data.Dataset.from_tensor_slices(train).shuffle(BUFFER_SIZE).batch( BATCH_SIZE, drop_remainder=True) test = tf.data.Dataset.from_tensor_slices(test).shuffle(BUFFER_SIZE).batch(
# load trained captions train_features = pickle.load(open("train_final_features.pkl", 'rb')) # getting rnn_cnn_model if args.model == "inception": rc_model = rnn_cnn_model(2048, max_len, vocab_size, args.optimizer) else: rc_model = rnn_cnn_model(4096, max_len, vocab_size, args.optimizer) # getting data to train in_img, in_seq, out_word = create_sequences(tokenizer, max_len, train_captions_dect, train_features, vocab_size) #fitting model tf.config.experimental_run_functions_eagerly(True) rc_model.fit([in_img, in_seq], out_word, batch_size=args.batchsize, epochs=args.epochs, verbose=1) #saving weights rc_model.save_weights('rc_model_weights.h5')
def read_data(self, window): print('| Reading EV data ...') df = pd.read_csv(f'{self.data_path}/dataset_raw.csv', index_col='timestamp', parse_dates=True) df = self.clean_data(df) seqs = [] feature_df = pd.DataFrame() r_window = 3 # Shuffle dataset, generate features for each car and aggregate cars = pd.unique(df['id']) random.shuffle(cars) for cid in cars: cdf = df[df['id'] == cid] cdf_features = pd.DataFrame() # Battery features, normalised to 0-1 battery_features = ['bat_used', 'bat_avg', 'bat_std'] #['bat_used', 'bat_charged', 'bat_avg', 'bat_std'] # battery_features = ['bat_used'] cdf_features[battery_features] = (cdf[battery_features] / 100) # cdf_features[battery_features] = (cdf[battery_features] / 100).rolling(window=r_window, min_periods=1).mean() # cdf_features['driven'] = cdf['driven'] # cdf_features['driven'] = cdf['driven'].rolling(window=r_window, min_periods=1).median() cdf_features['driven'] = utils.normalise_series(cdf['driven'], (0, 1)) # cdf_features['temperature'] = utils.normalise_series(cdf['tmp_out'], (0, 1)) # Features: distances from most frequent location. Scaled relative to 1km range # distances = self._charge_probability(cdf) # cdf_features['distances'] = 1 / (distances + 1) #(distances - distances.min()) / (distances.max() - distances.min()) # Brand vector # cdf_features = pd.concat((cdf_features, cdf[self.car_brands]), axis=1) # feature_df = feature_df.append(cdf_features) dummy_df = cdf_features.copy() dummy_df['id'] = cid feature_df = feature_df.append(dummy_df) # Create sequences input_seq = utils.create_sequences(cdf_features.values, window, 1) seqs.extend(input_seq) # Save feature df, for the sake of later analysis/cross-checking feature_df = feature_df.round(4) feature_df.to_csv(f'{self.data_path}/dataset_features.csv') # Drop used and charged features, first two columns seqs = np.array(seqs) # With rolling and used first column is "used", and is dropped as feature # Otherwise, rolling is first, where Y=mean of rolling xs = seqs[:, :window, :] # Used feature is first column. Get used t + n ys = np.abs(seqs[:, window:, 0]) # exit() return { 'X': xs, 'y': ys }
def read_data(self, window): print('| Reading football data ...') dataset, pids = self._create_dataset() print(f'| Number of players: {len(pids)} ...') # Clean dataset players = dataset.groupby(by=['pid']) observations = players.count().mean(axis=1) drop_players = observations[observations <= 40].index players = dataset[~dataset['pid'].isin(drop_players)].groupby( by=['pid']) features = dataset.columns.drop('pid') features = ['Readiness', 'Stress', 'Mood', 'Soreness', 'Fatigue'] features_out = ['Mood', 'Stress', 'Soreness', 'Fatigue'] print(f'| Using features {str(features)} for {predictor}...') xs, ys = [], [] i = 0 for pid, player_df in players: # TODO: Write data generation # 1. Collect features feat_vec = player_df[features] # 2. Normalise features feat_vec_norm = utils.normalise_series(feat_vec, (0, 1)) # 3. Create sequences y_dist = player_df[features_out].copy() input_seq = utils.create_sequences(y_dist.values, window + 1, 0) input_seq_norm = utils.create_sequences(feat_vec_norm.values, window + 1, 0) # NOTE: Test output # ys = input_seq[:, -1, 0] # First column is Readiness # ys_s = input_seq[:, -1, 2] # Third columns is Stress if i == 0: print(y_dist) seq_x = input_seq_norm[:, :-1, :] seq_y = input_seq[:, -1, :] i += 1 # print(pid) # sh = 0 # print(feat_vec[:][sh:40+sh+1]) # print(input_seq[sh, :-1]) # print(seq_y[sh]) # # print(input_seq) # # print(ys) # exit() # Multi-label classification # seq_y = input_seq[:, -1, list(range(1, 5))] # print(seq_x[sh]) # print(seq_y[sh]) xs.extend(seq_x) ys.extend(seq_y) # exit() # 4. Return Xs and ys to be formatted in the classification formatter xs, ys = np.array(xs), np.array(ys) return { 'X': xs, 'y': ys, 'columns': features_out, 'classes': len(features_out) }