예제 #1
0
    def preprocess_text(self, name_csv, input_text_path, output_text_path,
                        input_image_path):

        print("\nInizio preprocessamento testo..")

        data = pd.read_csv(name_csv, delimiter=';')  # leggo il csv

        # Preprocessing descrizone
        mapping_name_description = load_descriptions(data)
        clean_descriptions(mapping_name_description)
        #vocabulary = to_vocabulary(mapping_name_description)
        #print("Lunghezza vocabolario: ", len(vocabulary))
        save_descriptions(mapping_name_description, '../clean_dataset.csv')

        tokenizer = create_tokenizer(mapping_name_description)
        vocab_size = len(tokenizer.word_index) + 1
        max_len = max_length(mapping_name_description)
        create_sequences(tokenizer, max_len, mapping_name_description,
                         vocab_size, input_image_path, input_text_path,
                         output_text_path)

        print("Fine preprocessamento testo..")
예제 #2
0
  def read_data(self, window, resample):
    print('| Reading depression data ...')
    conditions = os.listdir(f'{self.data_path}/condition')
    controls = os.listdir(f'{self.data_path}/control')
    # random.shuffle(conditions)
    # random.shuffle(controls)
    files = conditions + controls
    random.shuffle(files)
    xs, ys = [], []

    for i, filename in enumerate(files):
      data_class = 'condition' if 'condition' in filename else 'control'
      raw_df = pd.read_csv(f'{self.data_path}/{data_class}/{filename}', index_col='timestamp', parse_dates=True)
      raw_df = raw_df.drop(columns=['date'])
      raw_df = raw_df.rename(columns={'activity': f'{data_class}_{i + 1}'}) 

      df = raw_df
      total = df.resample(resample).sum()
      if resample:
        df = df.resample(resample).mean()

      # Extract features and normalise
      # df = (df - df.min()) / (df.max() - df.min())
      mean, deviation = df.values.mean(), df.values.std()
      mn, mx = df.min(), df.max()
      # df['rate'] = df / mean
      # df['maxrate'] = df.iloc[:, 0] / mx
      # df['max'] = mx
      # df['min'] = mn
      df['mean'] = mean
      df['deviation'] = deviation
      # df['total'] = (total - total.min()) / (total.max() - total.min())
      # print(df)
      # exit()

      seq = utils.create_sequences(df.values, window, 0)
      xs.extend(seq)
      data_class = 1 if data_class == 'condition' else 0
      ys.extend(np.full(seq.shape[0], data_class))

    xs, ys = np.array(xs), np.array(ys)
    
    return {
      'X': xs,
      'y': ys
    }
예제 #3
0
def train_test_split(features, labels, scaler, seq_len, n_features,
                     test_type='all_stocks', test_size=0.25):
    
    if test_type not in ['all_stocks', 'new_stocks']:
        raise ValueError('invalid test_type provided')
    
    X_train = np.empty(shape=(0, seq_len, n_features))
    X_test = np.empty(shape=(0, seq_len, n_features))
    y_train = np.empty(shape=(0, 1))
    y_test = np.empty(shape=(0, 1))
    
    if test_type == 'new_stocks':
        n_train = len(features.keys()) - \
            int(len(features.keys()) * test_size)
        
        train_ticks = ", ".join(list(features.keys())[n_train:])
        print('Using ' + train_ticks + ' data for testing')

    for i, tick in enumerate(features.keys()):
        scaled = scaler.transform(features[tick])
        x, y = utils.create_sequences(scaled, labels[tick], seq_len)
        
        if test_type == 'new_stocks':
            if i < n_train:
                X_train = np.concatenate((X_train, x), axis=0)
                y_train = np.concatenate((y_train, y), axis=0)
            else:
                X_test = np.concatenate((X_test, x), axis=0)
                y_test = np.concatenate((y_test, y), axis=0)
                
        elif test_type == 'all_stocks':
            # Split the data into training and testing sets
            xt, xv, yt, yv = tts(x, y, test_size=test_size, shuffle=False)

            X_train = np.concatenate((X_train, xt), axis=0)
            X_test = np.concatenate((X_test, xv), axis=0)
            y_train = np.concatenate((y_train, yt), axis=0)
            y_test = np.concatenate((y_test, yv), axis=0)
            
    return X_train, X_test, y_train, y_test
train = load_set(filename)
print('Dataset: %d' % len(train))
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)
# prepare sequences
X1train, X2train, ytrain = create_sequences(tokenizer, max_length,
                                            train_descriptions, train_features)

# dev dataset

# load test set
filename = 'dataset/Flickr_8k.devImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))
# prepare sequences
X1test, X2test, ytest = create_sequences(tokenizer, max_length,
                                         test_descriptions, test_features)
예제 #5
0
          for line in corpus]  # tokenize corpus into words
print('Pre-processing')
print('vocab size: ', len(set([token for line in corpus for token in line])))
print('num. of lines: ', len(corpus))
print(' '.join(corpus[0]))
corpus = reduce_corpus(
    corpus, min_len=MIN_SEQ_LEN
)  # reduce corpus size - remove lines with length less than MIN_SEQ_LEN
corpus = [[START_TOKEN] + line + [END_TOKEN]
          for line in corpus]  # add start and end tokens
corpus = reduce_vocab(
    corpus, UNK_TOKEN, min_count=MIN_COUNT
)  # reduce vocab size - remove token with count less than MIN_COUNT
vocab = list(set([token for line in corpus
                  for token in line]))  # extract vocabulary of corpus
corpus = create_sequences(
    corpus, max_len=MAX_SEQ_LEN)  # create sequences of max length MAX_SEQ_LEN
print('\nPost-processing')
print('vocab size: ', len(vocab))
print('num. of lines: ', len(corpus))
print(' '.join(corpus[0]))

tensor, lang = tokenize_subword(
    corpus,
    reserved_tokens=[MASK_TOKEN, START_TOKEN, END_TOKEN
                     ])  # tokenize corpus and prepare padded Tensor sequences
train, test = train_test_split(
    tensor, test_size=TEST_SIZE,
    random_state=RANDOM_STATE)  # split dataset into train and test
train = tf.data.Dataset.from_tensor_slices(train).shuffle(BUFFER_SIZE).batch(
    BATCH_SIZE, drop_remainder=True)
test = tf.data.Dataset.from_tensor_slices(test).shuffle(BUFFER_SIZE).batch(
예제 #6
0
# load trained captions

train_features = pickle.load(open("train_final_features.pkl", 'rb'))

# getting rnn_cnn_model

if args.model == "inception":
    rc_model = rnn_cnn_model(2048, max_len, vocab_size, args.optimizer)
else:
    rc_model = rnn_cnn_model(4096, max_len, vocab_size, args.optimizer)

# getting data to train

in_img, in_seq, out_word = create_sequences(tokenizer, max_len,
                                            train_captions_dect,
                                            train_features, vocab_size)

#fitting model

tf.config.experimental_run_functions_eagerly(True)
rc_model.fit([in_img, in_seq],
             out_word,
             batch_size=args.batchsize,
             epochs=args.epochs,
             verbose=1)

#saving weights

rc_model.save_weights('rc_model_weights.h5')
예제 #7
0
  def read_data(self, window):
    print('| Reading EV data ...')
    df = pd.read_csv(f'{self.data_path}/dataset_raw.csv', index_col='timestamp', parse_dates=True)
    df = self.clean_data(df)

    seqs = []
    feature_df = pd.DataFrame()
    r_window = 3
    
    # Shuffle dataset, generate features for each car and aggregate
    cars  = pd.unique(df['id'])
    random.shuffle(cars)
    for cid in cars:
      cdf = df[df['id'] == cid]
      cdf_features = pd.DataFrame()

      # Battery features, normalised to 0-1
      battery_features = ['bat_used', 'bat_avg', 'bat_std'] #['bat_used', 'bat_charged', 'bat_avg', 'bat_std']
      # battery_features = ['bat_used']
      cdf_features[battery_features] = (cdf[battery_features] / 100)
      # cdf_features[battery_features] = (cdf[battery_features] / 100).rolling(window=r_window, min_periods=1).mean()

      # cdf_features['driven'] = cdf['driven']
      # cdf_features['driven'] = cdf['driven'].rolling(window=r_window, min_periods=1).median() 
      cdf_features['driven'] = utils.normalise_series(cdf['driven'], (0, 1))
      
      # cdf_features['temperature'] = utils.normalise_series(cdf['tmp_out'], (0, 1))
      
      # Features: distances from most frequent location. Scaled relative to 1km range
      # distances = self._charge_probability(cdf)
      # cdf_features['distances'] = 1 / (distances + 1) #(distances - distances.min()) / (distances.max() - distances.min())
      
      # Brand vector 
      # cdf_features = pd.concat((cdf_features, cdf[self.car_brands]), axis=1) 
      # feature_df = feature_df.append(cdf_features)

      dummy_df = cdf_features.copy()
      dummy_df['id'] = cid
      feature_df = feature_df.append(dummy_df)

      # Create sequences
      input_seq = utils.create_sequences(cdf_features.values, window, 1)
      seqs.extend(input_seq)

    # Save feature df, for the sake of later analysis/cross-checking
    feature_df = feature_df.round(4)
    feature_df.to_csv(f'{self.data_path}/dataset_features.csv')

    # Drop used and charged features, first two columns
    seqs = np.array(seqs)
    
    # With rolling and used first column is "used", and is dropped as feature
    # Otherwise, rolling is first, where Y=mean of rolling
    xs = seqs[:, :window, :]

    # Used feature is first column. Get used t + n
    ys = np.abs(seqs[:, window:, 0]) 
    # exit()
    return {
      'X': xs,
      'y': ys
    }
예제 #8
0
    def read_data(self, window):
        print('| Reading football data ...')
        dataset, pids = self._create_dataset()
        print(f'| Number of players: {len(pids)} ...')
        # Clean dataset
        players = dataset.groupby(by=['pid'])
        observations = players.count().mean(axis=1)
        drop_players = observations[observations <= 40].index
        players = dataset[~dataset['pid'].isin(drop_players)].groupby(
            by=['pid'])

        features = dataset.columns.drop('pid')
        features = ['Readiness', 'Stress', 'Mood', 'Soreness', 'Fatigue']
        features_out = ['Mood', 'Stress', 'Soreness', 'Fatigue']

        print(f'| Using features {str(features)} for {predictor}...')
        xs, ys = [], []

        i = 0
        for pid, player_df in players:
            # TODO: Write data generation
            # 1. Collect features
            feat_vec = player_df[features]

            # 2. Normalise features
            feat_vec_norm = utils.normalise_series(feat_vec, (0, 1))

            # 3. Create sequences
            y_dist = player_df[features_out].copy()

            input_seq = utils.create_sequences(y_dist.values, window + 1, 0)
            input_seq_norm = utils.create_sequences(feat_vec_norm.values,
                                                    window + 1, 0)

            # NOTE: Test output
            # ys = input_seq[:, -1, 0]    # First column is Readiness
            # ys_s = input_seq[:, -1, 2]  # Third columns is Stress

            if i == 0:
                print(y_dist)

            seq_x = input_seq_norm[:, :-1, :]
            seq_y = input_seq[:, -1, :]
            i += 1
            # print(pid)
            # sh = 0
            # print(feat_vec[:][sh:40+sh+1])
            # print(input_seq[sh, :-1])
            # print(seq_y[sh])
            # # print(input_seq)
            # # print(ys)
            # exit()

            # Multi-label classification
            # seq_y = input_seq[:, -1, list(range(1, 5))]

            # print(seq_x[sh])
            # print(seq_y[sh])
            xs.extend(seq_x)
            ys.extend(seq_y)
            # exit()

        # 4. Return Xs and ys to be formatted in the classification formatter
        xs, ys = np.array(xs), np.array(ys)

        return {
            'X': xs,
            'y': ys,
            'columns': features_out,
            'classes': len(features_out)
        }