示例#1
0
def input_fn(token_generator: t.Callable[[], t.Generator[str, None, None]],
             hyper_params: dict) -> tf.data.Dataset:
    tokens = tf.data.Dataset.from_generator(token_generator,
                                            output_types=tf.string,
                                            output_shapes=(None, ))
    one_token_window = tokens.apply(sliding_window_batch(2))
    # one_token_window value example:
    # [[b'F', b'd', b's'],
    #  [b'i', b' ', b'e']]
    window = one_token_window.batch(hyper_params['seq_len'])
    window_transpose = window.map(lambda w: ({
        "token": tf.transpose(w[:, 0, :])
    }, tf.transpose(w[:, 1, :])))
    # window_transpose value example:
    # ({'token': [['F', 'i', 'r', 's', 't'],
    #             ['d', ' ', 'u', 'p', '.'],
    #             ['s', 'e', 'n', 't', '\n']]},
    #  [['i', 'r', 's', 't', ' '],
    #   [' ', 'u', 'p', '.', '\n'],
    #   ['e', 'n', 't', '\n', 'H']])

    packed_as_workaround = window_transpose.map(lambda w0, w1: (
        {
            "token": tf.reshape(w0["token"], [-1]),
            "batch_size": tf.shape(w0["token"])[
                0
            ],  # used for unpacking inputs in the model_fn, this is a work around
            "seq_len": tf.shape(w0["token"])[
                1
            ]  # used for unpacking inputs in the model_fn, this is a work around
        },
        tf.reshape(w1, [-1])))

    prefetch = packed_as_workaround.prefetch(buffer_size=1)
    return prefetch
示例#2
0
    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        vocab_table = contrib_lookup.index_table_from_file(FLAGS.vocab_file)

        if len(expanded_files) == 1:
            d = tf.data.TFRecordDataset(expanded_files[0])
            if is_training:
                d = d.repeat()
                d = d.shuffle(buffer_size=256)
        else:
            dataset_list = [
                tf.data.TFRecordDataset(expanded_files[i])
                for i in range(len(expanded_files))
            ]
            if is_training:
                dataset_list = [d.repeat() for d in dataset_list]
            dset_weights = [
                FLAGS.dataset_one_weight, 1 - FLAGS.dataset_one_weight
            ]
            if FLAGS.dataset_two_weight != 0:
                dset_weights = [
                    FLAGS.dataset_one_weight, FLAGS.dataset_two_weight,
                    1 - FLAGS.dataset_one_weight + FLAGS.dataset_two_weight
                ]
            d = tf.data.experimental.sample_from_datasets(
                dataset_list, dset_weights)

            # Note that sample_from_datasets() inserts randomness into the training
            # An alternative would be to use choose_from_datasets() but then the
            # order must be stated explicitly which is less intitive for unbalanced
            # datasets. Example below:
            #
            # choice_dataset = tf.data.Dataset.range(len(dataset_list)).repeat()
            # d = tf.data.experimental.choose_from_datasets(dataset_list,
            #                                               choice_dataset)

            if is_training:
                d = d.shuffle(buffer_size=256)

        # The window size will be for selecting negative samples
        # It equals the number of documents to sample from -1
        d = d.apply(
            contrib_data.sliding_window_batch(
                window_size=FLAGS.data_window_size,
                window_shift=FLAGS.data_window_shift))
        d = d.apply(
            tf.data.experimental.map_and_batch(lambda record: _decode_record(
                record, name_to_features, vocab_table),
                                               batch_size=batch_size,
                                               drop_remainder=drop_remainder))

        return d
def main():
    """
  with open('labelled.json', encoding='utf-8') as f:
      json_labelled = json.load(f)

  labelled = np.asarray(json.loads(json_labelled))
  w_day_avg = []
  u_vec_avg = []
  label_avg = []
  for i in range(len(labelled)):
      w_day_avg.append(labelled[i][0])
      u_vec_avg.append(labelled[i][1])
      label_avg.append(labelled[i][2])

  w_day_avg = np.array(w_day_avg)
  u_vec_avg = np.array(u_vec_avg)
  label_avg = np.array(label_avg)
  """

    w_day_avg = []
    u_vec_avg = []
    label_avg = []

    with open('./5avg_vec.pickle', 'rb') as f:
        while True:
            try:
                pic = pickle.load(f)
                w_day_avg.append(pic['day'])
                u_vec_avg.append(pic['u-vector'])
                label_avg.append(pic['label'])
                print(pic['day'])
            except (EOFError):
                break
    w_day_avg = np.asarray(w_day_avg)
    u_vec_avg = np.array(u_vec_avg, dtype=np.int32)
    label_avg = np.array(label_avg, dtype=np.int32)

    m = len(u_vec_avg) // ul

    ln = m * ul - ul + 1  # subtract 1 to make spare batch
    ln = ln - divmod(ln, bs)[1] + ul - 1  # make multiple of batch size

    lnt = ((m // 5) * 4) * ul - ul + 1
    lnt = lnt - divmod(lnt, bs)[1] + ul - 1  # make multiple of batch size

    m = (len(u_vec_avg) - (lnt - ul + 1)) // ul
    df = m * ul - ul + 1
    df = df - divmod(df, bs)[1] + ul - 1

    # print(np.array(u_vec[:ln, :, :].shape))
    input1 = Input(shape=(
        ul,
        inp,
    ),
                   batch_shape=(bs, ul, inp),
                   dtype='float32')

    cnn = ConvolutionalNN(output_dim=oup, input_dim=inp)(input1)
    model = Model(inputs=[input1], outputs=[cnn])

    #     sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)
    adam = optimizers.Adam(lr=0.001,
                           beta_1=0.9,
                           beta_2=0.999,
                           epsilon=None,
                           decay=0.0,
                           amsgrad=False)
    rms = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
    model.compile(loss=binary_classification_loss,
                  optimizer=adam,
                  metrics=[binary_classification_loss, 'accuracy'])

    # Y_train = np.array([random.randint(0,1) for i in range(3*ln)])
    # Y_train = np.reshape(Y_train, (ln, 3, 1))
    # Y_t = Y_train*(-1)+1
    # Y_train = np.concatenate((Y_train,Y_t),axis=3)
    # Y_train =Y_train.astype(np.float32)
    # print(":Y_train_",Y_train.shape)

    dataset = tf.data.Dataset.from_tensor_slices(
        (np.array(u_vec_avg[:lnt][:])))
    dataset = dataset.apply(sliding_window_batch(window_size=ul))

    iterator = dataset.make_one_shot_iterator()  # iterator
    next_element = iterator.get_next()

    numpy_data = np.zeros((lnt - ul + 1, ul, inp))
    numpy_label = np.concatenate(
        (np.array(label_avg[:lnt - ul + 1]).reshape(lnt - ul + 1, 1),
         1 - np.array(label_avg[:lnt - ul + 1]).reshape(lnt - ul + 1, 1)),
        axis=1)
    #     numpy_data = np.zeros((10,ul, inp))
    #     numpy_label = np.concatenate((np.array(label_avg[:10]).reshape(10, 1),
    #                                   1 - np.array(label_avg[:10]).reshape(10, 1)), axis=1)

    sess = tf.Session()  # tensorflow session
    for i in range(lnt - ul + 1):
        #     for i in range(10):
        data_ = sess.run(
            next_element
        )  # data_ contains the data and label_ contains the labels that we fed in the previous step
        numpy_data[i, :, :] = data_

    MODEL_SAVE_FOLDER_PATH = './modelCNN/'
    if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
        os.mkdir(MODEL_SAVE_FOLDER_PATH)

    model_path = MODEL_SAVE_FOLDER_PATH + '{epoch:02d}-{binary_classification_loss:.4f}.hdf5'
    cb_checkpoint = ModelCheckpoint(filepath=model_path,
                                    monitor='binary_classification_loss',
                                    verbose=1,
                                    save_best_only=True)
    cb_early_stopping = EarlyStopping(monitor='binary_classification_loss',
                                      patience=100)

    model.fit(numpy_data,
              numpy_label,
              epochs=parser['cnn_epoch'],
              batch_size=bs,
              callbacks=[cb_early_stopping, cb_checkpoint])

    dataset = tf.data.Dataset.from_tensor_slices(
        (np.array(u_vec_avg[lnt - ul + 1:lnt - ul + 1 + df][:])))
    dataset = dataset.apply(sliding_window_batch(window_size=ul))

    iterator = dataset.make_one_shot_iterator()  # iterator
    next_element = iterator.get_next()
    numpy_data = np.zeros((df - ul + 1, ul, inp))
    numpy_label = np.concatenate(
        (np.array(label_avg[lnt - ul + 1:lnt - ul + 1 + df - ul + 1]).reshape(
            df - ul + 1, 1), 1 -
         np.array(label_avg[lnt - ul + 1:lnt - ul + 1 + df - ul + 1]).reshape(
             df - ul + 1, 1)),
        axis=1)
    sess = tf.Session()  # tensorflow session
    for i in range(df - ul + 1):
        data_ = sess.run(
            next_element
        )  # data_ contains the data and label_ contains the labels that we fed in the previous step
        numpy_data[i, :, :] = data_

    score = model.evaluate(numpy_data, numpy_label, batch_size=bs)
    print(score[1], ":loss ", score[2], ":accuracy")

    model.save(parser['cnn_mod_name'])
    print("Save model: %s" % (parser['cnn_mod_name']))
示例#4
0
    def get_mel_to_instr_batched_dataset(self, sequence_length=(96), batch_size=64,
            buffer_size=10000, reduced_mel=False, reduced_instr=False, ratio=[.8,.1], stride=5):
        """
        Get the original frames expressed as their respective integer index according to the
        unique frames vocabulary. Note that this returns 2 arrays, the first one containing the
        indices corresponding to the frames of the original melody, and the second one containing
        the indices corresponding to the frames of the original instrumental.
        @param sequence_length: an integer indicating the window length of eacch sequence we feed
                into the model when learning. Deafult 96.
        @param batch_size: in integer that determines the number of sequecences we want to have in
                each batch that we'll feed to the model. Default 64.
        @param buffer_size: in integer used when shuffling the data. Default 10000.
        @param reduced_mel: a boolean determining if we want the melody dataset to contain indices
                into the vocabulary of the original melody frames or indices into a vocabulary of
                the reduced melody frames. Reduced frames are frames that only contain information
                about the activated notes, regardless of what octaves they belong to. Reduced frames
                from the melody vocabular are of length 12, i.e they only contain information about
                the activation status of the 12 musical notes (C, C#, D, D#, E, F, F#, G, G#, A, A#
                and B). Default False.
        @param reduced_instr: a boolean determining if we want the melody dataset to contain indices
                into the vocabulary of the original melody frames or indices into a vocabulary of
                the reduced melody frames. Reduced frames from the instrumental vocabular are of
                length 24, i.e, on octave with exactly one activated note (the root note) and the
                other one with the activation status of any remaining note in the original frame.
                Default False.
        @param ratio: a list of 2 float numbers both between 0 and 1. The first number indicate the
                percentage of the final number of batches that we want to allocate as training data.
                The second is the percentage of total batch number to allocate as dev/validation
                data. The remaining will be allocated to the test data.
        @param stride: an integer. This represents the stride to be used when generating the
                sequences. Indeed, the sequences are going to be generated by taking each
                consecutive `sequence_length` frames and skipping `stride` frames from one sequence
                to the other. The stride has to be greater than 0. Defatul 5.

        @return a 3-tuple where each element represent respectively the training dataset, the
                dev/validation dataset and the test dataset. Each dataset contains 2 sequences: the
                first sequence corresponds to the input we would feed to the model and the second
                sequence corresponds to the output we expect the model to predict (true label). The
                output sequence is simply the input sequence minus its first frame and plus the
                frame that should come right after the last frame in the input sequence.

        Example Usage:
        # Assuming you declared a MidiData object like so
        data = MidiData('path_to_tracks_info.csv', open_files=['file1.mid', 'file2.mid'])
        mel_to_instr_dataset =
            data.get_mel_to_instr_batched_dataset(sequence_length=500, batch_size=256)
        """
        mel_seq = tf.data.Dataset.from_tensor_slices(
            self.__mel_as_int__ if not reduced_mel eslse self.__reduced_mel_as_int__)\
            .apply(sliding_window_batch(window_size=sequence_length+1, stride=stride))
            # .batch(sequence_length+1, drop_remainder=True)
        instr_seq = tf.data.Dataset.from_tensor_slices(
            self.__instr_as_int__ if not reduced_instr else self.__reduced_instr_as_int__)\
            .apply(sliding_window_batch(window_size=sequence_length+1, stride=stride))
            # .batch(sequence_length+1, drop_remainder=True)

        mel_dataset = mel_seq.map(lambda x: x[:-1])
        instr_dataset = instr_seq.map(lambda x: x[1:])

        dataset = tf.data.Dataset.zip((mel_dataset, instr_dataset)).shuffle(buffer_size)\
            .batch(batch_size, drop_remainder=True)

        return self.__split_dataset__(dataset, ratio)