Exemplo n.º 1
0
class DataLoader:
    """ Reads in pickled activations that have been extracted.

    Parameters
    ----------
    activations_dir : str
        Directory containing the extracted activations.
    corpus : Corpus
        Corpus containing the labels for each sentence.
    test_activations_dir : str, optional
        Directory containing the extracted test activations. If not
        provided the train activation set will be split and partially
        used as test set.
    test_corpus : Corpus, optional
        Corpus containing the test labels for each sentence. Must be
        provided if `test_activations_dir` is provided.
    selection_func : SelectFunc, optional
        Selection function that determines whether a corpus item should
        be taken into account. If such a function has been used during
        extraction, make sure to pass it along here as well.
    """
    def __init__(
        self,
        activations_dir: str,
        corpus: Corpus,
        test_activations_dir: Optional[str] = None,
        test_corpus: Optional[Corpus] = None,
        selection_func: SelectFunc = lambda sen_id, pos, example: True,
    ) -> None:
        assert corpus is not None, "`corpus`should be provided!"

        self.train_labels = create_labels_from_corpus(
            corpus, selection_func=selection_func)

        if test_activations_dir is not None:
            self.test_activation_reader = ActivationReader(
                test_activations_dir)
            assert test_corpus is not None, "`test_corpus` should be provided!"
            self.test_labels = create_labels_from_corpus(
                test_corpus, selection_func=selection_func)
        else:
            self.test_activation_reader = None
            self.test_labels = None

        self.activation_reader = ActivationReader(activations_dir)
        self.data_len = len(self.activation_reader)

    def create_data_split(
        self,
        activation_name: ActivationName,
        data_subset_size: int = -1,
        train_test_split: float = 0.9,
    ) -> DataDict:
        """ Creates train/test data split of activations

        Parameters
        ----------
        activation_name : ActivationName
            (layer, name) tuple indicating the activations to be read in
        data_subset_size : int, optional
            Subset size of data to train on. Defaults to -1, indicating
            the entire data set.
        train_test_split : float
            Percentage of the train/test split. If separate test
            activations are provided this split won't be used.
            Defaults to 0.9/0.1.
        """

        if data_subset_size != -1:
            assert (0 < data_subset_size <= self.data_len
                    ), "Size of subset can't be bigger than the full data set."

        train_activations = self.activation_reader.read_activations(
            activation_name)

        # Shuffle activations
        data_size = self.data_len if data_subset_size == -1 else data_subset_size
        indices = np.random.choice(range(data_size), data_size, replace=False)
        train_activations = train_activations[indices]
        train_labels = self.train_labels[indices]

        if self.test_activation_reader is not None:
            test_activations = self.test_activation_reader.read_activations(
                activation_name)
            test_labels = self.test_labels
        else:
            split = int(data_size * train_test_split)

            test_activations = train_activations[split:]
            test_labels = train_labels[split:]
            train_activations = train_activations[:split]
            train_labels = train_labels[:split]

        return {
            "train_x": train_activations,
            "train_y": train_labels,
            "test_x": test_activations,
            "test_y": test_labels,
        }
Exemplo n.º 2
0
def main(config):

    classifier = joblib.load('output/' + config.name +
                             '/classifiers/hx_l1.joblib')
    train_reader = ActivationReader('output/' + config.name +
                                    '/activations/train')
    test_reader = ActivationReader('output/' + config.name +
                                   '/activations/test')

    if 'raw' in config.name:
        representation = 'raw'
    if 'neume' in config.name:
        representation = 'neume'
    if 'syllable' in config.name:
        representation = 'syllable'

    if 'pitch' in config.name:
        notes = 'pitch'
    if 'interval' in config.name:
        notes = 'interval'

    if '_20_' in config.name:
        seq_length = 20
    else:
        seq_length = 30

    # train_corpus = import_corpus_from_path('data/inputs/' + notes + '_20_' + representation + '_mode_corpus_train.txt', ['sen', 'labels'])
    test_corpus = import_corpus_from_path(
        'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation +
        '_mode_corpus_test.txt', ['sen', 'labels'])
    test_genre_corpus = import_corpus_from_path(
        'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation +
        '_string_genre_corpus_test.txt', ['sen', 'labels'])

    ##############################################################
    ##############################################################

    hx_1_test = test_reader.read_activations((1, 'hx'))
    test_labels = np.zeros(int(hx_1_test.shape[0] / seq_length))
    test_genres = np.zeros(int(hx_1_test.shape[0] / seq_length))
    for i in range(len(test_labels)):
        test_labels[i] = test_corpus[i].labels[19]
        test_genres[i] = test_genre_corpus[i].labels[19]

    test_genres = test_genres.astype(int)
    test_labels = test_labels.astype(int)

    with open(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_genre_vocab.txt', 'rb') as vf:
        vocab_lines = vf.readlines()
        vocab_lines = [line.decode('utf-8') for line in vocab_lines]

    count = Counter(test_genres)
    most_common_genres = count.most_common(3)
    genre_vocab = [w.strip() for w in vocab_lines]

    resp_verse = genre_vocab.index('Responsory verse')
    antiphon = genre_vocab.index('Antiphon')

    genre_ind_dict = {}
    for genre in most_common_genres:
        print(genre_vocab[genre[0]], genre)
        genre_ind_dict[genre[0]] = np.where(np.array(test_genres) == genre[0])

    ct = 1

    embs = {}

    for i in [0, 13]:
        activation_test = hx_1_test[i::20]
        activation_test = activation_test[genre_ind_dict[resp_verse]]
        x_emb = TSNE(n_components=2, verbose=2).fit_transform(activation_test)
        show_tsne_plot(x_emb, test_labels[genre_ind_dict[resp_verse]] + 1)
Exemplo n.º 3
0
def main(config):

    classifier = joblib.load('output/' + config.name +
                             '/classifiers/hx_l1.joblib')
    train_reader = ActivationReader('output/' + config.name +
                                    '/activations/train')
    test_reader = ActivationReader('output/' + config.name +
                                   '/activations/test')

    if 'raw' in config.name:
        representation = 'raw'
    if 'neume' in config.name:
        representation = 'neume'
    if 'syllable' in config.name:
        representation = 'syllable'

    if 'pitch' in config.name:
        notes = 'pitch'
    if 'interval' in config.name:
        notes = 'interval'

    if '_20_' in config.name:
        seq_length = 20
    else:
        seq_length = 30

    if 'embedding' in config.name:
        train_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_string_mode_corpus_train.txt',
            ['sen', 'labels'])
        test_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_string_mode_corpus_test.txt', ['sen', 'labels'])
        test_genre_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_string_genre_corpus_test.txt',
            ['sen', 'labels'])
    else:
        train_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_mode_corpus_train.txt', ['sen', 'labels'])
        test_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_mode_corpus_test.txt', ['sen', 'labels'])

    # hx_1_train = train_reader.read_activations((1,'hx'))
    # train_labels = np.zeros(int(hx_1_train.shape[0]/seq_length))
    # for i in range(len(train_labels)):
    #     train_labels[i] = train_corpus[i].labels[19]

    # train_scores = np.zeros(seq_length)
    # train_stds = np.zeros(seq_length)

    # for i in range(seq_length):
    #     activation_train = hx_1_train[i::seq_length]
    #     train_pred = classifier.predict(activation_train)
    #     train_scores[i] = (train_pred == train_labels).mean()

    # plt.plot(range(1,seq_length+1), train_scores, c='Red')

    ##############################################################
    ##############################################################

    hx_1_test = test_reader.read_activations((1, 'hx'))
    test_labels = np.zeros(int(hx_1_test.shape[0] / seq_length))
    test_genres = np.zeros(int(hx_1_test.shape[0] / seq_length))
    for i in range(len(test_labels)):
        test_labels[i] = test_corpus[i].labels[19]
        test_genres[i] = test_genre_corpus[i].labels[19]

    test_genres = test_genres.astype(int)
    test_labels = test_labels.astype(int)

    with open(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_genre_vocab.txt', 'rb') as vf:
        vocab_lines = vf.readlines()
        vocab_lines = [line.decode('utf-8') for line in vocab_lines]

    count = Counter(test_genres)
    most_common_genres = count.most_common(25)
    genre_vocab = [w.strip() for w in vocab_lines]

    resp_verse = genre_vocab.index('Responsory verse')
    antiphon = genre_vocab.index('Antiphon')

    genre_ind_dict = {}
    for genre in most_common_genres:
        print(genre_vocab[genre[0]], genre)
        genre_ind_dict[genre[0]] = np.where(np.array(test_genres) == genre[0])

    test_scores = np.zeros(seq_length)
    test_stds = np.zeros(seq_length)

    test_scores_genre = {}

    for genre in genre_ind_dict:
        test_scores_genre[genre] = np.zeros(seq_length)

    preds = []

    for i in range(seq_length):
        activation_test = hx_1_test[i::seq_length]
        test_pred = classifier.predict(activation_test)
        preds.append(test_pred)
        for genre in genre_ind_dict:
            tp = test_pred[genre_ind_dict[genre]]
            tl = test_labels[genre_ind_dict[genre]]
            test_scores_genre[genre][i] = (tp == tl).mean()
        # test_scores[i] = (test_pred == test_labels).mean()

    d = {
        'antiphon': test_scores_genre[antiphon].tolist(),
        'resp_verse': test_scores_genre[resp_verse].tolist()
    }

    with open('output/' + config.name + '/scores.json', 'w') as f:
        json.dump(d, f)

    # plt.plot(range(1,seq_length+1), test_scores, label='All genres')
    for genre in genre_ind_dict:
        plt.plot(range(1, seq_length + 1),
                 test_scores_genre[genre],
                 label=genre_vocab[genre])
    # plt.plot(range(1,21), test_scores+test_stds, c='Blue')
    # plt.plot(range(1,21), test_scores-test_stds, c='Blue')
    plt.legend()
    plt.xticks(list(range(1, seq_length + 1)))
    plt.ylim(0, 1)
    fig = plt.gcf()
    plt.show()
    ans = input('Do you want to save this figure? y/n\n')
    if ans == 'y':
        fig.savefig('output/' + config.name + '/eval_class.jpg')

    print('done')