class DataLoader: """ Reads in pickled activations that have been extracted. Parameters ---------- activations_dir : str Directory containing the extracted activations. corpus : Corpus Corpus containing the labels for each sentence. test_activations_dir : str, optional Directory containing the extracted test activations. If not provided the train activation set will be split and partially used as test set. test_corpus : Corpus, optional Corpus containing the test labels for each sentence. Must be provided if `test_activations_dir` is provided. selection_func : SelectFunc, optional Selection function that determines whether a corpus item should be taken into account. If such a function has been used during extraction, make sure to pass it along here as well. """ def __init__( self, activations_dir: str, corpus: Corpus, test_activations_dir: Optional[str] = None, test_corpus: Optional[Corpus] = None, selection_func: SelectFunc = lambda sen_id, pos, example: True, ) -> None: assert corpus is not None, "`corpus`should be provided!" self.train_labels = create_labels_from_corpus( corpus, selection_func=selection_func) if test_activations_dir is not None: self.test_activation_reader = ActivationReader( test_activations_dir) assert test_corpus is not None, "`test_corpus` should be provided!" self.test_labels = create_labels_from_corpus( test_corpus, selection_func=selection_func) else: self.test_activation_reader = None self.test_labels = None self.activation_reader = ActivationReader(activations_dir) self.data_len = len(self.activation_reader) def create_data_split( self, activation_name: ActivationName, data_subset_size: int = -1, train_test_split: float = 0.9, ) -> DataDict: """ Creates train/test data split of activations Parameters ---------- activation_name : ActivationName (layer, name) tuple indicating the activations to be read in data_subset_size : int, optional Subset size of data to train on. Defaults to -1, indicating the entire data set. train_test_split : float Percentage of the train/test split. If separate test activations are provided this split won't be used. Defaults to 0.9/0.1. """ if data_subset_size != -1: assert (0 < data_subset_size <= self.data_len ), "Size of subset can't be bigger than the full data set." train_activations = self.activation_reader.read_activations( activation_name) # Shuffle activations data_size = self.data_len if data_subset_size == -1 else data_subset_size indices = np.random.choice(range(data_size), data_size, replace=False) train_activations = train_activations[indices] train_labels = self.train_labels[indices] if self.test_activation_reader is not None: test_activations = self.test_activation_reader.read_activations( activation_name) test_labels = self.test_labels else: split = int(data_size * train_test_split) test_activations = train_activations[split:] test_labels = train_labels[split:] train_activations = train_activations[:split] train_labels = train_labels[:split] return { "train_x": train_activations, "train_y": train_labels, "test_x": test_activations, "test_y": test_labels, }
def main(config): classifier = joblib.load('output/' + config.name + '/classifiers/hx_l1.joblib') train_reader = ActivationReader('output/' + config.name + '/activations/train') test_reader = ActivationReader('output/' + config.name + '/activations/test') if 'raw' in config.name: representation = 'raw' if 'neume' in config.name: representation = 'neume' if 'syllable' in config.name: representation = 'syllable' if 'pitch' in config.name: notes = 'pitch' if 'interval' in config.name: notes = 'interval' if '_20_' in config.name: seq_length = 20 else: seq_length = 30 # train_corpus = import_corpus_from_path('data/inputs/' + notes + '_20_' + representation + '_mode_corpus_train.txt', ['sen', 'labels']) test_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_mode_corpus_test.txt', ['sen', 'labels']) test_genre_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_string_genre_corpus_test.txt', ['sen', 'labels']) ############################################################## ############################################################## hx_1_test = test_reader.read_activations((1, 'hx')) test_labels = np.zeros(int(hx_1_test.shape[0] / seq_length)) test_genres = np.zeros(int(hx_1_test.shape[0] / seq_length)) for i in range(len(test_labels)): test_labels[i] = test_corpus[i].labels[19] test_genres[i] = test_genre_corpus[i].labels[19] test_genres = test_genres.astype(int) test_labels = test_labels.astype(int) with open( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_genre_vocab.txt', 'rb') as vf: vocab_lines = vf.readlines() vocab_lines = [line.decode('utf-8') for line in vocab_lines] count = Counter(test_genres) most_common_genres = count.most_common(3) genre_vocab = [w.strip() for w in vocab_lines] resp_verse = genre_vocab.index('Responsory verse') antiphon = genre_vocab.index('Antiphon') genre_ind_dict = {} for genre in most_common_genres: print(genre_vocab[genre[0]], genre) genre_ind_dict[genre[0]] = np.where(np.array(test_genres) == genre[0]) ct = 1 embs = {} for i in [0, 13]: activation_test = hx_1_test[i::20] activation_test = activation_test[genre_ind_dict[resp_verse]] x_emb = TSNE(n_components=2, verbose=2).fit_transform(activation_test) show_tsne_plot(x_emb, test_labels[genre_ind_dict[resp_verse]] + 1)
def main(config): classifier = joblib.load('output/' + config.name + '/classifiers/hx_l1.joblib') train_reader = ActivationReader('output/' + config.name + '/activations/train') test_reader = ActivationReader('output/' + config.name + '/activations/test') if 'raw' in config.name: representation = 'raw' if 'neume' in config.name: representation = 'neume' if 'syllable' in config.name: representation = 'syllable' if 'pitch' in config.name: notes = 'pitch' if 'interval' in config.name: notes = 'interval' if '_20_' in config.name: seq_length = 20 else: seq_length = 30 if 'embedding' in config.name: train_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_string_mode_corpus_train.txt', ['sen', 'labels']) test_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_string_mode_corpus_test.txt', ['sen', 'labels']) test_genre_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_string_genre_corpus_test.txt', ['sen', 'labels']) else: train_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_mode_corpus_train.txt', ['sen', 'labels']) test_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_mode_corpus_test.txt', ['sen', 'labels']) # hx_1_train = train_reader.read_activations((1,'hx')) # train_labels = np.zeros(int(hx_1_train.shape[0]/seq_length)) # for i in range(len(train_labels)): # train_labels[i] = train_corpus[i].labels[19] # train_scores = np.zeros(seq_length) # train_stds = np.zeros(seq_length) # for i in range(seq_length): # activation_train = hx_1_train[i::seq_length] # train_pred = classifier.predict(activation_train) # train_scores[i] = (train_pred == train_labels).mean() # plt.plot(range(1,seq_length+1), train_scores, c='Red') ############################################################## ############################################################## hx_1_test = test_reader.read_activations((1, 'hx')) test_labels = np.zeros(int(hx_1_test.shape[0] / seq_length)) test_genres = np.zeros(int(hx_1_test.shape[0] / seq_length)) for i in range(len(test_labels)): test_labels[i] = test_corpus[i].labels[19] test_genres[i] = test_genre_corpus[i].labels[19] test_genres = test_genres.astype(int) test_labels = test_labels.astype(int) with open( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_genre_vocab.txt', 'rb') as vf: vocab_lines = vf.readlines() vocab_lines = [line.decode('utf-8') for line in vocab_lines] count = Counter(test_genres) most_common_genres = count.most_common(25) genre_vocab = [w.strip() for w in vocab_lines] resp_verse = genre_vocab.index('Responsory verse') antiphon = genre_vocab.index('Antiphon') genre_ind_dict = {} for genre in most_common_genres: print(genre_vocab[genre[0]], genre) genre_ind_dict[genre[0]] = np.where(np.array(test_genres) == genre[0]) test_scores = np.zeros(seq_length) test_stds = np.zeros(seq_length) test_scores_genre = {} for genre in genre_ind_dict: test_scores_genre[genre] = np.zeros(seq_length) preds = [] for i in range(seq_length): activation_test = hx_1_test[i::seq_length] test_pred = classifier.predict(activation_test) preds.append(test_pred) for genre in genre_ind_dict: tp = test_pred[genre_ind_dict[genre]] tl = test_labels[genre_ind_dict[genre]] test_scores_genre[genre][i] = (tp == tl).mean() # test_scores[i] = (test_pred == test_labels).mean() d = { 'antiphon': test_scores_genre[antiphon].tolist(), 'resp_verse': test_scores_genre[resp_verse].tolist() } with open('output/' + config.name + '/scores.json', 'w') as f: json.dump(d, f) # plt.plot(range(1,seq_length+1), test_scores, label='All genres') for genre in genre_ind_dict: plt.plot(range(1, seq_length + 1), test_scores_genre[genre], label=genre_vocab[genre]) # plt.plot(range(1,21), test_scores+test_stds, c='Blue') # plt.plot(range(1,21), test_scores-test_stds, c='Blue') plt.legend() plt.xticks(list(range(1, seq_length + 1))) plt.ylim(0, 1) fig = plt.gcf() plt.show() ans = input('Do you want to save this figure? y/n\n') if ans == 'y': fig.savefig('output/' + config.name + '/eval_class.jpg') print('done')