def __init__(self, use_config=True, corpora='UD_Russian-SynTagRus', task_type='POS', verbose=1, class_index=3): if use_config: self.config = Config(model_type='cnn') self.task_type = self.config.get('Task_type') self.corpora = self.config.get('Corpora') else: self.task_type = task_type self.corpora = corpora logging.info('\nModel test for 1 level model.') logging.info("Task: {}".format(self.task_type)) logging.info("Corpora: {}".format(self.corpora)) logging.info("Label index: {}".format(str(class_index))) file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/%s/cnn/model_level_1/' % (self.corpora, )) self.model_path = os.path.abspath(file_path + '/../tagger_models/') self.x_test = load_bin_data(self.data_path + '/x_test_cnn1level.pkl') if verbose == 1: print('x_test shape:', self.x_test.shape) self.y_test = load_grammatical_cat_model1(self.data_path, self.task_type, verbose=1) self.estimation = 0
def __init__(self): self.config = Config(model_type='bilstm') file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/') data = [ self.data_path + '/' + self.config.get('Corpora') + '/test', self.data_path + '/' + self.config.get('Corpora') + '/train' ] for set in data: self.stat_pipline(set)
def __init__(self): self.config = Config(model_type='bilstm') self.sent_max_len = self.config.get('Sent_max_length') self.corpora_limit = self.config.get('Corpora_sent_limit') file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/') char_emb_feature = load_bin_data( self.data_path + '/' + self.config.get('Corpora') + '/bilstm/char_emb_rnn_feature_data.pkl') self.word2ind = char_emb_feature['word2index'] self.preparator(data_name='train') self.preparator(data_name='test')
def __init__(self): self.config = Config(model_type='bilstm') self.task_type = self.config.get('Task_type') print('#' * 100) print('Task:', self.task_type) print('Corpora:', self.config.get('Corpora')) print('#' * 100) file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/%s/' % (self.config.get('Corpora'), )) self.model_path = os.path.abspath(file_path + '/../tagger_models/') self.x_test = self.load_binary_data(self.data_path + '/x_test.pkl') print('X test shape:', self.x_test.shape) self.y_test = self.load_grammatical_cat() self.model = None
def __init__(self): self.config = Config(model_type='bilstm') file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/') sent = load_data(self.data_path + '/' + self.config.get('Corpora') + '/test') + \ load_data(self.data_path + '/' + self.config.get('Corpora') + '/train') self.model = gensim.models.Word2Vec.load_word2vec_format( self.data_path + '/' + "w2v_models/mix_corpora_5_10_300_skip_neg.bin", binary=True) unique_tokens = self.unique_tokens(sent) w2v_embeddings = self.form_emb_vocab(unique_tokens) print('vocabulary:', len(unique_tokens)) print('char embeddings:', w2v_embeddings.shape) self.save_emb(('w2v_matrix', w2v_embeddings))
def __init__(self'): self.config = Config(model_type='bilstm') self.task_type = self.config.get('Task_type') print('#' * 100) print('Task:', self.task_type) print('Corpora:', self.config.get('Corpora')) print('#' * 100) file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/%s/' % (self.config.get('Corpora'),)) self.model_path = os.path.abspath(file_path + '/../tagger_models/') char_emb_feature = self.load_binary_data(self.data_path + '/char_emb_feature_data.pkl') w2v_emb_feature = self.load_binary_data(self.data_path + '/w2v_emb_feature_data.pkl') w2v_emb_feature['w2v_matrix'] = w2v_emb_feature['w2v_matrix'].astype('int64') self.word2ind = char_emb_feature['word2index'] self.sent_max_len = char_emb_feature['max_sent_length'] self.x_train = self.load_binary_data(self.data_path + '/x_train.pkl') self.max_features = len(self.word2ind) + 1 self.random_embedding_size = self.config.get('Network_options').get('random_embedding_size') self.lstm_hidden_size = self.config.get('Network_options').get('lstm_hidden_size') self.dense_hidden_size = self.config.get('Network_options').get('dense_hidden_size') self.batch_size = self.config.get('Network_options').get('batch_size') self.epoch = self.config.get('Network_options').get('training_epoch') self.data_for_emb_layers = { 'char': char_emb_feature['char_matrix'], 'w2v': w2v_emb_feature['w2v_matrix'] } self.y_train, self.out_size = self.load_grammatical_cat() print('data embedding char shape:', self.data_for_emb_layers['char'].shape, self.data_for_emb_layers['char'].dtype) print('data embedding w2v shape:', self.data_for_emb_layers['w2v'].shape, self.data_for_emb_layers['w2v'].dtype) self.model = None
def __init__(self, model_type='bilstm'): """ Stata per corpora: gicrya: self.sent_max_len = 55 # optimal:55; max: 110; gicrya: self.max_token_length = 35; network_type: cnn or lstm; """ self.config = Config(model_type=model_type) self.sent_max_len = self.config.get('Sent_max_length') file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/') sent = load_data(self.data_path + '/' + self.config.get('Corpora') + '/test') + \ load_data(self.data_path + '/' + self.config.get('Corpora') + '/train') x_set = seq_form(sent, data_type='x') unique_tokens = unique_elements(x_set) self.unique_symbols = unique_chars(x_set) self.max_token_length = max([len(token) for token in unique_tokens]) self.word2ind, self.ind2word = self.token_encode(unique_tokens) char_embeddings = self.char_matrix(unique_tokens) print('vocabulary:', len(unique_tokens)) print('unique_symbols:', len(self.unique_symbols)) print('Maximum sequence length:', self.sent_max_len) print('Maximum token length:', self.max_token_length) print('char embeddings:', char_embeddings.shape) self.save_emb(('unique_symbols', self.unique_symbols), ('unique_tokens', unique_tokens), ('word2index', self.word2ind), ('max_sent_length', self.sent_max_len), ('char_matrix', char_embeddings))
def __init__(self, use_config=True, corpora='UD_Russian-SynTagRus', task_type='POS', class_index=3, dev=True, verbose=1, prob_cnn_emb_layer_name="dense_3"): if use_config: self.config = Config(model_type='cnn') self.task_type = self.config.get('Task_type') self.class_index = self.config.get( 'Classification_tasks')['UD2']['POS'][0] self.corpora = self.config.get('Corpora') self.prob_cnn_emb_layer_name = self.config.get( 'Network_options').get('prob_cnn_emb_layer_name') else: self.task_type = task_type self.corpora = corpora self.class_index = class_index self.prob_cnn_emb_layer_name = prob_cnn_emb_layer_name print('CNNProbEmbeddings for 2 level model.') print('Task:', self.task_type) print('Corpora:', self.corpora) print('Label index:', class_index) file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/%s/cnn/model_level_1/' % (self.corpora, )) self.model_path = os.path.abspath(file_path + '/../tagger_models/') char_emb_feature = self.load_binary_data( self.data_path + '/char_emb_cnn1_feature_data_%s.pkl' % (self.task_type, )) self.ind2symbol = char_emb_feature['ind2symbol'] self.max_token_length = char_emb_feature['max_token_length'] self.x_train = self.load_binary_data(self.data_path + '/x_train_cnn1level.pkl') self.x_test = self.load_binary_data(self.data_path + '/x_test_cnn1level.pkl') if dev: self.x_dev = self.load_binary_data(self.data_path + '/x_dev_cnn1level.pkl') if verbose == 1: print("Loading char_emb_cnn1_feature_data_%s ..." % (self.task_type, )) print('x_train shape:', self.x_train.shape) print('x_test shape:', self.x_test.shape) print('x_dev shape:', self.x_dev.shape) ################################################################################################################ str2vector = {} for el_ in self.x_train: str2vector[''.join([self.ind2symbol[s] for s in el_ if s != 0])] = el_ for _el in self.x_test: str2vector[''.join([self.ind2symbol[s] for s in _el if s != 0])] = _el if dev: for el in self.x_dev: str2vector[''.join([self.ind2symbol[s] for s in el if s != 0])] = el str2vector = OrderedDict(str2vector) if verbose == 1: print("Unique_tokens:", len(str2vector)) ################################################################################################################ null_word = [0 for i in range(self.max_token_length)] null_vector = np.array(null_word) str2vector.update({'_null_': null_vector}) str2vector.move_to_end('_null_', last=False) str2vector = [(el, str2vector[el]) for el in str2vector] if verbose == 1: print("Checking null word:", str2vector[0]) ################################################################################################################ non_tuned_embeddings = np.array([el[1] for el in str2vector]) if verbose == 1: print("Non tune embeddings:", non_tuned_embeddings.shape) ################################################################################################################ if verbose == 1: print('Loading cnn_1level_model_%s_%s.pkl' % ( self.corpora, self.task_type, )) self.model = load_model(self.model_path + '/cnn_1level_model_%s_%s.pkl' % ( self.corpora, self.task_type, )) activation_values = self.get_prob_from_layer( layer_name=self.prob_cnn_emb_layer_name, data=non_tuned_embeddings) if verbose == 1: print('activity_values_train shape:', activation_values.shape) ################################################################################################################ if self.task_type == "All": model_pos = load_model(self.model_path + '/cnn_1level_model_%s_%s.pkl' % ( self.corpora, "POS", )) pr = model_pos.predict(non_tuned_embeddings, verbose=1) activation_values = np.concatenate((activation_values, pr), axis=1) if verbose == 1: print("Predictons shape:", pr.shape) print("Predictons shape + activations:", activation_values.shape) # TODO проверить предсказание по _null_ ################################################################################################################ result_train = OrderedDict( list(zip([el[0] for el in str2vector], activation_values))) if verbose == 1: print("Checking null word:", len(result_train['_null_'])) self.save_binary( result_train, '_%s_%s' % (self.prob_cnn_emb_layer_name, self.task_type)) if dev: del (result_train, activation_values, self.model, str2vector, null_vector, null_word, char_emb_feature, self.ind2symbol, self.max_token_length, self.x_train, self.x_test, self.x_dev, non_tuned_embeddings) else: del (result_train, activation_values, self.model, str2vector, null_vector, null_word, char_emb_feature, self.ind2symbol, self.max_token_length, self.x_train, self.x_test, non_tuned_embeddings)
# потом идет 1 - emb_vocab.append(tokens_tune_vectors['_null_']) s_enc.append(1) count_new_tokens += 1 x_enc.append(s_enc) x = pad_sequences(x_enc, maxlen=max_sent_length, value=1) print('x_shape: %s;' % (name, ), x.shape) print('count_new_tokens:', count_new_tokens) print('count_tokens:', count_tokens) return x label_index = {'UPOS': 3, 'XPOS': 4} # ---------------------------------------------------------------------------------------------------------------------- config_models = Config(model_type='models') config_language_id = Config(model_type='tracks') model_dir = '../tagger_models/' test_files_udipipe_dir = '../data/conll2017_x/ud-test-v2.0-conll2017/input/conll17-ud-test-2017-05-09/' data_path = os.path.abspath( os.path.split(os.path.abspath(__file__))[0] + '/../data/') for corpora_name in config_models.get("models"): if corpora_name == 'UD_Russian': for tag_types in config_models.get("models")[corpora_name]: best_restart = config_models.get( "models")[corpora_name][tag_types].get('Best restart #')
def __init__(self, use_config=True, corpora='UD_Russian-SynTagRus', task_type='POS', class_index=3, dev=False, verbose=1, prob_cnn_emb_layer_name="dense_3"): if use_config: self.config = Config(model_type='cnn') self.task_type = self.config.get('Task_type') self.class_index = self.config.get( 'Classification_tasks')['UD2']['POS'][0] self.corpora = self.config.get('Corpora') self.prob_cnn_emb_layer_name = self.config.get( 'Network_options').get('prob_cnn_emb_layer_name') else: self.task_type = task_type self.corpora = corpora self.class_index = class_index self.prob_cnn_emb_layer_name = prob_cnn_emb_layer_name print('\nData preparation for 2 level model.') print('Task:', self.task_type) print('Corpora:', self.corpora) print('Label index:', class_index) file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/') self.tuned_vectors_path = os.path.abspath( file_path + '/../data/%s/cnn/model_level_1/' % (self.corpora, )) tokens_tune_vectors = load_bin_data( self.tuned_vectors_path + '/cnn_prob_emb%s.pkl' % ('_%s_%s' % (self.prob_cnn_emb_layer_name, self.task_type))) sent_test = load_data(self.data_path + '/' + self.corpora + '/test') sent_train = load_data(self.data_path + '/' + self.corpora + '/train') if dev: sent_valid = load_data(self.data_path + '/' + self.corpora + '/dev') else: sent_valid = [] test_tokens_data_seq = seq_form(sent_test) train_tokens_data_seq = seq_form(sent_train) if dev: dev_tokens_data_seq = seq_form(sent_valid) else: dev_tokens_data_seq = [] test_labels_data_seq = seq_form(sent_test, data_type='y', task_type=self.task_type, task_index=class_index) train_labels_data_seq = seq_form(sent_train, data_type='y', task_type=self.task_type, task_index=class_index) if dev: dev_labels_data_seq = seq_form(sent_valid, data_type='y', task_type=self.task_type, task_index=class_index) else: dev_labels_data_seq = [] self.MAX_SENT_LENGTH = max([len(s) for s in test_tokens_data_seq] + [len(s) for s in train_tokens_data_seq] + [len(s) for s in dev_tokens_data_seq]) if verbose == 1: print('Max sent length:', self.MAX_SENT_LENGTH) test_labels_data = [ labels for sent in test_labels_data_seq for labels in sent ] train_labels_data = [ labels for sent in train_labels_data_seq for labels in sent ] if dev: dev_labels_data = [ labels for sent in dev_labels_data_seq for labels in sent ] else: dev_labels_data = [] # After we can encode y test and train data. self.ADDING_INDEX = 1 self.PADDING_VALUE = 0 UNIQUE_LABELS = sorted( set(test_labels_data + train_labels_data + dev_labels_data)) self.label2ind_with_adding, self.ind2label_with_adding = self.labels_encode_cnn2( UNIQUE_LABELS) self.max_label_number = max(self.label2ind_with_adding.values()) if verbose == 1: print('max_label_number:', self.max_label_number) y_train = self.label_data_prepare(train_labels_data_seq, verbose=verbose) y_test = self.label_data_prepare(test_labels_data_seq, verbose=verbose) if dev: y_dev = self.label_data_prepare(dev_labels_data_seq, verbose=verbose) else: y_dev = [] save_binary( y_test, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_2/y_test_cnn2level_%s' % (self.task_type, )) save_binary( y_train, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_2/y_train_cnn2level_%s' % (self.task_type, )) if dev: save_binary( y_dev, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_2/y_dev_cnn2level_%s' % (self.task_type, )) save_binary( self.label2ind_with_adding, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_2/y_label2ind_cnn2level_%s' % (self.task_type, )) del (y_train, y_test, y_dev, self.label2ind_with_adding, self.ind2label_with_adding, UNIQUE_LABELS) # After we can encode x test and train data. unique_tokens = sorted(set([k for k in tokens_tune_vectors])) self.word2ind_with_adding = { token: (index + 2) for index, token in enumerate(unique_tokens) } if verbose == 1: print("\nUnique tokens:", len(unique_tokens)) x_test = self.data_prepare(test_tokens_data_seq, name="test", verbose=verbose) x_train = self.data_prepare(train_tokens_data_seq, name="train", verbose=verbose) if dev: x_dev = self.data_prepare(dev_tokens_data_seq, name="dev", verbose=verbose) else: x_dev = [] tune_char_emb_matrix = self.matrix_creating(unique_tokens, tokens_tune_vectors) if verbose == 1: print("Tune embedding matrix:", tune_char_emb_matrix.shape) save_binary( x_test, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_2/x_test_cnn2level_%s.pkl' % (self.task_type, )) save_binary( x_train, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_2/x_train_cnn2level_%s.pkl' % (self.task_type, )) if dev: save_binary( x_dev, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_2/x_dev_cnn2level_%s.pkl' % (self.task_type, )) self.save_emb(('max_label_numbers', self.max_label_number), ('max_sent_length', self.MAX_SENT_LENGTH), ('tune_char_emb_matrix', tune_char_emb_matrix), ('word2ind', self.word2ind_with_adding)) del (self.max_label_number, self.MAX_SENT_LENGTH, tune_char_emb_matrix, self.word2ind_with_adding, x_test, x_train, x_dev, sent_test, sent_valid, sent_train, test_tokens_data_seq, train_tokens_data_seq, dev_tokens_data_seq, test_labels_data_seq, train_labels_data_seq, dev_labels_data_seq, test_labels_data, train_labels_data, dev_labels_data, unique_tokens)
def __init__(self, use_config=True, corpora='UD_Russian-SynTagRus', task_type='POS', class_index=3, verbose=1, batch_size=512, epoch=300, dev=False): if use_config: self.config = Config(model_type='cnn') self.task_type = self.config.get('Task_type') self.class_index = self.config.get( 'Classification_tasks')['UD2']['POS'][0] self.corpora = self.config.get('Corpora') self.batch_size_level_2 = self.config.get('Network_options').get( 'batch_size_level_2') self.epoch = self.config.get('Network_options').get( 'training_epoch') else: self.task_type = task_type self.corpora = corpora self.class_index = class_index self.batch_size_level_2 = batch_size self.epoch = epoch print('\nModel train for 2 level model.') print('Task:', self.task_type) print('Corpora:', self.corpora) print('Label index:', class_index) file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/%s/cnn/model_level_2/' % (self.corpora, )) self.model_path = os.path.abspath(file_path + '/../tagger_models/') self.tune_char_emb_matrix = self.load_binary_data( self.data_path + '/char_emb_cnn2_feature_data_%s.pkl' % (self.task_type, )) self.sent_max_len = self.tune_char_emb_matrix['max_sent_length'] self.x_train = self.load_binary_data(self.data_path + '/x_train_cnn2level_%s.pkl' % (self.task_type, )) self.y_train, self.out_size = self.load_grammatical_cat( verbose=verbose) if verbose == 1: print('x_train shape:', self.x_train.shape) print('y_train shape:', self.y_train.shape) if dev: self.x_dev = self.load_binary_data(self.data_path + '/x_dev_cnn2level_%s.pkl' % (self.task_type, )) self.y_dev, _ = self.load_grammatical_cat(y_data_name='dev', verbose=verbose) if verbose == 1: print('x_dev shape:', self.x_dev.shape) print('y_dev shape:', self.y_dev.shape) self.max_features = len(self.tune_char_emb_matrix['word2ind']) + 2 self.data_for_emb_layers = { 'tune_char_emb_matrix': self.tune_char_emb_matrix['tune_char_emb_matrix'] } self.num_batches_per_epoch_train = math.ceil(self.x_train.shape[0] / self.batch_size_level_2) if dev: self.num_batches_per_epoch_valid = math.ceil( self.x_dev.shape[0] / self.batch_size_level_2) if verbose == 1: print("num_batches_per_epoch_train:", self.num_batches_per_epoch_train) print("num_batches_per_epoch_valid:", self.num_batches_per_epoch_valid) self.model = None
def __init__( self, use_config=True, corpora='UD_Russian-SynTagRus', task_type='POS', class_index=3, verbose=1, batch_size=512, epoch=300, dev=False, ): if use_config: self.config = Config(model_type='cnn') self.task_type = self.config.get('Task_type') self.class_index = self.config.get( 'Classification_tasks')['UD2']['POS'][0] self.corpora = self.config.get('Corpora') self.batch_size_level_1 = self.config.get('Network_options').get( 'batch_size_level_1') self.epoch = self.config.get('Network_options').get( 'training_epoch') else: self.task_type = task_type self.corpora = corpora self.class_index = class_index self.batch_size_level_1 = batch_size self.epoch = epoch print('\nModel train for 1 level model.') print('Task:', self.task_type) print('Corpora:', self.corpora) print('Label index:', class_index) file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/%s/cnn/model_level_1/' % (self.corpora, )) self.model_path = os.path.abspath(file_path + '/../tagger_models/') self.char_emb_feature = self.load_binary_data( self.data_path + '/char_emb_cnn1_feature_data_%s.pkl' % (self.task_type, )) self.symbol2ind = self.char_emb_feature['symbol2ind'] self.max_token_length = self.char_emb_feature['max_token_length'] self.x_train = self.load_binary_data(self.data_path + '/x_train_cnn1level.pkl') self.y_train, self.out_size = self.load_grammatical_cat( verbose=verbose) if verbose == 1: print('x_train shape:', self.x_train.shape) print('y_train shape:', self.y_train.shape) if dev: self.x_dev = self.load_binary_data(self.data_path + '/x_dev_cnn1level.pkl') self.y_dev, _ = self.load_grammatical_cat(y_data_name='dev', verbose=verbose) if verbose == 1: print('x_dev shape:', self.x_dev.shape) print('y_dev shape:', self.y_dev.shape) self.max_features = max(self.symbol2ind.values()) + 1 self.data_for_emb_layers = { 'char': self.char_emb_feature['char_matrix'] } if verbose == 1: print('data embedding char shape:', self.data_for_emb_layers['char'].shape, self.data_for_emb_layers['char'].dtype) self.model = None
def __init__(self): self.config = Config(model_type='bilstm') print('#' * 100) print('Task:', self.config.get('Task_type')) print('Corpora:', self.config.get('Corpora')) print('Label encoding') print('#' * 100) file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/') self.sent_max_len = self.config.get('Sent_max_length') self.corpora_limit = self.config.get('Corpora_sent_limit') sent_test = load_data(self.data_path + '/' + self.config.get('Corpora') + '/test') if self.corpora_limit != 'False': sent_train = load_data(self.data_path + '/' + self.config.get('Corpora') + '/train')[ :self.config.get('Corpora_sent_limit')] else: sent_train = load_data(self.data_path + '/' + self.config.get('Corpora') + '/train') for classification_task in self.config.get('Classification_tasks')[self.config.get('Corpora')]: print('\nClassification tasks:', classification_task) # We must find all unique labels in test and train for replace thenm by index. y_set = self.y_set_form( sent_test + sent_train, (classification_task, self.config.get('Classification_tasks')[self.config.get('Corpora')][classification_task]) ) unique_labels = unique_elements(y_set) self.label2ind, self.ind2label = elements_encode(unique_labels) self.max_label_numbers = max(self.label2ind.values()) + 1 print('labels: %s; with label for 0: %s' % (len(unique_labels), self.max_label_numbers)) del (y_set, unique_labels) # After we can encode test and train data. y_train = self.data_prepare( self.y_set_form( sent_train, (classification_task, self.config.get('Classification_tasks')[self.config.get('Corpora')][classification_task]) ) ) save_binary(y_train, 'y_train_%s.pkl' % (classification_task, )) del y_train y_test = self.data_prepare( self.y_set_form( sent_test, (classification_task, self.config.get('Classification_tasks')[self.config.get('Corpora')][classification_task]) ) ) save_binary(y_test, self.data_path + '/%s/' % (self.config.get('Corpora'),) + '/bilstm/' + 'y_test_%s.pkl' % ( classification_task,)) del y_test # Save label2ind save_binary(self.label2ind, self.data_path + '/%s/' % ( self.config.get('Corpora'),) + '/bilstm/' + 'y_label2ind_%s.pkl' % (classification_task,))
def __init__(self, use_config=True, corpora='UD_Russian-SynTagRus', task_type='POS', class_index=3, dev=True, verbose=1): if use_config: self.config = Config(model_type='cnn') self.task_type = self.config.get('Task_type') self.class_index = self.config.get( 'Classification_tasks')['UD2']['POS'][0] self.corpora = self.config.get('Corpora') else: self.task_type = task_type self.corpora = corpora self.class_index = class_index print('Data preparation for 1 level model.') print('Task:', self.task_type) print('Corpora:', self.corpora) print('Label index:', class_index) file_path = os.path.split(os.path.abspath(__file__))[0] self.data_path = os.path.abspath(file_path + '/../data/') sent_test = load_data(self.data_path + '/' + self.corpora + '/test') sent_train = load_data(self.data_path + '/' + self.corpora + '/train') if dev: sent_valid = load_data(self.data_path + '/' + self.corpora + '/dev') else: sent_valid = [] test_tokens_data_seq = seq_form(sent_test, task_type=self.task_type) train_tokens_data_seq = seq_form(sent_train, task_type=self.task_type) if dev: dev_tokens_data_seq = seq_form(sent_valid, task_type=self.task_type) else: dev_tokens_data_seq = [] test_labels_data_seq = seq_form(sent_test, data_type='y', task_type=self.task_type, task_index=self.class_index) train_labels_data_seq = seq_form(sent_train, data_type='y', task_type=self.task_type, task_index=self.class_index) if dev: dev_labels_data_seq = seq_form(sent_valid, data_type='y', task_type=self.task_type, task_index=self.class_index) else: dev_labels_data_seq = [] test_tokens_data = [ tokens for sent in test_tokens_data_seq for tokens in sent ] train_tokens_data = [ tokens for sent in train_tokens_data_seq for tokens in sent ] if dev: dev_tokens_data = [ tokens for sent in dev_tokens_data_seq for tokens in sent ] else: dev_tokens_data = [] test_labels_data = [ labels for sent in test_labels_data_seq for labels in sent ] train_labels_data = [ labels for sent in train_labels_data_seq for labels in sent ] if dev: dev_labels_data = [ labels for sent in dev_labels_data_seq for labels in sent ] else: dev_labels_data = [] # After we can encode y test and train data. self.ADDING_INDEX = 1 self.PADDING_VALUE = 0 UNIQUE_LABELS = sorted( set(test_labels_data + train_labels_data + dev_labels_data)) self.label2ind_with_adding, self.ind2label_with_adding = labels_encode( UNIQUE_LABELS, 0) self.max_label_numbers = max(self.label2ind_with_adding.values()) if verbose == 1: print('Unique labels:', self.max_label_numbers) print("\nLabels:", self.label2ind_with_adding.keys()) y_train = self.label_data_prepare(train_labels_data, verbose=verbose) y_test = self.label_data_prepare(test_labels_data, verbose=verbose) if dev: y_dev = self.label_data_prepare(dev_labels_data, verbose=verbose) else: y_dev = [] save_binary( y_test, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_1/y_test_cnn1level_%s' % (self.task_type, )) save_binary( y_train, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_1/y_train_cnn1level_%s' % (self.task_type, )) save_binary( y_dev, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_1/y_dev_cnn1level_%s' % (self.task_type, )) save_binary( self.label2ind_with_adding, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_1/y_label2ind_cnn1level_%s' % (self.task_type, )) del (y_train, y_test, y_dev, self.label2ind_with_adding, self.ind2label_with_adding, self.max_label_numbers, UNIQUE_LABELS) # After we can encode x test, dev, train data. unique_tokens = sorted( set(test_tokens_data + train_tokens_data + dev_tokens_data)) if verbose == 1: print("\nUnique tokens:", len(unique_tokens)) self.unique_symbols = unique_chars(unique_tokens) self.max_token_length = max([len(token) for token in unique_tokens]) self.symbol2ind_with_adding, self.ind2symbol_with_adding = symbols_encode( self.unique_symbols, self.ADDING_INDEX) if verbose == 1: print("\nUnique symbols:", self.symbol2ind_with_adding.keys()) x_test = self.data_prepare(test_tokens_data, verbose=verbose) x_train = self.data_prepare(train_tokens_data, verbose=verbose) x_dev = self.data_prepare(dev_tokens_data, verbose=verbose) save_binary( x_test, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_1/x_test_cnn1level.pkl') save_binary( x_train, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_1/x_train_cnn1level.pkl') save_binary( x_dev, self.data_path + '/%s/' % (self.corpora, ) + 'cnn/model_level_1/x_dev_cnn1level.pkl') char_embeddings = self.char_matrix_cnn() if verbose == 1: print('\nChar_embeddings shape:', char_embeddings.shape) self.save_emb(('symbol2ind', self.symbol2ind_with_adding), ('ind2symbol', self.ind2symbol_with_adding), ('max_token_length', self.max_token_length), ('char_matrix', char_embeddings)) del (self.symbol2ind_with_adding, self.ind2symbol_with_adding, self.max_token_length, char_embeddings, self.unique_symbols, x_test, x_train, x_dev, sent_test, sent_valid, sent_train, test_tokens_data_seq, train_tokens_data_seq, dev_tokens_data_seq, test_labels_data_seq, train_labels_data_seq, dev_labels_data_seq, test_tokens_data, train_tokens_data, dev_tokens_data, test_labels_data, train_labels_data, dev_labels_data, unique_tokens)