示例#1
0
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 verbose=1,
                 class_index=3):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.corpora = self.config.get('Corpora')
        else:
            self.task_type = task_type
            self.corpora = corpora

        logging.info('\nModel test for 1 level model.')
        logging.info("Task: {}".format(self.task_type))
        logging.info("Corpora: {}".format(self.corpora))
        logging.info("Label index: {}".format(str(class_index)))

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path +
                                         '/../data/%s/cnn/model_level_1/' %
                                         (self.corpora, ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        self.x_test = load_bin_data(self.data_path + '/x_test_cnn1level.pkl')
        if verbose == 1:
            print('x_test shape:', self.x_test.shape)
        self.y_test = load_grammatical_cat_model1(self.data_path,
                                                  self.task_type,
                                                  verbose=1)

        self.estimation = 0
示例#2
0
    def __init__(self):
        self.config = Config(model_type='bilstm')
        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')

        data = [
            self.data_path + '/' + self.config.get('Corpora') + '/test',
            self.data_path + '/' + self.config.get('Corpora') + '/train'
        ]

        for set in data:
            self.stat_pipline(set)
示例#3
0
    def __init__(self):

        self.config = Config(model_type='bilstm')
        self.sent_max_len = self.config.get('Sent_max_length')
        self.corpora_limit = self.config.get('Corpora_sent_limit')

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')

        char_emb_feature = load_bin_data(
            self.data_path + '/' + self.config.get('Corpora') +
            '/bilstm/char_emb_rnn_feature_data.pkl')
        self.word2ind = char_emb_feature['word2index']

        self.preparator(data_name='train')
        self.preparator(data_name='test')
    def __init__(self):
        self.config = Config(model_type='bilstm')
        self.task_type = self.config.get('Task_type')

        print('#' * 100)
        print('Task:', self.task_type)
        print('Corpora:', self.config.get('Corpora'))
        print('#' * 100)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/%s/' %
                                         (self.config.get('Corpora'), ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        self.x_test = self.load_binary_data(self.data_path + '/x_test.pkl')
        print('X test shape:', self.x_test.shape)

        self.y_test = self.load_grammatical_cat()
        self.model = None
    def __init__(self):

        self.config = Config(model_type='bilstm')

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')

        sent = load_data(self.data_path + '/' + self.config.get('Corpora') + '/test') + \
               load_data(self.data_path + '/' + self.config.get('Corpora') + '/train')

        self.model = gensim.models.Word2Vec.load_word2vec_format(
            self.data_path + '/' +
            "w2v_models/mix_corpora_5_10_300_skip_neg.bin",
            binary=True)
        unique_tokens = self.unique_tokens(sent)
        w2v_embeddings = self.form_emb_vocab(unique_tokens)

        print('vocabulary:', len(unique_tokens))
        print('char embeddings:', w2v_embeddings.shape)

        self.save_emb(('w2v_matrix', w2v_embeddings))
    def __init__(self'):
        self.config = Config(model_type='bilstm')
        self.task_type = self.config.get('Task_type')

        print('#' * 100)
        print('Task:', self.task_type)
        print('Corpora:', self.config.get('Corpora'))
        print('#' * 100)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/%s/' % (self.config.get('Corpora'),))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        char_emb_feature = self.load_binary_data(self.data_path + '/char_emb_feature_data.pkl')
        w2v_emb_feature = self.load_binary_data(self.data_path + '/w2v_emb_feature_data.pkl')

        w2v_emb_feature['w2v_matrix'] = w2v_emb_feature['w2v_matrix'].astype('int64')
        self.word2ind = char_emb_feature['word2index']
        self.sent_max_len = char_emb_feature['max_sent_length']

        self.x_train = self.load_binary_data(self.data_path + '/x_train.pkl')

        self.max_features = len(self.word2ind) + 1
        self.random_embedding_size = self.config.get('Network_options').get('random_embedding_size')
        self.lstm_hidden_size = self.config.get('Network_options').get('lstm_hidden_size')
        self.dense_hidden_size = self.config.get('Network_options').get('dense_hidden_size')
        self.batch_size = self.config.get('Network_options').get('batch_size')
        self.epoch = self.config.get('Network_options').get('training_epoch')

        self.data_for_emb_layers = {
            'char': char_emb_feature['char_matrix'],
            'w2v':  w2v_emb_feature['w2v_matrix']
        }

        self.y_train, self.out_size = self.load_grammatical_cat()

        print('data embedding char shape:', self.data_for_emb_layers['char'].shape, self.data_for_emb_layers['char'].dtype)
        print('data embedding w2v shape:', self.data_for_emb_layers['w2v'].shape, self.data_for_emb_layers['w2v'].dtype)
        
        self.model = None
示例#7
0
    def __init__(self, model_type='bilstm'):
        """
        Stata per corpora:
            gicrya: self.sent_max_len = 55  # optimal:55; max: 110;
            gicrya: self.max_token_length = 35;

        network_type: cnn or lstm;
            
        """

        self.config = Config(model_type=model_type)
        self.sent_max_len = self.config.get('Sent_max_length')

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')
        sent = load_data(self.data_path + '/' + self.config.get('Corpora') + '/test') + \
               load_data(self.data_path + '/' + self.config.get('Corpora') + '/train')

        x_set = seq_form(sent, data_type='x')
        unique_tokens = unique_elements(x_set)
        self.unique_symbols = unique_chars(x_set)
        self.max_token_length = max([len(token) for token in unique_tokens])
        self.word2ind, self.ind2word = self.token_encode(unique_tokens)

        char_embeddings = self.char_matrix(unique_tokens)

        print('vocabulary:', len(unique_tokens))
        print('unique_symbols:', len(self.unique_symbols))
        print('Maximum sequence length:', self.sent_max_len)
        print('Maximum token length:', self.max_token_length)
        print('char embeddings:', char_embeddings.shape)

        self.save_emb(('unique_symbols', self.unique_symbols),
                      ('unique_tokens', unique_tokens),
                      ('word2index', self.word2ind),
                      ('max_sent_length', self.sent_max_len),
                      ('char_matrix', char_embeddings))
示例#8
0
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 class_index=3,
                 dev=True,
                 verbose=1,
                 prob_cnn_emb_layer_name="dense_3"):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
            self.prob_cnn_emb_layer_name = self.config.get(
                'Network_options').get('prob_cnn_emb_layer_name')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index
            self.prob_cnn_emb_layer_name = prob_cnn_emb_layer_name

        print('CNNProbEmbeddings for 2 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path +
                                         '/../data/%s/cnn/model_level_1/' %
                                         (self.corpora, ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        char_emb_feature = self.load_binary_data(
            self.data_path + '/char_emb_cnn1_feature_data_%s.pkl' %
            (self.task_type, ))

        self.ind2symbol = char_emb_feature['ind2symbol']
        self.max_token_length = char_emb_feature['max_token_length']
        self.x_train = self.load_binary_data(self.data_path +
                                             '/x_train_cnn1level.pkl')
        self.x_test = self.load_binary_data(self.data_path +
                                            '/x_test_cnn1level.pkl')
        if dev:
            self.x_dev = self.load_binary_data(self.data_path +
                                               '/x_dev_cnn1level.pkl')

        if verbose == 1:
            print("Loading char_emb_cnn1_feature_data_%s ..." %
                  (self.task_type, ))
            print('x_train shape:', self.x_train.shape)
            print('x_test shape:', self.x_test.shape)
            print('x_dev shape:', self.x_dev.shape)

        ################################################################################################################

        str2vector = {}
        for el_ in self.x_train:
            str2vector[''.join([self.ind2symbol[s] for s in el_
                                if s != 0])] = el_
        for _el in self.x_test:
            str2vector[''.join([self.ind2symbol[s] for s in _el
                                if s != 0])] = _el
        if dev:
            for el in self.x_dev:
                str2vector[''.join([self.ind2symbol[s] for s in el
                                    if s != 0])] = el
        str2vector = OrderedDict(str2vector)
        if verbose == 1:
            print("Unique_tokens:", len(str2vector))

        ################################################################################################################

        null_word = [0 for i in range(self.max_token_length)]
        null_vector = np.array(null_word)

        str2vector.update({'_null_': null_vector})
        str2vector.move_to_end('_null_', last=False)
        str2vector = [(el, str2vector[el]) for el in str2vector]
        if verbose == 1:
            print("Checking null word:", str2vector[0])

        ################################################################################################################

        non_tuned_embeddings = np.array([el[1] for el in str2vector])
        if verbose == 1:
            print("Non tune embeddings:", non_tuned_embeddings.shape)

        ################################################################################################################

        if verbose == 1:
            print('Loading cnn_1level_model_%s_%s.pkl' % (
                self.corpora,
                self.task_type,
            ))
        self.model = load_model(self.model_path +
                                '/cnn_1level_model_%s_%s.pkl' % (
                                    self.corpora,
                                    self.task_type,
                                ))

        activation_values = self.get_prob_from_layer(
            layer_name=self.prob_cnn_emb_layer_name, data=non_tuned_embeddings)
        if verbose == 1:
            print('activity_values_train shape:', activation_values.shape)

        ################################################################################################################

        if self.task_type == "All":
            model_pos = load_model(self.model_path +
                                   '/cnn_1level_model_%s_%s.pkl' % (
                                       self.corpora,
                                       "POS",
                                   ))
            pr = model_pos.predict(non_tuned_embeddings, verbose=1)
            activation_values = np.concatenate((activation_values, pr), axis=1)
            if verbose == 1:
                print("Predictons shape:", pr.shape)
                print("Predictons shape + activations:",
                      activation_values.shape)
                # TODO проверить предсказание по _null_

        ################################################################################################################

        result_train = OrderedDict(
            list(zip([el[0] for el in str2vector], activation_values)))
        if verbose == 1:
            print("Checking null word:", len(result_train['_null_']))
        self.save_binary(
            result_train,
            '_%s_%s' % (self.prob_cnn_emb_layer_name, self.task_type))

        if dev:
            del (result_train, activation_values, self.model, str2vector,
                 null_vector, null_word, char_emb_feature, self.ind2symbol,
                 self.max_token_length, self.x_train, self.x_test, self.x_dev,
                 non_tuned_embeddings)
        else:
            del (result_train, activation_values, self.model, str2vector,
                 null_vector, null_word, char_emb_feature, self.ind2symbol,
                 self.max_token_length, self.x_train, self.x_test,
                 non_tuned_embeddings)
示例#9
0
                # потом идет 1 - emb_vocab.append(tokens_tune_vectors['_null_'])
                s_enc.append(1)
                count_new_tokens += 1
        x_enc.append(s_enc)
    x = pad_sequences(x_enc, maxlen=max_sent_length, value=1)
    print('x_shape: %s;' % (name, ), x.shape)
    print('count_new_tokens:', count_new_tokens)
    print('count_tokens:', count_tokens)
    return x


label_index = {'UPOS': 3, 'XPOS': 4}

# ----------------------------------------------------------------------------------------------------------------------

config_models = Config(model_type='models')
config_language_id = Config(model_type='tracks')

model_dir = '../tagger_models/'
test_files_udipipe_dir = '../data/conll2017_x/ud-test-v2.0-conll2017/input/conll17-ud-test-2017-05-09/'
data_path = os.path.abspath(
    os.path.split(os.path.abspath(__file__))[0] + '/../data/')

for corpora_name in config_models.get("models"):

    if corpora_name == 'UD_Russian':

        for tag_types in config_models.get("models")[corpora_name]:

            best_restart = config_models.get(
                "models")[corpora_name][tag_types].get('Best restart #')
示例#10
0
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 class_index=3,
                 dev=False,
                 verbose=1,
                 prob_cnn_emb_layer_name="dense_3"):
        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
            self.prob_cnn_emb_layer_name = self.config.get(
                'Network_options').get('prob_cnn_emb_layer_name')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index
            self.prob_cnn_emb_layer_name = prob_cnn_emb_layer_name

        print('\nData preparation for 2 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')
        self.tuned_vectors_path = os.path.abspath(
            file_path + '/../data/%s/cnn/model_level_1/' % (self.corpora, ))

        tokens_tune_vectors = load_bin_data(
            self.tuned_vectors_path + '/cnn_prob_emb%s.pkl' %
            ('_%s_%s' % (self.prob_cnn_emb_layer_name, self.task_type)))

        sent_test = load_data(self.data_path + '/' + self.corpora + '/test')
        sent_train = load_data(self.data_path + '/' + self.corpora + '/train')
        if dev:
            sent_valid = load_data(self.data_path + '/' + self.corpora +
                                   '/dev')
        else:
            sent_valid = []

        test_tokens_data_seq = seq_form(sent_test)
        train_tokens_data_seq = seq_form(sent_train)
        if dev:
            dev_tokens_data_seq = seq_form(sent_valid)
        else:
            dev_tokens_data_seq = []

        test_labels_data_seq = seq_form(sent_test,
                                        data_type='y',
                                        task_type=self.task_type,
                                        task_index=class_index)
        train_labels_data_seq = seq_form(sent_train,
                                         data_type='y',
                                         task_type=self.task_type,
                                         task_index=class_index)
        if dev:
            dev_labels_data_seq = seq_form(sent_valid,
                                           data_type='y',
                                           task_type=self.task_type,
                                           task_index=class_index)
        else:
            dev_labels_data_seq = []

        self.MAX_SENT_LENGTH = max([len(s) for s in test_tokens_data_seq] +
                                   [len(s) for s in train_tokens_data_seq] +
                                   [len(s) for s in dev_tokens_data_seq])
        if verbose == 1:
            print('Max sent length:', self.MAX_SENT_LENGTH)

        test_labels_data = [
            labels for sent in test_labels_data_seq for labels in sent
        ]
        train_labels_data = [
            labels for sent in train_labels_data_seq for labels in sent
        ]
        if dev:
            dev_labels_data = [
                labels for sent in dev_labels_data_seq for labels in sent
            ]
        else:
            dev_labels_data = []

        # After we can encode y test and train data.
        self.ADDING_INDEX = 1
        self.PADDING_VALUE = 0

        UNIQUE_LABELS = sorted(
            set(test_labels_data + train_labels_data + dev_labels_data))
        self.label2ind_with_adding, self.ind2label_with_adding = self.labels_encode_cnn2(
            UNIQUE_LABELS)
        self.max_label_number = max(self.label2ind_with_adding.values())
        if verbose == 1:
            print('max_label_number:', self.max_label_number)

        y_train = self.label_data_prepare(train_labels_data_seq,
                                          verbose=verbose)
        y_test = self.label_data_prepare(test_labels_data_seq, verbose=verbose)
        if dev:
            y_dev = self.label_data_prepare(dev_labels_data_seq,
                                            verbose=verbose)
        else:
            y_dev = []

        save_binary(
            y_test, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/y_test_cnn2level_%s' % (self.task_type, ))
        save_binary(
            y_train, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/y_train_cnn2level_%s' % (self.task_type, ))
        if dev:
            save_binary(
                y_dev, self.data_path + '/%s/' % (self.corpora, ) +
                'cnn/model_level_2/y_dev_cnn2level_%s' % (self.task_type, ))

        save_binary(
            self.label2ind_with_adding,
            self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/y_label2ind_cnn2level_%s' % (self.task_type, ))
        del (y_train, y_test, y_dev, self.label2ind_with_adding,
             self.ind2label_with_adding, UNIQUE_LABELS)

        # After we can encode x test and train data.
        unique_tokens = sorted(set([k for k in tokens_tune_vectors]))
        self.word2ind_with_adding = {
            token: (index + 2)
            for index, token in enumerate(unique_tokens)
        }
        if verbose == 1:
            print("\nUnique tokens:", len(unique_tokens))

        x_test = self.data_prepare(test_tokens_data_seq,
                                   name="test",
                                   verbose=verbose)
        x_train = self.data_prepare(train_tokens_data_seq,
                                    name="train",
                                    verbose=verbose)
        if dev:
            x_dev = self.data_prepare(dev_tokens_data_seq,
                                      name="dev",
                                      verbose=verbose)
        else:
            x_dev = []

        tune_char_emb_matrix = self.matrix_creating(unique_tokens,
                                                    tokens_tune_vectors)
        if verbose == 1:
            print("Tune embedding matrix:", tune_char_emb_matrix.shape)

        save_binary(
            x_test, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/x_test_cnn2level_%s.pkl' % (self.task_type, ))
        save_binary(
            x_train, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/x_train_cnn2level_%s.pkl' % (self.task_type, ))
        if dev:
            save_binary(
                x_dev, self.data_path + '/%s/' % (self.corpora, ) +
                'cnn/model_level_2/x_dev_cnn2level_%s.pkl' %
                (self.task_type, ))

        self.save_emb(('max_label_numbers', self.max_label_number),
                      ('max_sent_length', self.MAX_SENT_LENGTH),
                      ('tune_char_emb_matrix', tune_char_emb_matrix),
                      ('word2ind', self.word2ind_with_adding))

        del (self.max_label_number, self.MAX_SENT_LENGTH, tune_char_emb_matrix,
             self.word2ind_with_adding, x_test, x_train, x_dev, sent_test,
             sent_valid, sent_train, test_tokens_data_seq,
             train_tokens_data_seq, dev_tokens_data_seq, test_labels_data_seq,
             train_labels_data_seq, dev_labels_data_seq, test_labels_data,
             train_labels_data, dev_labels_data, unique_tokens)
示例#11
0
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 class_index=3,
                 verbose=1,
                 batch_size=512,
                 epoch=300,
                 dev=False):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
            self.batch_size_level_2 = self.config.get('Network_options').get(
                'batch_size_level_2')
            self.epoch = self.config.get('Network_options').get(
                'training_epoch')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index
            self.batch_size_level_2 = batch_size
            self.epoch = epoch

        print('\nModel train for 2 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path +
                                         '/../data/%s/cnn/model_level_2/' %
                                         (self.corpora, ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        self.tune_char_emb_matrix = self.load_binary_data(
            self.data_path + '/char_emb_cnn2_feature_data_%s.pkl' %
            (self.task_type, ))
        self.sent_max_len = self.tune_char_emb_matrix['max_sent_length']

        self.x_train = self.load_binary_data(self.data_path +
                                             '/x_train_cnn2level_%s.pkl' %
                                             (self.task_type, ))
        self.y_train, self.out_size = self.load_grammatical_cat(
            verbose=verbose)
        if verbose == 1:
            print('x_train shape:', self.x_train.shape)
            print('y_train shape:', self.y_train.shape)

        if dev:
            self.x_dev = self.load_binary_data(self.data_path +
                                               '/x_dev_cnn2level_%s.pkl' %
                                               (self.task_type, ))
            self.y_dev, _ = self.load_grammatical_cat(y_data_name='dev',
                                                      verbose=verbose)
            if verbose == 1:
                print('x_dev shape:', self.x_dev.shape)
                print('y_dev shape:', self.y_dev.shape)

        self.max_features = len(self.tune_char_emb_matrix['word2ind']) + 2
        self.data_for_emb_layers = {
            'tune_char_emb_matrix':
            self.tune_char_emb_matrix['tune_char_emb_matrix']
        }

        self.num_batches_per_epoch_train = math.ceil(self.x_train.shape[0] /
                                                     self.batch_size_level_2)
        if dev:
            self.num_batches_per_epoch_valid = math.ceil(
                self.x_dev.shape[0] / self.batch_size_level_2)

        if verbose == 1:
            print("num_batches_per_epoch_train:",
                  self.num_batches_per_epoch_train)
            print("num_batches_per_epoch_valid:",
                  self.num_batches_per_epoch_valid)

        self.model = None
示例#12
0
    def __init__(
        self,
        use_config=True,
        corpora='UD_Russian-SynTagRus',
        task_type='POS',
        class_index=3,
        verbose=1,
        batch_size=512,
        epoch=300,
        dev=False,
    ):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
            self.batch_size_level_1 = self.config.get('Network_options').get(
                'batch_size_level_1')
            self.epoch = self.config.get('Network_options').get(
                'training_epoch')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index
            self.batch_size_level_1 = batch_size
            self.epoch = epoch

        print('\nModel train for 1 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path +
                                         '/../data/%s/cnn/model_level_1/' %
                                         (self.corpora, ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        self.char_emb_feature = self.load_binary_data(
            self.data_path + '/char_emb_cnn1_feature_data_%s.pkl' %
            (self.task_type, ))

        self.symbol2ind = self.char_emb_feature['symbol2ind']
        self.max_token_length = self.char_emb_feature['max_token_length']

        self.x_train = self.load_binary_data(self.data_path +
                                             '/x_train_cnn1level.pkl')
        self.y_train, self.out_size = self.load_grammatical_cat(
            verbose=verbose)
        if verbose == 1:
            print('x_train shape:', self.x_train.shape)
            print('y_train shape:', self.y_train.shape)

        if dev:
            self.x_dev = self.load_binary_data(self.data_path +
                                               '/x_dev_cnn1level.pkl')
            self.y_dev, _ = self.load_grammatical_cat(y_data_name='dev',
                                                      verbose=verbose)
            if verbose == 1:
                print('x_dev shape:', self.x_dev.shape)
                print('y_dev shape:', self.y_dev.shape)

        self.max_features = max(self.symbol2ind.values()) + 1
        self.data_for_emb_layers = {
            'char': self.char_emb_feature['char_matrix']
        }

        if verbose == 1:
            print('data embedding char shape:',
                  self.data_for_emb_layers['char'].shape,
                  self.data_for_emb_layers['char'].dtype)

        self.model = None
示例#13
0
    def __init__(self):
        self.config = Config(model_type='bilstm')

        print('#' * 100)
        print('Task:', self.config.get('Task_type'))
        print('Corpora:', self.config.get('Corpora'))
        print('Label encoding')
        print('#' * 100)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')
        self.sent_max_len = self.config.get('Sent_max_length')
        self.corpora_limit = self.config.get('Corpora_sent_limit')

        sent_test = load_data(self.data_path + '/' + self.config.get('Corpora') + '/test')
        if self.corpora_limit != 'False':
            sent_train = load_data(self.data_path + '/' + self.config.get('Corpora') + '/train')[
                         :self.config.get('Corpora_sent_limit')]
        else:
            sent_train = load_data(self.data_path + '/' + self.config.get('Corpora') + '/train')

        for classification_task in self.config.get('Classification_tasks')[self.config.get('Corpora')]:
            print('\nClassification tasks:', classification_task)
            # We must find all unique labels in test and train for replace thenm by index.
            y_set = self.y_set_form(
                sent_test + sent_train,
                (classification_task,
                self.config.get('Classification_tasks')[self.config.get('Corpora')][classification_task])
                )
            unique_labels = unique_elements(y_set)
            self.label2ind, self.ind2label = elements_encode(unique_labels)
            self.max_label_numbers = max(self.label2ind.values()) + 1
            print('labels: %s; with label for 0: %s' % (len(unique_labels), self.max_label_numbers))
            del (y_set, unique_labels)

            # After we can encode test and train data.
            y_train = self.data_prepare(
                self.y_set_form(
                sent_train,
                (classification_task,
                self.config.get('Classification_tasks')[self.config.get('Corpora')][classification_task])
                )
            )
            save_binary(y_train, 'y_train_%s.pkl' % (classification_task, ))
            del y_train

            y_test = self.data_prepare(
                self.y_set_form(
                sent_test,
                (classification_task,
                self.config.get('Classification_tasks')[self.config.get('Corpora')][classification_task])
                )
            )
            save_binary(y_test,
                        self.data_path + '/%s/' % (self.config.get('Corpora'),) + '/bilstm/' + 'y_test_%s.pkl' % (
                        classification_task,))

            del y_test

            # Save label2ind
            save_binary(self.label2ind,
                        self.data_path + '/%s/' % (
                            self.config.get('Corpora'),) + '/bilstm/' + 'y_label2ind_%s.pkl' % (classification_task,))
示例#14
0
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 class_index=3,
                 dev=True,
                 verbose=1):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index

        print('Data preparation for 1 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')

        sent_test = load_data(self.data_path + '/' + self.corpora + '/test')
        sent_train = load_data(self.data_path + '/' + self.corpora + '/train')
        if dev:
            sent_valid = load_data(self.data_path + '/' + self.corpora +
                                   '/dev')
        else:
            sent_valid = []

        test_tokens_data_seq = seq_form(sent_test, task_type=self.task_type)
        train_tokens_data_seq = seq_form(sent_train, task_type=self.task_type)
        if dev:
            dev_tokens_data_seq = seq_form(sent_valid,
                                           task_type=self.task_type)
        else:
            dev_tokens_data_seq = []

        test_labels_data_seq = seq_form(sent_test,
                                        data_type='y',
                                        task_type=self.task_type,
                                        task_index=self.class_index)
        train_labels_data_seq = seq_form(sent_train,
                                         data_type='y',
                                         task_type=self.task_type,
                                         task_index=self.class_index)
        if dev:
            dev_labels_data_seq = seq_form(sent_valid,
                                           data_type='y',
                                           task_type=self.task_type,
                                           task_index=self.class_index)
        else:
            dev_labels_data_seq = []

        test_tokens_data = [
            tokens for sent in test_tokens_data_seq for tokens in sent
        ]
        train_tokens_data = [
            tokens for sent in train_tokens_data_seq for tokens in sent
        ]
        if dev:
            dev_tokens_data = [
                tokens for sent in dev_tokens_data_seq for tokens in sent
            ]
        else:
            dev_tokens_data = []

        test_labels_data = [
            labels for sent in test_labels_data_seq for labels in sent
        ]
        train_labels_data = [
            labels for sent in train_labels_data_seq for labels in sent
        ]
        if dev:
            dev_labels_data = [
                labels for sent in dev_labels_data_seq for labels in sent
            ]
        else:
            dev_labels_data = []

        # After we can encode y test and train data.
        self.ADDING_INDEX = 1
        self.PADDING_VALUE = 0

        UNIQUE_LABELS = sorted(
            set(test_labels_data + train_labels_data + dev_labels_data))
        self.label2ind_with_adding, self.ind2label_with_adding = labels_encode(
            UNIQUE_LABELS, 0)
        self.max_label_numbers = max(self.label2ind_with_adding.values())
        if verbose == 1:
            print('Unique labels:', self.max_label_numbers)
            print("\nLabels:", self.label2ind_with_adding.keys())

        y_train = self.label_data_prepare(train_labels_data, verbose=verbose)
        y_test = self.label_data_prepare(test_labels_data, verbose=verbose)
        if dev:
            y_dev = self.label_data_prepare(dev_labels_data, verbose=verbose)
        else:
            y_dev = []

        save_binary(
            y_test, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/y_test_cnn1level_%s' % (self.task_type, ))
        save_binary(
            y_train, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/y_train_cnn1level_%s' % (self.task_type, ))
        save_binary(
            y_dev, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/y_dev_cnn1level_%s' % (self.task_type, ))
        save_binary(
            self.label2ind_with_adding,
            self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/y_label2ind_cnn1level_%s' % (self.task_type, ))

        del (y_train, y_test, y_dev, self.label2ind_with_adding,
             self.ind2label_with_adding, self.max_label_numbers, UNIQUE_LABELS)

        # After we can encode x test, dev, train data.
        unique_tokens = sorted(
            set(test_tokens_data + train_tokens_data + dev_tokens_data))
        if verbose == 1:
            print("\nUnique tokens:", len(unique_tokens))

        self.unique_symbols = unique_chars(unique_tokens)
        self.max_token_length = max([len(token) for token in unique_tokens])
        self.symbol2ind_with_adding, self.ind2symbol_with_adding = symbols_encode(
            self.unique_symbols, self.ADDING_INDEX)
        if verbose == 1:
            print("\nUnique symbols:", self.symbol2ind_with_adding.keys())

        x_test = self.data_prepare(test_tokens_data, verbose=verbose)
        x_train = self.data_prepare(train_tokens_data, verbose=verbose)
        x_dev = self.data_prepare(dev_tokens_data, verbose=verbose)

        save_binary(
            x_test, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/x_test_cnn1level.pkl')
        save_binary(
            x_train, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/x_train_cnn1level.pkl')
        save_binary(
            x_dev, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/x_dev_cnn1level.pkl')

        char_embeddings = self.char_matrix_cnn()
        if verbose == 1:
            print('\nChar_embeddings shape:', char_embeddings.shape)

        self.save_emb(('symbol2ind', self.symbol2ind_with_adding),
                      ('ind2symbol', self.ind2symbol_with_adding),
                      ('max_token_length', self.max_token_length),
                      ('char_matrix', char_embeddings))

        del (self.symbol2ind_with_adding, self.ind2symbol_with_adding,
             self.max_token_length, char_embeddings, self.unique_symbols,
             x_test, x_train, x_dev, sent_test, sent_valid, sent_train,
             test_tokens_data_seq, train_tokens_data_seq, dev_tokens_data_seq,
             test_labels_data_seq, train_labels_data_seq, dev_labels_data_seq,
             test_tokens_data, train_tokens_data, dev_tokens_data,
             test_labels_data, train_labels_data, dev_labels_data,
             unique_tokens)