Exemplo n.º 1
0
 def make_note_sound(self, freq, time, amplitude=1):
     """
     Generates the numpy array for a single note
     with length [time] and loudness [amplitude] at
     the given frequency.
     """
     note = sin(2 * pi * freq * amplitude *
                linspace(0, time, time * self.RATE))
     freq = utils.reverse_dictionary(self.FREQUENCY_MAP)[freq]
     print freq, time, amplitude, len(note)
     return note / np.max(np.abs(note), axis=0)     # Normalize the note
Exemplo n.º 2
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_iob = {"O": 0, "B": 1, "I": 2}
    vocab_type = load_vocab(config.types_filename)
    id2type = reverse_dictionary(vocab_type)
    print vocab_type
    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_iob = get_processing_word(vocab_iob, lowercase=False)
    processing_type = get_processing_word(vocab_type, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                       processing_iob, processing_type, config.max_iter,
                       config.chars)
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        processing_iob, processing_type, config.max_iter,
                        config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, processing_iob, processing_type,
                         config.max_iter, config.chars)

    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars),
                     niob=len(vocab_iob),
                     ntype=len(vocab_type),
                     id2type=id2type)

    model.build()

    model.train(train, dev, vocab_tags)

    model.evaluate(test, vocab_tags)
def study_filtering():
    """ Temp to figure out what is going on
    """
    config = ConfigParser.RawConfigParser()
    config.read('../example.cfg')
    gpath = config.get('output', 'output_dir') 
    nstudies = config.get('params', 'nstudies')
    df = pd.read_pickle(gpath + 'full_size.pkl')
    dfd = pd.read_pickle(gpath + 'drop_duplicats_size.pkl')
    df = generate_unique_mapping(dfd, df, nstudies=2)
    print('begin filtering by study')
    study_dict = pickle.load(
            open(gpath + 'dict_test.txt', 'rb'))
    sdict = reverse_dictionary(study_dict)
    print('**** study dict loaded ******')
    gs, sl = remove_singleton_exp_variants(df, sdict,
            nstudies)
    filtered_data = dfd.ix[gs.values,:]
    filtered_data.to_csv(gpath + 'filtered_all.txt')
    embed()
Exemplo n.º 4
0
    def load_dataset(self,
                     dataset_filepaths,
                     parameters,
                     token_to_vector=None):
        '''
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
        '''
        start_time = time.time()
        print('Load dataset... ', end='', flush=True)
        if parameters['token_pretrained_embedding_filepath'] != '':
            if token_to_vector == None:
                token_to_vector = utils_nlp.load_pretrained_token_embeddings(
                    parameters)
        else:
            token_to_vector = {}
        if self.verbose:
            print("len(token_to_vector): {0}".format(len(token_to_vector)))

        # Load pretraining dataset to ensure that index to label is compatible to the pretrained model,
        #   and that token embeddings that are learned in the pretrained model are loaded properly.
        all_tokens_in_pretraining_dataset = []
        all_characters_in_pretraining_dataset = []
        if parameters['use_pretrained_model']:
            pretraining_dataset = pickle.load(
                open(
                    os.path.join(parameters['pretrained_model_folder'],
                                 'dataset.pickle'), 'rb'))
            all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values(
            )
            all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values(
            )

        remap_to_unk_count_threshold = 1
        self.UNK_TOKEN_INDEX = 0
        self.PADDING_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []
        self.UNK = 'UNK'
        self.unique_labels = []
        labels = {}
        tokens = {}
        label_count = {}
        token_count = {}
        character_count = {}
        for dataset_type in ['train', 'valid', 'test', 'deploy']:
            labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \
                = self._parse_dataset(dataset_filepaths.get(dataset_type, None))

            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()) + list(
                        token_count['deploy'].keys()):
            token_count['all'][token] = token_count['train'][
                token] + token_count['valid'][token] + token_count['test'][
                    token] + token_count['deploy'][token]

        if parameters['load_all_pretrained_token_embeddings']:
            for token in token_to_vector:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1
            for token in all_tokens_in_pretraining_dataset:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1

        character_count['all'] = {}
        for character in list(character_count['train'].keys()) + list(
                character_count['valid'].keys()) + list(
                    character_count['test'].keys()) + list(
                        character_count['deploy'].keys()):
            character_count['all'][character] = character_count['train'][
                character] + character_count['valid'][
                    character] + character_count['test'][
                        character] + character_count['deploy'][character]

        for character in all_characters_in_pretraining_dataset:
            if character not in character_count['all']:
                character_count['all'][character] = -1
                character_count['train'][character] = -1

        for dataset_type in dataset_filepaths.keys():
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()) + list(
                        label_count['deploy'].keys()):
            label_count['all'][character] = label_count['train'][
                character] + label_count['valid'][character] + label_count[
                    'test'][character] + label_count['deploy'][character]

        token_count['all'] = utils.order_dictionary(token_count['all'],
                                                    'value_key',
                                                    reverse=True)
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)
        character_count['all'] = utils.order_dictionary(character_count['all'],
                                                        'value',
                                                        reverse=True)
        if self.verbose:
            print('character_count[\'all\']: {0}'.format(
                character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        if self.verbose:
            print("parameters['remap_unknown_tokens_to_unk']: {0}".format(
                parameters['remap_unknown_tokens_to_unk']))
        if self.verbose:
            print("len(token_count['train'].keys()): {0}".format(
                len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1

            if parameters['remap_unknown_tokens_to_unk'] == 1 and \
                (token_count['train'][token] == 0 or \
                parameters['load_only_pretrained_token_embeddings']) and \
                not utils_nlp.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \
                token not in all_tokens_in_pretraining_dataset:
                if self.verbose: print("token: {0}".format(token))
                if self.verbose:
                    print("token.lower(): {0}".format(token.lower()))
                if self.verbose:
                    print("re.sub('\d', '0', token.lower()): {0}".format(
                        re.sub('\d', '0', token.lower())))
                token_to_index[token] = self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        # Ensure that both B- and I- versions exist for each label
        labels_without_bio = set()
        for label in label_count['all'].keys():
            new_label = utils_nlp.remove_bio_from_label_name(label)
            labels_without_bio.add(new_label)
        for label in labels_without_bio:
            if label == 'O':
                continue
            if parameters['tagging_format'] == 'bioes':
                prefixes = ['B-', 'I-', 'E-', 'S-']
            else:
                prefixes = ['B-', 'I-']
            for prefix in prefixes:
                l = prefix + label
                if l not in label_count['all']:
                    label_count['all'][l] = 0
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)

        if parameters['use_pretrained_model']:
            self.unique_labels = sorted(
                list(pretraining_dataset.label_to_index.keys()))
            # Make sure labels are compatible with the pretraining dataset.
            for label in label_count['all']:
                if label not in pretraining_dataset.label_to_index:
                    raise AssertionError(
                        "The label {0} does not exist in the pretraining dataset. "
                        .format(label) +
                        "Please ensure that only the following labels exist in the dataset: {0}"
                        .format(', '.join(self.unique_labels)))
            label_to_index = pretraining_dataset.label_to_index.copy()
        else:
            label_to_index = {}
            iteration_number = 0
            for label, count in label_count['all'].items():
                label_to_index[label] = iteration_number
                iteration_number += 1
                self.unique_labels.append(label)

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == self.PADDING_CHARACTER_INDEX:
                iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1

        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('token_to_index: {0}'.format(token_to_index))
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remap_unknown_tokens_to_unk'] == 1:
            index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        if self.verbose: print('index_to_token: {0}'.format(index_to_token))

        if self.verbose:
            print('label_count[\'train\']: {0}'.format(label_count['train']))
        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose: print('index_to_label: {0}'.format(index_to_label))

        character_to_index = utils.order_dictionary(character_to_index,
                                                    'value',
                                                    reverse=False)
        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose:
            print('character_to_index: {0}'.format(character_to_index))
        if self.verbose:
            print('index_to_character: {0}'.format(index_to_character))

        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        if self.verbose:
            # Print sequences of length 1 in train set
            for token_sequence, label_sequence in zip(tokens['train'],
                                                      labels['train']):
                if len(label_sequence) == 1 and label_sequence[0] != 'O':
                    print("{0}\t{1}".format(token_sequence[0],
                                            label_sequence[0]))

        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.index_to_character = index_to_character
        self.character_to_index = character_to_index
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index
        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))
        self.tokens = tokens
        self.labels = labels

        token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(
            dataset_filepaths.keys())

        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.label_vector_indices = label_vector_indices

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1
        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose:
            print('self.unique_labels_of_interest: {0}'.format(
                self.unique_labels_of_interest))
        if self.verbose:
            print('self.unique_label_indices_of_interest: {0}'.format(
                self.unique_label_indices_of_interest))

        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))

        return token_to_vector
Exemplo n.º 5
0
    def load_dataset(self, dataset_filepaths, parameters):
        '''
        args:
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test'
        http://stackoverflow.com/questions/27416164/what-is-conll-data-format
        '''
        all_pretrained_tokens = None
        if parameters['token_pretrained_embedding_filepath'] != '':
            all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings(
                parameters)
        if self.verbose:
            print("len(all_pretrained_tokens): {0}".format(
                len(all_pretrained_tokens)))

        remap_to_unk_count_threshold = 1
        #if ['train'] not in dataset_filepaths.keys(): raise ValueError('')
        UNK_TOKEN_INDEX = 0
        PADDING_CHARACTER_INDEX = 0
        self.UNK = 'UNK'
        self.unique_labels = []
        labels = {}
        tokens = {}
        characters = {}
        token_lengths = {}
        label_count = {}
        token_count = {}
        character_count = {}
        for dataset_type in ['train', 'valid', 'test']:
            labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], \
                character_count[dataset_type] = self._parse_dataset(dataset_filepaths[dataset_type],dataset_type)#,all_pretrained_tokens,token_count)
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        token_count['all'] = {}  # utils.merge_dictionaries()
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()):
            token_count['all'][
                token] = token_count['train'][token] + token_count['valid'][
                    token] + token_count['test'][token]

        for dataset_type in ['train', 'valid', 'test']:
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        character_count['all'] = {}  # utils.merge_dictionaries()
        for character in list(character_count['train'].keys()) + list(
                character_count['valid'].keys()) + list(
                    character_count['test'].keys()):
            character_count['all'][character] = character_count['train'][
                character] + character_count['valid'][
                    character] + character_count['test'][character]

        label_count['all'] = {}  # utils.merge_dictionaries()
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()):
            label_count['all'][
                character] = label_count['train'][character] + label_count[
                    'valid'][character] + label_count['test'][character]

        token_count['all'] = utils.order_dictionary(token_count['all'],
                                                    'value',
                                                    reverse=True)
        #label_count['train'] = utils.order_dictionary(label_count['train'], 'key', reverse = False)
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)
        label_count['train'] = utils.order_dictionary(label_count['train'],
                                                      'key',
                                                      reverse=False)
        character_count['all'] = utils.order_dictionary(character_count['all'],
                                                        'value',
                                                        reverse=True)
        if self.verbose:
            print('character_count[\'all\']: {0}'.format(
                character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        #         if self.verbose: print("parameters['remove_unknown_tokens']: {0}".format(parameters['remove_unknown_tokens']))
        #         if self.verbose: print("len(token_count['train'].keys()): {0}".format(len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == UNK_TOKEN_INDEX: iteration_number += 1

            if parameters['remove_unknown_tokens'] == 1 and \
                token_count['train'][token] == 0 and \
                (all_pretrained_tokens == None or \
                token not in all_pretrained_tokens and \
                token.lower() not in all_pretrained_tokens and \
                re.sub('\d', '0', token.lower()) not in all_pretrained_tokens):#all( [x not in all_pretrained_tokens for x in [ token, token.lower(), re.sub('\d', '0', token.lower()) ]]):

                #                         if self.verbose: print("token: {0}".format(token))
                #                         if self.verbose: print("token.lower(): {0}".format(token.lower()))
                #                         if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format(re.sub('\d', '0', token.lower())))
                #                         assert(token not in )
                #                         assert(token.lower() not in all_pretrained_tokens)
                #                         assert(re.sub('\d', '0', token.lower()) not in all_pretrained_tokens)
                token_to_index[token] = UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))
        #         0/0

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        label_to_index = {}
        iteration_number = 0
        #for label, count in label_count['train'].items():
        for label, count in label_count['all'].items():
            label_to_index[label] = iteration_number
            iteration_number += 1
            self.unique_labels.append(label)

        #for label, count in label_count['train'].items():
        #    self.unique_labels.append(label)

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == PADDING_CHARACTER_INDEX:
                iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1

        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)
        #if self.verbose: print('token_to_index[0:10]: {0}'.format(token_to_index[0:10]))
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remove_unknown_tokens'] == 1:
            index_to_token[UNK_TOKEN_INDEX] = self.UNK
        #if self.verbose: print('index_to_token[0:10]: {0}'.format(index_to_token[0:10]))

        #if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train']))
        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose: print('index_to_label: {0}'.format(index_to_label))

        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose:
            print('character_to_index: {0}'.format(character_to_index))
        if self.verbose:
            print('index_to_character: {0}'.format(index_to_character))

        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        # Map tokens and labels to their indices
        token_indices = {}
        label_indices = {}
        character_indices = {}
        character_indices_padded = {}
        for dataset_type in ['train', 'valid', 'test']:
            token_indices[dataset_type] = []
            characters[dataset_type] = []
            character_indices[dataset_type] = []
            token_lengths[dataset_type] = []
            character_indices_padded[dataset_type] = []
            for token_sequence in tokens[dataset_type]:
                token_indices[dataset_type].append(
                    [token_to_index[token] for token in token_sequence])
                characters[dataset_type].append(
                    [list(token) for token in token_sequence])
                character_indices[dataset_type].append(
                    [[character_to_index[character] for character in token]
                     for token in token_sequence])
                token_lengths[dataset_type].append(
                    [len(token) for token in token_sequence])

                longest_token_length_in_sequence = max(
                    token_lengths[dataset_type][-1])
                character_indices_padded[dataset_type].append([
                    utils.pad_list(temp_token_indices,
                                   longest_token_length_in_sequence,
                                   PADDING_CHARACTER_INDEX) for
                    temp_token_indices in character_indices[dataset_type][-1]
                ])

            label_indices[dataset_type] = []
            for label_sequence in labels[dataset_type]:
                label_indices[dataset_type].append(
                    [label_to_index[label] for label in label_sequence])

        if self.verbose:
            print('token_lengths[\'train\'][0][0:10]: {0}'.format(
                token_lengths['train'][0][0:10]))
        if self.verbose:
            print('characters[\'train\'][0][0:10]: {0}'.format(
                characters['train'][0][0:10]))
        if self.verbose:
            print('token_indices[\'train\'][0:10]: {0}'.format(
                token_indices['train'][0:10]))
        if self.verbose:
            print('label_indices[\'train\'][0:10]: {0}'.format(
                label_indices['train'][0:10]))
        if self.verbose:
            print('character_indices[\'train\'][0][0:10]: {0}'.format(
                character_indices['train'][0][0:10]))
        if self.verbose:
            print('character_indices_padded[\'train\'][0][0:10]: {0}'.format(
                character_indices_padded['train'][0][0:10]))

        #  Vectorize the labels
        # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857)
        label_binarizer = sklearn.preprocessing.LabelBinarizer()
        label_binarizer.fit(range(max(index_to_label.keys()) + 1))
        label_vector_indices = {}
        for dataset_type in ['train', 'valid', 'test']:
            label_vector_indices[dataset_type] = []
            for label_indices_sequence in label_indices[dataset_type]:
                label_vector_indices[dataset_type].append(
                    label_binarizer.transform(label_indices_sequence))

        if self.verbose:
            print('label_vector_indices[\'train\'][0:2]: {0}'.format(
                label_vector_indices['train'][0:2]))

        if self.verbose:
            print('len(label_vector_indices[\'train\']): {0}'.format(
                len(label_vector_indices['train'])))
        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.index_to_character = index_to_character
        self.character_to_index = character_to_index
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.tokens = tokens
        self.labels = labels
        self.label_vector_indices = label_vector_indices
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index
        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1
        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose:
            print('self.unique_labels_of_interest: {0}'.format(
                self.unique_labels_of_interest))
        if self.verbose:
            print('self.unique_label_indices_of_interest: {0}'.format(
                self.unique_label_indices_of_interest))
        print('Dataset formatting completed')
Exemplo n.º 6
0
    def load_dataset(self,
                     word_index=None,
                     tag_index=None,
                     char_index=None,
                     ner_index=None,
                     prefix_index=None,
                     suffix_index=None,
                     fgen=True):
        '''
    dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
    '''
        start_time = time.time()
        if fgen:
            self.tokens_mapped_to_unk = []
            prefix_count, suffix_count, word_count, tag_count, char_count, self.sentence_list = self._parse_dataset(
                fgen)
            word_count = utils.order_dictionary(word_count,
                                                'value_key',
                                                reverse=True)
            tag_count = utils.order_dictionary(tag_count, 'key', reverse=False)
            char_count = utils.order_dictionary(char_count,
                                                'value',
                                                reverse=True)
            prefix_count = utils.order_dictionary(prefix_count,
                                                  'value_key',
                                                  reverse=True)
            suffix_count = utils.order_dictionary(suffix_count,
                                                  'value_key',
                                                  reverse=True)

            pid = 0
            self.prefix_index["-padding-"] = pid
            pid += 1
            for pre, count in prefix_count.items():
                self.prefix_index[pre] = pid
                pid += 1
            self.prefix_index['-UNK-'] = pid
            pickle.dump(self.prefix_index, open("prefix_index", 'wb'))

            sid = 0
            self.suffix_index["-padding-"] = sid
            sid += 1
            for suf, count in suffix_count.items():
                self.suffix_index[suf] = sid
                sid += 1
            self.suffix_index['-UNK-'] = sid
            pickle.dump(self.suffix_index, open("suffix_index", 'wb'))

            wid = 0
            self.word_index["-padding-"] = wid
            wid += 1
            for word, count in word_count.items():
                self.word_index[word] = wid
                wid += 1
            self.word_index['-UNK-'] = wid
            pickle.dump(self.word_index, open("word_index", 'wb'))

            tid = 0
            #self.tag_index["-padding-"] = tid
            #tid += 1
            for tag, count in tag_count.items():
                self.tag_index[tag] = tid
                tid += 1
            pickle.dump(self.tag_index, open("tag_index", 'wb'))

            cid = 0
            self.char_index["-padding-"] = cid
            cid += 1
            for char, count in char_count.items():
                self.char_index[char] = cid
                cid += 1
            self.char_index['-UNK-'] = cid
            pickle.dump(self.char_index, open("char_index", 'wb'))
        else:
            self.word_index = word_index
            self.tag_index = tag_index
            self.char_index = char_index
            self.prefix_index = prefix_index
            self.suffix_index = suffix_index
            _, _, _, _, _, self.sentence_list = self._parse_dataset(fgen)

        for name, sent_list in self.sentence_list.items():
            for sent in sent_list:
                sent.gen_id_list(self.word_index, self.tag_index,
                                 self.char_index)

                if not self.use_char:
                    sent.gen_sent_features(self.word_index, self.tag_index,
                                           self.prefix_index,
                                           self.suffix_index)

        self.number_of_classes = len(self.tag_index)
        self.vocabulary_size = len(self.word_index)
        if self.char_index is not None:
            self.alphabet_size = len(self.char_index)

        if not self.use_char:
            self.prefix_size = len(self.prefix_index)
            self.suffix_size = len(self.suffix_index)
        if self.char_index is not None:
            self.char_map = utils.reverse_dictionary(self.char_index)
        self.word_map = utils.reverse_dictionary(self.word_index)
        self.tag_map = utils.reverse_dictionary(self.tag_index)

        elapsed_time = time.time() - start_time
        print('loading dataset done ({0:.2f} seconds)'.format(elapsed_time))
Exemplo n.º 7
0
  def load_dataset(self, word_index=None, tag_index=None, char_index=None, ner_index=None, prefix_index=None, suffix_index=None, fgen=True):
    '''
    dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
    '''
    start_time = time.time()
    if fgen:
      self.tokens_mapped_to_unk = []
      prefix_count, suffix_count, word_count, tag_count, char_count, ner_count, self.sentence_list = self._parse_dataset(fgen)
      word_count = utils.order_dictionary(word_count, 'value_key', reverse = True)
      tag_count = utils.order_dictionary(tag_count, 'key', reverse = False)
      char_count = utils.order_dictionary(char_count, 'value', reverse = True)
      prefix_count = utils.order_dictionary(prefix_count, 'value_key', reverse = True)
      suffix_count = utils.order_dictionary(suffix_count, 'value_key', reverse = True)
      
      pid = 0
      self.prefix_index["-padding-"] = pid
      pid += 1
      for pre, count in prefix_count.items():
        self.prefix_index[pre] = pid
        pid += 1
      self.prefix_index['-UNK-'] = pid
      pickle.dump(self.prefix_index, open(os.path.join("lstm_ner_models", "ner_prefix_index"), 'wb'))

      sid = 0
      self.suffix_index["-padding-"] = sid
      sid += 1
      for suf, count in suffix_count.items():
        self.suffix_index[suf] = sid
        sid += 1
      self.suffix_index['-UNK-'] = sid
      pickle.dump(self.suffix_index, open(os.path.join("lstm_ner_models", "ner_suffix_index"), 'wb'))
      
      wid = 0
      self.word_index["-padding-"] = wid
      wid += 1
      for word, count in word_count.items():
        self.word_index[word] = wid
        wid += 1
        if count <=1:
          self.rare_words.append(word)
      self.word_index['-UNK-'] = wid
      pickle.dump(self.word_index, open(os.path.join("lstm_ner_models", "ner_word_index"), 'wb'))

      # Ensure that both B- and I- versions exist for ach label
      labels_without_bio = set()
      for label, count in ner_count.items():
        new_label = utils.remove_bio_from_label_name(label)
        labels_without_bio.add(new_label)

      prefixes = ['B-', 'I-']
      nid = 0
      self.ner_index['O'] = nid
      nid += 1
      for label in labels_without_bio:
        if label == 'O':
          continue
        for prefix in prefixes:
          l = prefix + label
          self.ner_index[l] = nid
          nid += 1
      pickle.dump(self.ner_index, open(os.path.join("lstm_ner_models", "ner_index"), 'wb'))
      '''
      tid = 0
      self.tag_index["-padding-"] = tid
      tid += 1
      for tag, count in tag_count.items():
        self.tag_index[tag] = tid
        tid += 1
      pickle.dump(self.tag_index, open("ner_tag_index", 'wb'))
      '''
      cid = 0
      self.char_index["-padding-"] = cid
      cid += 1
      for char, count in char_count.items():
        self.char_index[char] = cid
        cid += 1
      self.char_index['-UNK-'] = cid
      pickle.dump(self.char_index, open(os.path.join("lstm_ner_models", "ner_char_index"), 'wb'))
    else:
      self.word_index = word_index
      self.tag_index = tag_index
      self.char_index = char_index
      self.ner_index = ner_index
      self.prefix_index = prefix_index
      self.suffix_index = suffix_index
      _, _, _, _, _, _, self.sentence_list = self._parse_dataset(fgen)

    for name, sent_list in self.sentence_list.items():
      for sent in sent_list:
        sent.gen_id_list(self.word_index, self.char_index, self.ner_index, None)
        if not self.use_char:
          sent.gen_sent_features(self.word_index, prefix_map=self.prefix_index, suffix_map=self.suffix_index)

    self.number_of_classes = len(self.ner_index)
    self.vocabulary_size = len(self.word_index)
    if self.char_index != None:
      self.alphabet_size = len(self.char_index)
    else:
      self.alphabet_size = 0
    #self.pos_classes = len(self.tag_index)
    self.number_of_boi = 3
    self.number_of_type = 4
    self.prefix_size = len(self.prefix_index)
    self.suffix_size = len(self.suffix_index)

    if self.char_index != None:
      self.char_map = utils.reverse_dictionary(self.char_index)
    self.word_map = utils.reverse_dictionary(self.word_index)
    #self.tag_map = utils.reverse_dictionary(self.tag_index)
    self.ner_map = utils.reverse_dictionary(self.ner_index)

    elapsed_time = time.time() - start_time
    print('loading dataset done ({0:.2f} seconds)'.format(elapsed_time))
Exemplo n.º 8
0
    def load_dataset(self, dataset_filepaths, parameters):
        '''
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test'
        '''
        start_time = time.time()
        pprint('Load dataset... ')
        # Load pretraining dataset to ensure that index to label is compatible to the pretrained model,
        #   and that token embeddings that are learned in the pretrained model are loaded properly.
        all_tokens_in_pretraining_dataset = []
        if parameters['use_pretrained_model']:
            pretrained_model_folder = os.path.dirname(
                parameters['pretrained_model_checkpoint_filepath'])
            pretraining_dataset = pickle.load(
                open(os.path.join(pretrained_model_folder, 'dataset.pickle'),
                     'rb'))
            all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values(
            )
            self.vocab_embeddings = all_tokens_in_pretraining_dataset

        remap_to_unk_count_threshold = 1
        self.PADDING_CHARACTER_INDEX = 1
        self.PADDING_TOKEN_INDEX = 1
        self.UNK_TOKEN_INDEX = 0
        self.UNK_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []
        self.UNK = '<UNK>'
        self.PAD = '<PAD>'
        self.unique_labels = []
        labels = {}
        tokens = {}
        characters = {}
        token_lengths = {}
        sequence_lengths = {}
        longest_token_length_in_sequence = {}
        label_count = {}
        token_count = {}
        character_count = {}

        for dataset_type in ['train', 'valid', 'test']:
            labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \
                = self._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language'], parameters['data_to_use'] if 'data_to_use' in parameters else None)

            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()):
            token_count['all'][
                token] = token_count['train'][token] + token_count['valid'][
                    token] + token_count['test'][token]

        for dataset_type in dataset_filepaths.keys():
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        character_count['all'] = {}
        for character in list(character_count['train'].keys()) + list(
                character_count['valid'].keys()) + list(
                    character_count['test'].keys()):
            character_count['all'][character] = character_count['train'][
                character] + character_count['valid'][
                    character] + character_count['test'][character]

        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()):
            label_count['all'][
                character] = label_count['train'][character] + label_count[
                    'valid'][character] + label_count['test'][character]

        token_count['all'] = utils.order_dictionary(token_count['all'],
                                                    'value_key',
                                                    reverse=True)
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)
        character_count['all'] = utils.order_dictionary(character_count['all'],
                                                        'value',
                                                        reverse=True)
        if self.verbose:
            print('character_count[\'all\']: {0}'.format(
                character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        token_to_index[self.PAD] = self.PADDING_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        if self.verbose:
            print("parameters['remap_unknown_tokens_to_unk']: {0}".format(
                parameters['remap_unknown_tokens_to_unk']))
        if self.verbose:
            print("len(token_count['train'].keys()): {0}".format(
                len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1
            if iteration_number == self.PADDING_TOKEN_INDEX:
                iteration_number += 1
            if parameters['remap_unknown_tokens_to_unk'] == 1 and \
                (token_count['train'][token] == 0 or \
                parameters['load_only_pretrained_token_embeddings']) and \
                not utils_nlp.is_token_in_pretrained_embeddings(token, self.vocab_embeddings, parameters) and \
                token not in all_tokens_in_pretraining_dataset:
                if self.verbose: print("token: {0}".format(token))
                if self.verbose:
                    print("token.lower(): {0}".format(token.lower()))
                if self.verbose:
                    print("re.sub('\d', '0', token.lower()): {0}".format(
                        re.sub('\d', '0', token.lower())))
                token_to_index[token] = iteration_number
                iteration_number += 1
                #if parameters['embedding_type'] == 'fasttext':
                #    token_to_index[token] = iteration_number
                #    iteration_number += 1
                #else:
                #    token_to_index[token] =  self.UNK_TOKEN_INDEX
                #    number_of_unknown_tokens += 1
                #    self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)

        if parameters['use_pretrained_model']:
            self.unique_labels = sorted(
                list(pretraining_dataset.label_to_index.keys()))
            # Make sure labels are compatible with the pretraining dataset.
            for label in label_count['all']:
                if label not in pretraining_dataset.label_to_index:
                    raise AssertionError(
                        "The label {0} does not exist in the pretraining dataset. "
                        .format(label) +
                        "Please ensure that only the following labels exist in the dataset: {0}"
                        .format(', '.join(self.unique_labels)))
            label_to_index = pretraining_dataset.label_to_index.copy()
        else:
            label_to_index = {}
            iteration_number = 0
            for label, count in label_count['all'].items():
                label_to_index[label] = iteration_number
                iteration_number += 1
                self.unique_labels.append(label)
        self.PADDING_LABEL_INDEX = label_to_index['O']

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        character_to_index[self.UNK] = self.UNK_CHARACTER_INDEX

        if parameters['use_pretrained_model']:
            # TODO: initialize character_to_index from saved pickle
            character_to_index = pretraining_dataset.character_to_index.copy()
        else:
            character_to_index[self.PAD] = self.PADDING_CHARACTER_INDEX
            iteration_number = 0
            for character, count in character_count['all'].items():
                if iteration_number == self.UNK_CHARACTER_INDEX:
                    iteration_number += 1
                if iteration_number == self.PADDING_CHARACTER_INDEX:
                    iteration_number += 1
                character_to_index[character] = iteration_number
                iteration_number += 1

        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('token_to_index: {0}'.format(token_to_index))
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remap_unknown_tokens_to_unk'] == 1:
            index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        if self.verbose: print('index_to_token: {0}'.format(index_to_token))

        if self.verbose:
            print('label_count[\'train\']: {0}'.format(label_count['train']))
        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose: print('index_to_label: {0}'.format(index_to_label))

        character_to_index = utils.order_dictionary(character_to_index,
                                                    'value',
                                                    reverse=False)
        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose:
            print('character_to_index: {0}'.format(character_to_index))
        if self.verbose:
            print('index_to_character: {0}'.format(index_to_character))

        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        if self.verbose:
            # Print sequences of length 1 in train set
            for token_sequence, label_sequence in zip(tokens['train'],
                                                      labels['train']):
                if len(label_sequence) == 1 and label_sequence[0] != 'O':
                    print("{0}\t{1}".format(token_sequence[0],
                                            label_sequence[0]))

        # Map tokens and labels to their indices
        token_indices = {}
        label_indices = {}
        character_indices = {}
        #character_indices_padded = {}
        for dataset_type in dataset_filepaths.keys():
            token_indices[dataset_type] = []
            characters[dataset_type] = []
            character_indices[dataset_type] = []
            token_lengths[dataset_type] = []
            sequence_lengths[dataset_type] = []
            longest_token_length_in_sequence[dataset_type] = []
            #character_indices_padded[dataset_type] = []
            for token_sequence in tokens[dataset_type]:
                token_indices[dataset_type].append(
                    [token_to_index[token] for token in token_sequence])
                characters[dataset_type].append(
                    [list(token) for token in token_sequence])
                character_indices[dataset_type].append(
                    [[character_to_index[character] for character in token]
                     for token in token_sequence])
                token_lengths[dataset_type].append(
                    [len(token) for token in token_sequence])
                sequence_lengths[dataset_type].append(len(token_sequence))
                longest_token_length_in_sequence[dataset_type].append(
                    max(token_lengths[dataset_type][-1]))

                #character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX)
                #                                                for temp_token_indices in character_indices[dataset_type][-1]])

            label_indices[dataset_type] = []
            for label_sequence in labels[dataset_type]:
                label_indices[dataset_type].append(
                    [label_to_index[label] for label in label_sequence])

        if self.verbose:
            print('token_lengths[\'train\'][0][0:10]: {0}'.format(
                token_lengths['train'][0][0:10]))
        if self.verbose:
            print('characters[\'train\'][0][0:10]: {0}'.format(
                characters['train'][0][0:10]))
        if self.verbose:
            print('token_indices[\'train\'][0:10]: {0}'.format(
                token_indices['train'][0:10]))
        if self.verbose:
            print('label_indices[\'train\'][0:10]: {0}'.format(
                label_indices['train'][0:10]))
        if self.verbose:
            print('character_indices[\'train\'][0][0:10]: {0}'.format(
                character_indices['train'][0][0:10]))
        #if self.verbose: print('character_indices_padded[\'train\'][0][0:10]: {0}'.format(character_indices_padded['train'][0][0:10]))

        label_vector_indices = {}
        tmp_vector = [0] * len(self.unique_labels)
        tmp_vector[label_to_index["O"]] = 1
        self.PADDING_LABEL_VECTOR = tmp_vector
        for dataset_type in dataset_filepaths.keys():
            label_vector_indices[dataset_type] = []
            for label_indices_sequence in label_indices[dataset_type]:
                vector_sequence = []
                for indice in label_indices_sequence:
                    vector = [0] * len(self.unique_labels)
                    vector[indice] = 1
                    vector_sequence.append(vector)
                label_vector_indices[dataset_type].append(vector_sequence)

        if self.verbose:
            print('label_vector_indices[\'train\'][0:2]: {0}'.format(
                label_vector_indices['train'][0:2]))

        if self.verbose:
            print('len(label_vector_indices[\'train\']): {0}'.format(
                len(label_vector_indices['train'])))
        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.token_indices = token_indices
        self.label_indices = label_indices
        #self.character_indices_padded = character_indices_padded
        self.index_to_character = index_to_character
        self.character_to_index = character_to_index
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.sequence_lengths = sequence_lengths
        self.longest_token_length_in_sequence = longest_token_length_in_sequence
        self.characters = characters
        self.tokens = tokens
        self.labels = labels
        self.label_vector_indices = label_vector_indices
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index
        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))

        self.number_of_classes = len(self.unique_labels)
        self.vocabulary_size = len(self.index_to_token) if len(
            self.index_to_token) > 100000 else 100000
        self.alphabet_size = len(self.character_to_index)
        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose:
            print('self.unique_labels_of_interest: {0}'.format(
                self.unique_labels_of_interest))
        if self.verbose:
            print('self.unique_label_indices_of_interest: {0}'.format(
                self.unique_label_indices_of_interest))

        print(self.label_to_index)
        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))
Exemplo n.º 9
0
    def load_dataset(self, dataset_filepaths, parameters):
        '''
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
        '''
        start_time = time.time()
        print('Load dataset... ', end='', flush=True)
        all_pretrained_tokens = []
        if parameters['token_pretrained_embedding_filepath'] != '':
            all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings(
                parameters)
        if self.verbose:
            print("len(all_pretrained_tokens): {0}".format(
                len(all_pretrained_tokens)))

        all_tokens_in_pretraining_dataset = []
        if parameters['use_pretrained_model']:
            pretraining_dataset = pickle.load(
                open(
                    os.path.join(parameters['pretrained_model_folder'],
                                 'dataset.pickle'), 'rb'))
            all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values(
            )

        remap_to_unk_count_threshold = 1
        self.UNK_TOKEN_INDEX = 0
        self.PADDING_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []
        self.UNK = 'UNK'
        self.unique_labels = []
        labels = {}
        tokens = {}
        if parameters['use_corrector']:
            labels_corrector = {}
            label_corrector_count = {}
            self.unique_labels_corrector = []
            if parameters['include_pos']:
                labels_pos = {}
                label_pos_count = {}
                self.unique_labels_pos = []
        characters = {}
        token_lengths = {}
        label_count = {}
        token_count = {}
        character_count = {}

        for dataset_type in ['train', 'valid', 'test', 'deploy']:
            # print("what am i getting?? {:s}".format(str(dataset_filepaths.get(dataset_type, None))))
            if parameters['use_corrector']:
                if parameters['include_pos']:
                    labels_pos[dataset_type], labels_corrector[dataset_type], labels[dataset_type], \
                    tokens[dataset_type], token_count[dataset_type], label_pos_count[dataset_type], \
                    label_corrector_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \
                        = self._parse_dataset(dataset_filepaths.get(dataset_type, None),
                                              use_corrector=True, include_pos=True, tagging_format=parameters['tagging_format'])
                else:
                    labels_corrector[dataset_type], labels[dataset_type], tokens[dataset_type], \
                    token_count[dataset_type], label_corrector_count[dataset_type], label_count[dataset_type], \
                    character_count[dataset_type] \
                        = self._parse_dataset(dataset_filepaths.get(dataset_type, None), use_corrector=True,
                                              tagging_format=parameters['tagging_format'])

            else:
                labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], \
                character_count[dataset_type] \
                    = self._parse_dataset(dataset_filepaths.get(dataset_type, None), tagging_format=parameters['tagging_format'])

            if self.verbose:
                print("len(token_count[{1}]): {0}".format(
                    len(token_count[dataset_type]), dataset_type))
        # sys.exit(0)
        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()) + list(
                        token_count['deploy'].keys()):
            token_count['all'][token] = token_count['train'][
                token] + token_count['valid'][token] + token_count['test'][
                    token] + token_count['deploy'][token]

        if self.verbose:
            print("len(token_count[all]): {0}".format(len(token_count['all'])))

        for dataset_type in dataset_filepaths.keys():
            if self.verbose:
                print("len(token_count[{1}]): {0}".format(
                    len(token_count[dataset_type]), dataset_type))

        character_count['all'] = {}
        for character in list(character_count['train'].keys()) + list(
                character_count['valid'].keys()) + list(
                    character_count['test'].keys()) + list(
                        character_count['deploy'].keys()):
            character_count['all'][character] = character_count['train'][
                character] + character_count['valid'][
                    character] + character_count['test'][
                        character] + character_count['deploy'][character]

        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()) + list(
                        label_count['deploy'].keys()):
            label_count['all'][character] = label_count['train'][character] + label_count['valid'][character] + \
                                            label_count['test'][character] + label_count['deploy'][character]
        if parameters['use_corrector']:
            label_corrector_count['all'] = {}
            for label in list(label_corrector_count['train'].keys()) + list(
                    label_corrector_count['valid'].keys()) + list(
                        label_corrector_count['test'].keys()) + list(
                            label_corrector_count['deploy'].keys()):
                label_corrector_count['all'][label] = label_corrector_count['train'][label] + label_corrector_count['valid'][label] + \
                                                      label_corrector_count['test'][label] + label_corrector_count['deploy'][label]
            label_corrector_count['all'] = utils.order_dictionary(
                label_corrector_count['all'], 'key', reverse=False)

            if parameters['include_pos']:
                label_pos_count['all'] = {}
                for label in list(label_pos_count['train'].keys()) + list(
                        label_pos_count['valid'].keys()) + list(
                            label_pos_count['test'].keys()) + list(
                                label_pos_count['deploy'].keys()):
                    label_pos_count['all'][label] = label_pos_count['train'][label] + label_pos_count['valid'][label] + \
                                                    label_pos_count['test'][label] + label_pos_count['deploy'][label]
                label_pos_count['all'] = utils.order_dictionary(
                    label_pos_count['all'], 'key', reverse=False)

        token_count['all'] = utils.order_dictionary(token_count['all'],
                                                    'value_key',
                                                    reverse=True)
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)
        character_count['all'] = utils.order_dictionary(character_count['all'],
                                                        'value',
                                                        reverse=True)
        if self.verbose:
            print('character_count[\'all\']: {0}'.format(
                character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        if self.verbose:
            print("parameters['remap_unknown_tokens_to_unk']: {0}".format(
                parameters['remap_unknown_tokens_to_unk']))
        if self.verbose:
            print("len(token_count['train'].keys()): {0}".format(
                len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1

            if parameters['remap_unknown_tokens_to_unk'] == 1 and \
                    (token_count['train'][token] == 0 or \
                             parameters['load_only_pretrained_token_embeddings']) and \
                    not utils_nlp.is_token_in_pretrained_embeddings(token, all_pretrained_tokens, parameters) and \
                            token not in all_tokens_in_pretraining_dataset:
                token_to_index[token] = self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        # Ensure that both B- and I- versions exist for each label
        labels_without_bio = set()
        for label in label_count['all'].keys():
            new_label = utils_nlp.remove_bio_from_label_name(label)
            labels_without_bio.add(new_label)
        for label in labels_without_bio:
            if label == 'O':
                continue
            if parameters['tagging_format'] == 'bioes':
                prefixes = ['B-', 'I-', 'E-', 'S-']
            else:
                prefixes = ['B-', 'I-']
            for prefix in prefixes:
                l = prefix + label
                if l not in label_count['all']:
                    label_count['all'][l] = 0
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)

        if parameters['use_pretrained_model'] and not parameters['add_class']:
            self.unique_labels = sorted(
                list(pretraining_dataset.label_to_index.keys()))
            # Make sure labels are compatible with the pretraining dataset.
            for label in label_count['all']:
                if label not in pretraining_dataset.label_to_index:
                    raise AssertionError(
                        "The label {0} does not exist in the pretraining dataset. "
                        .format(label) +
                        "Please ensure that only the following labels exist in the dataset: {0}"
                        .format(', '.join(self.unique_labels)))
            label_to_index = pretraining_dataset.label_to_index.copy()

        elif parameters['use_pretrained_model'] and parameters['add_class']:
            # make sure that the added labels are mapped to the end of the dectionary
            print('Adding new label-index pair to label_to_index dictionary')
            old_label_to_index = pretraining_dataset.label_to_index.copy()
            for label, count in label_count['all'].items():
                if label not in old_label_to_index.keys():
                    old_label_to_index[label] = len(old_label_to_index.keys())
            label_to_index = old_label_to_index.copy()

            self.unique_labels = list(label_to_index.keys())
        else:
            label_to_index = {}
            iteration_number = 0
            for label, count in label_count['all'].items():
                label_to_index[label] = iteration_number
                iteration_number += 1
                self.unique_labels.append(label)
        if parameters['use_corrector']:
            label_corrector_to_index = {}
            self.unique_labels_corrector = list(
                label_corrector_count['all'].keys())
            for n, label in enumerate(self.unique_labels_corrector):
                label_corrector_to_index[label] = n
            if parameters['include_pos']:
                label_pos_to_index = {}
                self.unique_labels_pos = list(label_pos_count['all'].keys())
                for n, pos in enumerate(self.unique_labels_pos):
                    label_pos_to_index[pos] = n

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == self.PADDING_CHARACTER_INDEX:
                iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1

        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remap_unknown_tokens_to_unk'] == 1:
            index_to_token[self.UNK_TOKEN_INDEX] = self.UNK

        if self.verbose:
            print('label_count[\'train\']: {0}'.format(label_count['train']))
        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose: print('index_to_label: {0}'.format(index_to_label))

        if parameters['use_corrector']:
            label_corrector_to_index = utils.order_dictionary(
                label_corrector_to_index, 'value', reverse=False)
            index_to_label_corrector = utils.reverse_dictionary(
                label_corrector_to_index)
            if parameters['include_pos']:
                label_pos_to_index = utils.order_dictionary(label_pos_to_index,
                                                            'value',
                                                            reverse=False)
                index_to_label_pos = utils.reverse_dictionary(
                    label_pos_to_index)

        character_to_index = utils.order_dictionary(character_to_index,
                                                    'value',
                                                    reverse=False)
        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose:
            print('character_to_index: {0}'.format(character_to_index))
        if self.verbose:
            print('index_to_character: {0}'.format(index_to_character))

        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        if self.verbose:
            # Print sequences of length 1 in train set
            for token_sequence, label_sequence in zip(tokens['train'],
                                                      labels['train']):
                if len(label_sequence) == 1 and label_sequence[0] != 'O':
                    print("{0}\t{1}".format(token_sequence[0],
                                            label_sequence[0]))

        token_indices = {}
        label_indices = {}
        if parameters['use_corrector']:
            label_indices_corrector = {}
            if parameters['include_pos']:
                label_indices_pos = {}

        character_indices = {}
        character_indices_padded = {}
        for dataset_type in dataset_filepaths.keys():
            # print("dataset_type: {:s}".format(dataset_type))
            token_indices[dataset_type] = []
            characters[dataset_type] = []
            character_indices[dataset_type] = []
            token_lengths[dataset_type] = []
            character_indices_padded[dataset_type] = []

            for token_sequence in tokens[dataset_type]:
                token_indices[dataset_type].append(
                    [token_to_index[token] for token in token_sequence])
                characters[dataset_type].append(
                    [list(token) for token in token_sequence])
                character_indices[dataset_type].append(
                    [[character_to_index[character] for character in token]
                     for token in token_sequence])
                token_lengths[dataset_type].append(
                    [len(token) for token in token_sequence])

                longest_token_length_in_sequence = max(
                    token_lengths[dataset_type][-1])
                character_indices_padded[dataset_type].append([
                    utils.pad_list(temp_token_indices,
                                   longest_token_length_in_sequence,
                                   self.PADDING_CHARACTER_INDEX) for
                    temp_token_indices in character_indices[dataset_type][-1]
                ])
            label_indices[dataset_type] = []
            for label_sequence in labels[dataset_type]:
                label_indices[dataset_type].append(
                    [label_to_index[label] for label in label_sequence])
            if parameters['use_corrector']:
                label_indices_corrector[dataset_type] = []
                for label_sequence_corrector in labels_corrector[dataset_type]:
                    label_indices_corrector[dataset_type].append([
                        label_corrector_to_index[label]
                        for label in label_sequence_corrector
                    ])
                if parameters['include_pos']:
                    label_indices_pos[dataset_type] = []
                    for label_sequence_pos in labels_pos[dataset_type]:
                        label_indices_pos[dataset_type].append([
                            label_pos_to_index[label]
                            for label in label_sequence_pos
                        ])

        if self.verbose:
            print('token_lengths[\'train\'][0][0:10]: {0}'.format(
                token_lengths['train'][0][0:10]))
        if self.verbose:
            print('characters[\'train\'][0][0:10]: {0}'.format(
                characters['train'][0][0:10]))
        if self.verbose:
            print('token_indices[\'train\'][0:10]: {0}'.format(
                token_indices['train'][0:10]))
        if self.verbose:
            print('label_indices[\'train\'][0:10]: {0}'.format(
                label_indices['train'][0:10]))
        if self.verbose:
            print('character_indices[\'train\'][0][0:10]: {0}'.format(
                character_indices['train'][0][0:10]))
        if self.verbose:
            print('character_indices_padded[\'train\'][0][0:10]: {0}'.format(
                character_indices_padded['train'][0][0:10]))

        label_binarizer = sklearn.preprocessing.LabelBinarizer()
        label_binarizer.fit(range(max(index_to_label.keys()) + 1))
        label_vector_indices = {}

        for dataset_type in dataset_filepaths.keys():
            label_vector_indices[dataset_type] = []
            for label_indices_sequence in label_indices[dataset_type]:
                label_vector_indices[dataset_type].append(
                    label_binarizer.transform(label_indices_sequence))

        if parameters['use_corrector']:
            label_binarizer_corrector = sklearn.preprocessing.LabelBinarizer()
            label_binarizer_corrector.fit(
                range(max(index_to_label_corrector.keys()) + 1))
            label_corrector_vector_indices = {}
            for dataset_type in dataset_filepaths.keys():
                label_corrector_vector_indices[dataset_type] = []
                for label_indices_sequence in label_indices_corrector[
                        dataset_type]:
                    label_corrector_vector_indices[dataset_type].append(
                        label_binarizer_corrector.transform(
                            label_indices_sequence))
            if parameters['include_pos']:
                label_binarizer_pos = sklearn.preprocessing.LabelBinarizer()
                label_binarizer_pos.fit(
                    range(max(index_to_label_pos.keys()) + 1))
                label_pos_vector_indices = {}
                for dataset_type in dataset_filepaths.keys():
                    label_pos_vector_indices[dataset_type] = []
                    for label_indices_sequence in label_indices_pos[
                            dataset_type]:
                        label_pos_vector_indices[dataset_type].append(
                            label_binarizer_pos.transform(
                                label_indices_sequence))
        if self.verbose:
            print('label_vector_indices[\'train\'][0:2]: {0}'.format(
                label_vector_indices['train'][0:2]))

        if self.verbose:
            print('len(label_vector_indices[\'train\']): {0}'.format(
                len(label_vector_indices['train'])))
        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.index_to_character = index_to_character
        self.character_to_index = character_to_index
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.tokens = tokens
        self.labels = labels
        self.label_vector_indices = label_vector_indices
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index
        if parameters['use_corrector']:
            self.index_to_label_corrector = index_to_label_corrector
            self.label_corrector_to_index = label_corrector_to_index
            self.label_indices_corrector = label_indices_corrector
            self.label_corrector_vector_indices = label_corrector_vector_indices
            if parameters['include_pos']:
                self.index_to_label_pos = index_to_label_pos
                self.label_pos_to_index = label_pos_to_index
                self.label_indices_pos = label_indices_pos
                self.label_pos_vector_indices = label_pos_vector_indices
        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))

        if parameters['add_class'] and parameters[
                'tagging_format'] == 'bioes' and len(
                    self.index_to_label) > 100:
            self.number_of_classes = max(self.index_to_label.keys()) + 1 - 8
        elif parameters['add_class'] and parameters[
                'tagging_format'] == 'bioes':
            print('here')
            self.number_of_classes = max(self.index_to_label.keys()) + 1 - 4
        elif parameters['add_class'] and parameters['tagging_format'] == 'bio':
            print('here2')
            self.number_of_classes = max(self.index_to_label.keys()) + 1 - 2
        else:
            self.number_of_classes = max(
                self.index_to_label.keys()) + 1  # 1 is for O label
        print('max(self.index_to_label.keys()) : {:d}'.format(
            max(self.index_to_label.keys())))
        print(self.index_to_label.keys())
        print(self.number_of_classes)

        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1
        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose:
            print('self.unique_labels_of_interest: {0}'.format(
                self.unique_labels_of_interest))
        if self.verbose:
            print('self.unique_label_indices_of_interest: {0}'.format(
                self.unique_label_indices_of_interest))

        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))
Exemplo n.º 10
0
    def load_dataset(self,
                     word2id=None,
                     tag2id=None,
                     prefix2id=None,
                     suffix2id=None,
                     fgen=True):
        '''
    dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
    '''
        start_time = time.time()
        if fgen:
            self.tokens_mapped_to_unk = []
            prefix_count, suffix_count, word_count, tag_count, self.sentence_list = self._parse_dataset(
                fgen)
            word_count = utils.order_dictionary(word_count,
                                                'value_key',
                                                reverse=True)
            tag_count = utils.order_dictionary(tag_count, 'key', reverse=False)
            prefix_count = utils.order_dictionary(prefix_count,
                                                  'value_key',
                                                  reverse=True)
            suffix_count = utils.order_dictionary(suffix_count,
                                                  'value_key',
                                                  reverse=True)

            pid = 0
            self.prefix2id["-padding-"] = pid
            pid += 1
            for pre, count in prefix_count.items():
                self.prefix2id[pre] = pid
                pid += 1
            self.prefix2id['-UNK-'] = pid
            pickle.dump(
                self.prefix2id,
                open(os.path.join(self.data_output, "ner_prefix2id"), 'wb'))

            sid = 0
            self.suffix2id["-padding-"] = sid
            sid += 1
            for suf, count in suffix_count.items():
                self.suffix2id[suf] = sid
                sid += 1
            self.suffix2id['-UNK-'] = sid
            pickle.dump(
                self.suffix2id,
                open(os.path.join(self.data_output, "ner_suffix2id"), 'wb'))

            wid = 0
            self.word2id["-padding-"] = wid
            wid += 1
            for word, count in word_count.items():
                self.word2id[word] = wid
                wid += 1
            self.word2id['-UNK-'] = wid
            pickle.dump(
                self.word2id,
                open(os.path.join(self.data_output, "ner_word2id"), 'wb'))

            # Ensure that both B- and I- versions exist for ach label
            labels_without_bio = set()
            for label, count in tag_count.items():
                new_label = utils.remove_bio_from_label_name(label)
                labels_without_bio.add(new_label)

            prefixes = ['B-', 'I-']
            nid = 0
            self.tag2id['O'] = nid
            nid += 1
            for label in labels_without_bio:
                if label == 'O':
                    continue
                for prefix in prefixes:
                    l = prefix + label
                    self.tag2id[l] = nid
                    nid += 1
            pickle.dump(
                self.tag2id,
                open(os.path.join(self.data_output, "ner_tag2id"), 'wb'))

        else:
            self.word2id = word2id
            self.tag2id = tag2id
            self.prefix2id = prefix2id
            self.suffix2id = suffix2id
            _, _, _, _, self.sentence_list = self._parse_dataset(fgen)

        for name, sent_list in self.sentence_list.items():
            for sent in sent_list:
                sent.gen_id_list(self.word2id, self.tag2id)
                sent.gen_sent_features(self.word2id, self.prefix2id,
                                       self.suffix2id)

        self.number_of_classes = len(self.tag2id)
        self.vocabulary_size = len(self.word2id)
        self.prefix_size = len(self.prefix2id)
        self.suffix_size = len(self.suffix2id)

        self.id2word = utils.reverse_dictionary(self.word2id)
        self.id2tag = utils.reverse_dictionary(self.tag2id)

        elapsed_time = time.time() - start_time
        print('loading dataset done ({0:.2f} seconds)'.format(elapsed_time))
Exemplo n.º 11
0
    def load_dataset(self,
                     dataset_filepaths,
                     parameters,
                     token_to_vector=None):
        '''
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
        Load word vectors từ file đã chuẩn bị sẵn
        '''
        start_time = time.time()
        print('Load dataset... ', end='', flush=True)
        if parameters['token_pretrained_embedding_filepath'] != '':
            if token_to_vector == None:
                token_to_vector = utils_nlp.load_pretrained_token_embeddings(
                    parameters)
        else:
            token_to_vector = {}
        if self.verbose:
            print("len(token_to_vector): {0}".format(len(token_to_vector)))

        # Load pretraining dataset to ensure that index to label is compatible to the pretrained model,
        #   and that token embeddings that are learned in the pretrained model are loaded properly.
        all_tokens_in_pretraining_dataset = []
        all_characters_in_pretraining_dataset = []
        if parameters['use_pretrained_model']:
            pretraining_dataset = pickle.load(
                open(
                    os.path.join(parameters['pretrained_model_folder'],
                                 'dataset.pickle'), 'rb'))
            all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values(
            )  # Những token lưu ở đợt train trước
            all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values(
            )  # Những character lưu ở đợt train trước

        remap_to_unk_count_threshold = 1
        self.UNK_TOKEN_INDEX = 0  # Index của những unknow token
        self.PADDING_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []  # những unknown token
        self.UNK = 'UNK'
        self.unique_labels = []  # Các nhãn tồn tại trong dataset
        labels = {}  # nhãn {all: ...., train: ..., test: ...}
        tokens = {}  # token {all: ...., train: ..., test: ...}
        label_count = {}  # Đếm số nhãn {all: ...., train: ..., test: ...}
        token_count = {}  # Đếm số token {all: ...., train: ..., test: ...}
        character_count = {}  # Đếm số ký tự {all: ...., train: ..., test: ...}
        for dataset_type in ['train', 'valid', 'test', 'deploy']:
            labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \
                = self._parse_dataset(dataset_filepaths.get(dataset_type, None))

            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        # Tính tổng hợp lại cho tất cả các dataset
        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()) + list(
                        token_count['deploy'].keys()):
            token_count['all'][token] = token_count['train'][
                token] + token_count['valid'][token] + token_count['test'][
                    token] + token_count['deploy'][token]

        # Thêm những token ở pretrained trước với giá trị -1
        if parameters['load_all_pretrained_token_embeddings']:
            for token in token_to_vector:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1
            for token in all_tokens_in_pretraining_dataset:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1

        # Tính tổng hợp lại cho tất cả các dataset
        character_count['all'] = {}
        for character in list(character_count['train'].keys()) + list(
                character_count['valid'].keys()) + list(
                    character_count['test'].keys()) + list(
                        character_count['deploy'].keys()):
            character_count['all'][character] = character_count['train'][
                character] + character_count['valid'][
                    character] + character_count['test'][
                        character] + character_count['deploy'][character]

        # Thêm những token ở pretrained trước với giá trị -1
        for character in all_characters_in_pretraining_dataset:
            if character not in character_count['all']:
                character_count['all'][character] = -1
                character_count['train'][character] = -1

        for dataset_type in dataset_filepaths.keys():
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        # Tính tổng hợp lại các nhãn ở đợt train trước
        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()) + list(
                        label_count['deploy'].keys()):
            label_count['all'][character] = label_count['train'][
                character] + label_count['valid'][character] + label_count[
                    'test'][character] + label_count['deploy'][character]

        token_count['all'] = utils.order_dictionary(
            token_count['all'], 'value_key', reverse=True
        )  # Sort token count theo các token có freq cao đến thấp, token desc
        label_count['all'] = utils.order_dictionary(
            label_count['all'], 'key',
            reverse=False)  # Sort label count theo label asc
        character_count['all'] = utils.order_dictionary(
            character_count['all'], 'value', reverse=True
        )  # Sort character count theo các character có freq cao đến thấp
        if self.verbose:
            print('character_count[\'all\']: {0}'.format(
                character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        if self.verbose:
            print("parameters['remap_unknown_tokens_to_unk']: {0}".format(
                parameters['remap_unknown_tokens_to_unk']))
        if self.verbose:
            print("len(token_count['train'].keys()): {0}".format(
                len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1
            '''
            UNK_TOKEN: token không xuất hiện trong pretraining_dataset và trong word vectors
            '''
            if parameters['remap_unknown_tokens_to_unk'] == 1 and \
                (token_count['train'][token] == 0 or \
                parameters['load_only_pretrained_token_embeddings']) and \
                not utils_nlp.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \
                token not in all_tokens_in_pretraining_dataset:
                if self.verbose: print("token: {0}".format(token))
                if self.verbose:
                    print("token.lower(): {0}".format(token.lower()))
                if self.verbose:
                    print("re.sub('\d', '0', token.lower()): {0}".format(
                        re.sub('\d', '0', token.lower())))
                token_to_index[token] = self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))

        infrequent_token_indices = [
        ]  # Các token xuất hiện thấp trong train dataset
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        # Ensure that both B- and I- versions exist for each label
        # Bỏ các tiền tố B-, O-, I-...
        labels_without_bio = set()
        for label in label_count['all'].keys():
            new_label = utils_nlp.remove_bio_from_label_name(label)
            labels_without_bio.add(new_label)

        # Kết hợp các ENTITY vs các tiền tố B-, I-,... và thêm vào label count
        for label in labels_without_bio:
            if label == 'O':
                continue
            if parameters['tagging_format'] == 'bioes':
                prefixes = ['B-', 'I-', 'E-', 'S-']
            else:
                prefixes = ['B-', 'I-']
            for prefix in prefixes:
                l = prefix + label
                if l not in label_count['all']:
                    label_count['all'][l] = 0
        # Sắp xếp label_count theo label asc
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)

        if parameters['use_pretrained_model']:
            self.unique_labels = sorted(
                list(pretraining_dataset.label_to_index.keys()))
            # Make sure labels are compatible with the pretraining dataset.
            for label in label_count['all']:
                if label not in pretraining_dataset.label_to_index:
                    raise AssertionError(
                        "The label {0} does not exist in the pretraining dataset. "
                        .format(label) +
                        "Please ensure that only the following labels exist in the dataset: {0}"
                        .format(', '.join(self.unique_labels)))
            label_to_index = pretraining_dataset.label_to_index.copy()
        else:
            label_to_index = {}
            iteration_number = 0
            for label, count in label_count['all'].items():
                label_to_index[label] = iteration_number
                iteration_number += 1
                self.unique_labels.append(label)

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == self.PADDING_CHARACTER_INDEX:
                iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1

        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('token_to_index: {0}'.format(token_to_index))
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remap_unknown_tokens_to_unk'] == 1:
            index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        if self.verbose: print('index_to_token: {0}'.format(index_to_token))

        if self.verbose:
            print('label_count[\'train\']: {0}'.format(label_count['train']))
        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose: print('index_to_label: {0}'.format(index_to_label))

        character_to_index = utils.order_dictionary(character_to_index,
                                                    'value',
                                                    reverse=False)
        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose:
            print('character_to_index: {0}'.format(character_to_index))
        if self.verbose:
            print('index_to_character: {0}'.format(index_to_character))

        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        if self.verbose:
            # Print sequences of length 1 in train set
            for token_sequence, label_sequence in zip(tokens['train'],
                                                      labels['train']):
                if len(label_sequence) == 1 and label_sequence[0] != 'O':
                    print("{0}\t{1}".format(token_sequence[0],
                                            label_sequence[0]))

        self.token_to_index = token_to_index  # {token: index sau khi sắp xếp theo freq từ cao đến thấp, 0 nếu là unk token}
        self.index_to_token = index_to_token  # Ngược token_to_index

        self.index_to_character = index_to_character  # Ngược character_to_index
        self.character_to_index = character_to_index  # { character: index sau khi sắp xếp freq từ cao đến thấp}

        self.index_to_label = index_to_label  # Ngược label_to_index
        self.label_to_index = label_to_index  # {label: index sau khi sắp xếp asc}

        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))
        self.tokens = tokens
        self.labels = labels

        token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(
            dataset_filepaths.keys())

        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.label_vector_indices = label_vector_indices

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1
        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose:
            print('self.unique_labels_of_interest: {0}'.format(
                self.unique_labels_of_interest))
        if self.verbose:
            print('self.unique_label_indices_of_interest: {0}'.format(
                self.unique_label_indices_of_interest))

        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))

        return token_to_vector
Exemplo n.º 12
0
    def load_dataset(self, dataset_filepaths, parameters, annotator):
        '''
            dataset_filepaths : dictionary with keys 'train', 'valid', 'test'
        '''
        start_time = time.time()
        print('Load dataset... ', end='', flush=True)

        if parameters['do_split']:
            dataset_filepaths = self._do_split(parameters)

        all_pretrained_tokens = []
        if parameters['token_pretrained_embedding_filepath'] != '':
            all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings(
                parameters)
        if self.verbose:
            print("len(all_pretrained_tokens): {0}".format(
                len(all_pretrained_tokens)))

        # Load pretraining dataset to ensure that index to label is compatible to the pretrained model,
        #   and that token embeddings that are learned in the pretrained model are loaded properly.
        all_tokens_in_pretraining_dataset = []

        self.UNK_TOKEN_INDEX = 0
        self.PADDING_TOKEN_INDEX = 1
        self.tokens_mapped_to_unk = []
        self.UNK = '_UNK_'
        self.PAD = '_PAD_'
        self.unique_labels = []
        labels = {}
        tokens = {}
        token_count = {}
        label_count = {}

        self.max_tokens = -1
        # Look for max length
        for dataset_type in ['train', 'valid', 'test']:
            max_tokens = self._find_max_length(
                dataset_filepaths.get(dataset_type, None),
                annotator,
                force_preprocessing=parameters['do_split'])
            if parameters['max_length_sentence'] == -1:
                self.max_tokens = max(self.max_tokens, max_tokens)
            else:
                if self.max_tokens == -1:
                    self.max_tokens = max_tokens
                self.max_tokens = min(parameters['max_length_sentence'],
                                      self.max_tokens)

        for dataset_type in ['train', 'valid', 'test']:
            labels[dataset_type], tokens[dataset_type], token_count[
                dataset_type], label_count[dataset_type] = self._parse_dataset(
                    dataset_filepaths.get(dataset_type, None),
                    annotator,
                    force_preprocessing=parameters['do_split'],
                    limit=self.max_tokens)

            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()):
            token_count['all'][token] = token_count['train'].get(
                token, 0) + token_count['valid'].get(
                    token, 0) + token_count['test'].get(token, 0)

        for dataset_type in dataset_filepaths.keys():
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()):
            label_count['all'][character] = label_count['train'].get(
                character, 0) + label_count['valid'].get(
                    character, 0) + label_count['test'].get(character, 0)

        token_count['all'] = utils.order_dictionary(token_count['all'],
                                                    'value_key',
                                                    reverse=True)
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        token_to_index[self.PAD] = self.PADDING_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0

        if self.verbose:
            print("parameters['remap_unknown_tokens_to_unk']: {0}".format(
                parameters['remap_unknown_tokens_to_unk']))
        if self.verbose:
            print("len(token_count['train'].keys()): {0}".format(
                len(token_count['train'].keys())))

        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX:
                iteration_number += 1
            if iteration_number == self.PADDING_TOKEN_INDEX:
                iteration_number += 1

            if parameters['remap_unknown_tokens_to_unk'] and (
                    token_count['train'].get(token, 0) == 0
                    or parameters['load_only_pretrained_token_embeddings']
            ) and not utils_nlp.is_token_in_pretrained_embeddings(
                    token, all_pretrained_tokens, parameters
            ) and token not in all_tokens_in_pretraining_dataset:
                if self.verbose: print("token: {0}".format(token))
                if self.verbose:
                    print("token.lower(): {0}".format(token.lower()))
                if self.verbose:
                    print("re.sub('\d', '0', token.lower()): {0}".format(
                        re.sub('\d', '0', token.lower())))
                token_to_index[token] = self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1

        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= parameters['remap_to_unk_count_threshold']:
                infrequent_token_indices.append(token_to_index[token])

        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        label_to_index = {}
        iteration_number = 0
        for label, count in label_count['all'].items():
            label_to_index[label] = iteration_number
            iteration_number += 1
            self.unique_labels.append(label)

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))
        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))

        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)

        if self.verbose: print('token_to_index: {0}'.format(token_to_index))

        index_to_token = utils.reverse_dictionary(token_to_index)

        if parameters['remap_unknown_tokens_to_unk'] == 1:
            index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        index_to_token[self.PADDING_TOKEN_INDEX] = self.PAD

        if self.verbose: print('index_to_token: {0}'.format(index_to_token))
        if self.verbose:
            print('label_count[\'train\']: {0}'.format(label_count['train']))

        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)

        if self.verbose: print('label_to_index: {0}'.format(label_to_index))

        index_to_label = utils.reverse_dictionary(label_to_index)

        if self.verbose: print('index_to_label: {0}'.format(index_to_label))
        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        # Map tokens and labels to their indices
        token_indices = {}
        label_indices = {}
        token_lengths = {}
        token_indices_padded = {}
        for dataset_type in dataset_filepaths.keys():
            token_indices[dataset_type] = []
            token_lengths[dataset_type] = []
            token_indices_padded[dataset_type] = []

            # Tokens
            for token_sequence in tokens[dataset_type]:
                token_indices[dataset_type].append(
                    [token_to_index[token] for token in token_sequence])
                token_lengths[dataset_type].append(len(token_sequence))

            # Labels
            label_indices[dataset_type] = []
            for label in labels[dataset_type]:
                label_indices[dataset_type].append(label_to_index[label])

        # Pad tokens
        for dataset_type in dataset_filepaths.keys():
            token_indices_padded[dataset_type] = []
            token_indices_padded[dataset_type] = [
                utils.pad_list(temp_token_indices, self.max_tokens,
                               self.PADDING_TOKEN_INDEX)
                for temp_token_indices in token_indices[dataset_type]
            ]

        if self.verbose:
            print('token_lengths[\'train\'][0:10]: {0}'.format(
                token_lengths['train'][0:10]))
        if self.verbose:
            print('token_indices[\'train\'][0][0:10]: {0}'.format(
                token_indices['train'][0][0:10]))
        if self.verbose:
            print('token_indices_padded[\'train\'][0][0:10]: {0}'.format(
                token_indices_padded['train'][0][0:10]))
        if self.verbose:
            print('label_indices[\'train\'][0:10]: {0}'.format(
                label_indices['train'][0:10]))

        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.token_indices = token_indices
        self.label_indices = label_indices
        self.token_indices_padded = token_indices_padded
        self.token_lengths = token_lengths
        self.tokens = tokens
        self.labels = labels
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index

        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1

        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        self.infrequent_token_indices = infrequent_token_indices

        # Binarize label
        label_vector_indices = {}
        for dataset_type, labels in label_indices.items():
            label_vector_indices[dataset_type] = []
            for label in labels:
                label_vector_indices[dataset_type].append(
                    utils.convert_one_hot(label, self.number_of_classes))
        self.label_vector_indices = label_vector_indices

        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))