Пример #1
0
def load_data():
    train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en(
    )

    conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
    train_dict = conll_dict.transform(train_sentences)
    test_dict = conll_dict.transform(test_sentences)
    dev_dict = conll_dict.transform(dev_sentences)

    return train_dict, test_dict, dev_dict
Пример #2
0
    test_sentences = open(test_file).read().strip()
    test2_sentences = open(test2_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names


vilde = False
if vilde:
    BASE_DIR = '/home/pierre/Cours/EDAN20/corpus/CoNLL2003/'
else:
    BASE_DIR = '/Users/pierre/Projets/Corpora/CoNLL2003/'


def load_conll2003_en():
    train_file = BASE_DIR + 'NER-data/eng.train'
    dev_file = BASE_DIR + 'NER-data/eng.valid'
    test_file = BASE_DIR + 'NER-data/eng.test'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file, encoding='utf8').read().strip()
    dev_sentences = open(dev_file, encoding='utf8').read().strip()
    test_sentences = open(test_file, encoding='utf8').read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names


if __name__ == '__main__':
    train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()

    conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
    train_dict = conll_dict.transform(train_sentences)
    print(train_dict[0])
    print(train_dict[1])
    train_file = BASE_DIR + '/eng.train'
    dev_file = BASE_DIR + '/eng.valid'
    test_file = BASE_DIR + '/eng.test'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names


# Converting the corpus into a dictionary
if __name__ == '__main__':
    train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en(
    )

    conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
    train_dict = conll_dict.transform(train_sentences)
    dev_dict = conll_dict.transform(
        dev_sentences)  # added for dev and test as well
    test_dict = conll_dict.transform(test_sentences)
    print('First sentence, train:', train_dict[0])


# Function to build the two-way sequences
# Two vectors: x and Y
# Instead of extracting_dictorizer
def build_sequences(corpus_dict, key_x='form', key_y='ner', tolower=True):
    """
    Creates sequences from a list of dictionaries
    :param corpus_dict:
    :param key_x:
    column_names = ['form', 'ppos', 'pchunk', 'ner']

    train_sentences = open(train_file).read().strip()
    valid_sentences = open(valid_file).read().strip()
    test_sentences = open(test_file).read().strip()

    return train_sentences, valid_sentences, test_sentences, column_names


# Converting the corpus into a dictionary
if __name__ == '__main__':
    train_sentences, valid_sentences, test_sentences, column_names = load_conll2003_en(
    )

    # Why use '+' as sep?
    conll_dict = CoNLLDictorizer(column_names)
    train_dict = conll_dict.transform(train_sentences)
    valid_dict = conll_dict.transform(valid_sentences)
    test_dict = conll_dict.transform(test_sentences)
    print('First sentence, train:', train_dict[0])


# Function to build the two-way sequence
# Vectors: x and Y
def build_sequences(corpus_dict, key_x='form', key_y='ner', tolower=True):
    """
    Creates sequences from a list of dictionaries
    :param corpus_dict:
    :param key_x:
    :param key_y:
    :return: