def load_data(): train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en( ) conll_dict = CoNLLDictorizer(column_names, col_sep=' +') train_dict = conll_dict.transform(train_sentences) test_dict = conll_dict.transform(test_sentences) dev_dict = conll_dict.transform(dev_sentences) return train_dict, test_dict, dev_dict
test_sentences = open(test_file).read().strip() test2_sentences = open(test2_file).read().strip() return train_sentences, dev_sentences, test_sentences, column_names vilde = False if vilde: BASE_DIR = '/home/pierre/Cours/EDAN20/corpus/CoNLL2003/' else: BASE_DIR = '/Users/pierre/Projets/Corpora/CoNLL2003/' def load_conll2003_en(): train_file = BASE_DIR + 'NER-data/eng.train' dev_file = BASE_DIR + 'NER-data/eng.valid' test_file = BASE_DIR + 'NER-data/eng.test' column_names = ['form', 'ppos', 'pchunk', 'ner'] train_sentences = open(train_file, encoding='utf8').read().strip() dev_sentences = open(dev_file, encoding='utf8').read().strip() test_sentences = open(test_file, encoding='utf8').read().strip() return train_sentences, dev_sentences, test_sentences, column_names if __name__ == '__main__': train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en() conll_dict = CoNLLDictorizer(column_names, col_sep=' +') train_dict = conll_dict.transform(train_sentences) print(train_dict[0]) print(train_dict[1])
train_file = BASE_DIR + '/eng.train' dev_file = BASE_DIR + '/eng.valid' test_file = BASE_DIR + '/eng.test' column_names = ['form', 'ppos', 'pchunk', 'ner'] train_sentences = open(train_file).read().strip() dev_sentences = open(dev_file).read().strip() test_sentences = open(test_file).read().strip() return train_sentences, dev_sentences, test_sentences, column_names # Converting the corpus into a dictionary if __name__ == '__main__': train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en( ) conll_dict = CoNLLDictorizer(column_names, col_sep=' +') train_dict = conll_dict.transform(train_sentences) dev_dict = conll_dict.transform( dev_sentences) # added for dev and test as well test_dict = conll_dict.transform(test_sentences) print('First sentence, train:', train_dict[0]) # Function to build the two-way sequences # Two vectors: x and Y # Instead of extracting_dictorizer def build_sequences(corpus_dict, key_x='form', key_y='ner', tolower=True): """ Creates sequences from a list of dictionaries :param corpus_dict: :param key_x:
column_names = ['form', 'ppos', 'pchunk', 'ner'] train_sentences = open(train_file).read().strip() valid_sentences = open(valid_file).read().strip() test_sentences = open(test_file).read().strip() return train_sentences, valid_sentences, test_sentences, column_names # Converting the corpus into a dictionary if __name__ == '__main__': train_sentences, valid_sentences, test_sentences, column_names = load_conll2003_en( ) # Why use '+' as sep? conll_dict = CoNLLDictorizer(column_names) train_dict = conll_dict.transform(train_sentences) valid_dict = conll_dict.transform(valid_sentences) test_dict = conll_dict.transform(test_sentences) print('First sentence, train:', train_dict[0]) # Function to build the two-way sequence # Vectors: x and Y def build_sequences(corpus_dict, key_x='form', key_y='ner', tolower=True): """ Creates sequences from a list of dictionaries :param corpus_dict: :param key_x: :param key_y: :return: