def read_pos(pos_tag_file): sentences = [] sentence = [None] previous_line = '' for line in read_lines(pos_tag_file)[:-1]: if not ((line == '!\tPUNCT') or (line == '.\tPUNCT') or (line == '?\tPUNCT')): sentence.append(line.split('\t')[1]) else: if (len(sentence) != 1): sentence.append(line.split('\t')[1]) sentences.append(sentence) sentence = [None] previous_line = line return sentences
def write_sentences(sentences, filename, preamble): write_lines(filename, preamble + flatten_sentences(sentences)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--train', type=float, default=0.7) parser.add_argument('--test', type=float, default=0.2) parser.add_argument("--cv", action="store_true") args = parser.parse_args() # Source file tokens_with_entities = add_bio_prefixes( [line.split(' ') for line in read_lines(STANFORD_FILE)]) # Stanford output tokens_with_pos_and_phrase_types = [ line.split(' ') for line in read_lines(POS_FILE)[:] ] # Merge two files preamble = list( map(lambda item: ' '.join(item), tokens_with_pos_and_phrase_types[:2])) sentences = [] sentence = [] i = 2 mismatching_tokens_counter = 0 max_number_of_mismatching_tokens = 100 skip_iterations = 0 for j in range(len(tokens_with_entities)):
from convert import read_lines, write_lines STANFORD_FILE = 'stanford-ner-corpus.txt' tokens = [line.split(' ')[0] for line in read_lines(STANFORD_FILE)] write_lines('text.txt', [ ' '.join(tokens), ])
sentence.append(line.split('\t')[1]) sentences.append(sentence) sentence = [None] previous_line = line return sentences STANFORD_DEPS_FILE = 'text.txt.out' POS_FILE = 'pos.tag' PHRASE_TYPES_FILE = 'phrase_types.txt' if __name__ == '__main__': sentences = [] sentence = [] last_line = '' for line in read_lines(STANFORD_DEPS_FILE): last_line = line if not line.startswith('Sentence'): sentence.append(line) #sentence.append(decode_dependency(line)) else: sentence = sentence[3:-1] #print(sentence) #print(len(sentence)) #print(sentence) if len(sentence) <= 4: sentence = [] continue #print(sentence[sentence.index('') + 2:]) #print(sentence[3:sentence.index('')]) #print(sentence[:sentence.index('')])
from convert import read_lines, write_lines structure_stanford_deps = __import__('structure-stanford-deps') STANFORD_POS_FILE = 'ru-conll2003.txt' STANFORD_FILE = 'stanford-ner-corpus.txt' POS_FILE = 'phrase_types.txt' tokens_with_entities = [line.split(' ') for line in read_lines(STANFORD_FILE)] tokens_with_pos_and_phrase_types = [ line.split(' ') for line in read_lines(POS_FILE)[:] ] # Add bio prefixes for entities labels no_entity_mark = 'O' previous_entity = '' new_tokens_with_entities = [] for token, entity in tokens_with_entities: new_tokens_with_entities.append(( token, f'{structure_stanford_deps.get_bio_prefix(previous_entity, entity) if entity != no_entity_mark else ""}{entity}' )) previous_entity = entity tokens_with_entities = new_tokens_with_entities #Merge lines = list( map(lambda item: ' '.join(item), tokens_with_pos_and_phrase_types[:2])) i = 2 for token, entity in tokens_with_entities: if i >= len(tokens_with_pos_and_phrase_types): break
from convert import read_lines, write_lines POS_FILE = 'stanford-ner-corpus.txt' PHRASE_TYPES_FILE = 'phrase_types.txt' pos_tokens = list(map(lambda line: line.split(' ')[0], read_lines(POS_FILE))) phrase_types = list( map(lambda line: line.split(' ')[0], read_lines(PHRASE_TYPES_FILE))) counter = 1000 for i in range(len(pos_tokens)): if pos_tokens[i] != phrase_types[i]: counter -= 1 print(i, pos_tokens[i], phrase_types[i]) if (counter < 0): break