def lemmatize(file, output_file): morphodita_model = os.path.join( dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy') tagger = Tagger.load(morphodita_model) assert tagger forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() tokenizer = tagger.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) # for i in range(len(tokens)): # lemma = lemmas[i] # token = tokens[i] #word = line[token.start:token.start + token.length] #out.write(str(lemma.lemma) + ' ') #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas)))) out.write(" ".join( list( map( lambda x: str(x.lemma).strip() + '___' + str(x.tag) .strip(), lemmas)))) out.write('\n')
def syn2_to_plain(filename, filename_out, keep_punctuation=True, keep_tags=False, raw=False): with open_gz(filename_out, 'w') as file, open_gz(filename, 'r', encoding='utf-8') as f: root = ET.iterparse(f) for event, element in root: if element.tag == 'block': file.write('\n') if element.tag == 's': file.write(' '.join(element.text.split()[::3])) # for word in element.text.split('\n'): # if word: # word, lemma, tags = word.split('\t') # if raw: # file.write(word + ' ') # elif keep_tags: # file.write(word + ' ' + lemma + ' ' + tags + '\n') # elif keep_punctuation or not tags.startswith('Z'): # file.write(lemma + ' ') file.write('\n') element.clear()
def lemmatize_and_replace_entities(file, output_file): nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner') morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast') tagger = Tagger.load(morphodita_model) assert tagger ner = Ner.load(nametag_model) assert ner forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() entities = NamedEntities() tokenizer = ner.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) ner.recognize(forms, entities) sorted_entities = sort_entities(entities) open_entities = [] open_entities_type = [] e = 0 for i in range(len(tokens)): lemma = lemmas[i] token = tokens[i] word = line[token.start:token.start + token.length] while e < len( sorted_entities) and sorted_entities[e].start == i: open_entities.append(sorted_entities[e].start + sorted_entities[e].length - 1) open_entities_type.append(sorted_entities[e].type) e += 1 if len(open_entities) == 0: out.write(str(lemma.lemma) + ' ') else: out.write("@!ENT!%s " % ('!'.join(open_entities_type))) while open_entities and open_entities[-1] == i: open_entities.pop() open_entities_type.pop() out.write('\n')
def lemmatize_and_replace_entities(file, output_file): nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner') morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast') tagger = Tagger.load(morphodita_model) assert tagger ner = Ner.load(nametag_model) assert ner forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() entities = NamedEntities() tokenizer = ner.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) ner.recognize(forms, entities) sorted_entities = sort_entities(entities) open_entities = [] open_entities_type = [] e = 0 for i in range(len(tokens)): lemma = lemmas[i] token = tokens[i] word = line[token.start:token.start + token.length] while e < len(sorted_entities) and sorted_entities[e].start == i: open_entities.append(sorted_entities[e].start + sorted_entities[e].length - 1) open_entities_type.append(sorted_entities[e].type) e += 1 if len(open_entities) == 0: out.write(str(lemma.lemma) + ' ') else: out.write("@!ENT!%s " % ('!'.join(open_entities_type))) while open_entities and open_entities[-1] == i: open_entities.pop() open_entities_type.pop() out.write('\n')
def lemmatize(file, output_file): morphodita_model = os.path.join(dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy') tagger = Tagger.load(morphodita_model) assert tagger forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() tokenizer = tagger.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) # for i in range(len(tokens)): # lemma = lemmas[i] # token = tokens[i] #word = line[token.start:token.start + token.length] #out.write(str(lemma.lemma) + ' ') #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas)))) out.write(" ".join(list(map(lambda x: str(x.lemma).strip() + '___' + str(x.tag).strip(), lemmas)))) out.write('\n')
def create(file): tags = defaultdict(lambda: defaultdict(int)) with open_gz(file) as f: for line in f: if line != '\n' and len(line) >= 3: word = line.strip() if word[-2] == '_': p = word[-1] w = word[:-2] tags[w][p] += 1 pos = POS() for word, t in tags.items(): max_num = 0 for tag, num in t.items(): if num > max_num: pos.word_pos[word] = t max_num = num return pos
__author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes counts and ppmi matrix for given corpus and dictionary") parser.add_argument('corpus', type=str, help='Corpus') parser.add_argument('word_count', type=int, help='Word count') parser.add_argument('postfix_length', type=int) parser.add_argument('output_file', type=str, help='Name of the output file') args = parser.parse_args() check_input_file_exists(args.corpus) max_count = args.word_count with open_gz(args.corpus) as input: word_count = 0 file_count = 0 line_1 = None end_of_file = False output = None for line in input: line = line.strip() if not output: output = open_gz(args.output_file + ('.%0' + str(args.postfix_length) + 'd') % file_count + '.gz', 'w') if line == '\n': output.write('\n') continue words = len(line.split()) word_count += words if line_1:
#!/usr/bin/env python3 from synonyms.dictionary import Dictionary from synonyms.in_out.readers import open_gz __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists if __name__ == '__main__': parser = argparse.ArgumentParser(description="TODO") parser.add_argument('input_file', type=str, help='Input file with corpus in plain text') parser.add_argument('dictionary', type=str, help='Input file with dictionary') parser.add_argument('output_file', type=str, help='Output file where filtered version of corpus will be stored') args = parser.parse_args() check_input_file_exists(args.input_file) check_input_file_exists(args.dictionary) dictionary = Dictionary(filename=args.dictionary) with open_gz(args.output_file, 'w+', encoding='utf-8') as w, open_gz(args.input_file, encoding='utf-8') as r: for line in r: w.write(' '.join([word for word in line.lower().split() if word in dictionary])+'\n')
def load(filename): pos = POS() with open_gz(filename) as f: pos.word_pos = json.load(f) return pos
def save(self, filename): with open_gz(filename, 'w') as f: json.dump(self.word_pos, f)