def analyse_entry(self, entry, params): chunker_type = params["delimiter"] original_text = entry['nif:isString'] if chunker_type == "sentence": tokenizer = PunktSentenceTokenizer() if chunker_type == "paragraph": tokenizer = LineTokenizer() chars = list(tokenizer.span_tokenize(original_text)) for i, chunk in enumerate(tokenizer.tokenize(original_text)): print(chunk) e = Entry() e['nif:isString'] = chunk if entry.id: e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1]) yield e
def apply_pipeline(text, conf): #Initialize the configuration #If alphabet corrections should be performed if 'prenormalize' not in conf: conf['prenormalize'] = True if 'add_punctuation_analyses' not in conf: conf['add_punctuation_analyses'] = True #If user dictionary should be used if 'user_dictionaries' not in conf: conf['user_dictionaries'] = None if 'vm_analyzer' not in conf: conf['vm_analyzer'] = VabamorfAnalyzer(guess=False, propername=False) if 'newline_sentence_tokenizer' not in conf: conf['newline_sentence_tokenizer'] = SentenceTokenizer( base_sentence_tokenizer=LineTokenizer()) if 'tokens_tagger' not in conf: conf['tokens_tagger'] = TokensTagger() if conf['prenormalize'] and 'prenormalizer' not in conf: conf['prenormalizer'] = word_prenormalizer() #For testing the pretokenized functions txt = text.text multiword_expressions = [] raw_words = txt.split(' ') for raw_word in raw_words: if ' ' in raw_word: multiword_expressions.append(raw_token) conf['tokens_tagger'].tag(text) multiword_expressions = [mw.split() for mw in multiword_expressions] compound_tokens_tagger = PretokenizedTextCompoundTokensTagger( multiword_units=multiword_expressions) compound_tokens_tagger.tag(text) #CompoundTokenTagger(tag_initials = False).tag(text) #text.tag_layer(['sentences']) text.tag_layer(['words']) conf['newline_sentence_tokenizer'].tag(text) if conf['prenormalize']: conf['prenormalizer'].retag(text) conf['vm_analyzer'].tag(text) # Perform the fixes if conf['user_dictionaries']: if 'global' in conf['user_dictionaries']: conf['user_dictionaries']['global'].retag(text) if text.meta['location'] in conf['user_dictionaries']: conf['user_dictionaries'][text.meta['location']].retag(text) if conf['add_punctuation_analyses']: add_punctuation_analysis(text) return text
def analyse_entry(self, entry, params): yield entry chunker_type = params["delimiter"] original_text = entry['nif:isString'] if chunker_type == "sentence": tokenizer = PunktSentenceTokenizer() if chunker_type == "paragraph": tokenizer = LineTokenizer() chars = list(tokenizer.span_tokenize(original_text)) if len(chars) == 1: # This sentence was already split return for i, chunk in enumerate(chars): start, end = chunk e = Entry() e['nif:isString'] = original_text[start:end] if entry.id: e.id = entry.id + "#char={},{}".format(start, end) yield e
def __init__(self,): NltkTokenizer.__init__(self) _LineTokenizer.__init__(self,)
def read_text_from_conll_file( file_name, layer_name=LAYER_CONLL, **kwargs ): ''' Reads the CONLL format syntactic analysis from given file, and returns as a Text object. The Text object has been tokenized for paragraphs, sentences, words, and it contains syntactic analyses aligned with word spans, in the layer *layer_name* (by default: LAYER_CONLL); Attached syntactic analyses are in the format as is the output of utils.normalise_alignments(); Parameters ----------- file_name : str Name of the input file; Should contain syntactically analysed text, following the CONLL format; layer_name : str Name of the Text's layer in which syntactic analyses are stored; Defaults to 'conll_syntax'; For other parameters, see optional parameters of the methods: utils.normalise_alignments(): "rep_miss_w_dummy", "fix_selfrefs", "keep_old", "mark_root"; maltparser_support.align_CONLL_with_Text(): "check_tokens", "add_word_ids"; ''' # 1) Load conll analysed text from file conll_lines = [] in_f = codecs.open(file_name, mode='r', encoding='utf-8') for line in in_f: # Skip comment lines if line.startswith('#'): continue conll_lines.append( line.rstrip() ) in_f.close() # 2) Extract sentences and word tokens sentences = [] sentence = [] for i, line in enumerate( conll_lines ): if len(line) > 0 and '\t' in line: features = line.split('\t') if len(features) != 10: raise Exception(' In file '+in_file+', line '+str(i)+\ ' with unexpected format: "'+line+'" ') word_id = features[0] token = features[1] sentence.append( token ) elif len(line)==0 or re.match('^\s+$', line): # End of a sentence if sentence: # (!) Use double space instead of single space in order to distinguish # word-tokenizing space from the single space in the multiwords # (e.g. 'Rio de Janeiro' as a single word); sentences.append( ' '.join(sentence) ) sentence = [] if sentence: sentences.append( ' '.join(sentence) ) # 3) Construct the estnltk's Text kwargs4text = { # Use custom tokenization utils in order to preserve exactly the same # tokenization as was in the input; "word_tokenizer": RegexpTokenizer(" ", gaps=True), "sentence_tokenizer": LineTokenizer() } from estnltk.text import Text text = Text( '\n'.join(sentences), **kwargs4text ) # Tokenize up to the words layer text.tokenize_words() # 4) Align syntactic analyses with the Text alignments = align_CONLL_with_Text( conll_lines, text, None, **kwargs ) normalise_alignments( alignments, data_type=CONLL_DATA, **kwargs ) # Attach alignments to the text text[ layer_name ] = alignments return text
def read_text_from_cg3_file( file_name, layer_name=LAYER_VISLCG3, **kwargs ): ''' Reads the output of VISLCG3 syntactic analysis from given file, and returns as a Text object. The Text object has been tokenized for paragraphs, sentences, words, and it contains syntactic analyses aligned with word spans, in the layer *layer_name* (by default: LAYER_VISLCG3); Attached syntactic analyses are in the format as is the output of utils.normalise_alignments(); Note: when loading data from https://github.com/EstSyntax/EDT corpus, it is advisable to add flags: clean_up=True, fix_sent_tags=True, fix_out_of_sent=True in order to ensure that well-formed data will be read from the corpus; Parameters ----------- file_name : str Name of the input file; Should contain syntactically analysed text, following the format of the output of VISLCG3 syntactic analyser; clean_up : bool Optional argument specifying whether the vislcg3_syntax.cleanup_lines() should be applied in the lines of syntactic analyses read from the file; Default: False layer_name : str Name of the Text's layer in which syntactic analyses are stored; Defaults to 'vislcg3_syntax'; For other parameters, see optional parameters of the methods: utils.normalise_alignments(): "rep_miss_w_dummy", "fix_selfrefs", "keep_old", "mark_root"; vislcg3_syntax.align_cg3_with_Text(): "check_tokens", "add_word_ids"; vislcg3_syntax.cleanup_lines(): "remove_caps", "remove_clo", "double_quotes", "fix_sent_tags" ''' clean_up = False for argName, argVal in kwargs.items(): if argName in ['clean_up', 'cleanup'] and argVal in [True, False]: # Clean up lines clean_up = argVal # 1) Load vislcg3 analysed text from file cg3_lines = [] in_f = codecs.open(file_name, mode='r', encoding='utf-8') for line in in_f: # Skip comment lines if line.startswith('#'): continue cg3_lines.append( line.rstrip() ) in_f.close() # Clean up lines of syntactic analyses (if requested) if clean_up: cg3_lines = cleanup_lines( cg3_lines, **kwargs ) # 2) Extract sentences and word tokens sentences = [] sentence = [] for i, line in enumerate( cg3_lines ): if line == '"<s>"': if sentence: print('(!) Sentence begins before previous ends at line: '+str(i), \ file=sys.stderr) sentence = [] elif pat_double_quoted.match( line ) and line != '"<s>"' and line != '"</s>"': token_match = pat_cg3_word_token.match( line ) if token_match: line = token_match.group(1) else: raise Exception('(!) Unexpected token format: ', line) sentence.append( line ) elif line == '"</s>"': if not sentence: print('(!) Empty sentence at line: '+str(i), \ file=sys.stderr) # (!) Use double space instead of single space in order to distinguish # word-tokenizing space from the single space in the multiwords # (e.g. 'Rio de Janeiro' as a single word); sentences.append( ' '.join(sentence) ) sentence = [] # 3) Construct the estnltk's Text kwargs4text = { # Use custom tokenization utils in order to preserve exactly the same # tokenization as was in the input; "word_tokenizer": RegexpTokenizer(" ", gaps=True), "sentence_tokenizer": LineTokenizer() } from estnltk.text import Text text = Text( '\n'.join(sentences), **kwargs4text ) # Tokenize up to the words layer text.tokenize_words() # 4) Align syntactic analyses with the Text alignments = align_cg3_with_Text( cg3_lines, text, **kwargs ) normalise_alignments( alignments, data_type=VISLCG3_DATA, **kwargs ) # Attach alignments to the text text[ layer_name ] = alignments return text
def read_text_from_idx_file(file_name, layer_name=WORDS, keep_init_lines=False): ''' Reads IDX format morphological annotations from given file, and returns as a Text object. The Text object will be tokenized for paragraphs, sentences, words, and it will contain morphological annotations in the layer *layer_name* (by default: WORDS); Parameters ----------- file_name : str Name of the input file; Should contain IDX format text segmentation and morphological annotation; keep_init_lines : bool Optional argument specifying whether the lines from the file should also be preserved on a special layer named 'init_lines'; Default: False layer_name : str Name of the Text's layer in which morphological annotations from text are stored; Defaults to WORDS; Example: expected format of the input: 129 1 1 " " " Z 129 2 1 Mul mina mina+l P sg ad 129 3 1 on olema ole+0 V b 129 3 1 on olema ole+0 V vad 129 4 1 palju palju palju+0 D 129 5 1 igasugust igasugune iga_sugune+t P sg p 129 6 1 informatsiooni informatsioon informatsioon+0 S sg p 129 7 1 . . . Z ''' from nltk.tokenize.simple import LineTokenizer from nltk.tokenize.regexp import RegexpTokenizer from estnltk import Text # 1) Collect the text along with morphological analyses from the input IDX file init_lines = [] words = [] sentence = [] sentences = [] prev_sent_id = -1 prev_word_id = -1 in_f = codecs.open(file_name, mode='r', encoding='utf-8') for line in in_f: fields = line.split('\t') assert len( fields ) == 8, '(!) Unexpected number of fields in the line: ' + str( len(fields)) sent_id = fields[0] word_id = fields[1] clause_id = fields[2] token = fields[3] if prev_sent_id != sent_id: # Record the old sentence, start a new if sentence: sentences.append(' '.join(sentence)) sentence = [] if prev_word_id != word_id: # Record a new token sentence.append(token) word = {TEXT: token, ANALYSIS: []} words.append(word) # Augment the last word in the list with new analysis lemma = fields[4] root = fields[5] pos = fields[6] form = fields[7].rstrip() ending = '' clitic = '' analysis = get_analysis_dict(root, pos, form) analysis[LEMMA] = lemma words[-1][ANALYSIS].append(analysis) prev_sent_id = sent_id prev_word_id = word_id if keep_init_lines: init_lines.append([sent_id + ' ' + word_id, line]) in_f.close() if sentence: # Record the last sentence sentences.append(' '.join(sentence)) # 2) Construct the estnltk's Text kwargs4text = { # Use custom tokenization utils in order to preserve exactly the same # tokenization as was in the input; "word_tokenizer": RegexpTokenizer(" ", gaps=True), "sentence_tokenizer": LineTokenizer() } from estnltk.text import Text text = Text('\n'.join(sentences), **kwargs4text) # Tokenize up to the words layer text.tokenize_words() # 3) Create a new layer with morphological analyses, or # populate the old layer with morphological analyses; assert len(text[WORDS]) == len(words), \ '(!) Number of words from input does not match with the number of words in EstNLTK Text: '+\ str(len(text[WORDS]) )+' != '+str(len(words)) if layer_name != WORDS: # If necessary, create a new layer duplicating the WORDS layer text[layer_name] = [] for word in text[WORDS]: text[layer_name].append({ START: word[START], END: word[END], TEXT: word[TEXT] }) # Copy morphological analyses to the new layer / populate the old layer for wid, word in enumerate(text[WORDS]): text[layer_name][wid][ANALYSIS] = words[wid][ANALYSIS] if layer_name == WORDS: assert text.is_tagged( ANALYSIS), '(!) The layer of analysis should exist by now!' if keep_init_lines: # Preserve the initial lines from file in a separate layer text['init_lines'] = [] i = 0 for wid, word in enumerate(text[layer_name]): words_lines = [] # collect lines associated with the word while i < len(init_lines): [lid, line] = init_lines[i] if not words_lines or words_lines[-1][0] == lid: words_lines.append([lid, line]) else: break i += 1 # record lines text['init_lines'].append( \ {START:word[START], END:word[END], 'lines':[l[1] for l in words_lines]} ) assert len(text['init_lines']) == len(text[layer_name]), \ '(!) The number of initial lines should match the number of words in text!' return text