Пример #1
0
 def analyse_entry(self, entry, params):
     chunker_type = params["delimiter"]
     original_text = entry['nif:isString']
     if chunker_type == "sentence":
         tokenizer = PunktSentenceTokenizer()
     if chunker_type == "paragraph":
         tokenizer = LineTokenizer()
     chars = list(tokenizer.span_tokenize(original_text))
     for i, chunk in enumerate(tokenizer.tokenize(original_text)):
         print(chunk)
         e = Entry()
         e['nif:isString'] = chunk
         if entry.id:
             e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
         yield e
def apply_pipeline(text, conf):
    #Initialize the configuration
    #If alphabet corrections should be performed
    if 'prenormalize' not in conf:
        conf['prenormalize'] = True
    if 'add_punctuation_analyses' not in conf:
        conf['add_punctuation_analyses'] = True
    #If user dictionary should be used
    if 'user_dictionaries' not in conf:
        conf['user_dictionaries'] = None
    if 'vm_analyzer' not in conf:
        conf['vm_analyzer'] = VabamorfAnalyzer(guess=False, propername=False)
    if 'newline_sentence_tokenizer' not in conf:
        conf['newline_sentence_tokenizer'] = SentenceTokenizer(
            base_sentence_tokenizer=LineTokenizer())
    if 'tokens_tagger' not in conf:
        conf['tokens_tagger'] = TokensTagger()
    if conf['prenormalize'] and 'prenormalizer' not in conf:
        conf['prenormalizer'] = word_prenormalizer()
    #For testing the pretokenized functions
    txt = text.text
    multiword_expressions = []
    raw_words = txt.split(' ')
    for raw_word in raw_words:
        if ' ' in raw_word:
            multiword_expressions.append(raw_token)
    conf['tokens_tagger'].tag(text)
    multiword_expressions = [mw.split() for mw in multiword_expressions]
    compound_tokens_tagger = PretokenizedTextCompoundTokensTagger(
        multiword_units=multiword_expressions)
    compound_tokens_tagger.tag(text)
    #CompoundTokenTagger(tag_initials = False).tag(text)
    #text.tag_layer(['sentences'])
    text.tag_layer(['words'])
    conf['newline_sentence_tokenizer'].tag(text)
    if conf['prenormalize']:
        conf['prenormalizer'].retag(text)
    conf['vm_analyzer'].tag(text)
    # Perform the fixes
    if conf['user_dictionaries']:
        if 'global' in conf['user_dictionaries']:
            conf['user_dictionaries']['global'].retag(text)
        if text.meta['location'] in conf['user_dictionaries']:
            conf['user_dictionaries'][text.meta['location']].retag(text)
    if conf['add_punctuation_analyses']:
        add_punctuation_analysis(text)
    return text
Пример #3
0
 def analyse_entry(self, entry, params):
     yield entry
     chunker_type = params["delimiter"]
     original_text = entry['nif:isString']
     if chunker_type == "sentence":
         tokenizer = PunktSentenceTokenizer()
     if chunker_type == "paragraph":
         tokenizer = LineTokenizer()
     chars = list(tokenizer.span_tokenize(original_text))
     if len(chars) == 1:
         # This sentence was already split
         return
     for i, chunk in enumerate(chars):
         start, end = chunk
         e = Entry()
         e['nif:isString'] = original_text[start:end]
         if entry.id:
             e.id = entry.id + "#char={},{}".format(start, end)
         yield e
Пример #4
0
    def __init__(self,):

        NltkTokenizer.__init__(self)
        _LineTokenizer.__init__(self,)
Пример #5
0
def read_text_from_conll_file( file_name, layer_name=LAYER_CONLL, **kwargs ):
    ''' Reads the CONLL format syntactic analysis from given file, and returns as 
        a Text object.
        
        The Text object has been tokenized for paragraphs, sentences, words, and it 
        contains syntactic analyses aligned with word spans, in the layer *layer_name* 
        (by default: LAYER_CONLL);
        
        Attached syntactic analyses are in the format as is the output of 
          utils.normalise_alignments();
        
        Parameters
        -----------
        file_name : str
            Name of the input file; Should contain syntactically analysed text,
            following the CONLL format;
        
        layer_name : str
            Name of the Text's layer in which syntactic analyses are stored; 
            Defaults to 'conll_syntax';
        
            For other parameters, see optional parameters of the methods:
            
             utils.normalise_alignments():          "rep_miss_w_dummy", "fix_selfrefs",
                                                    "keep_old", "mark_root";
             maltparser_support.align_CONLL_with_Text():  "check_tokens", "add_word_ids";

    '''
    # 1) Load conll analysed text from file
    conll_lines = []
    in_f = codecs.open(file_name, mode='r', encoding='utf-8')
    for line in in_f:
        # Skip comment lines
        if line.startswith('#'):
            continue
        conll_lines.append( line.rstrip() )
    in_f.close()
    
    # 2) Extract sentences and word tokens
    sentences = []
    sentence  = []
    for i, line in enumerate( conll_lines ):
        if len(line) > 0 and '\t' in line:
            features = line.split('\t')
            if len(features) != 10:
                raise Exception(' In file '+in_file+', line '+str(i)+\
                                ' with unexpected format: "'+line+'" ')
            word_id = features[0]
            token   = features[1]
            sentence.append( token )
        elif len(line)==0 or re.match('^\s+$', line):
            # End of a sentence 
            if sentence:
               # (!) Use double space instead of single space in order to distinguish
               #     word-tokenizing space from the single space in the multiwords
               #     (e.g. 'Rio de Janeiro' as a single word);
               sentences.append( '  '.join(sentence) )
            sentence = []
    if sentence:
        sentences.append( '  '.join(sentence) )
    
    # 3) Construct the estnltk's Text
    kwargs4text = {
      # Use custom tokenization utils in order to preserve exactly the same 
      # tokenization as was in the input;
      "word_tokenizer": RegexpTokenizer("  ", gaps=True),
      "sentence_tokenizer": LineTokenizer()
    }
    from estnltk.text import Text
    text = Text( '\n'.join(sentences), **kwargs4text )
    # Tokenize up to the words layer
    text.tokenize_words()
    
    # 4) Align syntactic analyses with the Text
    alignments = align_CONLL_with_Text( conll_lines, text, None, **kwargs )
    normalise_alignments( alignments, data_type=CONLL_DATA, **kwargs )
    # Attach alignments to the text
    text[ layer_name ] = alignments
    return text
Пример #6
0
def read_text_from_cg3_file( file_name, layer_name=LAYER_VISLCG3, **kwargs ):
    ''' Reads the output of VISLCG3 syntactic analysis from given file, and 
        returns as a Text object.
        
        The Text object has been tokenized for paragraphs, sentences, words, and it 
        contains syntactic analyses aligned with word spans, in the layer *layer_name* 
        (by default: LAYER_VISLCG3);
        
        Attached syntactic analyses are in the format as is the output of 
          utils.normalise_alignments();
        
        Note: when loading data from  https://github.com/EstSyntax/EDT  corpus,
        it  is  advisable  to  add  flags:  clean_up=True,  fix_sent_tags=True, 
        fix_out_of_sent=True  in order to ensure that well-formed data will be
        read from the corpus;
        
        Parameters
        -----------
        file_name : str
            Name of the input file; Should contain syntactically analysed text,
            following the format of the output of VISLCG3 syntactic analyser;
        
        clean_up : bool
            Optional argument specifying whether the vislcg3_syntax.cleanup_lines()
            should be applied in the lines of syntactic analyses read from the 
            file;
            Default: False
        
        layer_name : str
            Name of the Text's layer in which syntactic analyses are stored; 
            Defaults to 'vislcg3_syntax';
        
            For other parameters, see optional parameters of the methods:
            
             utils.normalise_alignments():          "rep_miss_w_dummy", "fix_selfrefs",
                                                    "keep_old", "mark_root";
             vislcg3_syntax.align_cg3_with_Text():  "check_tokens", "add_word_ids";
             vislcg3_syntax.cleanup_lines():        "remove_caps", "remove_clo",
                                                    "double_quotes", "fix_sent_tags"
        
        
    '''
    clean_up = False
    for argName, argVal in kwargs.items():
        if argName in ['clean_up', 'cleanup'] and argVal in [True, False]:
           #  Clean up lines
           clean_up = argVal
    # 1) Load vislcg3 analysed text from file
    cg3_lines = []
    in_f = codecs.open(file_name, mode='r', encoding='utf-8')
    for line in in_f:
        # Skip comment lines
        if line.startswith('#'):
            continue
        cg3_lines.append( line.rstrip() )
    in_f.close()
    # Clean up lines of syntactic analyses (if requested)
    if clean_up:
        cg3_lines = cleanup_lines( cg3_lines, **kwargs )

    # 2) Extract sentences and word tokens
    sentences = []
    sentence  = []
    for i, line in enumerate( cg3_lines ):
        if line == '"<s>"':
            if sentence:
                print('(!) Sentence begins before previous ends at line: '+str(i), \
                      file=sys.stderr)
            sentence  = []
        elif pat_double_quoted.match( line ) and line != '"<s>"' and line != '"</s>"':
            token_match = pat_cg3_word_token.match( line )
            if token_match:
                line = token_match.group(1)
            else:
                raise Exception('(!) Unexpected token format: ', line)
            sentence.append( line )
        elif line == '"</s>"':
            if not sentence:
                print('(!) Empty sentence at line: '+str(i), \
                      file=sys.stderr)
            # (!) Use double space instead of single space in order to distinguish
            #     word-tokenizing space from the single space in the multiwords
            #     (e.g. 'Rio de Janeiro' as a single word);
            sentences.append( '  '.join(sentence) )
            sentence = []

    # 3) Construct the estnltk's Text
    kwargs4text = {
      # Use custom tokenization utils in order to preserve exactly the same 
      # tokenization as was in the input;
      "word_tokenizer": RegexpTokenizer("  ", gaps=True),
      "sentence_tokenizer": LineTokenizer()
    }
    from estnltk.text import Text
    text = Text( '\n'.join(sentences), **kwargs4text )
    # Tokenize up to the words layer
    text.tokenize_words()
    
    # 4) Align syntactic analyses with the Text
    alignments = align_cg3_with_Text( cg3_lines, text, **kwargs )
    normalise_alignments( alignments, data_type=VISLCG3_DATA, **kwargs )
    # Attach alignments to the text
    text[ layer_name ] = alignments
    return text
Пример #7
0
def read_text_from_idx_file(file_name,
                            layer_name=WORDS,
                            keep_init_lines=False):
    ''' Reads IDX format morphological annotations from given file, and returns as a Text 
        object.
        
        The Text object will be tokenized for paragraphs, sentences, words, and it will
        contain morphological annotations in the layer *layer_name* (by default: WORDS);
        
        Parameters
        -----------
        file_name : str
            Name of the input file; Should contain IDX format text segmentation and 
            morphological annotation;
        
        keep_init_lines : bool
            Optional argument specifying whether the lines from the file should also be
            preserved on a special layer named 'init_lines';
            Default: False
        
        layer_name : str
            Name of the Text's layer in which morphological annotations from text are 
            stored; 
            Defaults to WORDS;
    
        Example: expected format of the input:
          129	1	1	"	"	"	Z	
          129	2	1	Mul	mina	mina+l	P	sg ad
          129	3	1	on	olema	ole+0	V	b
          129	3	1	on	olema	ole+0	V	vad
          129	4	1	palju	palju	palju+0	D	
          129	5	1	igasugust	igasugune	iga_sugune+t	P	sg p
          129	6	1	informatsiooni	informatsioon	informatsioon+0	S	sg p
          129	7	1	.	.	.	Z	
        
    '''
    from nltk.tokenize.simple import LineTokenizer
    from nltk.tokenize.regexp import RegexpTokenizer
    from estnltk import Text
    # 1) Collect the text along with morphological analyses from the input IDX file
    init_lines = []
    words = []
    sentence = []
    sentences = []
    prev_sent_id = -1
    prev_word_id = -1
    in_f = codecs.open(file_name, mode='r', encoding='utf-8')
    for line in in_f:
        fields = line.split('\t')
        assert len(
            fields
        ) == 8, '(!) Unexpected number of fields in the line: ' + str(
            len(fields))
        sent_id = fields[0]
        word_id = fields[1]
        clause_id = fields[2]
        token = fields[3]
        if prev_sent_id != sent_id:
            # Record the old sentence, start a new
            if sentence:
                sentences.append('  '.join(sentence))
            sentence = []
        if prev_word_id != word_id:
            # Record a new token
            sentence.append(token)
            word = {TEXT: token, ANALYSIS: []}
            words.append(word)
        # Augment the last word in the list with new analysis
        lemma = fields[4]
        root = fields[5]
        pos = fields[6]
        form = fields[7].rstrip()
        ending = ''
        clitic = ''
        analysis = get_analysis_dict(root, pos, form)
        analysis[LEMMA] = lemma
        words[-1][ANALYSIS].append(analysis)
        prev_sent_id = sent_id
        prev_word_id = word_id
        if keep_init_lines:
            init_lines.append([sent_id + ' ' + word_id, line])
    in_f.close()
    if sentence:
        # Record the last sentence
        sentences.append('  '.join(sentence))

    # 2) Construct the estnltk's Text
    kwargs4text = {
        # Use custom tokenization utils in order to preserve exactly the same
        # tokenization as was in the input;
        "word_tokenizer": RegexpTokenizer("  ", gaps=True),
        "sentence_tokenizer": LineTokenizer()
    }
    from estnltk.text import Text
    text = Text('\n'.join(sentences), **kwargs4text)
    # Tokenize up to the words layer
    text.tokenize_words()

    # 3) Create a new layer with morphological analyses, or
    #    populate the old layer with morphological analyses;
    assert len(text[WORDS]) == len(words), \
        '(!) Number of words from input does not match with the number of words in EstNLTK Text: '+\
             str(len(text[WORDS]) )+' != '+str(len(words))
    if layer_name != WORDS:
        # If necessary, create a new layer duplicating the WORDS layer
        text[layer_name] = []
        for word in text[WORDS]:
            text[layer_name].append({
                START: word[START],
                END: word[END],
                TEXT: word[TEXT]
            })
    # Copy morphological analyses to the new layer / populate the old layer
    for wid, word in enumerate(text[WORDS]):
        text[layer_name][wid][ANALYSIS] = words[wid][ANALYSIS]
    if layer_name == WORDS:
        assert text.is_tagged(
            ANALYSIS), '(!) The layer of analysis should exist by now!'

    if keep_init_lines:
        # Preserve the initial lines from file in a separate layer
        text['init_lines'] = []
        i = 0
        for wid, word in enumerate(text[layer_name]):
            words_lines = []
            # collect lines associated with the word
            while i < len(init_lines):
                [lid, line] = init_lines[i]
                if not words_lines or words_lines[-1][0] == lid:
                    words_lines.append([lid, line])
                else:
                    break
                i += 1
            # record lines
            text['init_lines'].append( \
                {START:word[START], END:word[END], 'lines':[l[1] for l in words_lines]} )
        assert len(text['init_lines']) == len(text[layer_name]), \
            '(!) The number of initial lines should match the number of words in text!'
    return text