Пример #1
0
def count_ngram2(N, fin, fout):
    print 'count', N, '-gram2 on', fin, 'out to', fout, 'only top candidate reserved'
    import pynlpl.textprocessors as proc

    with open(fin, 'r') as corpus_in:
        ngrams = {}
        count_total = 0
        for line in corpus_in:
            words = proc.tokenize(line)
            for ngram in proc.Windower(words, N, '<s>', None):
                term = '\t'.join(ngram[:-1])
                cand = ngram[-1]
                if term in ngrams:
                    if cand in ngrams[term]:
                        ngrams[term][cand] += 1
                    else:
                        ngrams[term][cand] = 1
                else:
                    ngrams[term] = {cand: 1}

            count_total += 1
            if count_total % 1000000 == 0:
                print count_total

        for k in ngrams.keys():
            ngrams[k] = sorted(ngrams[k].items(),
                               key=lambda (_, freq): (-freq))[0][0]

        with open(fout, 'w') as ngrams_out:
            for key, value in sorted(ngrams.items(),
                                     key=lambda (term, cand): (term, cand)):
                ngrams_out.write(key + '\t' + value + '\n')
Пример #2
0
    def add_folia_correction(self, doc, sentence):
        """
        Adds a folia.Correction to an existing folia.Sentence. Output the created folia.Words.
        """
        words = []

        # Tokenize both the original and the edited form.
        original_tokens = tokenize(self.original)
        edited_tokens = tokenize(self.edited)

        # If we're dealing with single words (e.g. spelling errors), create the correction directly on the word.
        if len(original_tokens) == 1 and len(edited_tokens) == 1:
            w = sentence.add(folia.Word)
            words.append(w)
            n = folia.New(doc, self.edited)
            o = folia.Original(doc, self.original)
            for i, a in enumerate(self.annotations):
                if i == 0:
                    correction = w.add(folia.Correction, n, o, cls=a['unit'], generate_id_in=sentence)
                else:
                    n_new = folia.New(doc, self.edited)
                    o_new = folia.Original(doc, self.original)
                    correction = o.add(folia.Correction, n_new, o_new, cls=a['unit'], generate_id_in=sentence)
                self.add_features(correction, a)
        # We are dealing with more than one word, or an insertion/deletion. Create word elements for each token.
        else:
            n = folia.New(doc)
            o = folia.Original(doc)
            for w in edited_tokens:
                word = n.add(folia.Word, w, generate_id_in=sentence)
                words.append(word)
            for w in original_tokens:
                o.add(folia.Word, w, generate_id_in=sentence)
            for i, a in enumerate(self.annotations):
                if i == 0:
                    correction = sentence.add(folia.Correction, n, o, cls=a['unit'], generate_id_in=sentence)
                else:
                    n_new = folia.New(doc)
                    o_new = folia.Original(doc)
                    for w in edited_tokens:
                        n_new.add(folia.Word, w, generate_id_in=sentence)
                    for w in original_tokens:
                        o_new.add(folia.Word, w, generate_id_in=sentence)
                    correction = o.add(folia.Correction, n_new, o_new, cls=a['unit'], generate_id_in=sentence)
                self.add_features(correction, a)

        return words
Пример #3
0
 def to_folia_sentence_child(self, doc, sentence):
     words = []
     role = None
     if self.is_correction:
         words.extend(self.add_folia_correction(doc, sentence))
     else:
         for token in tokenize(self.original):
             w = sentence.add(folia.Word, token)
             words.append(w)
         for a in self.annotations:
             role = folia.SemanticRole(doc, *words, cls=a['unit'])
             self.add_features(role, a)
     return words, role
Пример #4
0
 def to_folia_sentence_child(self, doc, sentence):
     words = []
     role = None
     if self.is_correction:
         words.extend(self.add_folia_correction(doc, sentence))
     else:
         for token in tokenize(self.original):
             w = sentence.add(folia.Word, token)
             words.append(w)
         for a in self.annotations:
             role = folia.SemanticRole(doc, *words, cls=a['unit'])
             self.add_features(role, a)
     return words, role
Пример #5
0
    def to_folia_sentence(self, doc, sentence):
        all_words = []
        all_roles = []

        # Loop over the child nodes
        current_position = 0
        for start, end, node in self.get_child_nodes():
            # Add tokens until the start of the next child node to the sentence
            tokens = tokenize(self.original[current_position:start])
            for token in tokens:
                word = sentence.add(folia.Word, token)
                all_words.append(word)
            # If the child node has children, recurse
            if node.children:
                words, roles = node.to_folia_sentence(doc, sentence)
                all_words.extend(words)
                all_roles.extend(roles)
            # Else, add the child node
            else:
                words, role = node.to_folia_sentence_child(doc, sentence)
                all_words.extend(words)
                if role:
                    all_roles.append(role)
            current_position = end

        # Add the tokens from the last child node to the end of this node
        tokens = tokenize(self.original[current_position:self.end])
        for token in tokens:
            word = sentence.add(folia.Word, token)
            all_words.append(word)

        # If this node has annotations, add roles.
        for a in self.annotations:
            role = folia.SemanticRole(doc, *all_words, cls=a['unit'])
            self.add_features(role, a)
            all_roles.append(role)

        return all_words, all_roles
Пример #6
0
    def to_folia_sentence(self, doc, sentence):
        all_words = []
        all_roles = []

        # Loop over the child nodes
        current_position = 0
        for start, end, node in self.get_child_nodes():
            # Add tokens until the start of the next child node to the sentence
            tokens = tokenize(self.original[current_position:start])
            for token in tokens:
                word = sentence.add(folia.Word, token)
                all_words.append(word)
            # If the child node has children, recurse
            if node.children:
                words, roles = node.to_folia_sentence(doc, sentence)
                all_words.extend(words)
                all_roles.extend(roles)
            # Else, add the child node
            else:
                words, role = node.to_folia_sentence_child(doc, sentence)
                all_words.extend(words)
                if role:
                    all_roles.append(role)
            current_position = end

        # Add the tokens from the last child node to the end of this node
        tokens = tokenize(self.original[current_position:self.end])
        for token in tokens:
            word = sentence.add(folia.Word, token)
            all_words.append(word)

        # If this node has annotations, add roles.
        for a in self.annotations:
            role = folia.SemanticRole(doc, *all_words, cls=a['unit'])
            self.add_features(role, a)
            all_roles.append(role)

        return all_words, all_roles
Пример #7
0
def count_ngram(N, fin, fout):
    print 'count', N, '-gram on', fin, 'out to', fout
    import pynlpl.textprocessors as proc

    with open(fin, 'r') as corpus_in:
        ngrams = {}
        for line in corpus_in:
            words = proc.tokenize(line)
            for ngram in proc.Windower(words, N, '<s>', None):
                key = '\t'.join(ngram)
                if key in ngrams:
                    ngrams[key] += 1
                else:
                    ngrams[key] = 1

        with open(fout, 'w') as ngrams_out:
            for key, value in sorted(ngrams.items(),
                                     key=lambda (term, freq): (term, -freq)):
                if value > 1:
                    ngrams_out.write(key + '\t' + str(value) + '\n')
Пример #8
0
def tokenize(text):
    #tokens = nltk.word_tokenize(text)
    tokens = nlp.tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems
Пример #9
0
def tokenize(text):
    #tokens = nltk.word_tokenize(text)
    tokens = nlp.tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems
Пример #10
0
    def add_folia_correction(self, doc, sentence):
        """
        Adds a folia.Correction to an existing folia.Sentence. Output the created folia.Words.
        """
        words = []

        # Tokenize both the original and the edited form.
        original_tokens = tokenize(self.original)
        edited_tokens = tokenize(self.edited)

        # If we're dealing with single words (e.g. spelling errors), create the correction directly on the word.
        if len(original_tokens) == 1 and len(edited_tokens) == 1:
            w = sentence.add(folia.Word)
            words.append(w)
            n = folia.New(doc, self.edited)
            o = folia.Original(doc, self.original)
            for i, a in enumerate(self.annotations):
                if i == 0:
                    correction = w.add(folia.Correction,
                                       n,
                                       o,
                                       cls=a['unit'],
                                       generate_id_in=sentence)
                else:
                    n_new = folia.New(doc, self.edited)
                    o_new = folia.Original(doc, self.original)
                    correction = o.add(folia.Correction,
                                       n_new,
                                       o_new,
                                       cls=a['unit'],
                                       generate_id_in=sentence)
                self.add_features(correction, a)
        # We are dealing with more than one word, or an insertion/deletion. Create word elements for each token.
        else:
            n = folia.New(doc)
            o = folia.Original(doc)
            for w in edited_tokens:
                word = n.add(folia.Word, w, generate_id_in=sentence)
                words.append(word)
            for w in original_tokens:
                o.add(folia.Word, w, generate_id_in=sentence)
            for i, a in enumerate(self.annotations):
                if i == 0:
                    correction = sentence.add(folia.Correction,
                                              n,
                                              o,
                                              cls=a['unit'],
                                              generate_id_in=sentence)
                else:
                    n_new = folia.New(doc)
                    o_new = folia.Original(doc)
                    for w in edited_tokens:
                        n_new.add(folia.Word, w, generate_id_in=sentence)
                    for w in original_tokens:
                        o_new.add(folia.Word, w, generate_id_in=sentence)
                    correction = o.add(folia.Correction,
                                       n_new,
                                       o_new,
                                       cls=a['unit'],
                                       generate_id_in=sentence)
                self.add_features(correction, a)

        return words