def count_ngram2(N, fin, fout): print 'count', N, '-gram2 on', fin, 'out to', fout, 'only top candidate reserved' import pynlpl.textprocessors as proc with open(fin, 'r') as corpus_in: ngrams = {} count_total = 0 for line in corpus_in: words = proc.tokenize(line) for ngram in proc.Windower(words, N, '<s>', None): term = '\t'.join(ngram[:-1]) cand = ngram[-1] if term in ngrams: if cand in ngrams[term]: ngrams[term][cand] += 1 else: ngrams[term][cand] = 1 else: ngrams[term] = {cand: 1} count_total += 1 if count_total % 1000000 == 0: print count_total for k in ngrams.keys(): ngrams[k] = sorted(ngrams[k].items(), key=lambda (_, freq): (-freq))[0][0] with open(fout, 'w') as ngrams_out: for key, value in sorted(ngrams.items(), key=lambda (term, cand): (term, cand)): ngrams_out.write(key + '\t' + value + '\n')
def add_folia_correction(self, doc, sentence): """ Adds a folia.Correction to an existing folia.Sentence. Output the created folia.Words. """ words = [] # Tokenize both the original and the edited form. original_tokens = tokenize(self.original) edited_tokens = tokenize(self.edited) # If we're dealing with single words (e.g. spelling errors), create the correction directly on the word. if len(original_tokens) == 1 and len(edited_tokens) == 1: w = sentence.add(folia.Word) words.append(w) n = folia.New(doc, self.edited) o = folia.Original(doc, self.original) for i, a in enumerate(self.annotations): if i == 0: correction = w.add(folia.Correction, n, o, cls=a['unit'], generate_id_in=sentence) else: n_new = folia.New(doc, self.edited) o_new = folia.Original(doc, self.original) correction = o.add(folia.Correction, n_new, o_new, cls=a['unit'], generate_id_in=sentence) self.add_features(correction, a) # We are dealing with more than one word, or an insertion/deletion. Create word elements for each token. else: n = folia.New(doc) o = folia.Original(doc) for w in edited_tokens: word = n.add(folia.Word, w, generate_id_in=sentence) words.append(word) for w in original_tokens: o.add(folia.Word, w, generate_id_in=sentence) for i, a in enumerate(self.annotations): if i == 0: correction = sentence.add(folia.Correction, n, o, cls=a['unit'], generate_id_in=sentence) else: n_new = folia.New(doc) o_new = folia.Original(doc) for w in edited_tokens: n_new.add(folia.Word, w, generate_id_in=sentence) for w in original_tokens: o_new.add(folia.Word, w, generate_id_in=sentence) correction = o.add(folia.Correction, n_new, o_new, cls=a['unit'], generate_id_in=sentence) self.add_features(correction, a) return words
def to_folia_sentence_child(self, doc, sentence): words = [] role = None if self.is_correction: words.extend(self.add_folia_correction(doc, sentence)) else: for token in tokenize(self.original): w = sentence.add(folia.Word, token) words.append(w) for a in self.annotations: role = folia.SemanticRole(doc, *words, cls=a['unit']) self.add_features(role, a) return words, role
def to_folia_sentence(self, doc, sentence): all_words = [] all_roles = [] # Loop over the child nodes current_position = 0 for start, end, node in self.get_child_nodes(): # Add tokens until the start of the next child node to the sentence tokens = tokenize(self.original[current_position:start]) for token in tokens: word = sentence.add(folia.Word, token) all_words.append(word) # If the child node has children, recurse if node.children: words, roles = node.to_folia_sentence(doc, sentence) all_words.extend(words) all_roles.extend(roles) # Else, add the child node else: words, role = node.to_folia_sentence_child(doc, sentence) all_words.extend(words) if role: all_roles.append(role) current_position = end # Add the tokens from the last child node to the end of this node tokens = tokenize(self.original[current_position:self.end]) for token in tokens: word = sentence.add(folia.Word, token) all_words.append(word) # If this node has annotations, add roles. for a in self.annotations: role = folia.SemanticRole(doc, *all_words, cls=a['unit']) self.add_features(role, a) all_roles.append(role) return all_words, all_roles
def count_ngram(N, fin, fout): print 'count', N, '-gram on', fin, 'out to', fout import pynlpl.textprocessors as proc with open(fin, 'r') as corpus_in: ngrams = {} for line in corpus_in: words = proc.tokenize(line) for ngram in proc.Windower(words, N, '<s>', None): key = '\t'.join(ngram) if key in ngrams: ngrams[key] += 1 else: ngrams[key] = 1 with open(fout, 'w') as ngrams_out: for key, value in sorted(ngrams.items(), key=lambda (term, freq): (term, -freq)): if value > 1: ngrams_out.write(key + '\t' + str(value) + '\n')
def tokenize(text): #tokens = nltk.word_tokenize(text) tokens = nlp.tokenize(text) stems = stem_tokens(tokens, stemmer) return stems