def remove_contradicting_pairs(all_pairs, conjugated_words, disable_tqdm=False, verbose=True): conj_words = {word: OrderedSet() for word in conjugated_words} # Group all tuples by their first word for pair in all_pairs: word1 = pair[0] conj_words[word1].add(pair) # Make sure each conjugated word is only paired with one form of another word for word, pairs in tqdm(conj_words.items(), disable=disable_tqdm): pairs_cleaned = OrderedSet() paired_words = OrderedSet() for pair in pairs: word2 = pair[1] word2_lemma = lemma(word2) if word2_lemma not in paired_words: paired_words.add(word2_lemma) pairs_cleaned.add(pair) else: # If this word stem already appears here it must be removed. Otherwise syntactically different forms # of the same stem will appear with the same other word and hence be mapped closely together # in attract-repel if verbose: print("Removed:", pair, "From:", pairs) conj_words[word] = pairs_cleaned all_pairs_cleaned = OrderedSet() for pairs in conj_words.values(): all_pairs_cleaned.update(pairs) return all_pairs_cleaned
def evaluate(self, node): words = node.words() A = Analyzer.instance() corp = A.get(corpus="TIGER") sents_count = len(node.sents(tokenizer=corp.sent_tokenizer())) tagger = corp.tagger(True) tagged_words = tagger.tag(words) count = 0 if len(tagged_words) > 0: for w in tagged_words: if w[1] and w[1].startswith("V"): lemm = lemma(w[0]) if lemm in self.VERBS: count += 1 return float(count) / sents_count return 0.0
def evaluate(self, node): words = node.words() words_no_no_words = [w for w in words if w not in NO_WORDS] A = Analyzer.instance() corp = A.get(corpus="TIGER") tagger = corp.tagger(True) tagged_words = tagger.tag(words) unique_words = set() if len(tagged_words) > 0 and len(words_no_no_words) > 0: for w in tagged_words: if w[0] not in NO_WORDS: if w[1] and w[1].startswith("V"): lemm = lemma(w[0]) unique_words.add(lemm) else: unique_words.add(w[0]) return float(len(unique_words)) / len(words_no_no_words) return 0.0
def _getLemma(word, language): import pattern.en as pattern_en # @UnresolvedImport import pattern.es as pattern_es # @UnresolvedImport import pattern.fr as pattern_fr # @UnresolvedImport import pattern.de as pattern_de # @UnresolvedImport import pattern.it as pattern_it # @UnresolvedImport if language == "es": return pattern_es.lemma(word) elif language == "en": return pattern_en.lemma(word) elif language == "it": return pattern_it.lemma(word) elif language == "fr": return pattern_fr.lemma(word) elif language == "de": return pattern_de.lemma(word) else: return pattern_en.lemma(word)
# Author: Devon Fritz # Date: 30.5.15 # Stems the lexicon of german words import sys from pattern.de import lemma reload(sys) sys.setdefaultencoding('utf-8') input_file = sys.argv[1] with open(input_file) as f: for line in f: print lemma(line.split()[0].strip()) + (' ' + line.split()[1] if len(line.split()) > 1 else '')
# Author: Devon Fritz # Date: 30.5.15 # Stems the lexicon of german words import sys from pattern.de import lemma, lexeme reload(sys) sys.setdefaultencoding('utf-8') input_file = sys.argv[1] with open(input_file) as f: for line in f: print lemma(line.strip()) for l in lexeme(line.strip()): print l
import sys import re import codecs # not used, but possibly interesting http://www.nltk.org/ # http://www.clips.ua.ac.be/pages/pattern-de from pattern.de import lemma, tag, predicative, singularize # possible parts of speech: # PRP$, FW, VBN, WDT, JJ, WP, DT, RP, NN, TO, PRP, # RB, NNS, NNP, VB, WRB, CC, LS, CD, IN, MD, UH part_of_speech_command = { 'PRP$': lambda word: predicative(word), # pronomina 'VBN': lambda word: lemma(word), # verba 'DT': lambda word: predicative(word), # pronomina 'VB': lambda word: lemma(word), # verba 'NN': lambda word: singularize(word), # nomina 'JJ': lambda word: predicative(word) # preposice } pattern_word = re.compile('[a-zA-Z]') pattern_punctuation = re.compile(ur'[—\-|«»…–<>]') def transform(tagword): word = tagword[0] part = tagword[1] # if part == 'VBN': # print tagword