Exemplo n.º 1
0
def remove_contradicting_pairs(all_pairs, conjugated_words, disable_tqdm=False, verbose=True):
    conj_words = {word: OrderedSet() for word in conjugated_words}
    # Group all tuples by their first word
    for pair in all_pairs:
        word1 = pair[0]
        conj_words[word1].add(pair)

    # Make sure each conjugated word is only paired with one form of another word
    for word, pairs in tqdm(conj_words.items(), disable=disable_tqdm):
        pairs_cleaned = OrderedSet()
        paired_words = OrderedSet()
        for pair in pairs:
            word2 = pair[1]
            word2_lemma = lemma(word2)
            if word2_lemma not in paired_words:
                paired_words.add(word2_lemma)
                pairs_cleaned.add(pair)
            else:
                # If this word stem already appears here it must be removed. Otherwise syntactically different forms
                # of the same stem will appear with the same other word and hence be mapped closely together
                # in attract-repel
                if verbose:
                    print("Removed:", pair, "From:", pairs)
        conj_words[word] = pairs_cleaned

    all_pairs_cleaned = OrderedSet()
    for pairs in conj_words.values():
        all_pairs_cleaned.update(pairs)
    return all_pairs_cleaned
Exemplo n.º 2
0
 def evaluate(self, node):
     words = node.words()
     A = Analyzer.instance()
     corp = A.get(corpus="TIGER")
     sents_count = len(node.sents(tokenizer=corp.sent_tokenizer()))
     tagger = corp.tagger(True)
     tagged_words = tagger.tag(words)
     count = 0
     if len(tagged_words) > 0:
         for w in tagged_words:
             if w[1] and w[1].startswith("V"):
                 lemm = lemma(w[0])
                 if lemm in self.VERBS:
                     count += 1
         return float(count) / sents_count
     return 0.0
Exemplo n.º 3
0
 def evaluate(self, node):
     words = node.words()
     words_no_no_words = [w for w in words if w not in NO_WORDS]
     A = Analyzer.instance()
     corp = A.get(corpus="TIGER")
     tagger = corp.tagger(True)
     tagged_words = tagger.tag(words)
     unique_words = set()
     if len(tagged_words) > 0 and len(words_no_no_words) > 0:
         for w in tagged_words:
             if w[0] not in NO_WORDS:
                 if w[1] and w[1].startswith("V"):
                     lemm = lemma(w[0])
                     unique_words.add(lemm)
                 else:
                     unique_words.add(w[0])
         return float(len(unique_words)) / len(words_no_no_words)
     return 0.0
Exemplo n.º 4
0
def _getLemma(word, language):
    import pattern.en as pattern_en  # @UnresolvedImport
    import pattern.es as pattern_es  # @UnresolvedImport
    import pattern.fr as pattern_fr  # @UnresolvedImport
    import pattern.de as pattern_de  # @UnresolvedImport
    import pattern.it as pattern_it  # @UnresolvedImport

    if language == "es":
        return pattern_es.lemma(word)
    elif language == "en":
        return pattern_en.lemma(word)
    elif language == "it":
        return pattern_it.lemma(word)
    elif language == "fr":
        return pattern_fr.lemma(word)
    elif language == "de":
        return pattern_de.lemma(word)
    else:
        return pattern_en.lemma(word)
Exemplo n.º 5
0
# Author: Devon Fritz
# Date: 30.5.15
# Stems the lexicon of german words

import sys
from pattern.de import lemma

reload(sys)
sys.setdefaultencoding('utf-8')
input_file = sys.argv[1]

with open(input_file) as f:
    for line in f:
        print lemma(line.split()[0].strip()) + (' ' + line.split()[1] if
                                                len(line.split()) > 1 else '')
# Author: Devon Fritz
# Date: 30.5.15
# Stems the lexicon of german words

import sys
from pattern.de import lemma

reload(sys)
sys.setdefaultencoding('utf-8')
input_file = sys.argv[1]

with open(input_file) as f:
    for line in f:
        print lemma(line.split()[0].strip()) + (' ' + line.split()[1] if len(line.split()) > 1 else '')

# Author: Devon Fritz
# Date: 30.5.15
# Stems the lexicon of german words

import sys
from pattern.de import lemma, lexeme

reload(sys)
sys.setdefaultencoding('utf-8')
input_file = sys.argv[1]

with open(input_file) as f:
    for line in f:
        print lemma(line.strip())
        for l in lexeme(line.strip()):
            print l

Exemplo n.º 8
0
import sys
import re
import codecs

# not used, but possibly interesting http://www.nltk.org/

# http://www.clips.ua.ac.be/pages/pattern-de
from pattern.de import lemma, tag, predicative, singularize

# possible parts of speech:
# PRP$, FW, VBN, WDT, JJ, WP, DT, RP, NN, TO, PRP,
# RB, NNS, NNP, VB, WRB, CC, LS, CD, IN, MD, UH
part_of_speech_command = {
    'PRP$': lambda word: predicative(word),  # pronomina
    'VBN': lambda word: lemma(word),  # verba
    'DT': lambda word: predicative(word),  # pronomina
    'VB': lambda word: lemma(word),  # verba
    'NN': lambda word: singularize(word),  # nomina
    'JJ': lambda word: predicative(word)  # preposice
}

pattern_word = re.compile('[a-zA-Z]')
pattern_punctuation = re.compile(ur'[—\-|«»…–<>]')


def transform(tagword):
    word = tagword[0]
    part = tagword[1]
    # if part == 'VBN':
    #     print tagword