Exemplo n.º 1
0
    def __init__(self, BASEDIR, session_only=False, cycle_time=1):
        super().__init__(BASEDIR, session_only, cycle_time)
        self.name = 'contentrank'

        mapper = Mapping()
        self.rec_mapping = mapper.get_header_rec()
        self.event_mapping = mapper.get_header_event()
        self.update_mapping = mapper.get_header_update()
        self.item_id_idx = self.rec_mapping.index('ITEM_SOURCE')
        self.publisher_id_idx = self.rec_mapping.index('PUBLISHER')
        self.recs_idx = self.event_mapping.index('recs')
        self.limit_idx = self.rec_mapping.index('limit')
        self.title_idx = self.update_mapping.index('title')
        self.text_idx = self.update_mapping.index('text')
        self.update_id_idx = self.update_mapping.index('id')
        self.update_domainid_idx = self.update_mapping.index('domainid')

        self.germanStemmer = GermanStemmer(ignore_stopwords=True)
        self.stopwords = stopwords.words('german')
        self.stems = {}  # (item, [stem, stem, stem])

        self.correct = 0
        self.total_events = 0
        self.nrrows = 0

        self.counts = {}
Exemplo n.º 2
0
def cosine_preprocess(texts, pickle_name, pickle_folder='pickle'):
    pickle_path = os.path.join(pickle_folder, pickle_name)

    # Return from disk if possible for efficiency reasons
    if os.path.exists(pickle_path):
        with open(pickle_path, 'rb') as f:
            return pickle.load(f)

    processed = []
    for text in tqdm(texts):
        stemmer = GermanStemmer()
        words = stopwords.words('german')

        tokens = [
            stemmer.stem(token) for token in word_tokenize(text)
            if token not in words
        ]

        processed.append(' '.join(tokens))

    # Pickle the output
    if not os.path.exists(pickle_folder):
        os.makedirs(pickle_folder)

    with open(pickle_path, 'wb') as f:
        pickle.dump(processed, f)

    return processed
Exemplo n.º 3
0
def evaluate_dnn(path:str):
    with open(os.path.join(path, "tag_to_int.json"), "rt") as f:
        tag_to_int = json.load(f)
    with open(os.path.join(path, "int_to_tag.json"), "rt") as f:
        int_to_tag = json.load(f)  

    cv = pickle.load(open(os.path.join(path, "cv.p"), "rb"))
    stemmer = GermanStemmer()
    model_name = "dnn_intent_classification.h5"
    model = load_model(os.path.join(path, model_name))

    with open(os.path.join("Data", "commands", "Test", "testingdata.json"), "rt") as f:
        val_data = json.load(f)

    X = []
    y = []

    for tag, commands in val_data.items():
        for command in commands:
            command = " ".join(stemmer.stem(c) for c in sorted(word_tokenize(command)))
            X.append(transform_command_BoW(command, cv))
            y.append(tag_to_int[tag])

    X = np.array(X)
    y = np.array(y)

    predictions =  model.predict(X)
    predicted_indices = np.argmax(predictions, 1)

    print("acc: ", accuracy_score(y, predicted_indices))
    cm = confusion_matrix(y, predicted_indices)
    cm = pd.DataFrame(cm, index=int_to_tag.values(), columns=int_to_tag.values())
    print(cm)

    return (accuracy_score(y, predicted_indices), cm)
Exemplo n.º 4
0
def build_stems(pattern: str, category: Category,
                elements: List[Tuple[Category, Set[str]]],
                total_stems: Set[str]) -> Set[str]:
    """
    Builds a set of stems for all words used in the pattern.

    Args:
        pattern: The pattern to tokenize and stem.
        category: The category of the pattern.
        elements:
            A mutable list of (category, stem) pairs that the new stems will
            be appended to.
        total_stems:
            The set of total stems before this function was invoked.
            Will not be mutated.

    Returns:
        The union of total_stems and stems found in the pattern.
    """

    # Tokenize pattern into words
    words = nltk.word_tokenize(pattern)
    # Get stems for the pattern's words, as a set to avoid duplicates
    stemmer = GermanStemmer()
    stems: Set[str] = {stemmer.stem(w.lower()) for w in words}
    # Add stems associated with association to the category to the
    # pattern list.
    elements.append((category, stems))
    # Add stems to total set of stems, needed for conversion to numeric
    # TensorFlow training array
    total_stems |= stems
    return total_stems
Exemplo n.º 5
0
 def __init__(self, config):
   self.config = config
   if config.stem:
     if config.lang == 'en':
       self.stemmer = PorterStemmer()
     elif config.lang == 'de':
       self.stemmer = GermanStemmer()
     else:
       self.stemmer = IdStemmer()
Exemplo n.º 6
0
def _check_NE_yeah(gram):
    tag = entities.get(" ".join(gram), "O")

    if tag == "O":
        if len(gram) == 2:
            first, last = gram
            if first in vornamen and last in nachnamen:
                tag = "PER"

    if tag == "O":
        try:
            tag = entities.get(
                " ".join([GermanStemmer().stem(g) for g in gram]), "O")
        except:
            tag = entities.get(
                " ".join([
                    GermanStemmer().stem(g.decode(encoding="UTF-8"))
                    for g in gram
                ]), "O")

    return tag
Exemplo n.º 7
0
def ner_features(sentence, i, history):
    # TODO: try using TreeTagger's POS tag
    wordO = sentence[i]
    word = wordO.string
    pos = wordO.pos
    stemmed = GermanStemmer().stem(word)

    if i == 0:
        prevword, prevpos = "<START>", "<START>"
        last = "<START>"
        prevstemmed = "<START>"
    else:
        last = history[-1]
        prevword = sentence[i - 1].string
        prevpos = sentence[i - 1].pos
        prevstemmed = GermanStemmer().stem(sentence[i - 1].string)

    chunk = []
    if not wordO.chunk:
        chunk.append("START")
        knowledge_sources = "O"
    else:
        knowledge_sources = check_NE(convert(wordO.string), wordO.chunk)
        chunk = [w.string for w in wordO.chunk]

    stem_is_word = stemmed == word.lower()

    knowledge_sources_stemmed = _check_NE_yeah([stemmed])

    return {
        "knowledge": knowledge_sources,
        "knowledge_lemma": knowledge_sources_stemmed,
        "history": "+".join(history)[-2:],
        "pos": pos,
        "word": word,
        "stemmed": stemmed
    }
Exemplo n.º 8
0
def remove_stop_words(msg):
    # remove stop words and stem words
    stemmer = GermanStemmer()

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(msg)

    stop_words = set(stopwords.words('german'))

    words_filtered = []

    for w in words:
        if w not in stop_words:
            words_filtered.append(stemmer.stem(w))

    return words_filtered
Exemplo n.º 9
0
 def __init__(self):
     self.tweets = 0
     self.related_tweets = 0
     self.stopwords = {}
     self.stemmers = {}
     self.stemmers["es"] = SpanishStemmer()
     self.stemmers["en"] = PorterStemmer()
     self.stemmers["fr"] = FrenchStemmer()
     self.stemmers["de"] = GermanStemmer()
     self.stopwords["es"] = self.load_stopwords_file(
         "spanish_stopwords.txt")
     self.stopwords["en"] = self.load_stopwords_file(
         "english_stopwords.txt")
     self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt")
     self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt")
     self.output_file = open(sys.argv[2], 'a')
Exemplo n.º 10
0
    def __init__(self, essay: str, name: str, gazetteer_version: int = 1):
        """
        Initalizes the Stringmatcher. Takes a path to an essay and the gazetteer version, that should be used. See the above dict 
        :param file_path: path to the essay that is to be processed
        :param gazetteer_version: the gazetteer version that should be used. See the above defined dict "version_subfolder" for what values are possible
        """
        # Initialize data structures
        self.essay = essay
        self.essay_name = name
        self.gazetteer_version = gazetteer_version
        self.tokens_without_stopwords = []
        self.found_entities = dict()
        self.stemmer = GermanStemmer()
        self.fastText_model = None
        self.spacy_model = None
        self.file_path = RESULTS_PATH + name

        if not os.path.exists(self.file_path):
            os.makedirs(self.file_path)

        # retrieve the gazetteers that should be used for annotation
        self.gazetteers = sorted([
            f for f in os.listdir(PATH_GAZETTEERS +
                                  version_subfolder[gazetteer_version])
            if os.path.isfile(PATH_GAZETTEERS +
                              version_subfolder[gazetteer_version] + f)
        ])
        print("Used gazetteers: %s" % (gazetteer_version))

        # retrieve gazetteers with already preprocessed entries if available (for efficiency reasons) or create new one
        if os.path.isfile(PATH_GAZETTEERS + "tokenized_gazetteers"):
            self.tokenized_gazetteers = pickle.load(
                open(PATH_GAZETTEERS + "tokenized_gazetteers", "rb"))
        else:
            self.tokenized_gazetteers = dict()
        changed = False
        for gazetteer_filename in self.gazetteers:
            # if there is not already a tokenized version of this gazetteer, tokenize it
            if not gazetteer_filename in self.tokenized_gazetteers.keys():
                self.tokenized_gazetteers[
                    gazetteer_filename] = self.tokenize_gazetteer(
                        gazetteer_filename)
                changed = True
        if changed:
            pickle.dump(self.tokenized_gazetteers,
                        open(PATH_GAZETTEERS + "tokenized_gazetteers", "wb"))
Exemplo n.º 11
0
    def __init__(self,
                 lang,
                 strip_accents=None,
                 ngram_range=(1, 1),
                 max_df=1.0,
                 min_df=1,
                 stop_words=None):

        if lang == 'de':
            self.stemmer = GermanStemmer()
        else:
            self.stemmer = EnglishStemmer()

        super(self.__class__, self).__init__(stop_words=stop_words,
                                             strip_accents=strip_accents,
                                             ngram_range=ngram_range,
                                             max_df=max_df,
                                             min_df=min_df)
Exemplo n.º 12
0
    def stemWord(self, word, lng):
        '''Separates the word's changeable part with a '|' for wordfast'''
        if lng == 'ru':
            stemmer = RussianStemmer()
        elif lng == 'en':
            stemmer = PorterStemmer()
        elif lng == 'de':
            stemmer = GermanStemmer()
        else:
            print('Language error. Exiting...')
            sys.exit(1)

        word = word.lower()  #otherwise the stemmer fails
        if len(word) <= 3:
            return word
        elif len(word) == len(stemmer.stem(word)):
            return "{0}|{1}".format(word[:-1], word[-1])
        else:
            return "{0}|{1}".format(word[:len(stemmer.stem(word))], \
            word[len(stemmer.stem(word)):])
Exemplo n.º 13
0
    def _preprocess(text, mode=None):
        '''helper function to preprocess text. returns List of Sentences'''
        sentences = split_single(text)
        if mode:
            nlp = spacy.load('de_core_news_sm')
            if mode == 'lemmatize':
                sentences = [
                    Sentence((' ').join([token.lemma_ for token in nlp(s)]))
                    for s in sentences
                ]
            elif mode == 'stem':
                stemmer = GermanStemmer()
                sentences = [
                    Sentence((' ').join(
                        [stemmer.stem(token.text) for token in nlp(s)]))
                    for s in sentences
                ]
        else:
            sentences = [Sentence(s, use_tokenizer=True) for s in sentences]

        return sentences
Exemplo n.º 14
0
def clean_text(text):
    """
    :param text:
    :return:
    """
    # stopwords = set(nltk.corpus.stopwords.words('german'))
    file_path = r'etc/models/german.txt'
    with open(file_path) as file:
        file_data = file.read()
    stopwords = file_data.split('\n')
    gs = GermanStemmer()
    text_cleaned = ""
    text_cleaned = re.sub('[^a-zA-Z]', ' ',
                          text)  # Keep only alphabet and space characters
    text_cleaned = text_cleaned.lower()  # All character to lowercase
    text_cleaned = text_cleaned.split(
    )  # Split to list of word (split by space specify character)
    text_cleaned = [
        gs.stem(word) for word in text_cleaned if not word in stopwords
    ]
    text_cleaned = ' '.join(text_cleaned)
    return text_cleaned
Exemplo n.º 15
0
def text_cleaner(text):
    use_GermanStemmer = False
    tokens = False

    # Remove username handles
    # -? do we need the user names
    text = remove_handles(text)

    # Remove punctuation marks
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)

    # replace the umlauts
    # =============================================================================
    #         text = re.sub('ä', 'ae', text)
    #         text = re.sub('ö', 'oe', text)
    #         text = re.sub('ü', 'ue', text)
    #         text = re.sub('Ä', 'Ae', text)
    #         text = re.sub('Ö', 'Oe', text)
    #         text = re.sub('Ü', 'Ue', text)
    #         text = re.sub('ß', 'ss', text)
    # =============================================================================

    # remove the numbers
    text = re.sub(r'[0-9]+', '', text)

    # Remove emojis
    german_char = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜ"
    text = ''.join(c for c in text if c in german_char)

    tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True)
    if tokens:
        return tokenizer.tokenize(text)
    elif use_GermanStemmer:
        stemmer = GermanStemmer()
        return [stemmer.stem(token) for token in tokenizer.tokenize(text)]
    else:
        return text
Exemplo n.º 16
0
def set_stemmer(stemmer_language):
    if (stemmer_language == "GER"):
        stemmers = GermanStemmer()
    else:
        stemmers = EnglishStemmer()
    return stemmers
Exemplo n.º 17
0
sorted_d = np.sort([int(x["Veröffentlichungsdatum"].split("-")[0]) for x in d])
year_indices = {}
for ind, ind_year in enumerate(
        sorted([np.where(sorted_d == x)[0][0] for x in set(sorted_d)])):
    year_indices.update({list(range(2010, 2019 + 1))[ind]: ind_year})
year_indices[2020] = None

# IMPORTANT!
# 70 k times 588 k is big, sizing down therefore
year = 2010
d = d[year_indices[year]:year_indices[year + 1]]  # d[:500]

nltk.download("stopwords")
stop_words_en = stopwords.words('english')

stemmer = GermanStemmer()  # Cistem()

with open("stop_full.pkl", "rb") as f:
    stop_words = pickle.load(f)
    stop_words = [x.strip() for x in stop_words] + stop_words_en


def preprocess(text):
    text = text.lower().split()
    # text = [w.split(".")[0].split(",")[0].split(":")[0].split(";")[0] for w in text]
    text = " ".join(text)
    remove_punctuation_regex = re.compile(
        r"[^A-ZÄÖÜäöüßa-z ]"
    )  # regex for all characters that are NOT A-Z, a-z and space " "
    text = re.sub(remove_punctuation_regex, "", text)
    text = text.split()
Exemplo n.º 18
0
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from boilerpipe.extract import Extractor
from nltk.stem.snowball import GermanStemmer
from nltk import word_tokenize
import nltk.data
import os
import re

logger = logging.getLogger(__name__)
logging.getLogger('pdfminer').setLevel(logging.CRITICAL)

satztokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
stemmer = GermanStemmer()
stoppwörter = []
'''Lädt Stopwortliste'''
with open('traindata/german', 'r') as f:
    for line in f:
        wort = line.split('\n')[0]
        stoppwörter.append(wort.lower())


def preprocess(text):
    '''Filterregelungen, um Text zu vereinheitlichen.'''
    try:
        text = re.sub(
            "/innen|\*innen|/-innen", "innen",
            text)  # Vereinheitlicht unterschiedliche Gender-Varianten
        text = re.sub("-\s*\n", "", text)  # Entfernt Silbentrennung
Exemplo n.º 19
0
 def __init__(self):
     self.stemmer = GermanStemmer()
Exemplo n.º 20
0
test_df.reset_index(inplace=True)
print test_df.isnull().sum()



print 'Unique restaurants: {}'.format(len(data['restaurant_id'].unique()))
print 'Unique menu_category: {}'.format(len(data['menu_category'].unique()))
print 'Unique product_name: {}'.format(len(data['product_name'].unique()))
print 'Unique ingredients: {}'.format(len(data['ingredients'].unique()))
print test_df.shape

encode_menu = test_df['menu_category'].str.encode('ascii', errors='ignore')
print len(encode_menu.unique())
encode_menu.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True)
print len(encode_menu.unique())
encode_menu = encode_menu.apply(lambda x:GermanStemmer().stem(x))
print len(encode_menu.unique())
encode_name = test_df['product_name'].str.encode('ascii', errors='ignore')
print len(encode_name.unique())
encode_name.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True)
print len(encode_name.unique())
encode_name = encode_name.apply(lambda x:GermanStemmer().stem(x))
print len(encode_name.unique())


# X = pd.concat([encode_menu, encode_name, test_df['restaurant_id'].astype('str')], axis=1)

# le = preprocessing.LabelEncoder()
# X_2 = X.apply(le.fit_transform)
# print X_2.head()
# print X_2.shape
Exemplo n.º 21
0
class StringHandler:
    _STEMMER = GermanStemmer()
    _P_SIMILARITY_THRESHOLD: float = 0.9

    def __init__(self, string_series: pd.Series):
        self._ds = string_series.str.lower()
        self.ds_origin = string_series

    def optimize(self):
        self.remove_noise()
        self.split_text()
        self.build_sentence()
        self.stem_words()
        # self.correct_spelling()

    def reset(self):
        self.ds = self.ds_origin.copy()

    # string manipulation
    ##################################

    def stem_words(self):
        self.ds = self.ds.apply(StringHandler.stem_sentence)

    def split_text(self):
        self.ds = self.ds.str.split(' ')

    def remove_noise(self):
        self.ds = self.ds.str.replace(r'[^a-zA-Z0-9]', ' ')
        # remove leftover isolated substrings that are not words/digits

    def build_sentence(self):
        self.ds = self.ds.apply(lambda x: ' '.join(word.strip() for word in x if word))

    # nlp manipulation
    ##################################

    def correct_spelling(self):
        uniques = self.get_unique_series
        uniques.apply(lambda x: list(i for i in uniques if i != x and SequenceMatcher(None, x, i).ratio() > 0.9))

    @classmethod
    def stem_sentence(cls, sentence: str, split_char: str = ' '):
        return ' '.join(cls._STEMMER.stem(word) for word in sentence.split(split_char))

    # properties
    ##################################

    @property
    def get_unique_series(self):
        return pd.Series(self.ds.unique()).sort_values().reset_index(drop=True)

    @property
    def ds(self):
        return self._ds

    @ds.setter
    def ds(self, ds: pd.Series):
        if isinstance(ds, pd.Series) and not ds.empty:
            self._ds = ds
        else:
            raise TypeError('Wrong variable type or empty series')
Exemplo n.º 22
0
            ]
            res.append("\n".join(lines))

        return res


# In[3]:


def subwords(word):
    return [word[:2], word[2:]]


# In[27]:

stem = GermanStemmer().stem

cnt_vect_splits = [
    ("short", lambda doc: [line for line in doc if len(line) <= 1], {}),
    ("long", lambda doc: [line for line in doc if len(line) > 1], {}),
    ("subwords", lambda doc: [
        list(map(stem, concat(subwords(word) for word in line)))
        for line in doc
    ], {
        "ngram_range": (1, 1)
    }),
]

doc_funcs = [
    ("num_char", lambda doc: len(re.findall("[A-Za-zäöüÄÖÜß]", doc))),
]
Exemplo n.º 23
0
 
import nltk
import sys
from string import punctuation
import re
from nltk.stem.snowball import GermanStemmer

reload(sys)
sys.setdefaultencoding('utf-8')

#pre-processing tools
sents_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#sents_tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle')
stemmerEn = nltk.PorterStemmer() # uses nltk Porter stemmer
wnl = nltk.WordNetLemmatizer()
stemmerDe = GermanStemmer() # uses nltk Snowballs stemmer for German

def split_into_sentences(text):
	import re
	caps = "([A-Z])"
	prefixes = "(Mr|St|Mrs|Ms|Dr|dr|etc|vs|doc|art|no|inc|mr)[.]"
	suffixes = "(Inc|Ltd|Jr|Sr|Co|gdp|hon)"
	starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
	acronyms = "([A-Za-z][.][A-Za-z][.](?:[A-Za-z][.])?)"
	websites = "[.](com|net|org|io|gov|de|fr|il|mk)"
	dates = "(\d\d?)\.(\s+(januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember|jahrestag))"
	#dates = "(\d\d?)\."
	www = "(www)\."
	times = "(\d\d?)\.(\s?\d\d?)"
	full_date ="(\d\d?)\.(\s?\d\d?)\.(\s?\d\d\d?\d?)"
	
import re
import preprocess_files
from nltk.stem.snowball import GermanStemmer


gs = GermanStemmer()
punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''



def match_synms(tokens):
    syn_dict = preprocess_files.read_synms_list()
    for t in tokens:
        for (idx, val) in enumerate(t):
            if val in syn_dict:
                t[idx] = syn_dict[val]

    return tokens

def _remove_punctuation(tokens):
    tokens_filt = []
    for gT in tokens:
        if gT not in punctuations: tokens_filt.append(gT)
    return tokens_filt


def _remove_stopwords(tokens):
    '''Remove stop words from an array of tokens'''

    stopWords = ['the', 'to', '-', 'pr', 'der', 'is', 'of', 'die', 'in', 'and', 'und', '–', '•', '✔', '●', 'a']
Exemplo n.º 25
0
def get_stem_relations(sentences, gn):
    """Gets verb-noun relations
    between two sentences.

    Returns
        Array of word-pairs between two sentences
    """

    # Init word pairs
    word_pairs = []

    # Init stemmer
    stemmer = GermanStemmer(ignore_stopwords=True)

    # Loop over every sentence
    for val, sentence in enumerate(sentences):
        # Is current sentence not the last
        # sentence? If so carry on
        if val != (len(sentences) - 1):
            # Get stems of all words in current sentence
            stems_next_sentence = map(lambda x: stemmer.stem(x['lemma']),
                                      sentences[val + 1])

            # Nouns in next sentence
            nouns_next_sentence = [
                word['lemma'] for word in sentences[val + 1] if word['noun']
            ]

            # Nouns of current sentence
            words_current_sentence = [
                word for word in sentence if word['noun']
            ]

            # Loop over every word in current sentece
            for word in sentences[val]:
                # Stem of current word
                stem_current_word = stemmer.stem(word['lemma'])

                # Is the stemmed word in the next sentence, great.
                # If word is a lame 'sein', ignore it
                if (stem_current_word
                        in stems_next_sentence) and word['lemma'] != 'sein':
                    # Get index of stem that is related to current word
                    index_word_next_sentence = stems_next_sentence.index(
                        stem_current_word)

                    # Corresponding word in next sentence
                    corresponding_word = sentences[val +
                                                   1][index_word_next_sentence]

                    # Only add word pairs if verb or noun
                    if word['noun'] or word['verb']:
                        # Get dictionary of word in next sentence
                        dict_next = sentences[val +
                                              1][index_word_next_sentence]

                        # We do not want to combine words
                        # that have the same grammatical function
                        # A noun should not be combined with a noun
                        # We are only interested in verb-noun relations
                        if word['verb'] and dict_next['noun']:
                            # Get all combinations of corresponding noun
                            # in next sentence an all nouns in current sentence
                            for wordCurrent in words_current_sentence:
                                # Append to list
                                word_pairs.append({
                                    'source': {
                                        'word': corresponding_word['orth'],
                                        'lemma': corresponding_word['lemma'],
                                        'sentence': val
                                    },
                                    'target': {
                                        'word': wordCurrent['orth'],
                                        'lemma': wordCurrent['lemma'],
                                        'sentence': val + 1
                                    },
                                    'device':
                                    'verb noun relation'
                                })

                        # Current word is noun and corresponding word is
                        # verb
                        elif word['noun'] and dict_next['verb']:
                            # Get all combinations of of noun in this sentence
                            # with nouns in next sentence
                            for wordNext in sentences[val + 1]:
                                # Do not use stupid 'sein'
                                if wordNext['noun']:
                                    # Append to list
                                    word_pairs.append({
                                        'source': {
                                            'word': word['orth'],
                                            'lemma': word['lemma'],
                                            'sentence': val
                                        },
                                        'target': {
                                            'word': wordNext['orth'],
                                            'lemma': wordNext['lemma'],
                                            'sentence': val + 1
                                        },
                                        'device':
                                        'noun verb relation'
                                    })

    return word_pairs
Exemplo n.º 26
0
 def stem_words(self, words):
     stemmer = GermanStemmer()
     stemmed_words = []
     for word in words:
         stemmed_words.append(stemmer.stem(word))
     return stemmed_words
Exemplo n.º 27
0
 def load_stemmer(self):
     self._stemmer = None
     if self._stemming_lang == Language.GERMAN:
         self._stemmer = GermanStemmer()
     else:
         self._stemmer = EnglishStemmer()
Exemplo n.º 28
0
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import GermanStemmer
import os
import json
import nltk
import pandas as pd
import numpy as np

stemmer = GermanStemmer(ignore_stopwords=True)

CONFLICT_OUTPUT_PATH = os.path.join("Output")
CREATE_VOCABULARY = os.path.join("Output")


def combine_data_panning(dirpath: str, output_name: str = None):
    ACTION = "actions"
    TAG = "tag"
    COMMANDS = "commands"

    # with open(os.path.join("Data", "stopwords.txt"), "rt") as f:
    #     stopwords = set(f.read().splitlines())

    document_pathes = [os.path.join(dirpath, x) for x in os.listdir(dirpath)]
    new_data = {}
    for i, document in enumerate(document_pathes):

        with open(document, "rt") as f:
            commands = json.load(f)

        repeat = set()
        for action in commands[ACTION]:
from nlingua.stemmers import GermanSnowballStemmer
from nltk.stem.snowball import GermanStemmer
import codecs

if __name__ == '__main__':
    l = []
    with codecs.open("german_words.txt", encoding="utf-8", mode="r") as f:
        words = f.readlines()

    words = [x[:-1] for x in words]

    correct = 0
    stemmer = GermanSnowballStemmer()
    stemmer2 = GermanStemmer()
    for word in words:
        a = stemmer.stem(word)
        b = stemmer2.stem(word)
        if a == b:
            correct += 1
        else:
            print(word, a, b)

    print(f"{correct}/{len(words)} correct")