Пример #1
0
 def load_model(self):
     """
     Imports the German language model.
     :return:
     """
     nlp = de_core_news_sm.load()
     return nlp
 def __init__(self):
     import spacy
     # nlp = spacy.load('de', disable=['ner', 'parser'])
     import de_core_news_sm
     # !python -m spacy download de_core_news_sm
     nlp = de_core_news_sm.load(disable=['parser', 'ner'])
     self.processor = nlp
    def __init__(self, lang):
        if lang == LANG.EN:
            self.nlp = en_core_web_md.load()
        else:
            self.nlp = de_core_news_sm.load()

        self.stanford_ner = StanfordNERTagger(model,
                                              '../models/stanford-ner.jar',
                                              encoding='utf-8')
Пример #4
0
    def de_lang(cls):
        me_list = ['ich', 'mein', 'meine']
        embeddings_model = FlairEmbeddingModels().de_lang()
        nlp = de_core_news_sm.load()
        relationship_list = [
            'vater', 'mutter', 'sohn', 'tochter', 'bruder', 'schwester',
            'enkel', 'enkelin', 'großvater', 'großmutter', 'ehemann',
            'ehefrau', 'onkel', 'tante', 'freund'
        ]

        return cls(me_list, embeddings_model, nlp, relationship_list)
Пример #5
0
def translate_sentence(model, sentence, german, english, device, max_length=50):
    print(sentence)

    # sys.exit()

    # Load german tokenizer
    spacy_ger = de_core_news_sm.load()

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]
Пример #6
0
def translate_sentence(sentence,
                       src_field,
                       trg_field,
                       model,
                       device,
                       max_len=50):
    model.eval()

    # tokenize input
    if isinstance(sentence, str):
        nlp = de_core_news_sm.load()
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    # add <sos> and <eos>
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    # get input's one-hot vec
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    # add a batch dim and convert into tensor
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    with torch.no_grad():
        encoder_outputs = model.encoder(src_tensor)

    hidden = encoder_outputs

    # get first decoder input (<sos>)'s one hot
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)

        pred_token = output.argmax(1).item()

        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]

    return trg_tokens[1:]
Пример #7
0
    def de_lang(cls):
        nlp = de_core_news_sm.load()
        embeddings_model = FlairEmbeddingModels().de_lang()
        # PP: e.g. 'I habe einen Sohn', 'I habe einen kleinen Bruder'
        # NP: e.g. 'Meine kleine Schwester'
        grammar = r"""
                PP: {<PRON><AUX><DET><ADJ>?<NOUN>}
                NP: {<DET><ADJ>?<NOUN>}            
                REL: {<PP>|<NP>}"""
        relationship_list = [
            'vater', 'mutter', 'sohn', 'tochter', 'bruder', 'schwester',
            'enkel', 'enkelin', 'großvater', 'großmutter', 'ehemann',
            'ehefrau', 'onkel', 'tante', 'freund'
        ]
        me_list = ['ich', 'mein', 'meine']

        return cls(nlp, grammar, relationship_list, me_list, embeddings_model)
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required):
    '''returns the spacy nlp function corresponding to the language of a document'''
    if default_lingo in supported_languages:
        if bigmodel_required == False:
            if default_lingo == "German":
                import de_core_news_sm
                nlp = de_core_news_sm.load()
            elif default_lingo == "English":
                import en_core_web_sm
                nlp = en_core_web_sm.load()
            elif default_lingo == "Spanish":
                import es_core_news_sm
                nlp = es_core_news_sm.load()
            elif default_lingo == "French":
                import fr_core_news_sm
                nlp = fr_core_news_sm.load()
            elif default_lingo == "Portuguese":
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                import it_core_news_sm
                nlp = it_core_news_sm.load()
        else:
            if default_lingo == "German":
                import de_core_news_md
                nlp = de_core_news_md.load()
            elif default_lingo == "English":
                import en_core_web_md
                nlp = en_core_web_md.load()
            elif default_lingo == "Spanish":
                import es_core_news_md
                nlp = es_core_news_md.load()
            elif default_lingo == "French":
                import fr_core_news_md
                nlp = fr_core_news_md.load()
            elif default_lingo == "Portuguese":
                # there is no pt_md model
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                # there is no it_md model
                import it_core_news_sm
                nlp = it_core_news_sm.load()
    else:
        print("NOT A SUPPORTED LANGUAGE!")
    return nlp
Пример #9
0
def _nlp(spacy_module: str) -> Optional[NLP]:
    print("Loading spacy language model for '", spacy_module, "'")
    if spacy_module == 'en':
        nlp = en_core_web_sm.load()
    elif spacy_module == 'es':
        nlp = es_core_news_sm.load()
    elif spacy_module == 'de':
        nlp = de_core_news_sm.load()
    elif spacy_module == 'fr':
        nlp = fr_core_news_sm.load()
    elif spacy_module == 'it':
        nlp = it_core_news_sm.load()
    elif spacy_module == 'pt':
        nlp = pt_core_news_sm.load()
    else:
        raise ValueError(f'Unsupported language {spacy_module}')
    return nlp
Пример #10
0
def get_sentiment_scores(data, emoji_dict):
    nlp = de_core_news_sm.load()
    sentiws = spaCySentiWS(sentiws_path="data\sentiws")
    nlp.add_pipe(sentiws)
    scores = np.zeros((len(data), 1))
    for i in range(len(data)):
        doc = nlp(data[i])
        for j, token in enumerate(doc):
            if token._.sentiws:
                scores[i][0] += token._.sentiws
            elif str(token).startswith('U0') and len(str(token)) == 10:
                emoji = str(token)
                emoji = emoji.replace("U000", "0x")
                emoji = emoji.lower()
                if emoji in emoji_dict.keys():
                    scores[i][0] += emoji_dict[emoji]
    return scores
Пример #11
0
 def find_location_in_query(self, query):
     self.found_cities = []
     nlp = de_core_news_sm.load()
     doc = nlp(query)
     found_locations = set()
     for ent in doc.ents:
         if ent.label_ == "LOC":
             found_locations.add(ent.text)
             for token in doc:
                 # If the name of the city consists of only one word, then it should be checked whether the word belongs
                 # to the tag "NE", since cities always belong to this tag.
                 if token.text == ent.text and token.tag_ != "NE":
                     # If the word cannot be assigned to the tag "NE", then it is not recognized as a city.
                     found_locations.remove(token.text)
     self.found_cities = list(found_locations)
     if len(self.found_cities) == 0:
         return None
     return self.found_cities[0]
def translate_sentence(model, sentence, german, english, device, max_length=50):
    # Load german tokenizer
    import de_core_news_sm
    spacy_ger = de_core_news_sm.load()

    print('here')

    #spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]
Пример #13
0
def doPreprocessing(DM):
    # read the text
    text = DM.readText()
    # load spacy
    spacy = de_core_news_sm.load()
    sentences = []
    # mainIndex is needed for the comparison with xml-data
    mainIndex = 1
    # read the text sentence by sentence
    for sent in text:
        sentence = []
        # get the tags from spacy
        doc = spacy(sent)
        for tok in doc:
            # ignore spaces
            if tok.pos_ != "SPACE":
                # prune the token: not all spacy-infos are needed
                sentence.append(pruneToken(tok, mainIndex))
                mainIndex += 1
        # expand the token with morphological information
        expandToken(sentence)
        sentences.append(sentence)
    printAllToFile(sentences, DM)
Пример #14
0
def translate_annotated_encoder_decoder_de_en(
    model: annotated_encoder_decoder_de_en.EncoderDecoder,
    meta: Dict[str, Any],
    source_text: str,
) -> str:

    spacy_de: German = de_core_news_sm.load()

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(text)]

    src_tok: List[str] = tokenize_de(source_text)

    src_idx: List[int] = [meta["SRC.vocab.stoi"][x] for x in src_tok
                          ] + [meta["SRC.vocab.stoi"][meta["EOS_TOKEN"]]]
    src: Tensor = torch.LongTensor(src_idx)
    src_mask: Tensor = (
        src != meta["SRC.vocab.stoi"][meta["PAD_TOKEN"]]).unsqueeze(-2)
    src_length: Tensor = torch.tensor(len(src))

    # convert to batch size 1
    src = src.unsqueeze(0)
    src_mask = src_mask.unsqueeze(0)
    src_length = src_length.unsqueeze(0)

    output = annotated_encoder_decoder_de_en.greedy_decode(
        model,
        src,
        src_mask,
        src_length,
        max_len=100,
        sos_index=meta["TRG.vocab.stoi"][meta["SOS_TOKEN"]],
        eos_index=meta["TRG.vocab.stoi"][meta["EOS_TOKEN"]],
    )

    return " ".join([meta["TRG.vocab.itos"][x] for x in output])
Пример #15
0
            else:
                return location

        return location


# Main-Methode
if __name__ == '__main__':

    try:
        #house_number = Housenumber()

        # Read tweets from fileLocation
        fileTweets = open("tweets.txt", "r")
        fileLocation = open("locations.txt", "w")
        nlp = de_core_news_sm.load()
        for line in fileTweets:
            tweet = json.loads(line)
            tweetText = tweet["text"]

            # Entity Detection
            nlpTweet = nlp(tweetText)
            entities = [(i, i.label_, i.label) for i in nlpTweet.ents]

            # write result in json format to file
            for obj in entities:
                #try:
                #    location = house_number.add_housenumber(str(obj[0]), tweet["text"])
                #except (Exception) as error :
                location = str(obj[0])
Пример #16
0
import random
import spacy


SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

import en_core_web_sm
import de_core_news_sm
spacy_eng = en_core_web_sm.load()
spacy_ger = de_core_news_sm.load()


def tokenize_ger(text):
    ### Tokenization of German sentence
    return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenize_eng(text):
    ### Tokenization of English sentence
    return [tok.text for tok in spacy_eng.tokenizer(text)]


german = Field(tokenize=tokenize_ger,lower= True,init_token = "<sos>",
               eos_token="<eos>")

english = Field(tokenize=tokenize_eng,lower= True,init_token = "<sos>",
               eos_token="<eos>")
Пример #17
0
import string
import unidecode
from nltk.stem import WordNetLemmatizer
from html.parser import HTMLParser
import unicodedata
from tqdm.auto import tqdm
from nltk.corpus import wordnet as wn

import fr_core_news_sm
nlp_fr = fr_core_news_sm.load()

import en_core_web_sm
nlp_en = en_core_web_sm.load()

import de_core_news_sm
nlp_de = de_core_news_sm.load()

import es_core_news_sm
nlp_es = es_core_news_sm.load()

import it_core_news_sm
nlp_it = it_core_news_sm.load()

import pt_core_news_sm
nlp_pt = pt_core_news_sm.load()

import nl_core_news_sm
nlp_nl = nl_core_news_sm.load()

# global variables
wnl = WordNetLemmatizer()
Пример #18
0
def check_spacy_models(main, lang, pipeline):
    if lang == 'other':
        lang = 'eng'

    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['tokenization', 'sentence_tokenization']:
        nlp_pipelines = ['sbd']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)

        if 'sbd' in nlp_pipelines:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            if 'sbd' not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe('sentencizer'))
Пример #19
0
def evaulate_parsers(article_de, article_en, article_fr):

    # Get all articles from the database
    # print(f"German Article to parse: {article_de}")
    # print(f"English Article to parse: {article_en}")
    # print(f"French Article to parse: {article_fr}")

    # TODO Create AllenNLP Parsing Function, and call the proper values....
    allen_scores = []
    allen_scores.append(50)
    allen_scores.append(60)
    allen_scores.append(70)

    # Stanford Parser Baseline CODE
    """
    The Stanford Parser is the Baseline for this Application, we compare every other parser to the output of the Stanford Parser.
    Define Models, Assign Dataframe to List
    """

    config = "tokenize,mwt,pos,lemma,depparse"

    nlp_en = stanza.Pipeline(lang='en', processors=config)
    nlp_de = stanza.Pipeline(lang='de', processors=config)
    nlp_fr = stanza.Pipeline(lang='fr', processors=config)

    df_stanford_en = stanford.parse_stan(article_en, nlp_en)
    df_stanford_de = stanford.parse_stan(article_de, nlp_de)
    df_stanford_fr = stanford.parse_stan(article_fr, nlp_fr)
    """
    Spacy Parser:
    Define Spacy Models, Assign Dataframe to List
    """

    df_spacy_de = spacyparser.parse_spacy(article_de, de_core_news_sm.load())
    df_spacy_en = spacyparser.parse_spacy(article_en, en_core_web_sm.load())
    df_spacy_fr = spacyparser.parse_spacy(article_fr, fr_core_news_sm.load())

    # Evaluate Parsers against each other....
    df_complete_de = pd.concat([df_stanford_de, df_spacy_de],
                               axis=1,
                               sort=False)
    df_complete_en = pd.concat([df_stanford_en, df_spacy_en],
                               axis=1,
                               sort=False)
    df_complete_fr = pd.concat([df_stanford_fr, df_spacy_fr],
                               axis=1,
                               sort=False)

    # print("German Dataframe Combined")
    df_complete_de['spacy_eval_upos'] = df_complete_de['upos'].str.lower(
    ) == df_complete_de['sp_upos'].str.lower()
    df_complete_de['spacy_eval_deprel'] = df_complete_de['deprel'].str.lower(
    ) == df_complete_de['sp_deprel'].str.lower()
    df_complete_de['spacy_eval'] = df_complete_de[
        'spacy_eval_upos'] == df_complete_de['spacy_eval_deprel']

    # print("English Dataframe Combined")
    df_complete_en['spacy_eval_upos'] = df_complete_en['upos'].str.lower(
    ) == df_complete_en['sp_upos'].str.lower()
    df_complete_en['spacy_eval_deprel'] = df_complete_en['deprel'].str.lower(
    ) == df_complete_en['sp_deprel'].str.lower()
    df_complete_en['spacy_eval'] = df_complete_en[
        'spacy_eval_upos'] == df_complete_en['spacy_eval_deprel']

    # print("French Dataframe Combined")
    df_complete_fr['spacy_eval_upos'] = df_complete_fr['upos'].str.lower(
    ) == df_complete_fr['sp_upos'].str.lower()
    df_complete_fr['spacy_eval_deprel'] = df_complete_fr['deprel'].str.lower(
    ) == df_complete_fr['sp_deprel'].str.lower()
    df_complete_fr['spacy_eval'] = df_complete_fr[
        'spacy_eval_upos'] == df_complete_fr['spacy_eval_deprel']

    # Evaluate the Parsers Against the Stanford Parse
    # print(df_stanford_de.equals(df_spacy_de))

    allen_scores = [0, 0, 0]
    spacy_scores = []
    stanford_scores = [1, 1, 1]

    spacy_de_scores = df_complete_de.spacy_eval.value_counts().tolist()
    spacy_en_scores = df_complete_en.spacy_eval.value_counts().tolist()
    spacy_fr_scores = df_complete_fr.spacy_eval.value_counts().tolist()

    spacy_de_score = calculate_score(spacy_de_scores[0],
                                     spacy_de_scores[0] + spacy_de_scores[1])
    spacy_en_score = calculate_score(spacy_en_scores[0],
                                     spacy_en_scores[0] + spacy_en_scores[1])
    spacy_fr_score = calculate_score(spacy_fr_scores[0],
                                     spacy_fr_scores[0] + spacy_fr_scores[1])

    spacy_scores.append(spacy_de_score)
    spacy_scores.append(spacy_en_score)
    spacy_scores.append(spacy_fr_score)

    # The Report Data sets Stanford Parser Output to 100 by default, as it is the parser we wan't to compare against. The other parsers are set by their values of true and false in comparison to the stanford parser
    report_data = {
        'de_stan': stanford_scores[0],
        'en_stan': stanford_scores[1],
        'fr_stan': stanford_scores[2],
        'de_spacy': spacy_scores[0],
        'en_spacy': spacy_scores[1],
        'fr_spacy': spacy_scores[2],
        'de_allen': allen_scores[0],
        'en_allen': allen_scores[1],
        'fr_allen': allen_scores[2]
    }
    # report_data = {'de_stan': 100, 'en_stan': 100, 'fr_stan': 100, 'de_spacy': 93.2, 'en_spacy': 92.6, 'fr_spacy': 90.7, 'de_allen': 87.9, 'en_allen': 88.6, 'fr_allen': 90.2}
    return (report_data)
    def __init__(self):
        self.nlp = de_core_news_sm.load()

        # grammar for spaCy POS Tags
        # extracts noun phrases (NP) and relationships (REL)
        self.grammar = r"""NP: {<DET>?<ADJ>*<NOUN>?<PROPN|PRON>*}
Пример #21
0
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '3'
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torchtext.data import Field, BucketIterator
import numpy as np

import spacy
import en_core_web_sm
import de_core_news_sm

spacy_en = en_core_web_sm.load()
spacy_de = de_core_news_sm.load()

from IPython import embed

from model import NAT

input = torch.randint(0, 100, (16, 20)).long()  # input: [N, S]

# torch.cuda.empty_cache()

# vocab_src, vocab_tgt, S, d_embed=512, L=50, nhead=8, num_encoder_layers=6,
#                  num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu"

model = NAT(vocab_src=100, vocab_tgt=50, S=20, num_encoder_layers=4, num_decoder_layers=4,\
    dim_feedforward=512)
num_parameters_train = sum(p.numel() for p in model.parameters()
                           if p.requires_grad)
Пример #22
0
def check_spacy_models(main, lang, pipeline):
    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['sentence_tokenization', 'tokenization']:
        nlp_pipelines = ['sentencizer']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    # Languages with models
    if lang in [
            'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other'
    ]:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)
            # Other Languages
            elif lang == 'other':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
    # Languages without models
    else:
        # Serbian (Cyrillic) & Serbian (Latin)
        if lang in ['srp_cyrl', 'srp_latn']:
            main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs')
            main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs')
        else:
            main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(
                wordless_conversion.to_iso_639_1(main, lang))

    if 'sentencizer' in nlp_pipelines:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if 'sentencizer' not in nlp.pipe_names:
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
Пример #23
0
def text_analysis(numberOfTopics, numberOfTopWords, textData, perplexity):
    """
    preprocesses text + calculates the LDA with specific number of topics.

    args-
    -numberOfTopics: the number of topics over all documents
    -numberOfTopWords: the number of top words to be displayed
    -textData: DataFrame of text documents

    returns [pyLDAvis.thml, pd.DataFrame all results, DataFrame for wordPerTopic]
    """

    # text preprocessing
    # initialize nlp
    nlp = de_core_news_sm.load()

    # feed the document into the object
    document_list = [nlp(answer) for answer in textData]

    # delete stop words
    without_stop_words = []
    for doc in document_list:
        without_stop_words.append(
            [token for token in doc if token.is_stop != True])

    # lemmatize
    lemma_list = []
    for doc in without_stop_words:
        lemma_list.append([token.lemma_ for token in doc])

    # clean expressions
    expressions = [
        ":", "-", "(", ")", "\n", "\n\n", "?", ":", "\'", '\"', ".", ",", "'s",
        "...", "&", "+", "1", "2", "3", "4", "5", "6", "7", "8", "9", ";-)",
        " ", ";", "/", "z.", "b."
    ]

    # acutal cleaning
    cleaned_lemma = []
    for doc in lemma_list:
        cleaned_lemma.append(
            [token for token in doc if token not in expressions])

    # convert text to lowercase
    low = []
    for doc in cleaned_lemma:
        low.append([token.lower() for token in doc])

    # terminate empty cells or 1-word cells
    final = []
    text_list = []
    for doc, t in zip(low, textData):
        if len(doc) > 1:
            final.append(doc)
            text_list.append(t)

    # cleane non informative words
    final2 = []
    for doc in final:
        final2.append([
            token for token in doc if token not in ["risiko", "chance", "ki"]
        ])

    # text mining!
    # call vectorizer
    cV = CountVectorizer(tokenizer=dummy, preprocessor=dummy)

    # fit vecotrizer
    cV.fit(final2)

    # create bow corpus
    bow_corpus_sk = cV.transform(final2)

    # LDA
    alpha = 0.5  # the higher the more topics in one document
    beta = 0.1  # the higher the more words of the corpus are in the topic

    #call the lda object
    lda_sk = LatentDirichletAllocation(n_components=numberOfTopics,
                                       doc_topic_prior=beta,
                                       topic_word_prior=alpha,
                                       random_state=1)
    #fitting
    lda_sk.fit(bow_corpus_sk)
    """
    # currently killed
    # pyLDAvis
    vis_sk = pyLDAvis.sklearn.prepare(lda_sk, bow_corpus_sk, cV)
    vis_html = pyLDAvis.prepared_data_to_html(vis_sk, template_type="simple")
    """

    #prepare the pd.DataFrame!
    # probability of each word in a topic
    wordPerTopic_sk = pd.DataFrame(
        lda_sk.components_,
        index=["topic" + str(num) for num in range(lda_sk.n_components)],
        columns=cV.get_feature_names())

    # top words for each topic
    top = numberOfTopWords
    topWordPerTopic_sk = pd.DataFrame(
        [[name, rows.sort_values(ascending=False).index.tolist()[:top]]
         for name, rows in wordPerTopic_sk.iterrows()])

    # probability of each topic per document
    topicPerDoc_sk = pd.DataFrame(
        lda_sk.transform(bow_corpus_sk),
        index=["commentary" + str(i) for i in range(len(final2))],
        columns=["topic" + str(i) for i in range(lda_sk.n_components)])

    topTopicPerDoc_sk = topicPerDoc_sk.T.apply(lambda x: x.idxmax())

    # merge different parts
    merged = pd.DataFrame(topTopicPerDoc_sk).merge(topWordPerTopic_sk,
                                                   how="left")
    merged = pd.concat([merged, pd.Series(text_list)], axis=1)

    # rename
    merged.columns = ["topic", "words", "text"]

    #split words in seperate cols
    merged[["word" + str(n)
            for n in range(top)]] = pd.DataFrame(merged["words"].tolist())
    newSorting = ["topic"] + ["word" + str(i) for i in range(top)] + ["text"]
    merged = merged.loc[:, newSorting]

    topWords = topWordPerTopic_sk.iloc[:, 0]
    topWords = pd.concat([
        topWords,
        pd.DataFrame(topWordPerTopic_sk.iloc[:, 1].tolist(),
                     columns=["word " + str(n + 1) for n in range(top)])
    ],
                         axis=1)
    topWords.rename(columns={0: "Topic"}, inplace=True)

    # tsne for visualization --> probability of each word in
    bow_embedded = pd.DataFrame(
        TSNE(n_components=2, random_state=5,
             perplexity=perplexity).fit_transform(
                 lda_sk.transform(bow_corpus_sk)))
    bow_embedded.topic = ["value1", "value2"]
    bow_embedded["topic"] = merged["topic"]
    bow_embedded["text"] = merged["text"]

    return [bow_embedded, merged, topWords]
Пример #24
0
import re
from trainers.misc import embedding_dict
from torchtext import data
from torchtext.datasets.sequence_tagging import CoNLL2000Chunking
from macros import DATA_PATH
from loaders.dictionary import Dictionary, PretrainedDictionary
import en_core_web_sm
import de_core_news_sm
import torch

spacy_de = en_core_web_sm.load()
spacy_en = de_core_news_sm.load()

url = re.compile('(<url>.*</url>)')


def chunking102(pretrain=False, emb_type='word'):
    corpus = CoNLLCorpus(emb_type='word', pretrain=pretrain)
    corp = {}
    corp['word2ind'] = corpus.dictionary.word2idx
    print("Vocabulary Size: {}".format(len(corp['word2ind'])))
    corp['ind2word'] = corpus.dictionary.idx2word
    corp['word2vec'] = corpus.dictionary.wv
    corp['id2vec'] = None
    return corp


def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

Пример #25
0
 def load():
     # spacy.load() won't work with models over pip, use de_core_news_sm.load() instead.
     # see https://spacy.io/usage/models#models-loading
     import de_core_news_sm
     return de_core_news_sm.load()
Пример #26
0
    def de_lang(cls):
        nlp = de_core_news_sm.load()
        me_list = ['ich', 'mein', 'meine']
        spacy_per_symbol = 'PER'

        return cls(nlp, me_list, spacy_per_symbol)
Пример #27
0
#!/usr/bin/python3

import spacy
from spacy import displacy
import pt_core_news_sm
import de_core_news_sm
import en_core_web_sm
from fuzzywuzzy import fuzz, process

NAMED_ENTITY_MINIMUM_LENGTH = 3
SIMILARITY_RATIO_THRESHOLD = 70

NLP_PT = pt_core_news_sm.load()
NLP_DE = de_core_news_sm.load()
NLP_EN = en_core_web_sm.load()

MODELS = {
    'de': NLP_DE,
    'en': NLP_EN,
    'pt': NLP_PT,
}


def _get_nlp_model(language):
    return MODELS[language]


def _get_named_entities(text, language):
    nlp_model = _get_nlp_model(language)
    named_entities = nlp_model(text).ents
    return named_entities