예제 #1
0
def process_for_named_entity(text,
                             language,
                             to_ascii=True,
                             stem=False,
                             shorten=False):
    if language == "ro":
        if isinstance(text, list):
            text = [replace_diactitics(subtext) for subtext in text]
        else:
            text = replace_diactitics(text)

    if isinstance(text, list):
        if to_ascii:
            text = [
                unicodedata.normalize('NFKD',
                                      subtext).encode('ascii',
                                                      'ignore').decode("ascii")
                for subtext in text
            ]
        text = [subtext.lower() for subtext in text]
    else:
        if to_ascii:
            text = unicodedata.normalize('NFKD',
                                         text).encode('ascii',
                                                      'ignore').decode("ascii")
        text = text.lower()

    if language == "ro":
        stemmer = snowball.RomanianStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    elif language == "it":
        stemmer = snowball.ItalianStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    elif language == "en":
        stemmer = snowball.EnglishStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []
    else:
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    if isinstance(text, list):
        for i in range(len(words)):
            sent = words[i]
            sentence = []
            if stem:
                for word in sent:
                    word = re.sub("[^a-z0-9]", "", word)
                    if word != '':
                        sentence.append(stemmer.stem(word))
            else:
                for word in sent:
                    word = re.sub("[^a-z0-9]", "", word)
                    if word != '':
                        sentence.append(word)
            procced_text.append(sentence)
    else:
        for word in words:
            word = re.sub("[^a-z0-9]", "", word)
            if word != '':
                if stem:
                    word = stemmer.stem(word)
                procced_text.append(word)

    if isinstance(text, list):
        for i in range(len(procced_text)):
            company_name = procced_text[i]
            if len(company_name) > 0 and company_name[0] != 'null':
                if False and company_name[-1] in [
                        'srl', 'ltd', 'spa', 'ltda', 'sl', 'snc'
                ]:
                    contracted = ' '.join(company_name[:-1])
                    if not check_if_text_in_language(
                            company_name[:-1]
                    ) and len(contracted) > 6 and not is_number(
                            contracted) and contracted not in [
                                'data', 'aprile', 'group', 'azienda',
                                'profilo', 'alumino', 'stato', 'roma',
                                'service', 'area', 'estate', 'date 4', 'work',
                                'altre', 'italia', 'stage', 'ottobre 2008',
                                'strada', '16 luglio', 'espresso', 'export',
                                'prime', 'sala', 'panelli'
                            ]:

                        del company_name[-1]
                        if shorten:
                            while len(contracted) > 23:
                                if len(contracted) - len(company_name[0]) < 15:
                                    break
                                del company_name[0]
                                contracted = ' '.join(procced_text)
                procced_text[i] = ' '.join(company_name)
    else:

        if len(procced_text) > 0 and procced_text[0] != 'null':
            if False and procced_text[-1] in [
                    'srl', 'ltd', 'spa', 'ltda', 'sl', 'snc'
            ]:
                contracted = ' '.join(procced_text[:-1])
                if not check_if_text_in_language(procced_text[:-1]) and len(
                        contracted) > 6 and not is_number(
                            contracted) and contracted not in [
                                'data', 'aprile', 'group', 'azienda',
                                'profilo', 'allumino', 'stato', 'roma',
                                'service', 'area', 'estate', 'metalmeccanica'
                                'date 4', 'work', 'castel'
                                'altre', 'italia', 'controlo qualita', 'stage',
                                'ottobre 2008', 'atena', 'strada', '16 luglio',
                                'industriale', 'espresso', 'export', 'prime',
                                'sala', 'panelli'
                            ]:
                    del procced_text[-1]
                    if shorten:
                        while len(contracted) > 23:
                            if len(contracted) - len(procced_text[0]) < 15:
                                break
                            del procced_text[0]
                            contracted = ' '.join(procced_text)

            procced_text = ' '.join(procced_text)

    return procced_text
예제 #2
0
def process_words(text,
                  language=None,
                  stem=True,
                  to_ascii=True,
                  character_level=False):
    if language is None:
        translator = Translator()
        if isinstance(text, list):
            language = translator.detect(text)[0].lang

    if language == "ro":
        if isinstance(text, list):
            text = [replace_diactitics(subtext) for subtext in text]
        else:
            text = replace_diactitics(text)

    if isinstance(text, list):
        if to_ascii:
            text = [
                unicodedata.normalize('NFKD',
                                      subtext).encode('ascii',
                                                      'ignore').decode("ascii")
                for subtext in text
            ]
        text = [subtext.lower() for subtext in text]
    else:
        if to_ascii:
            text = unicodedata.normalize('NFKD',
                                         text).encode('ascii',
                                                      'ignore').decode("ascii")
        text = text.lower()

    if language == "ro":
        stemmer = snowball.RomanianStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    elif language == "it":
        stemmer = snowball.ItalianStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    elif language == "en":
        stemmer = snowball.EnglishStemmer()
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []
    else:
        if isinstance(text, list):
            words = [nltk.word_tokenize(subtext) for subtext in text]
        else:
            words = nltk.word_tokenize(text)
        procced_text = []

    stopw = []
    if language in stopwords:
        stopw = stopwords[language]

    if isinstance(text, list):
        for i in range(len(words)):
            sent = words[i]
            sentence = []
            if stem:
                for word in sent:
                    word = keep_only_letters(word)

                    if word not in stopw:
                        if character_level:
                            sentence += list(word)
                        else:
                            sentence.append(stemmer.stem(word))
            else:
                for word in sent:
                    word = keep_only_letters(word)
                    if word not in stopw:
                        if character_level:
                            sentence += list(word)
                        else:
                            sentence.append(word)
            procced_text.append(sentence)
    else:
        for word in words:
            word = keep_only_letters(word)
            if word not in stopw:
                if character_level:
                    procced_text += list(word)
                else:
                    procced_text.append(word)

    return procced_text
예제 #3
0
 def __init__(self):
     self.stemming = snowball.ItalianStemmer()
     self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='it')