def process_for_named_entity(text, language, to_ascii=True, stem=False, shorten=False): if language == "ro": if isinstance(text, list): text = [replace_diactitics(subtext) for subtext in text] else: text = replace_diactitics(text) if isinstance(text, list): if to_ascii: text = [ unicodedata.normalize('NFKD', subtext).encode('ascii', 'ignore').decode("ascii") for subtext in text ] text = [subtext.lower() for subtext in text] else: if to_ascii: text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode("ascii") text = text.lower() if language == "ro": stemmer = snowball.RomanianStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] elif language == "it": stemmer = snowball.ItalianStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] elif language == "en": stemmer = snowball.EnglishStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] else: if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] if isinstance(text, list): for i in range(len(words)): sent = words[i] sentence = [] if stem: for word in sent: word = re.sub("[^a-z0-9]", "", word) if word != '': sentence.append(stemmer.stem(word)) else: for word in sent: word = re.sub("[^a-z0-9]", "", word) if word != '': sentence.append(word) procced_text.append(sentence) else: for word in words: word = re.sub("[^a-z0-9]", "", word) if word != '': if stem: word = stemmer.stem(word) procced_text.append(word) if isinstance(text, list): for i in range(len(procced_text)): company_name = procced_text[i] if len(company_name) > 0 and company_name[0] != 'null': if False and company_name[-1] in [ 'srl', 'ltd', 'spa', 'ltda', 'sl', 'snc' ]: contracted = ' '.join(company_name[:-1]) if not check_if_text_in_language( company_name[:-1] ) and len(contracted) > 6 and not is_number( contracted) and contracted not in [ 'data', 'aprile', 'group', 'azienda', 'profilo', 'alumino', 'stato', 'roma', 'service', 'area', 'estate', 'date 4', 'work', 'altre', 'italia', 'stage', 'ottobre 2008', 'strada', '16 luglio', 'espresso', 'export', 'prime', 'sala', 'panelli' ]: del company_name[-1] if shorten: while len(contracted) > 23: if len(contracted) - len(company_name[0]) < 15: break del company_name[0] contracted = ' '.join(procced_text) procced_text[i] = ' '.join(company_name) else: if len(procced_text) > 0 and procced_text[0] != 'null': if False and procced_text[-1] in [ 'srl', 'ltd', 'spa', 'ltda', 'sl', 'snc' ]: contracted = ' '.join(procced_text[:-1]) if not check_if_text_in_language(procced_text[:-1]) and len( contracted) > 6 and not is_number( contracted) and contracted not in [ 'data', 'aprile', 'group', 'azienda', 'profilo', 'allumino', 'stato', 'roma', 'service', 'area', 'estate', 'metalmeccanica' 'date 4', 'work', 'castel' 'altre', 'italia', 'controlo qualita', 'stage', 'ottobre 2008', 'atena', 'strada', '16 luglio', 'industriale', 'espresso', 'export', 'prime', 'sala', 'panelli' ]: del procced_text[-1] if shorten: while len(contracted) > 23: if len(contracted) - len(procced_text[0]) < 15: break del procced_text[0] contracted = ' '.join(procced_text) procced_text = ' '.join(procced_text) return procced_text
def process_words(text, language=None, stem=True, to_ascii=True, character_level=False): if language is None: translator = Translator() if isinstance(text, list): language = translator.detect(text)[0].lang if language == "ro": if isinstance(text, list): text = [replace_diactitics(subtext) for subtext in text] else: text = replace_diactitics(text) if isinstance(text, list): if to_ascii: text = [ unicodedata.normalize('NFKD', subtext).encode('ascii', 'ignore').decode("ascii") for subtext in text ] text = [subtext.lower() for subtext in text] else: if to_ascii: text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode("ascii") text = text.lower() if language == "ro": stemmer = snowball.RomanianStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] elif language == "it": stemmer = snowball.ItalianStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] elif language == "en": stemmer = snowball.EnglishStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] else: if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] stopw = [] if language in stopwords: stopw = stopwords[language] if isinstance(text, list): for i in range(len(words)): sent = words[i] sentence = [] if stem: for word in sent: word = keep_only_letters(word) if word not in stopw: if character_level: sentence += list(word) else: sentence.append(stemmer.stem(word)) else: for word in sent: word = keep_only_letters(word) if word not in stopw: if character_level: sentence += list(word) else: sentence.append(word) procced_text.append(sentence) else: for word in words: word = keep_only_letters(word) if word not in stopw: if character_level: procced_text += list(word) else: procced_text.append(word) return procced_text
def __init__(self): self.stemming = snowball.ItalianStemmer() self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='it')