def test_text_none(): """Text is None.""" with pytest.warns(SentenceSplitterWarning): splitter = SentenceSplitter(language='en') # noinspection PyTypeChecker sentences = splitter.split(text=None) assert sentences == []
def test_en_sentence_within_brackets(): splitter = SentenceSplitter(language='en') input_text = 'Foo bar. (Baz foo.) Bar baz.' expected_sentences = ['Foo bar.', '(Baz foo.)', 'Bar baz.'] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def clean_expo_data(): with open(file_folder + "Process_data/Dictionary_approach/" + "expo.txt", 'r') as file: document = file.readlines() lst_all_sentence = [] splitter = SentenceSplitter(language='fr') percent = 0 french = 0 for sentence in document: percent += 1 print(str(percent) + " completed") sentence = sentence.replace('\n', '') lst_sentence = splitter.split(text=sentence) if len(lst_sentence) > 1: print(lst_sentence) for s in lst_sentence: if detect(s) == "fr": lst_all_sentence.append(s) french += 1 with open( file_folder + "Process_data/Dictionary_approach/" + "expo_fr.txt", "a") as text_file: for s in lst_all_sentence: text_file.write(s + "\n") print(str(french) + " french sentences") print("End function")
def test_en_uppercase_acronym(): splitter = SentenceSplitter(language='en') input_text = 'Hello. .NATO. Good bye.' expected_sentences = ['Hello. .NATO. Good bye.'] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def gramarize(self, sent): f = open('Value-json/logic_activation.json') activation = json.load(f) if activation['grammar_logic'] == "active": test_str = sent splitter = SentenceSplitter(language='en') sente = splitter.split(text=test_str) gram_sent = [] for sent in sente: parser = GingerIt() output = parser.parse(sent) output_1 = (output.get("result")) output_1 = output_1 gram_sent.append(output_1) f_output = ' '.join(gram_sent) if f_output[-1] == '.' and f_output[-2] == '.': f_output = f_output[:-2] f_output = f_output + '.' f_output = self.remove_trailing_dots(f_output) f_output = f_output.replace('..', '.') return f_output else: return sent
def prendi(request): start_time = time.time() testo = request.POST.get('testT', None) lingua = detect(testo) #rilevamento della lingua del testo dato in input splitter = SentenceSplitter( language=lingua ) #come deve dividere (splittare) il testo in base alla lingua rilevata testoEm = splitter.split(text=testo) #divisione (split) del testo message_embeddings = embed(testoEm) #embedding sul testo splittato fileSvm = open( "/home/angela/PycharmProjects/ServerDjangoGit/ServerDjangoProva/SVMAll.pickle", 'rb') #richiamo il classificatore SVM svm = pickle.load(fileSvm) #carico il classificatore SVM preSvm = svm.predict( message_embeddings ) #utilizzo di SVM per rilevare a quale categoria appartiene ogni split del testo fileRf = open( "/home/angela/PycharmProjects/ServerDjangoGit/ServerDjangoProva/RFFireness.pickle", 'rb') #richiamo il calssificatore RF rf = pickle.load(fileRf) # carico il classificatore RF preRf = rf.predict_proba( message_embeddings ) #utilizzo di RF per rilevare il livello di fireness al quale appartiene ogni embedding ca = preSvm.tolist( ) #metto in una lista le categoria di appartenenza di ogni frase splittata mat = preRf.tolist( ) #metto in una lista le probabilità di appartenenza alle fireness di ogni frase splittata max = [] perc = [] temp_p = 0 for i in range( len(mat) ): #questo for serve per individuare la massima probabilità di appartenenza della fireness massimo = mat[i][0] temp = 0 for j in range(len(mat[i])): if mat[i][j] > massimo: massimo = mat[i][j] temp_p = massimo temp = j tras = temp_p * 100 arr = math.trunc(tras) perc.append(arr) max.append(temp) #metto in questo vettore la probabilità più alta fra = { #oggetto json di dati da passare al client 'frase': testoEm, 'cate': ca, 'fair': max, 'perc': perc } p_server = psutil.Process() # numero pid ram = round(p_server.memory_percent(), 3) cpu = str(p_server.cpu_percent(interval=1.0)) write_test(start_time, ram, cpu) return JsonResponse(fra) #restituisce la risposta json fileSvm.close() #chiudo il file del classificatore SVM fileRf.close() #chiudo file del classificatore RF
def test_fr(): splitter = SentenceSplitter(language='fr') input_text = 'Brookfield Office Properties Inc. (« BOPI »), dont les actifs liés aux immeubles directement...' expected_sentences = [ input_text, ] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def test_en_numeric_only(): splitter = SentenceSplitter(language='en') input_text = 'Hello. No. 1. No. 2. Prefix. 1. Prefix. 2. Good bye.' expected_sentences = [ 'Hello.', 'No. 1.', 'No. 2.', 'Prefix.', '1.', 'Prefix.', '2.', 'Good bye.' ] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def test_de(): splitter = SentenceSplitter(language='de') input_text = 'Nie hätte das passieren sollen. Dr. Soltan sagte: "Der Fluxcompensator war doch kalibriert!".' expected_sentences = [ 'Nie hätte das passieren sollen.', 'Dr. Soltan sagte: "Der Fluxcompensator war doch kalibriert!".', ] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def test_pt(): splitter = SentenceSplitter(language='pt') input_text = 'Isto é um parágrafo. Contém várias frases. «Mas porquê,» perguntas tu?' expected_sentences = [ "Isto é um parágrafo.", "Contém várias frases.", "«Mas porquê,» perguntas tu?", ] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def split_sents(text, lang): if lang in LANG.SPLITTER: if lang == 'zh': sents = _split_zh(text) else: splitter = SentenceSplitter(language=lang) sents = splitter.split(text=text) sents = [sent.strip() for sent in sents] return sents else: raise Exception('The language {} is not suppored yet.'.format( LANG.ISO[lang]))
def tokenize(self): sentence_splitter = SentenceSplitter(language='en') for i, review in enumerate(self.reviews): text = review.text sentences = sentence_splitter.split(text) for sentence in sentences: tokenized_sentence = [] words_borders = list( WordPunctTokenizer().span_tokenize(sentence)) for word_begin, word_end in words_borders: word_text = sentence[word_begin:word_end] word = Word(word_text, word_begin, word_end) tokenized_sentence.append(word) self.reviews[i].sentences.append(tokenized_sentence)
def test_es(): splitter = SentenceSplitter(language='es') input_text = ( 'La UE ofrece una gran variedad de empleos en un entorno multinacional y multilingüe. La Oficina Europea de ' 'Selección de Personal (EPSO) se ocupa de la contratación, sobre todo mediante oposiciones generales.' ) expected_sentences = [ 'La UE ofrece una gran variedad de empleos en un entorno multinacional y multilingüe.', ('La Oficina Europea de Selección de Personal (EPSO) se ocupa de la contratación, sobre todo mediante ' 'oposiciones generales.'), ] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def test_el(): splitter = SentenceSplitter(language='el') input_text = ( 'Όλα τα συστήματα ανώτατης εκπαίδευσης σχεδιάζονται σε εθνικό επίπεδο. Η ΕΕ αναλαμβάνει κυρίως να συμβάλει ' 'στη βελτίωση της συγκρισιμότητας μεταξύ των διάφορων συστημάτων και να βοηθά φοιτητές και καθηγητές να ' 'μετακινούνται με ευκολία μεταξύ των συστημάτων των κρατών μελών.') expected_sentences = [ 'Όλα τα συστήματα ανώτατης εκπαίδευσης σχεδιάζονται σε εθνικό επίπεδο.', ('Η ΕΕ αναλαμβάνει κυρίως να συμβάλει στη βελτίωση της συγκρισιμότητας μεταξύ των διάφορων συστημάτων ' 'και να βοηθά φοιτητές και καθηγητές να μετακινούνται με ευκολία μεταξύ των συστημάτων των κρατών ' 'μελών.'), ] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def parse_paragraphs(record): """ parse paragraphs into sentences, returns list """ from sentence_splitter import SentenceSplitter splitter = SentenceSplitter(language='en') sentences = splitter.split(record['value']) article_id = remove_prefix(record['key'], 'paragraphs:') pre = 'sentence:' + article_id l = [{ 'key': f'{pre}', 'idx': f'{idx}', 'value': sentence } for idx, sentence in enumerate(sentences)] return l
def __init__(self, term_dictionary, language_code, language_name): self._term_dictionary = term_dictionary self._language_code = language_code self._language_name = language_name if self._language_code in punkt_tokenizers: splitter = nltk.data.load("tokenizers/punkt/%s" % punkt_tokenizers[self._language_code]) self.sent_split = splitter.tokenize elif self._language_code in splitter_sent_tok: splitter = SentenceSplitter(language=self._language_code) self.sent_split = splitter.split else: # If nothing works, use naive sentence splitter self.sent_split = partial( re.split, r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s') self._lemmas = set([lemma for lemma in term_dictionary.values()]) if self._language_code in nltk_stopwords: self._stopwords = stopwords.words( nltk_stopwords[self._language_code]) else: print("No stopwords:", self._language_code) # Change punctuation for whitespace self.remove_punctuation = partial(regex.sub, '[\p{P}]+', ' ') self.pyphen_dic = pyphen.Pyphen(lang=pyphen_dicts[self._language_code])
def split_to_sentences(text, target_lang='es'): ''' DESCRIPTION: Split text into sentences. Parameters ---------- text : string String with entire document. Returns ------- sentences: list of str List with sentences of document ''' splitter = SentenceSplitter(language=target_lang) return splitter.split(text)
def test_en(): splitter = SentenceSplitter(language='en') input_text = 'This is a paragraph. It contains several sentences. "But why," you ask?' expected_sentences = [ 'This is a paragraph.', 'It contains several sentences.', '"But why," you ask?' ] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences input_text = 'Hey! Now.' expected_sentences = ['Hey!', 'Now.'] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences input_text = 'Hey... Now.' expected_sentences = ['Hey...', 'Now.'] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences input_text = 'Hey. Now.' expected_sentences = ['Hey.', 'Now.'] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences input_text = 'Hey. Now.' expected_sentences = ['Hey.', 'Now.'] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def gramarize(self, sent): anti_grammar_words = [] file = open('Value-json/anti_grammar.txt') for lines in file.read().splitlines(): lines = lines.rstrip() lines = lines.lstrip() anti_grammar_words.append(lines) f = open('Value-json/logic_activation.json') activation = json.load(f) if activation['grammar_logic'] == "active": test_str = sent splitter = SentenceSplitter(language='en') sente = splitter.split(text=test_str) gram_sent = [] for sent in sente: parser = GingerIt() ani = False for ani_words in anti_grammar_words: if ani_words in sent: ani = True break else: continue if ani == False: output = parser.parse(sent) output_1 = (output.get("result")) output_1 = output_1 else: output_1 = sent gram_sent.append(output_1) f_output = ' '.join(gram_sent) f_output = f_output + '.' f_output = self.remove_trailing_dots(f_output) f_output = f_output.replace('..', '.') return f_output else: return sent
class ExtractSentences(jsonql.Transformer): def __init__( self, sp_model: Path, lm_model: Path, field: str = "raw_content", threshold: float = float("+inf"), ): super().__init__() self.sp_model = sp_model self.lm_model = lm_model self.field = field self.threshold = threshold self.sp: SentencePieceProcessor = None self.lm: KenlmModel = None self.splitter: SentenceSplitter = None self.hashes: Set[int] = set() def _prepare(self): self.sp = SentencePieceProcessor() self.sp.load(str(self.sp_model)) self.splitter = SentenceSplitter("en") self.lm = KenlmModel(str(self.lm_model)) def do(self, document: dict) -> Optional[str]: content: Optional[str] = document.get(self.field) if not content: return None all_sentences = [ s for l in content.split("\n") if l for s in self.splitter.split(text=l) ] unique_sentences = [] for s in all_sentences: if not s: continue h = dedup.str_hash(s) if h in self.hashes: continue self.hashes.add(h) unique_sentences.append(s) scores = [] for sentence in unique_sentences: normalized = text_normalizer.normalize(sentence) pieces = self.sp.encode_as_pieces(normalized) log_score = self.lm.score(" ".join(pieces)) pp = -1 if len(pieces): pp = perplexity.pp(log_score, len(pieces)) scores.append(pp) res = filter(lambda pp_s: self.threshold > pp_s[0] > 0, zip(scores, unique_sentences)) return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None
def test_custom_non_breaking_prefixes(): with tempfile.NamedTemporaryFile(mode='w+') as f: f.write(("# \n" "# Temporary prefix file\n" "# \n" "\n" "Prefix1\n" "Prefix2\n")) f.flush() splitter = SentenceSplitter(language='xx', non_breaking_prefix_file=f.name) input_text = "Hello. Prefix1. Prefix2. Hello again. Good bye." expected_sentences = [ 'Hello.', 'Prefix1. Prefix2. Hello again.', 'Good bye.', ] actual_sentences = splitter.split(text=input_text) assert expected_sentences == actual_sentences
def tokenize(self): sentence_splitter = SentenceSplitter(language='ru') for i, review in enumerate(self.reviews): text = review.text sentences = sentence_splitter.split(text) words_borders = list(WordPunctTokenizer().span_tokenize(text)) for sentence in sentences: tokenized_sentence = [] sentence_begin = text.find(sentence) sentence_end = sentence_begin + len(sentence) for word_begin, word_end in words_borders: if word_begin >= sentence_begin and word_end <= sentence_end: word_text = text[word_begin:word_end] word = Word(word_text, word_begin, word_end) for opinion in review.aspects: if word.begin >= opinion.begin and word.end <= opinion.end: word.add_opinion(opinion) opinion.words.append(word) tokenized_sentence.append(word) self.reviews[i].sentences.append(tokenized_sentence)
class PipelineSyntaxNet(object): def __init__(self, host, port): self.word_tokeniser_ = create_tokenizer_ru() self.sent_splitter_ = SentenceSplitter() self.syntaxnet_parser_ = ProcessorSyntaxNet(host, port) def process(self, text, raw_output=False): tokens = list(self.word_tokeniser_.span_tokenize(text)) sents = self.sent_splitter_.process(text, tokens) trees = self.syntaxnet_parser_.parse(text, sents, raw_output=raw_output) return trees
class Embedder: def __init__(self): self.model = SentenceTransformer("LaBSE") self.en_sent_splitter = SentenceSplitter(language="en") def encode(self, text, lang): sentences = None if lang == "en": sentences = self.en_sent_splitter.split(text) elif lang == "ne": sentences = sentence_tokenize.sentence_split(text, "ne") filtered_sentences = [ sentence for sentence in sentences if len(sentence.split()) > 3 and detect(sentence) == lang ] return filtered_sentences, self.model.encode(filtered_sentences)
def __process_line(line: str, output_file: TextIO, sentence_splitter: SentenceSplitter, morph_predictor: RNNMorphPredictor): sentences = sentence_splitter.split(line) for sentence in sentences: words = [ token.text for token in Tokenizer.tokenize(sentence) if token.text != '' and token.token_type != Token.TokenType.SPACE ] if not words: continue forms = morph_predictor.predict_sentence_tags(words) for form in forms: if form.pos == "PUNCT": continue output_file.write( "%s\t%s\t%s\t%s\n" % (form.word, form.normal_form, form.pos, form.tag)) output_file.write("\n")
def get_morph_markup(input_filenames: List[str], output_filename: str): """ Разметка по грамматическим значениям :param input_filenames: входные текстовые файлы :param output_filename: путь к файлу, куда будет сохранена разметка """ if os.path.exists(output_filename): os.remove(output_filename) sentence_splitter = SentenceSplitter(language='ru') morph_predictor = RNNMorphPredictor() for filename in input_filenames: with open(filename, "r", encoding="utf-8") as r, open(output_filename, "w+", encoding="utf-8") as w: for line in r: Morph.__process_line(line, w, sentence_splitter, morph_predictor)
def __init__(self, tokenizer: PreTrainedTokenizer, args, dir_path: str, block_size=1024): self.examples = [] tokenizer_class = tokenizer.__class__.__name__ cached_features_file = os.path.join( dir_path, args.model_type + "_cached2_maskedsents3_" + str(block_size) + "_" + tokenizer_class) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: logger.info("Creating features from dataset file at %s", dir_path) good_docs = bad_docs = 0 for filename in os.listdir(dir_path): try: if not filename.endswith(".json"): continue path = os.path.join(dir_path, filename) with open(path) as json_file: data = json.load(json_file) facts_doc = FactsDoc.Schema().load(data) splitter = SentenceSplitter(language='en') full_text_sentence_split = splitter.split( text=facts_doc.text) sent_one = full_text_sentence_split[START_SENT] sent_two = full_text_sentence_split[END_SENT] inbetween_text = " ".join( full_text_sentence_split[START_SENT + 1:END_SENT]) tokenized_sent_one = tokenizer.encode( sent_one, add_special_tokens=False, return_tensors="pt").squeeze(0) tokenized_sent_two = tokenizer.encode( sent_two, add_special_tokens=False, return_tensors="pt").squeeze(0) tokenized_inbetween_text = tokenizer.encode( inbetween_text, add_special_tokens=False, return_tensors="pt").squeeze(0) full_text_tensor = torch.cat([ tokenized_sent_one, tokenized_inbetween_text, tokenized_sent_two ], dim=0) mask = torch.cat([ torch.ones(tokenized_sent_one.size()), torch.zeros(tokenized_inbetween_text.size()), torch.ones(tokenized_sent_two.size()) ]) self.examples.append((full_text_tensor, mask)) good_docs += 1 except: bad_docs += 1 logger.info("finished creating examples for " + dir_path) logger.info( f"docs with exceptions = {bad_docs} fro total {bad_docs+good_docs}" ) logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
import re from pyvi import ViTokenizer from joblib import load from sentence_splitter import SentenceSplitter splitter = SentenceSplitter(language='en') from cfg.config import SENTIMENT_MODEL_PATH """ date format: d/m/y date_range format: (d/m/y, d/m/y) check if date is in the date_range """ def date_in_range(date, date_range): try: if date and date_range: date_start = date_range[0] date_end = date_range[1] date_tuple = [int(elem) for elem in reversed(date.split("/"))] date_start_tuple = [ int(elem) for elem in reversed(date_start.split("/")) ] date_end_tuple = [ int(elem) for elem in reversed(date_end.split("/")) ] return date_start_tuple < date_tuple < date_end_tuple else: return True except Exception as e: print("Error checking date in range " + str(e)) return True
def test_invalid_language_code(): """Invalid language code.""" with pytest.raises(SentenceSplitterException): SentenceSplitter(language='/etc/passwd')
def split_text_to_sentences(self, text: str) -> List[str]: """Splits text into sentences with "sentence_splitter" module. Language code will be read from language_code() method.""" text = decode_object_from_bytes_if_needed(text) language_code = self.language_code() if self.__sentence_splitter is None: try: self.__sentence_splitter = SentenceSplitter(language=language_code) except Exception as ex: raise McLanguageException( "Unable to initialize sentence splitter for language '%s': %s" % (language_code, str(ex),) ) if text is None: log.warning("Text is None.") return [] # Sentence tokenizer can hang for a very long on very long text, and anything greater than 1 MB is more likely # to be an artifact than actual text if len(text) > self.__MAX_TEXT_LENGTH: text = text[:self.__MAX_TEXT_LENGTH] # Only "\n\n" (not a single "\n") denotes the end of sentence, so remove single line breaks text = re.sub('([^\n])\n([^\n])', r"\1 \2", text, flags=re.DOTALL) # Remove asterisks from lists text = re.sub(r" {2}\*", " ", text, flags=re.DOTALL) text = re.sub(r"\n\s\*\n", "\n\n", text, flags=re.DOTALL) text = re.sub(r"\n\n\n\*", "\n\n", text, flags=re.DOTALL) text = re.sub(r"\n\n", "\n", text, flags=re.DOTALL) # Replace tabs with spaces text = re.sub(r"\t", " ", text, flags=re.DOTALL) # Replace non-breaking spaces with normal spaces text = re.sub(r"\xa0", " ", text, flags=re.DOTALL) # Replace multiple spaces with a single space text = re.sub(" +", " ", text, flags=re.DOTALL) # The above regexp and HTML stripping often leave a space before the period at the end of a sentence text = re.sub(r" +\.", ".", text, flags=re.DOTALL) # We see lots of cases of missing spaces after sentence ending periods (has a hardcoded lower limit of # characters because otherwise it breaks Portuguese "a.C.." abbreviations and such) text = re.sub(r"([a-z]{2,})\.([A-Z][a-z]+)", r"\1. \2", text, flags=re.DOTALL) # Replace Unicode's "…" with "..." text = text.replace("…", "...") # Trim whitespace from start / end of the whole string text = text.strip() # FIXME: fix "bla bla... yada yada"? is it two sentences? # FIXME: fix "text . . some more text."? if len(text) == 0: log.debug("Text is empty after processing it.") return [] # Split to sentences sentences = self.__sentence_splitter.split(text=text) non_empty_sentences = [] # Trim whitespace from start / end of each of the sentences for sentence in sentences: sentence = sentence.strip() if len(sentence) > 0: non_empty_sentences.append(sentence) return non_empty_sentences
def test_text_empty(): """Text is empty.""" splitter = SentenceSplitter(language='en') assert splitter.split(text='') == []