def split_to_sent_array(self, text, lang): spm_limit = self.spm_limit spm_processor = self.spm_processor _ = "▁" # words in sentencepieces start with this weird unicode underscore def decode(x): """convert sequence of sentencepieces back to the original string""" return "".join(x).replace(_, " ") def limit_sp(n, s): """n: take first n sentencepieces. Don't split it inside of a word, rather take less sentencepieces. s: sequence of sentencepieces """ n -= 1 while 0 < n < len(s) - 1 and not s[n + 1].startswith(_): n -= 1 return s[:n + 1] sent_array = [] for sent in split_text_into_sentences(text=text, language=lang): sp_sent = spm_processor.EncodeAsPieces(sent) # splitting to chunks of 100 (default) subwords, at most while len(sp_sent) > spm_limit: part = limit_sp(spm_limit, sp_sent) sent_array.append(decode(part)) sp_sent = sp_sent[len(part):] sent_array.append(decode(sp_sent)) # print(len(sent_array), [len(x) for x in sent_array], [len(spm_processor.EncodeAsPieces(x)) for x in sent_array]) return sent_array
def on_request(): if request.method == 'POST': data = request.get_json(force=True) source_text = preprocess(data.get("text")) source_lang = data.get("lang") else: source_text = request.args.get('text') source_lang = request.args.get('lang') if not source_text or not source_lang: return "Please provide the following parameters: text, lang", 400 source_sentences = split_text_into_sentences(source_text, language=source_lang) target_sentences = [] # translate each sentence individually for source_sent in source_sentences: target_sent = translate(source_sent, source_lang) target_sentences.append(target_sent) # merge translated sentences paraphrases = {} for language in LANGUAGES: paraphrase_in_lang = [para[language] for para in target_sentences] paraphrase_in_lang = ' '.join(paraphrase_in_lang) paraphrases[language] = paraphrase_in_lang return jsonify(paraphrases)
def seg_text( text: str, lang: Optional[str] = None, qmode: bool = False, maxlines: int = 1000 ) -> List[str]: # fmt: on """ Split text to sentences. Use sentence_splitter if supported, else use polyglot.text.Text.sentences qmode: skip split_text_into_sentences if True, default False vectors for all books are based on qmode=False. qmode=True is for quick test purpose only maxlines (default 1000), threhold for turn on tqdm progressbar set to <1 or a large number to turn it off """ if lang is None: try: lang = Detector(text).language.code except Exception as exc: logger.warning( "polyglot.text.Detector exc: %s, setting to 'en'", exc ) lang = "en" if not qmode and lang in LANG_S: _ = [] lines = text.splitlines() # if maxlines > 1 and len(lines) > maxlines: if len(lines) > maxlines > 1: for para in tqdm(lines): if para.strip(): _.extend(split_text_into_sentences(para, lang)) else: for para in lines: if para.strip(): _.extend(split_text_into_sentences(para, lang)) return _ # return split_text_into_sentences(text, lang) return [elm.string for elm in Text(text, lang).sentences]
def test_split_text_into_sentences(): input_text = 'This is a paragraph. It contains several sentences. "But why," you ask?' expected_sentences = [ 'This is a paragraph.', 'It contains several sentences.', '"But why," you ask?' ] actual_sentences = split_text_into_sentences(text=input_text, language='en') assert expected_sentences == actual_sentences
def seg_text(text: str, lang: str) -> List[str]: """ split text to sentences. use sentence_splitter if supported, else use polyglot.text.Text """ if lang in LANG_S: return split_text_into_sentences(text, lang) return [elm.string for elm in Text(text, lang).sentences]
def default_sentence_splitter(text, do_lower_case=True): split_sentences = split_text_into_sentences( text=text, language='en', non_breaking_prefix_file=pkg_resources.resource_filename( __name__, 'resource/custom_english_non_breaking_prefixes.txt')) if do_lower_case: return [i.lower() for i in split_sentences if i.strip() != ''] else: return [i for i in split_sentences if i.strip() != '']
def split_into_sentences(content, input_language): """ Tasks with takes an input text and splits it into sentence Providing the input_language """ content = " ".join(content) sentences = split_text_into_sentences(text=content, language=input_language) for sentence in sentences: if sentence: print(sentence)
def seg_text(text: str, lang: Optional[str] = None) -> List[str]: """split text to sentences. use sentence_splitter if supported, else use polyglot.text.Text """ if lang is None: lang = Detector("testt 12 3").language.code if lang in LANG_S: return split_text_into_sentences(text, lang) return [elm.string for elm in Text(text, lang).sentences]
def tokenize_text(text_lines): sentences_tokens = [] if not isinstance(text_lines, list): text_lines = [text_lines] for line in text_lines: sentences = split_text_into_sentences(line, language="fr") for sentence in sentences: tokens = MOSES_TOKENIZER.tokenize(sentence, aggressive_dash_splits=True, escape=False) sentences_tokens.append(tokens) return sentences_tokens
def _count_it_up(self): sentences = split_text_into_sentences(text=self.text, language=self.language) sentence_count = len(sentences) words = [] for sentence in sentences: sentence_stripped = sentence.translate(str.maketrans('', '', string.punctuation)) words += sentence_stripped.split() long_words_count = 0 for word in words: if len(word) >= 7: long_words_count += 1 self.word_count = len(words) self._sentence_count = sentence_count self._long_words_count = long_words_count
def split_to_sent_array(self, text, lang): charlimit = self.sent_chars_limit sent_array = [] for sent in split_text_into_sentences(text=text, language=lang): while len(sent) > charlimit: try: # When sent starts with a space, then sent[0:0] was an empty string, # and it caused an infinite loop. This fixes it. beg = 0 while sent[beg] == ' ': beg += 1 last_space_idx = sent.rindex(" ", beg, charlimit) sent_array.append(sent[0:last_space_idx]) sent = sent[last_space_idx:] except ValueError: # raised if no space found by rindex sent_array.append(sent[0:charlimit]) sent = sent[charlimit:] sent_array.append(sent) # print(len(sent_array), [len(x) for x in sent_array]) return sent_array
def parse_article(url): text = get_article(url) summary = summarizer.summarize(text) lang = detect(text) print(f'LANG: {lang}') sentences = split_text_into_sentences(text=text, language=lang) sentences = [s for s in sentences if s.strip()] tagger = pos_taggers[lang] words, markups = tagger(sentences) _keywords = keywords.keywords(text).split("\n") _keywords = [k for k in _keywords if not is_stop_word(k)] markups["KEYWORD"] = _keywords return words, markups, summary
def breaksetences(text): content['sentences'] = split_text_into_sentences( text=text, language="pt", ) return content['sentences']
from sentence_splitter import SentenceSplitter, split_text_into_sentences # # Object interface # splitter = SentenceSplitter(language='tr') with open('test.txt', 'r', encoding="utf8") as file: text = file.read().replace('\n', ' ').replace('\r', '') #print(text) #print(splitter.split(text=text)) # ['This is a paragraph.', 'It contains several sentences.', '"But why," you ask?'] # # Functional interface # ''' print(split_text_into_sentences( text=text, language='tr' ))''' list = split_text_into_sentences(text=text, language='tr') with open('output.txt', 'w', encoding="utf8") as f: for item in list: f.write("%s\n" % item)
import requests
return yield start start += len(sub) sentences = find_sentences(text) for chunk in doc.noun_chunks: print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text) from sentence_splitter import SentenceSplitter, split_text_into_sentences splitter = SentenceSplitter( language='ro', non_breaking_prefix_file='D:/BusuiocI/Downloads/ro.txt') sentences = splitter.split(text=textwithoutdiacritics) sentences2 = split_text_into_sentences( text=text, language='ro', non_breaking_prefix_file='D:/BusuiocI/Downloads/ro.txt') def show_ents(doc): if doc.ents: for ent in doc.ents: print(ent.text + ' - ' + str(ent.start_char) + ' - ' + str(ent.end_char) + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) else: print('No named entities found.') show_ents(doc)
import wikipedia import re from sentence_splitter import split_text_into_sentences term = 'Steve Jobs' summary = wikipedia.summary(term, sentences=7) summary = re.sub(r"\([^)]*\)", "", summary) result = split_text_into_sentences( text=summary, language="en", ) subtitles_template = """ 1 00:00:00,000 --> 00:00:10,000 {0} 2 00:00:10,000 --> 00:00:20,000 {1} 3 00:00:20,000 --> 00:00:30,000 {2} 4 00:00:30,000 --> 00:00:40,000