def test_split_sentences(): """ Test shallow tokenization """ s = ( "3.janúar sl. keypti ég 64kWst rafbíl. Hann kostaði € 30.000. \n" "200.000 manns mótmæltu.\n" "Hér byrjar ný setning" ) g = t.split_into_sentences(s) sents = list(g) assert len(sents) == 4 assert sents[0] == "3. janúar sl. keypti ég 64kWst rafbíl ." assert sents[1] == "Hann kostaði €30.000 ." assert sents[2] == "200.000 manns mótmæltu ." assert sents[3] == "Hér byrjar ný setning" # Test using a generator as input into split_into_sentences() s = ( "3.janúar sl. keypti ég 64kWst rafbíl. Hann kostaði € 30.000. \n", "200.000 manns mótmæltu\n", "\n", "Hér byrjar ný setning\n", ) def gen(s): for line in s: yield line g = t.split_into_sentences(gen(s)) sents = list(g) assert len(sents) == 4 assert sents[0] == "3. janúar sl. keypti ég 64kWst rafbíl ." assert sents[1] == "Hann kostaði €30.000 ." assert sents[2] == "200.000 manns mótmæltu" assert sents[3] == "Hér byrjar ný setning" # Test the normalize option s = ( "Hún sagði: \"Þú ert leiðinlegur\"! Hann svaraði engu -- " "en hætti við ferðina. \n" ) g = t.split_into_sentences(s, normalize=True) sents = list(g) assert len(sents) == 2 assert sents[0] == "Hún sagði : „ Þú ert leiðinlegur “ !" assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ." g = t.split_into_sentences(s, normalize=False) sents = list(g) assert len(sents) == 2 assert sents[0] == "Hún sagði : \" Þú ert leiðinlegur \" !" assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ."
def clean_pg(self, pg): sentences = [s for s in split_into_sentences(pg) if self.check_sentence(s)] n_sentences = len(sentences) n_pg = [] idx = 0 while idx < n_sentences - self.min_lines + 1: for j in range(self.max_lines, self.min_lines - 1, -1): if idx + j > n_sentences: continue sentence_batch = " ".join(sentences[idx : idx + j]) # noqa sh = self.hash(sentence_batch) if sh in self.line_hashes[j]: idx += j break if idx < n_sentences: n_pg.append(sentences[idx]) idx += 1 n_pg += sentences[idx:] if n_pg: self.add_pg_to_line_hashes(sentences) return "\n".join(n_pg)
def predict(self, text, summary_length, sentence_limit=None): # if loader_workers is None: # loader_workers = min(16, cpu_count()) text = text.strip() sentences = [s for s in tokenizer.split_into_sentences(text)] tokenized = [s.split() for s in sentences] detokenized = detokenize(text.split(), tokenized) doc_inputs = [{ "text": sentence, "sentence_id": num, "pos": [], "word_count": len(tokens), "tokens": tokens } for num, (sentence, tokens) in enumerate(zip(detokenized, tokenized), 1)] doc = {"id": "doc", "inputs": doc_inputs} data = SumDataset(self.vocab, doc, sentence_limit=sentence_limit) loader = SumDataLoader(data, batch_size=1, num_workers=0) with torch.no_grad(): for step, batch in enumerate(loader, 1): batch = batch.to(self.gpu) texts = self.model.predict(batch, max_length=summary_length) # Make sure that the sentences are in the correct order summary_sentences = set(texts[0]) summary = " ".join( [s for s in detokenized if s in summary_sentences]) # labels = [int(s in set(texts[0])) for s in sentences] return summary
def is_tok(line: str, tokenizer: str, model="") -> List[str]: if tokenizer is None or tokenizer == "": return [ token for sent in mideind_tok.split_into_sentences(line) for token in sent.split(' ') ] elif tokenizer == 'moses': return _lazy_load_moses_tokenizer('is').tokenize(line, escape=False) elif tokenizer == 'bpe': return _lazy_load_bpe_tokenizer('is', model).EncodeAsPieces(line) else: raise ValueError(f'Unknown tokenizer={tokenizer}')
def createArticle(text): article_file = tokenizer.split_into_sentences(text) article = [] for sentence in article_file: article.append(sentence) index = 0 if (len(article) > 1): if (len(article) > 3): index = len(article) // 3 if (index - 2 > 0): intnum = rd.randrange(index - 2, len(article) // 2) else: intnum = rd.randrange(index, len(article) // 2) return article, intnum
def tokenization(tweet): #TOKENIZATION #print("fatima") g = split_into_sentences(tweet) # Loop through the sentences for sentence in g: # Obtain the individual token strings tokens = sentence.split() filtered_sentence = [w for w in tokens if not w in stopwords.words('english')] # print(tokens) # Print the tokens, comma-separated # print(", ".join(tokens)) # store the tokens in a list thisList = filtered_sentence yield thisList
def tag_simple(input, output, tagger): input_file = open(input, 'rt') input_text = input_file.readlines() input_file.close() output_file = input + output with open(output_file, "w") as f: for i in input_text: if i.strip() != '': if args.tokenize: simple_tokens = [] g = split_into_sentences(i.strip()) for sentence in g: simple_tokens.append(sentence) else: simple_tokens = i.strip().split() if simple_tokens[0][0].isupper( ) and not simple_tokens[0] in tagger.vw.w2i: simple_tokens[ 0] = simple_tokens[0][0] + simple_tokens[0][1:] f.write("\n".join([ x[0] + "\t" + x[1] for x in tagger.tag_sent(simple_tokens) ]) + '\n') f.write("\n")
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'): sentences = [] if lang not in main.settings_global['sentence_tokenizers']: lang = 'other' if sentence_tokenizer == 'default': sentence_tokenizer = main.settings_custom['sentence_tokenization'][ 'sentence_tokenizers'][lang] wl_nlp_utils.init_sentence_tokenizers( main, lang=lang, sentence_tokenizer=sentence_tokenizer) # Input of SudachiPy cannot be more than 49149 BYTES if sentence_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4: # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300) sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10) else: sections = wl_nlp_utils.split_into_chunks_text( text, section_size=main.settings_custom['files']['misc'] ['read_files_in_chunks']) for section in sections: # NLTK if sentence_tokenizer == 'nltk_punkt': lang_texts = { 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', # English 'eng_gb': 'english', 'eng_us': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', # German 'deu_at': 'german', 'deu_de': 'german', 'deu_ch': 'german', 'ell': 'greek', 'ita': 'italian', # Norwegian 'nob': 'norwegian', 'nno': 'norwegian', 'pol': 'polish', # Portuguese 'por_br': 'portuguese', 'por_pt': 'portuguese', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tur': 'turkish', # Other languages 'other': 'english' } sentences.extend( nltk.sent_tokenize(section, language=lang_texts[lang])) # spaCy elif sentence_tokenizer.startswith('spacy_'): # Chinese, English, German, Portuguese if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(section) sentences.extend([sentence.text for sentence in doc.sents]) # Chinese & Japanese elif sentence_tokenizer in ['wordless_zho', 'wordless_jpn']: for line in section.splitlines(): sentence_start = 0 for i, char in enumerate(line): if i >= sentence_start and char in [ '。', '!', '?', '!', '?' ]: for j, char_next in enumerate(line): if j > i and char_next not in [ '。', '!', '?', '!', '?', '’', '”', ')', ')' ]: sentences.append(line[sentence_start:j]) sentence_start = j break if sentence_start <= len(line): sentences.append(line[sentence_start:]) # Icelandic elif sentence_tokenizer == 'tokenizer_isl': for sentence in tokenizer.split_into_sentences(section): sentences.append( wl_word_detokenization.wl_word_detokenize( main, tokens=sentence.split(), lang='isl')) # Thai elif sentence_tokenizer == 'pythainlp_crfcut': sentences.extend(pythainlp.sent_tokenize(section)) # Tibetan elif sentence_tokenizer == 'botok_bod': wl_nlp_utils.init_word_tokenizers(main, lang='bod') tokens = main.botok_word_tokenizer.tokenize(section) for sentence_tokens in botok.sentence_tokenizer(tokens): sentences.append(''.join([ sentence_token.text for sentence_token in sentence_tokens['tokens'] ])) # Vietnamese elif sentence_tokenizer == 'underthesea_vie': sentences.extend(underthesea.sent_tokenize(section)) # Strip spaces sentences = [ sentence_non_empty for sentence in sentences if (sentence_non_empty := sentence.strip()) ]
# as invoking the tokenizer directly from the command line: # $ tokenize input.txt output.txt # if __name__ == '__main__': # reading input parameters parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--output', '-o', help='Select suffix for output files.', default=".tokenized") requiredNamed = parser.add_argument_group('required named arguments') requiredNamed.add_argument('--input', '-i', nargs='+', required=True, default=argparse.SUPPRESS, help="File(s) to tokenize before tagging.") if len(sys.argv) < 2: parser.print_help() sys.exit(0) try: args = parser.parse_args() except: sys.exit(0) filename = args.input for current_file in args.input: with open(current_file + args.output, "w") as f: line_list = [] for line in open(current_file): if len(line.strip()) > 0: g = split_into_sentences(line.strip()) for sentence in g: f.write(sentence + '\n')
import sys import tokenizer if __name__ == "__main__": # article = sys.argv[1] article = """ Technology gets the creative bug The hi-tech and the arts worlds have for some time danced around each other and offered creative and technical help when required. Often this help has come in the form of corporate art sponsorship or infrastructure provision. But that dance is growing more intimate as hi-tech firms look to the creative industries for inspiration. And vice versa. UK telco BT is serious about the idea and has launched its Connected World initiative. The idea, says BT, is to shape a "21st Century model" which will help cement the art, technology, and business worlds together. "We are hoping to understand the creative industry that has a natural thirst for broadband technology," said Frank Stone, head of the BT's business sector programmes. He looks after several "centres of excellence" which the telco has set up with other institutions and organisations, one of which is focused on creative industries. To mark the initiative's launch, a major international art installation is to open on 15 April in Brussels, with a further exhibit in Madrid later in the summer. They have both been created using the telco's technology that it has been incubating at its research and development arm, including a sophisticated graphics rendering program. Using a 3D graphics engine, the type commonly used in gaming, Bafta-winning artists Langlands & Bell have created a virtual, story-based, 3D model of Brussels' Coudenberg Cellars. They have recently been excavated and are thought to be the remnants of Coudenberg Palace, an historical seat of European power. The 3D world can be navigated using a joystick and offers an immersive experience of a landscape that historically had a river running through it until it was bricked up in the 19th Century. "The river was integral to the city's survival for hundreds of years and it was equally essential to the city that it disappeared," said the artists. "We hope that by uncovering the river, we can greater understand the connections between the past and the present, and appreciate the flow of modernity, once concealing, but now revealing the River Senne." In their previous works they used the Quake game graphics engine. The game engine is the core component of a video game because it handles graphics rendering, game AI, and how objects behave and relate to each other in a game. They are so time-consuming and expensive to create, the engines can be licensed out to handle other graphics-intensive games. BT's own engine, Tara (Total Abstract Rendering Architecture) has been in development since 2001 and has been used to recreate virtual interactive models of buildings for planners. It was also used in 2003 in Encounter, an urban-based, pervasive game that combined both virtual play in conjunction with physical, on-the-street action. Because the artists wanted video and interactive elements in their worlds, new features were added to Tara in order to handle the complex data sets. But collaboration between art and digital technology is by no means new, and many keen coders, designers, games makers and animators argue that what they create is art itself. As more tools for self-expression are given to the person on the street, enabling people to take photos with a phone and upload them to the web for instance, creativity will become an integral part of technology. The Orange Expressionist exhibition last year, for example, displayed thousands of picture messages from people all over the UK to create an interactive installation. Technology as a way of unleashing creativity has massive potential, not least because it gives people something to do with their technology. Big businesses know it is good for them to get in on the creative vein too. The art world is "fantastically rich", said Mr Stone, with creative people and ideas which means traditional companies like BT want to get in with them. Between 1997 and 2002, the creative industry brought £21 billion to London alone. It is an industry that is growing by 6% a year too. The partnership between artists and technologists is part of trying to understand the creative potential of technologies like broadband net, according to Mr Stone. "This is not just about putting art galleries and museums online," he said. "It is about how can everyone have the best seat in house and asking if technology has a role in solving that problem." With broadband penetration reaching 100% in the UK, businesses with a stake in the technology want to give people reasons to want and use it. The creative drive is not purely altruistic obviously. It is about both industries borrowing strategies and creative ideas together which can result in better business practices for creative industries, or more patent ideas for tech companies. "What we are trying to do is have outside-in thinking. "We are creating a future cultural drive for the economy," said Mr Stone. """ sente_gen = tokenizer.split_into_sentences(article) size = 0 for sentence in sente_gen: size += 1 summarizer = pipeline('summarization') print( summarizer(article, max_length=int((2 * size) // 3), min_length=int(size // 3))) sys.stdout.flush()
completed = 0 #Go to each topic page for link in url: response = requests.get(link) soup = BeautifulSoup(response.content, 'html.parser') #Find the article body body = soup.find('div', {'itemprop': 'articleBody'}) text = "" #Concatenate all the paragraphs that make up the article content for message in body.find_all('p'): text += message.text + " " #Shallow Tokenizer: Split the current article body into sentences sentences = list(tokenizer.split_into_sentences(text)) #Further formatting... for sentence in sentences: #Make whole sentence lowercase lowers = sentence.lower() #Remove punctuation s = re.sub(r'[^\w\s]', '', lowers) #Add necessary start/stop tokens for BERT sformatted = "[CLS] " + s + " [SEP]" #Use the Bert Tokenizer bertTokens.append(bertTokenize.tokenize(sformatted)) #Add a label based on article category sentencetags.append(numtags[completed])
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'): sentences = [] if lang not in main.settings_global['sentence_tokenizers']: lang = 'other' if sentence_tokenizer == 'default': sentence_tokenizer = main.settings_custom['sentence_tokenization'][ 'sentence_tokenizers'][lang] wl_text_utils.check_sentence_tokenizers( main, lang=lang, sentence_tokenizer=sentence_tokenizer) # NLTK if sentence_tokenizer == main.tr('NLTK - Punkt Sentence Tokenizer'): lang_texts = { 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', 'deu': 'german', # Greek (Modern) 'ell': 'greek', 'ita': 'italian', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'pol': 'polish', 'por': 'portuguese', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tur': 'turkish', # Other languages 'other': 'english' } sentences = nltk.sent_tokenize(text, language=lang_texts[lang]) # spaCy elif sentence_tokenizer == main.tr('spaCy - Sentencizer'): nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True sentences = [sentence.text for sentence in doc.sents] # syntok elif sentence_tokenizer == main.tr('syntok - Sentence Segmenter'): for para in syntok.segmenter.analyze(text): for sentence in para: sentences.append(''.join( [token.spacing + token.value for token in sentence])) # Chinese & Japanese elif sentence_tokenizer in [ main.tr('Wordless - Chinese Sentence Tokenizer'), main.tr('Wordless - Japanese Sentence Tokenizer') ]: for line in text.splitlines(): sentence_start = 0 for i, char in enumerate(line): if i >= sentence_start and char in ['。', '!', '?', '!', '?']: for j, char in enumerate(line): if j > i and char not in [ '。', '!', '?', '!', '?', '’', '”', ')', ')' ]: sentences.append(line[sentence_start:j]) sentence_start = j break if sentence_start <= len(line): sentences.append(line[sentence_start:]) # Icelandic elif sentence_tokenizer == main.tr( 'Tokenizer - Icelandic Sentence Tokenizer'): for sentence in tokenizer.split_into_sentences(text): sentences.append( wl_word_detokenization.wl_word_detokenize( main, tokens=sentence.split(), lang='isl')) # Russian elif sentence_tokenizer == main.tr('razdel - Russian Sentenizer'): sentences = [sentence.text for sentence in razdel.sentenize(text)] # Thai elif sentence_tokenizer == main.tr('PyThaiNLP - CRFCut'): sentences = pythainlp.sent_tokenize(text) # Tibetan elif sentence_tokenizer == main.tr('botok - Tibetan Sentence Tokenizer'): wl_text_utils.check_word_tokenizers(main, lang='bod') tokens = main.botok_word_tokenizer.tokenize(text) for sentence_tokens in botok.sentence_tokenizer(tokens): sentences.append(''.join([ sentence_token.text for sentence_token in sentence_tokens[1] ])) # Vietnamese elif sentence_tokenizer == main.tr( 'Underthesea - Vietnamese Sentence Tokenizer'): sentences = underthesea.sent_tokenize(text) # Strip spaces sentences = [sentence.strip() for sentence in sentences] sentences = wl_text_utils.record_boundary_sentences(sentences, text) return sentences
def test_split_sentences(): """ Test shallow tokenization """ s = ("3.janúar sl. keypti ég 64kWst rafbíl. Hann kostaði € 30.000. \n" "200.000 manns mótmæltu.\n" "Hér byrjar ný setning") g = t.split_into_sentences(s) sents = list(g) assert len(sents) == 4 assert sents[0] == "3. janúar sl. keypti ég 64kWst rafbíl ." assert sents[1] == "Hann kostaði €30.000 ." assert sents[2] == "200.000 manns mótmæltu ." assert sents[3] == "Hér byrjar ný setning" # Test using a generator as input into split_into_sentences() s = ( "3.janúar sl. keypti ég 64kWst rafbíl. Hann kostaði € 30.000. \n", "200.000 manns mótmæltu\n", "\n", "Hér byrjar ný setning\n", ) def gen(s): for line in s: yield line g = t.split_into_sentences(gen(s)) sents = list(g) assert len(sents) == 4 assert sents[0] == "3. janúar sl. keypti ég 64kWst rafbíl ." assert sents[1] == "Hann kostaði €30.000 ." assert sents[2] == "200.000 manns mótmæltu" assert sents[3] == "Hér byrjar ný setning" # Test the normalize option s = ("Hún sagði: \"Þú ert leiðinlegur\"! Hann svaraði engu -- " "en hætti við ferðina. \n") g = t.split_into_sentences(s, normalize=True) sents = list(g) assert len(sents) == 2 assert sents[0] == "Hún sagði : „ Þú ert leiðinlegur “ !" assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ." g = t.split_into_sentences(s, normalize=False) sents = list(g) assert len(sents) == 2 assert sents[0] == "Hún sagði : \" Þú ert leiðinlegur \" !" assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ." g = t.split_into_sentences( "Aðalsteinn Jónsson SU á leið til hafnar í " "Reykjavík.Flutningaskipið Selfoss kom til Reykjavíkur.Rósin sigldi með " "ferðamenn í hvalaskoðun.") sents = list(g) assert len(sents) == 3 assert sents == [ 'Aðalsteinn Jónsson SU á leið til hafnar í Reykjavík .', 'Flutningaskipið Selfoss kom til Reykjavíkur .', 'Rósin sigldi með ferðamenn í hvalaskoðun .', ] g = t.split_into_sentences(s for s in [ "Aðalsteinn Jónsson SU á leið til hafnar í ", "Reykjavík.Flutningaskipið Selfoss kom til Reykjavíkur.Rósin sigldi með ", "ferðamenn í hvalaskoðun.", ]) sents = list(g) assert len(sents) == 3 assert sents == [ 'Aðalsteinn Jónsson SU á leið til hafnar í Reykjavík .', 'Flutningaskipið Selfoss kom til Reykjavíkur .', 'Rósin sigldi með ferðamenn í hvalaskoðun .', ] g = t.split_into_sentences(s for s in [ "Aðalsteinn Jónsson SU á leið \n til hafnar í ", "Reykjavík.\nFlutningaskipið Selfoss \nkom til Reykjavíkur.Rósin sigldi með ", "ferðamenn í\nhvalaskoðun.\n\n\n", ]) sents = list(g) assert len(sents) == 3 assert sents == [ 'Aðalsteinn Jónsson SU á leið til hafnar í Reykjavík .', 'Flutningaskipið Selfoss kom til Reykjavíkur .', 'Rósin sigldi með ferðamenn í hvalaskoðun .' ] g = t.split_into_sentences(s for s in [ "Aðalsteinn Jónsson SU á leið \n til hafnar í ", "Reykjavík\n \t \nFlutningaskipið Selfoss \nkom til Reykjavíkur", "", "Rósin sigldi með ", "ferðamenn í\nhvalaskoðun\n\n\nVigur kom með fullfermi að landi", ]) sents = list(g) assert len(sents) == 4 assert sents == [ 'Aðalsteinn Jónsson SU á leið til hafnar í Reykjavík', 'Flutningaskipið Selfoss kom til Reykjavíkur', 'Rósin sigldi með ferðamenn í hvalaskoðun', "Vigur kom með fullfermi að landi", ]