Python split_into_sentences示例，tokenizer.split_into_sentences Python示例

示例#1

0

显示文件

def test_split_sentences():
    """ Test shallow tokenization """
    s = (
        "3.janúar sl. keypti   ég 64kWst rafbíl. Hann kostaði € 30.000.  \n"
        "200.000 manns mótmæltu.\n"
        "Hér byrjar ný setning"
    )
    g = t.split_into_sentences(s)
    sents = list(g)
    assert len(sents) == 4
    assert sents[0] == "3. janúar sl. keypti ég 64kWst rafbíl ."
    assert sents[1] == "Hann kostaði €30.000 ."
    assert sents[2] == "200.000 manns mótmæltu ."
    assert sents[3] == "Hér byrjar ný setning"

    # Test using a generator as input into split_into_sentences()
    s = (
        "3.janúar sl. keypti   ég 64kWst rafbíl. Hann kostaði € 30.000.  \n",
        "200.000 manns mótmæltu\n",
        "\n",
        "Hér byrjar ný setning\n",
    )

    def gen(s):
        for line in s:
            yield line

    g = t.split_into_sentences(gen(s))
    sents = list(g)
    assert len(sents) == 4
    assert sents[0] == "3. janúar sl. keypti ég 64kWst rafbíl ."
    assert sents[1] == "Hann kostaði €30.000 ."
    assert sents[2] == "200.000 manns mótmæltu"
    assert sents[3] == "Hér byrjar ný setning"

    # Test the normalize option
    s = (
        "Hún sagði: \"Þú ert leiðinlegur\"! Hann svaraði engu -- "
        "en hætti við ferðina.  \n"
    )
    g = t.split_into_sentences(s, normalize=True)
    sents = list(g)
    assert len(sents) == 2
    assert sents[0] == "Hún sagði : „ Þú ert leiðinlegur “ !"
    assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ."
    g = t.split_into_sentences(s, normalize=False)
    sents = list(g)
    assert len(sents) == 2
    assert sents[0] == "Hún sagði : \" Þú ert leiðinlegur \" !"
    assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ."

示例#2

0

显示文件

文件： deduplicate.py 项目： mideind/GreynirSeq

    def clean_pg(self, pg):
        sentences = [s for s in split_into_sentences(pg) if self.check_sentence(s)]
        n_sentences = len(sentences)

        n_pg = []
        idx = 0

        while idx < n_sentences - self.min_lines + 1:
            for j in range(self.max_lines, self.min_lines - 1, -1):
                if idx + j > n_sentences:
                    continue

                sentence_batch = " ".join(sentences[idx : idx + j])  # noqa
                sh = self.hash(sentence_batch)
                if sh in self.line_hashes[j]:
                    idx += j
                    break

            if idx < n_sentences:
                n_pg.append(sentences[idx])
                idx += 1

        n_pg += sentences[idx:]

        if n_pg:
            self.add_pg_to_line_hashes(sentences)

        return "\n".join(n_pg)

示例#3

0

显示文件

    def predict(self, text, summary_length, sentence_limit=None):
        # if loader_workers is None:
        #     loader_workers = min(16, cpu_count())

        text = text.strip()
        sentences = [s for s in tokenizer.split_into_sentences(text)]
        tokenized = [s.split() for s in sentences]
        detokenized = detokenize(text.split(), tokenized)

        doc_inputs = [{
            "text": sentence,
            "sentence_id": num,
            "pos": [],
            "word_count": len(tokens),
            "tokens": tokens
        } for num, (sentence,
                    tokens) in enumerate(zip(detokenized, tokenized), 1)]

        doc = {"id": "doc", "inputs": doc_inputs}
        data = SumDataset(self.vocab, doc, sentence_limit=sentence_limit)
        loader = SumDataLoader(data, batch_size=1, num_workers=0)

        with torch.no_grad():
            for step, batch in enumerate(loader, 1):
                batch = batch.to(self.gpu)
                texts = self.model.predict(batch, max_length=summary_length)

                # Make sure that the sentences are in the correct order
                summary_sentences = set(texts[0])
                summary = " ".join(
                    [s for s in detokenized if s in summary_sentences])

                # labels = [int(s in set(texts[0])) for s in sentences]
                return summary

示例#4

0

显示文件

def is_tok(line: str, tokenizer: str, model="") -> List[str]:
    if tokenizer is None or tokenizer == "":
        return [
            token for sent in mideind_tok.split_into_sentences(line)
            for token in sent.split(' ')
        ]
    elif tokenizer == 'moses':
        return _lazy_load_moses_tokenizer('is').tokenize(line, escape=False)
    elif tokenizer == 'bpe':
        return _lazy_load_bpe_tokenizer('is', model).EncodeAsPieces(line)
    else:
        raise ValueError(f'Unknown tokenizer={tokenizer}')

示例#5

0

显示文件

文件： newtextsummarizer.py 项目： ejaazsayyedexp/Text-Summarizer

def createArticle(text):
    article_file = tokenizer.split_into_sentences(text)
    article = []
    for sentence in article_file:
        article.append(sentence)
    index = 0
    if (len(article) > 1):
        if (len(article) > 3):
            index = len(article) // 3
    if (index - 2 > 0):
        intnum = rd.randrange(index - 2, len(article) // 2)
    else:
        intnum = rd.randrange(index, len(article) // 2)
    return article, intnum

示例#6

0

显示文件

文件： fyp1.py 项目： fatimaikrams/University_projects

def tokenization(tweet):
    
    #TOKENIZATION
    #print("fatima")
    g = split_into_sentences(tweet)
# Loop through the sentences
    for sentence in g:
    # Obtain the individual token strings
        tokens = sentence.split()
        filtered_sentence = [w for w in tokens if not w in stopwords.words('english')]
       # print(tokens)
   #  Print the tokens, comma-separated
#        print(", ".join(tokens))
     # store the tokens in a list
        thisList = filtered_sentence
        yield thisList

示例#7

0

显示文件

文件： tag.py 项目： steinst/ABLTagger

def tag_simple(input, output, tagger):
    input_file = open(input, 'rt')
    input_text = input_file.readlines()
    input_file.close()
    output_file = input + output

    with open(output_file, "w") as f:
        for i in input_text:
            if i.strip() != '':
                if args.tokenize:
                    simple_tokens = []
                    g = split_into_sentences(i.strip())
                    for sentence in g:
                        simple_tokens.append(sentence)
                else:
                    simple_tokens = i.strip().split()
                if simple_tokens[0][0].isupper(
                ) and not simple_tokens[0] in tagger.vw.w2i:
                    simple_tokens[
                        0] = simple_tokens[0][0] + simple_tokens[0][1:]
                f.write("\n".join([
                    x[0] + "\t" + x[1] for x in tagger.tag_sent(simple_tokens)
                ]) + '\n')
                f.write("\n")

示例#8

0

显示文件

文件： wl_sentence_tokenization.py 项目： BLKSerene/Wordless

def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'):
    sentences = []

    if lang not in main.settings_global['sentence_tokenizers']:
        lang = 'other'

    if sentence_tokenizer == 'default':
        sentence_tokenizer = main.settings_custom['sentence_tokenization'][
            'sentence_tokenizers'][lang]

    wl_nlp_utils.init_sentence_tokenizers(
        main, lang=lang, sentence_tokenizer=sentence_tokenizer)

    # Input of SudachiPy cannot be more than 49149 BYTES
    if sentence_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4:
        # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300)
        sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10)
    else:
        sections = wl_nlp_utils.split_into_chunks_text(
            text,
            section_size=main.settings_custom['files']['misc']
            ['read_files_in_chunks'])

    for section in sections:
        # NLTK
        if sentence_tokenizer == 'nltk_punkt':
            lang_texts = {
                'ces': 'czech',
                'dan': 'danish',
                'nld': 'dutch',
                # English
                'eng_gb': 'english',
                'eng_us': 'english',
                'est': 'estonian',
                'fin': 'finnish',
                'fra': 'french',
                # German
                'deu_at': 'german',
                'deu_de': 'german',
                'deu_ch': 'german',
                'ell': 'greek',
                'ita': 'italian',
                # Norwegian
                'nob': 'norwegian',
                'nno': 'norwegian',
                'pol': 'polish',
                # Portuguese
                'por_br': 'portuguese',
                'por_pt': 'portuguese',
                'rus': 'russian',
                'slv': 'slovene',
                'spa': 'spanish',
                'swe': 'swedish',
                'tur': 'turkish',
                # Other languages
                'other': 'english'
            }

            sentences.extend(
                nltk.sent_tokenize(section, language=lang_texts[lang]))
        # spaCy
        elif sentence_tokenizer.startswith('spacy_'):
            # Chinese, English, German, Portuguese
            if not lang.startswith('srp_'):
                lang = wl_conversion.remove_lang_code_suffixes(main, lang)

            nlp = main.__dict__[f'spacy_nlp_{lang}']
            doc = nlp(section)

            sentences.extend([sentence.text for sentence in doc.sents])
        # Chinese & Japanese
        elif sentence_tokenizer in ['wordless_zho', 'wordless_jpn']:
            for line in section.splitlines():
                sentence_start = 0

                for i, char in enumerate(line):
                    if i >= sentence_start and char in [
                            '。', '！', '？', '!', '?'
                    ]:
                        for j, char_next in enumerate(line):
                            if j > i and char_next not in [
                                    '。', '！', '？', '!', '?', '’', '”', '）', ')'
                            ]:
                                sentences.append(line[sentence_start:j])

                                sentence_start = j

                                break

                if sentence_start <= len(line):
                    sentences.append(line[sentence_start:])
        # Icelandic
        elif sentence_tokenizer == 'tokenizer_isl':
            for sentence in tokenizer.split_into_sentences(section):
                sentences.append(
                    wl_word_detokenization.wl_word_detokenize(
                        main, tokens=sentence.split(), lang='isl'))
        # Thai
        elif sentence_tokenizer == 'pythainlp_crfcut':
            sentences.extend(pythainlp.sent_tokenize(section))
        # Tibetan
        elif sentence_tokenizer == 'botok_bod':
            wl_nlp_utils.init_word_tokenizers(main, lang='bod')

            tokens = main.botok_word_tokenizer.tokenize(section)

            for sentence_tokens in botok.sentence_tokenizer(tokens):
                sentences.append(''.join([
                    sentence_token.text
                    for sentence_token in sentence_tokens['tokens']
                ]))
        # Vietnamese
        elif sentence_tokenizer == 'underthesea_vie':
            sentences.extend(underthesea.sent_tokenize(section))

    # Strip spaces
    sentences = [
        sentence_non_empty for sentence in sentences
        if (sentence_non_empty := sentence.strip())
    ]

示例#9

0

显示文件

文件： tokenize_input.py 项目： steinst/ABLTagger

# as invoking the tokenizer directly from the command line:
#       $ tokenize input.txt output.txt
#

if __name__ == '__main__':
    # reading input parameters
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--output', '-o', help='Select suffix for output files.', default=".tokenized")
    requiredNamed = parser.add_argument_group('required named arguments')
    requiredNamed.add_argument('--input', '-i', nargs='+', required=True, default=argparse.SUPPRESS,
                               help="File(s) to tokenize before tagging.")

    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit(0)
    try:
        args = parser.parse_args()
    except:
        sys.exit(0)

    filename = args.input

    for current_file in args.input:
        with open(current_file + args.output, "w") as f:
            line_list = []
            for line in open(current_file):
                if len(line.strip()) > 0:
                    g = split_into_sentences(line.strip())
                    for sentence in g:
                        f.write(sentence + '\n')

示例#10

0

显示文件

文件： abstextsummarizer.py 项目： ejaazsayyedexp/Text-Summarizer

import sys
import tokenizer

if __name__ == "__main__":
# article = sys.argv[1]
article = """
Technology gets the creative bug

The hi-tech and the arts worlds have for some time danced around each other and offered creative and technical help when required.

Often this help has come in the form of corporate art sponsorship or infrastructure provision. But that dance is growing more intimate as hi-tech firms look to the creative industries for inspiration. And vice versa. UK telco BT is serious about the idea and has launched its Connected World initiative. The idea, says BT, is to shape a "21st Century model" which will help cement the art, technology, and business worlds together. "We are hoping to understand the creative industry that has a natural thirst for broadband technology," said Frank Stone, head of the BT's business sector programmes. He looks after several "centres of excellence" which the telco has set up with other institutions and organisations, one of which is focused on creative industries.

To mark the initiative's launch, a major international art installation is to open on 15 April in Brussels, with a further exhibit in Madrid later in the summer. They have both been created using the telco's technology that it has been incubating at its research and development arm, including a sophisticated graphics rendering program. Using a 3D graphics engine, the type commonly used in gaming, Bafta-winning artists Langlands & Bell have created a virtual, story-based, 3D model of Brussels' Coudenberg Cellars.

They have recently been excavated and are thought to be the remnants of Coudenberg Palace, an historical seat of European power. The 3D world can be navigated using a joystick and offers an immersive experience of a landscape that historically had a river running through it until it was bricked up in the 19th Century. "The river was integral to the city's survival for hundreds of years and it was equally essential to the city that it disappeared," said the artists. "We hope that by uncovering the river, we can greater understand the connections between the past and the present, and appreciate the flow of modernity, once concealing, but now revealing the River Senne." In their previous works they used the Quake game graphics engine. The game engine is the core component of a video game because it handles graphics rendering, game AI, and how objects behave and relate to each other in a game. They are so time-consuming and expensive to create, the engines can be licensed out to handle other graphics-intensive games. BT's own engine, Tara (Total Abstract Rendering Architecture) has been in development since 2001 and has been used to recreate virtual interactive models of buildings for planners. It was also used in 2003 in Encounter, an urban-based, pervasive game that combined both virtual play in conjunction with physical, on-the-street action. Because the artists wanted video and interactive elements in their worlds, new features were added to Tara in order to handle the complex data sets. But collaboration between art and digital technology is by no means new, and many keen coders, designers, games makers and animators argue that what they create is art itself.

As more tools for self-expression are given to the person on the street, enabling people to take photos with a phone and upload them to the web for instance, creativity will become an integral part of technology. The Orange Expressionist exhibition last year, for example, displayed thousands of picture messages from people all over the UK to create an interactive installation.

Technology as a way of unleashing creativity has massive potential, not least because it gives people something to do with their technology. Big businesses know it is good for them to get in on the creative vein too. The art world is "fantastically rich", said Mr Stone, with creative people and ideas which means traditional companies like BT want to get in with them. Between 1997 and 2002, the creative industry brought £21 billion to London alone. It is an industry that is growing by 6% a year too. The partnership between artists and technologists is part of trying to understand the creative potential of technologies like broadband net, according to Mr Stone. "This is not just about putting art galleries and museums online," he said. "It is about how can everyone have the best seat in house and asking if technology has a role in solving that problem." With broadband penetration reaching 100% in the UK, businesses with a stake in the technology want to give people reasons to want and use it. The creative drive is not purely altruistic obviously. It is about both industries borrowing strategies and creative ideas together which can result in better business practices for creative industries, or more patent ideas for tech companies. "What we are trying to do is have outside-in thinking. "We are creating a future cultural drive for the economy," said Mr Stone.

"""
sente_gen = tokenizer.split_into_sentences(article)
size = 0
for sentence in sente_gen:
size += 1
summarizer = pipeline('summarization')
print(
summarizer(article,
max_length=int((2 * size) // 3),
min_length=int(size // 3)))
sys.stdout.flush()

示例#11

0

显示文件

文件： BERT-v1.py 项目： mentorchains/mlteam2

completed = 0
#Go to each topic page
for link in url:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')

    #Find the article body
    body = soup.find('div', {'itemprop': 'articleBody'})
    text = ""
    #Concatenate all the paragraphs that make up the article content
    for message in body.find_all('p'):
        text += message.text + " "

    #Shallow Tokenizer: Split the current article body into sentences
    sentences = list(tokenizer.split_into_sentences(text))

    #Further formatting...
    for sentence in sentences:
        #Make whole sentence lowercase
        lowers = sentence.lower()
        #Remove punctuation
        s = re.sub(r'[^\w\s]', '', lowers)
        #Add necessary start/stop tokens for BERT
        sformatted = "[CLS] " + s + " [SEP]"

        #Use the Bert Tokenizer
        bertTokens.append(bertTokenize.tokenize(sformatted))

        #Add a label based on article category
        sentencetags.append(numtags[completed])

示例#12

0

显示文件

def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'):
    sentences = []

    if lang not in main.settings_global['sentence_tokenizers']:
        lang = 'other'

    if sentence_tokenizer == 'default':
        sentence_tokenizer = main.settings_custom['sentence_tokenization'][
            'sentence_tokenizers'][lang]

    wl_text_utils.check_sentence_tokenizers(
        main, lang=lang, sentence_tokenizer=sentence_tokenizer)

    # NLTK
    if sentence_tokenizer == main.tr('NLTK - Punkt Sentence Tokenizer'):
        lang_texts = {
            'ces': 'czech',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'est': 'estonian',
            'fin': 'finnish',
            'fra': 'french',
            'deu': 'german',
            # Greek (Modern)
            'ell': 'greek',
            'ita': 'italian',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'pol': 'polish',
            'por': 'portuguese',
            'rus': 'russian',
            'slv': 'slovene',
            'spa': 'spanish',
            'swe': 'swedish',
            'tur': 'turkish',
            # Other languages
            'other': 'english'
        }

        sentences = nltk.sent_tokenize(text, language=lang_texts[lang])
    # spaCy
    elif sentence_tokenizer == main.tr('spaCy - Sentencizer'):
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        sentences = [sentence.text for sentence in doc.sents]
    # syntok
    elif sentence_tokenizer == main.tr('syntok - Sentence Segmenter'):
        for para in syntok.segmenter.analyze(text):
            for sentence in para:

                sentences.append(''.join(
                    [token.spacing + token.value for token in sentence]))
    # Chinese & Japanese
    elif sentence_tokenizer in [
            main.tr('Wordless - Chinese Sentence Tokenizer'),
            main.tr('Wordless - Japanese Sentence Tokenizer')
    ]:
        for line in text.splitlines():
            sentence_start = 0

            for i, char in enumerate(line):
                if i >= sentence_start and char in ['。', '！', '？', '!', '?']:
                    for j, char in enumerate(line):
                        if j > i and char not in [
                                '。', '！', '？', '!', '?', '’', '”', '）', ')'
                        ]:
                            sentences.append(line[sentence_start:j])

                            sentence_start = j

                            break

            if sentence_start <= len(line):
                sentences.append(line[sentence_start:])
    # Icelandic
    elif sentence_tokenizer == main.tr(
            'Tokenizer - Icelandic Sentence Tokenizer'):
        for sentence in tokenizer.split_into_sentences(text):
            sentences.append(
                wl_word_detokenization.wl_word_detokenize(
                    main, tokens=sentence.split(), lang='isl'))

    # Russian
    elif sentence_tokenizer == main.tr('razdel - Russian Sentenizer'):
        sentences = [sentence.text for sentence in razdel.sentenize(text)]
    # Thai
    elif sentence_tokenizer == main.tr('PyThaiNLP - CRFCut'):
        sentences = pythainlp.sent_tokenize(text)
    # Tibetan
    elif sentence_tokenizer == main.tr('botok - Tibetan Sentence Tokenizer'):
        wl_text_utils.check_word_tokenizers(main, lang='bod')
        tokens = main.botok_word_tokenizer.tokenize(text)

        for sentence_tokens in botok.sentence_tokenizer(tokens):
            sentences.append(''.join([
                sentence_token.text for sentence_token in sentence_tokens[1]
            ]))
    # Vietnamese
    elif sentence_tokenizer == main.tr(
            'Underthesea - Vietnamese Sentence Tokenizer'):
        sentences = underthesea.sent_tokenize(text)

    # Strip spaces
    sentences = [sentence.strip() for sentence in sentences]

    sentences = wl_text_utils.record_boundary_sentences(sentences, text)

    return sentences

示例#13

0

显示文件

def test_split_sentences():
    """ Test shallow tokenization """
    s = ("3.janúar sl. keypti   ég 64kWst rafbíl. Hann kostaði € 30.000.  \n"
         "200.000 manns mótmæltu.\n"
         "Hér byrjar ný setning")
    g = t.split_into_sentences(s)
    sents = list(g)
    assert len(sents) == 4
    assert sents[0] == "3. janúar sl. keypti ég 64kWst rafbíl ."
    assert sents[1] == "Hann kostaði €30.000 ."
    assert sents[2] == "200.000 manns mótmæltu ."
    assert sents[3] == "Hér byrjar ný setning"

    # Test using a generator as input into split_into_sentences()
    s = (
        "3.janúar sl. keypti   ég 64kWst rafbíl. Hann kostaði € 30.000.  \n",
        "200.000 manns mótmæltu\n",
        "\n",
        "Hér byrjar ný setning\n",
    )

    def gen(s):
        for line in s:
            yield line

    g = t.split_into_sentences(gen(s))
    sents = list(g)
    assert len(sents) == 4
    assert sents[0] == "3. janúar sl. keypti ég 64kWst rafbíl ."
    assert sents[1] == "Hann kostaði €30.000 ."
    assert sents[2] == "200.000 manns mótmæltu"
    assert sents[3] == "Hér byrjar ný setning"

    # Test the normalize option
    s = ("Hún sagði: \"Þú ert leiðinlegur\"! Hann svaraði engu -- "
         "en hætti við ferðina.  \n")
    g = t.split_into_sentences(s, normalize=True)
    sents = list(g)
    assert len(sents) == 2
    assert sents[0] == "Hún sagði : „ Þú ert leiðinlegur “ !"
    assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ."
    g = t.split_into_sentences(s, normalize=False)
    sents = list(g)
    assert len(sents) == 2
    assert sents[0] == "Hún sagði : \" Þú ert leiðinlegur \" !"
    assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ."

    g = t.split_into_sentences(
        "Aðalsteinn Jónsson SU á leið til hafnar í "
        "Reykjavík.Flutningaskipið Selfoss kom til Reykjavíkur.Rósin sigldi með "
        "ferðamenn í hvalaskoðun.")
    sents = list(g)
    assert len(sents) == 3
    assert sents == [
        'Aðalsteinn Jónsson SU á leið til hafnar í Reykjavík .',
        'Flutningaskipið Selfoss kom til Reykjavíkur .',
        'Rósin sigldi með ferðamenn í hvalaskoðun .',
    ]

    g = t.split_into_sentences(s for s in [
        "Aðalsteinn Jónsson SU á leið til hafnar í ",
        "Reykjavík.Flutningaskipið Selfoss kom til Reykjavíkur.Rósin sigldi með ",
        "ferðamenn í hvalaskoðun.",
    ])
    sents = list(g)
    assert len(sents) == 3
    assert sents == [
        'Aðalsteinn Jónsson SU á leið til hafnar í Reykjavík .',
        'Flutningaskipið Selfoss kom til Reykjavíkur .',
        'Rósin sigldi með ferðamenn í hvalaskoðun .',
    ]

    g = t.split_into_sentences(s for s in [
        "Aðalsteinn Jónsson SU á leið \n til hafnar í ",
        "Reykjavík.\nFlutningaskipið Selfoss \nkom til Reykjavíkur.Rósin sigldi með ",
        "ferðamenn í\nhvalaskoðun.\n\n\n",
    ])
    sents = list(g)
    assert len(sents) == 3
    assert sents == [
        'Aðalsteinn Jónsson SU á leið til hafnar í Reykjavík .',
        'Flutningaskipið Selfoss kom til Reykjavíkur .',
        'Rósin sigldi með ferðamenn í hvalaskoðun .'
    ]

    g = t.split_into_sentences(s for s in [
        "Aðalsteinn Jónsson SU á leið \n til hafnar í ",
        "Reykjavík\n \t  \nFlutningaskipið Selfoss \nkom til Reykjavíkur",
        "",
        "Rósin sigldi með ",
        "ferðamenn í\nhvalaskoðun\n\n\nVigur kom með fullfermi að landi",
    ])
    sents = list(g)
    assert len(sents) == 4
    assert sents == [
        'Aðalsteinn Jónsson SU á leið til hafnar í Reykjavík',
        'Flutningaskipið Selfoss kom til Reykjavíkur',
        'Rósin sigldi með ferðamenn í hvalaskoðun',
        "Vigur kom með fullfermi að landi",
    ]