Python SentenceSplitter 예제들, sentence_splitter.SentenceSplitter Python 예제들

예제 #1

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_text_none():
    """Text is None."""
    with pytest.warns(SentenceSplitterWarning):
        splitter = SentenceSplitter(language='en')
        # noinspection PyTypeChecker
        sentences = splitter.split(text=None)
        assert sentences == []

예제 #2

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_en_sentence_within_brackets():
    splitter = SentenceSplitter(language='en')

    input_text = 'Foo bar. (Baz foo.) Bar baz.'
    expected_sentences = ['Foo bar.', '(Baz foo.)', 'Bar baz.']
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

예제 #3

0

파일 보기

파일: Data_Pretraitement3.py 프로젝트: datamusee/experimentations

def clean_expo_data():
    with open(file_folder + "Process_data/Dictionary_approach/" + "expo.txt",
              'r') as file:
        document = file.readlines()

    lst_all_sentence = []
    splitter = SentenceSplitter(language='fr')
    percent = 0
    french = 0
    for sentence in document:
        percent += 1
        print(str(percent) + " completed")
        sentence = sentence.replace('\n', '')
        lst_sentence = splitter.split(text=sentence)
        if len(lst_sentence) > 1:
            print(lst_sentence)
        for s in lst_sentence:
            if detect(s) == "fr":
                lst_all_sentence.append(s)
                french += 1

    with open(
            file_folder + "Process_data/Dictionary_approach/" + "expo_fr.txt",
            "a") as text_file:
        for s in lst_all_sentence:
            text_file.write(s + "\n")

    print(str(french) + " french sentences")
    print("End function")

예제 #4

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_en_uppercase_acronym():
    splitter = SentenceSplitter(language='en')

    input_text = 'Hello. .NATO. Good bye.'
    expected_sentences = ['Hello. .NATO. Good bye.']
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

예제 #5

0

파일 보기

파일: storykey.py 프로젝트: PredectiveAI/desktopapp

    def gramarize(self, sent):

        f = open('Value-json/logic_activation.json')
        activation = json.load(f)
        if activation['grammar_logic'] == "active":
            test_str = sent
            splitter = SentenceSplitter(language='en')
            sente = splitter.split(text=test_str)
            gram_sent = []

            for sent in sente:
                parser = GingerIt()

                output = parser.parse(sent)
                output_1 = (output.get("result"))
                output_1 = output_1
                gram_sent.append(output_1)

            f_output = ' '.join(gram_sent)

            if f_output[-1] == '.' and f_output[-2] == '.':
                f_output = f_output[:-2]

            f_output = f_output + '.'

            f_output = self.remove_trailing_dots(f_output)
            f_output = f_output.replace('..', '.')

            return f_output

        else:
            return sent

예제 #6

0

파일 보기

def prendi(request):
    start_time = time.time()
    testo = request.POST.get('testT', None)
    lingua = detect(testo)  #rilevamento della lingua del testo dato in input
    splitter = SentenceSplitter(
        language=lingua
    )  #come deve dividere (splittare) il testo in base alla lingua rilevata
    testoEm = splitter.split(text=testo)  #divisione (split) del testo
    message_embeddings = embed(testoEm)  #embedding sul testo splittato
    fileSvm = open(
        "/home/angela/PycharmProjects/ServerDjangoGit/ServerDjangoProva/SVMAll.pickle",
        'rb')  #richiamo il classificatore SVM
    svm = pickle.load(fileSvm)  #carico il classificatore SVM
    preSvm = svm.predict(
        message_embeddings
    )  #utilizzo di SVM per rilevare a quale categoria appartiene ogni split del testo
    fileRf = open(
        "/home/angela/PycharmProjects/ServerDjangoGit/ServerDjangoProva/RFFireness.pickle",
        'rb')  #richiamo il calssificatore RF
    rf = pickle.load(fileRf)  # carico il classificatore RF
    preRf = rf.predict_proba(
        message_embeddings
    )  #utilizzo di RF per rilevare il livello di fireness al quale appartiene ogni embedding
    ca = preSvm.tolist(
    )  #metto in una lista le categoria di appartenenza di ogni frase splittata
    mat = preRf.tolist(
    )  #metto in una lista le probabilità di appartenenza alle fireness di ogni frase splittata
    max = []
    perc = []
    temp_p = 0
    for i in range(
            len(mat)
    ):  #questo for serve per individuare la massima probabilità di appartenenza della fireness
        massimo = mat[i][0]
        temp = 0
        for j in range(len(mat[i])):
            if mat[i][j] > massimo:
                massimo = mat[i][j]
                temp_p = massimo
                temp = j
        tras = temp_p * 100
        arr = math.trunc(tras)
        perc.append(arr)
        max.append(temp)  #metto in questo vettore la probabilità più alta
    fra = {  #oggetto json di dati da passare al client
        'frase': testoEm,
        'cate': ca,
        'fair': max,
        'perc': perc
    }
    p_server = psutil.Process()  # numero pid
    ram = round(p_server.memory_percent(), 3)
    cpu = str(p_server.cpu_percent(interval=1.0))
    write_test(start_time, ram, cpu)
    return JsonResponse(fra)  #restituisce la risposta json
    fileSvm.close()  #chiudo il file del classificatore SVM
    fileRf.close()  #chiudo file del classificatore RF

예제 #7

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_fr():
    splitter = SentenceSplitter(language='fr')

    input_text = 'Brookfield Office Properties Inc. (« BOPI »), dont les actifs liés aux immeubles directement...'
    expected_sentences = [
        input_text,
    ]
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

예제 #8

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_en_numeric_only():
    splitter = SentenceSplitter(language='en')

    input_text = 'Hello. No. 1. No. 2. Prefix. 1. Prefix. 2. Good bye.'
    expected_sentences = [
        'Hello.', 'No. 1.', 'No. 2.', 'Prefix.', '1.', 'Prefix.', '2.',
        'Good bye.'
    ]
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

예제 #9

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_de():
    splitter = SentenceSplitter(language='de')

    input_text = 'Nie hätte das passieren sollen. Dr. Soltan sagte: "Der Fluxcompensator war doch kalibriert!".'
    expected_sentences = [
        'Nie hätte das passieren sollen.',
        'Dr. Soltan sagte: "Der Fluxcompensator war doch kalibriert!".',
    ]
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

예제 #10

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_pt():
    splitter = SentenceSplitter(language='pt')

    input_text = 'Isto é um parágrafo. Contém várias frases. «Mas porquê,» perguntas tu?'
    expected_sentences = [
        "Isto é um parágrafo.",
        "Contém várias frases.",
        "«Mas porquê,» perguntas tu?",
    ]
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

예제 #11

0

파일 보기

파일: utils.py 프로젝트: bfsujason/bertalign

def split_sents(text, lang):
    if lang in LANG.SPLITTER:
        if lang == 'zh':
            sents = _split_zh(text)
        else:
            splitter = SentenceSplitter(language=lang)
            sents = splitter.split(text=text)
            sents = [sent.strip() for sent in sents]
        return sents
    else:
        raise Exception('The language {} is not suppored yet.'.format(
            LANG.ISO[lang]))

예제 #12

0

파일 보기

 def tokenize(self):
     sentence_splitter = SentenceSplitter(language='en')
     for i, review in enumerate(self.reviews):
         text = review.text
         sentences = sentence_splitter.split(text)
         for sentence in sentences:
             tokenized_sentence = []
             words_borders = list(
                 WordPunctTokenizer().span_tokenize(sentence))
             for word_begin, word_end in words_borders:
                 word_text = sentence[word_begin:word_end]
                 word = Word(word_text, word_begin, word_end)
                 tokenized_sentence.append(word)
             self.reviews[i].sentences.append(tokenized_sentence)

예제 #13

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_es():
    splitter = SentenceSplitter(language='es')

    input_text = (
        'La UE ofrece una gran variedad de empleos en un entorno multinacional y multilingüe. La Oficina Europea de '
        'Selección de Personal (EPSO) se ocupa de la contratación, sobre todo mediante oposiciones generales.'
    )
    expected_sentences = [
        'La UE ofrece una gran variedad de empleos en un entorno multinacional y multilingüe.',
        ('La Oficina Europea de Selección de Personal (EPSO) se ocupa de la contratación, sobre todo mediante '
         'oposiciones generales.'),
    ]
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

예제 #14

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_el():
    splitter = SentenceSplitter(language='el')

    input_text = (
        'Όλα τα συστήματα ανώτατης εκπαίδευσης σχεδιάζονται σε εθνικό επίπεδο. Η ΕΕ αναλαμβάνει κυρίως να συμβάλει '
        'στη βελτίωση της συγκρισιμότητας μεταξύ των διάφορων συστημάτων και να βοηθά φοιτητές και καθηγητές να '
        'μετακινούνται με ευκολία μεταξύ των συστημάτων των κρατών μελών.')
    expected_sentences = [
        'Όλα τα συστήματα ανώτατης εκπαίδευσης σχεδιάζονται σε εθνικό επίπεδο.',
        ('Η ΕΕ αναλαμβάνει κυρίως να συμβάλει στη βελτίωση της συγκρισιμότητας μεταξύ των διάφορων συστημάτων '
         'και να βοηθά φοιτητές και καθηγητές να μετακινούνται με ευκολία μεταξύ των συστημάτων των κρατών '
         'μελών.'),
    ]
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

예제 #15

0

파일 보기

파일: gears_pipeline_sentence.py 프로젝트: applied-knowledge-systems/the-pattern-platform

def parse_paragraphs(record):
    """
    parse paragraphs into sentences, returns list
    """
    from sentence_splitter import SentenceSplitter
    splitter = SentenceSplitter(language='en')
    sentences = splitter.split(record['value'])
    article_id = remove_prefix(record['key'], 'paragraphs:')
    pre = 'sentence:' + article_id
    l = [{
        'key': f'{pre}',
        'idx': f'{idx}',
        'value': sentence
    } for idx, sentence in enumerate(sentences)]
    return l

예제 #16

0

파일 보기

    def __init__(self, term_dictionary, language_code, language_name):
        self._term_dictionary = term_dictionary
        self._language_code = language_code
        self._language_name = language_name
        if self._language_code in punkt_tokenizers:
            splitter = nltk.data.load("tokenizers/punkt/%s" %
                                      punkt_tokenizers[self._language_code])
            self.sent_split = splitter.tokenize
        elif self._language_code in splitter_sent_tok:
            splitter = SentenceSplitter(language=self._language_code)
            self.sent_split = splitter.split
        else:
            # If nothing works, use naive sentence splitter
            self.sent_split = partial(
                re.split, r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')
        self._lemmas = set([lemma for lemma in term_dictionary.values()])
        if self._language_code in nltk_stopwords:
            self._stopwords = stopwords.words(
                nltk_stopwords[self._language_code])
        else:
            print("No stopwords:", self._language_code)

        # Change punctuation for whitespace
        self.remove_punctuation = partial(regex.sub, '[\p{P}]+', ' ')

        self.pyphen_dic = pyphen.Pyphen(lang=pyphen_dicts[self._language_code])

예제 #17

0

파일 보기

파일: sentence_splitter.py 프로젝트: tonifuc3m/utils-BSC

def split_to_sentences(text, target_lang='es'):
    '''
    DESCRIPTION: Split text into sentences.

    Parameters
    ----------
    text : string
        String with entire document.

    Returns
    -------
    sentences: list of str
        List with sentences of document

    '''  
    splitter = SentenceSplitter(language=target_lang)
    return splitter.split(text)

예제 #18

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_en():
    splitter = SentenceSplitter(language='en')

    input_text = 'This is a paragraph. It contains several sentences. "But why," you ask?'
    expected_sentences = [
        'This is a paragraph.', 'It contains several sentences.',
        '"But why," you ask?'
    ]
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

    input_text = 'Hey! Now.'
    expected_sentences = ['Hey!', 'Now.']
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

    input_text = 'Hey... Now.'
    expected_sentences = ['Hey...', 'Now.']
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

    input_text = 'Hey. Now.'
    expected_sentences = ['Hey.', 'Now.']
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

    input_text = 'Hey.  Now.'
    expected_sentences = ['Hey.', 'Now.']
    actual_sentences = splitter.split(text=input_text)
    assert expected_sentences == actual_sentences

예제 #19

0

파일 보기

파일: story_key.py 프로젝트: VisheshJain112/sk

    def gramarize(self, sent):
        anti_grammar_words = []
        file = open('Value-json/anti_grammar.txt')
        for lines in file.read().splitlines():
            lines = lines.rstrip()
            lines = lines.lstrip()
            anti_grammar_words.append(lines)

        f = open('Value-json/logic_activation.json')
        activation = json.load(f)
        if activation['grammar_logic'] == "active":
            test_str = sent
            splitter = SentenceSplitter(language='en')
            sente = splitter.split(text=test_str)
            gram_sent = []

            for sent in sente:
                parser = GingerIt()
                ani = False
                for ani_words in anti_grammar_words:
                    if ani_words in sent:
                        ani = True
                        break
                    else:
                        continue

                if ani == False:
                    output = parser.parse(sent)
                    output_1 = (output.get("result"))
                    output_1 = output_1
                else:
                    output_1 = sent
                gram_sent.append(output_1)

            f_output = ' '.join(gram_sent)

            f_output = f_output + '.'

            f_output = self.remove_trailing_dots(f_output)
            f_output = f_output.replace('..', '.')

            return f_output

        else:
            return sent

예제 #20

0

파일 보기

파일: expand_corpus.py 프로젝트: jsedoc/cc_net

class ExtractSentences(jsonql.Transformer):
    def __init__(
            self,
            sp_model: Path,
            lm_model: Path,
            field: str = "raw_content",
            threshold: float = float("+inf"),
    ):
        super().__init__()
        self.sp_model = sp_model
        self.lm_model = lm_model
        self.field = field
        self.threshold = threshold
        self.sp: SentencePieceProcessor = None
        self.lm: KenlmModel = None
        self.splitter: SentenceSplitter = None
        self.hashes: Set[int] = set()

    def _prepare(self):
        self.sp = SentencePieceProcessor()
        self.sp.load(str(self.sp_model))
        self.splitter = SentenceSplitter("en")
        self.lm = KenlmModel(str(self.lm_model))

    def do(self, document: dict) -> Optional[str]:
        content: Optional[str] = document.get(self.field)
        if not content:
            return None
        all_sentences = [
            s for l in content.split("\n") if l
            for s in self.splitter.split(text=l)
        ]
        unique_sentences = []
        for s in all_sentences:
            if not s:
                continue
            h = dedup.str_hash(s)
            if h in self.hashes:
                continue
            self.hashes.add(h)
            unique_sentences.append(s)

        scores = []
        for sentence in unique_sentences:
            normalized = text_normalizer.normalize(sentence)
            pieces = self.sp.encode_as_pieces(normalized)
            log_score = self.lm.score(" ".join(pieces))
            pp = -1
            if len(pieces):
                pp = perplexity.pp(log_score, len(pieces))
            scores.append(pp)

        res = filter(lambda pp_s: self.threshold > pp_s[0] > 0,
                     zip(scores, unique_sentences))
        return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None

예제 #21

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_custom_non_breaking_prefixes():
    with tempfile.NamedTemporaryFile(mode='w+') as f:
        f.write(("# \n"
                 "# Temporary prefix file\n"
                 "# \n"
                 "\n"
                 "Prefix1\n"
                 "Prefix2\n"))
        f.flush()

        splitter = SentenceSplitter(language='xx',
                                    non_breaking_prefix_file=f.name)
        input_text = "Hello. Prefix1. Prefix2. Hello again. Good bye."
        expected_sentences = [
            'Hello.',
            'Prefix1. Prefix2. Hello again.',
            'Good bye.',
        ]
        actual_sentences = splitter.split(text=input_text)
        assert expected_sentences == actual_sentences

예제 #22

0

파일 보기

파일: sentirueval_parser.py 프로젝트: IlyaGusev/remotion

 def tokenize(self):
     sentence_splitter = SentenceSplitter(language='ru')
     for i, review in enumerate(self.reviews):
         text = review.text
         sentences = sentence_splitter.split(text)
         words_borders = list(WordPunctTokenizer().span_tokenize(text))
         for sentence in sentences:
             tokenized_sentence = []
             sentence_begin = text.find(sentence)
             sentence_end = sentence_begin + len(sentence)
             for word_begin, word_end in words_borders:
                 if word_begin >= sentence_begin and word_end <= sentence_end:
                     word_text = text[word_begin:word_end]
                     word = Word(word_text, word_begin, word_end)
                     for opinion in review.aspects:
                         if word.begin >= opinion.begin and word.end <= opinion.end:
                             word.add_opinion(opinion)
                             opinion.words.append(word)
                     tokenized_sentence.append(word)
             self.reviews[i].sentences.append(tokenized_sentence)

예제 #23

0

파일 보기

class PipelineSyntaxNet(object):
    def __init__(self, host, port):
        self.word_tokeniser_ = create_tokenizer_ru()
        self.sent_splitter_ = SentenceSplitter()
        self.syntaxnet_parser_ = ProcessorSyntaxNet(host, port)

    def process(self, text, raw_output=False):
        tokens = list(self.word_tokeniser_.span_tokenize(text))
        sents = self.sent_splitter_.process(text, tokens)
        trees = self.syntaxnet_parser_.parse(text,
                                             sents,
                                             raw_output=raw_output)

        return trees

예제 #24

0

파일 보기

class Embedder:
    def __init__(self):
        self.model = SentenceTransformer("LaBSE")
        self.en_sent_splitter = SentenceSplitter(language="en")

    def encode(self, text, lang):
        sentences = None
        if lang == "en":
            sentences = self.en_sent_splitter.split(text)
        elif lang == "ne":
            sentences = sentence_tokenize.sentence_split(text, "ne")
        filtered_sentences = [
            sentence for sentence in sentences
            if len(sentence.split()) > 3 and detect(sentence) == lang
        ]
        return filtered_sentences, self.model.encode(filtered_sentences)

예제 #25

0

파일 보기

파일: morph.py 프로젝트: suhlob/alice-generate-poems

 def __process_line(line: str, output_file: TextIO,
                    sentence_splitter: SentenceSplitter,
                    morph_predictor: RNNMorphPredictor):
     sentences = sentence_splitter.split(line)
     for sentence in sentences:
         words = [
             token.text for token in Tokenizer.tokenize(sentence) if
             token.text != '' and token.token_type != Token.TokenType.SPACE
         ]
         if not words:
             continue
         forms = morph_predictor.predict_sentence_tags(words)
         for form in forms:
             if form.pos == "PUNCT":
                 continue
             output_file.write(
                 "%s\t%s\t%s\t%s\n" %
                 (form.word, form.normal_form, form.pos, form.tag))
         output_file.write("\n")

예제 #26

0

파일 보기

파일: morph.py 프로젝트: suhlob/alice-generate-poems

    def get_morph_markup(input_filenames: List[str], output_filename: str):
        """
        Разметка по грамматическим значениям

        :param input_filenames: входные текстовые файлы
        :param output_filename: путь к файлу, куда будет сохранена разметка
        """
        if os.path.exists(output_filename):
            os.remove(output_filename)

        sentence_splitter = SentenceSplitter(language='ru')
        morph_predictor = RNNMorphPredictor()

        for filename in input_filenames:
            with open(filename, "r",
                      encoding="utf-8") as r, open(output_filename,
                                                   "w+",
                                                   encoding="utf-8") as w:
                for line in r:
                    Morph.__process_line(line, w, sentence_splitter,
                                         morph_predictor)

예제 #27

0

파일 보기

파일: MaskedSentsDataset.py 프로젝트: jppaolim/Facts2Story-XLNetPlanCloze

    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 dir_path: str,
                 block_size=1024):
        self.examples = []
        tokenizer_class = tokenizer.__class__.__name__
        cached_features_file = os.path.join(
            dir_path, args.model_type + "_cached2_maskedsents3_" +
            str(block_size) + "_" + tokenizer_class)

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", dir_path)
            good_docs = bad_docs = 0
            for filename in os.listdir(dir_path):
                try:
                    if not filename.endswith(".json"):
                        continue

                    path = os.path.join(dir_path, filename)
                    with open(path) as json_file:
                        data = json.load(json_file)
                        facts_doc = FactsDoc.Schema().load(data)

                    splitter = SentenceSplitter(language='en')
                    full_text_sentence_split = splitter.split(
                        text=facts_doc.text)
                    sent_one = full_text_sentence_split[START_SENT]
                    sent_two = full_text_sentence_split[END_SENT]
                    inbetween_text = " ".join(
                        full_text_sentence_split[START_SENT + 1:END_SENT])
                    tokenized_sent_one = tokenizer.encode(
                        sent_one,
                        add_special_tokens=False,
                        return_tensors="pt").squeeze(0)
                    tokenized_sent_two = tokenizer.encode(
                        sent_two,
                        add_special_tokens=False,
                        return_tensors="pt").squeeze(0)
                    tokenized_inbetween_text = tokenizer.encode(
                        inbetween_text,
                        add_special_tokens=False,
                        return_tensors="pt").squeeze(0)
                    full_text_tensor = torch.cat([
                        tokenized_sent_one, tokenized_inbetween_text,
                        tokenized_sent_two
                    ],
                                                 dim=0)
                    mask = torch.cat([
                        torch.ones(tokenized_sent_one.size()),
                        torch.zeros(tokenized_inbetween_text.size()),
                        torch.ones(tokenized_sent_two.size())
                    ])
                    self.examples.append((full_text_tensor, mask))
                    good_docs += 1
                except:
                    bad_docs += 1

            logger.info("finished creating examples for " + dir_path)
            logger.info(
                f"docs with exceptions = {bad_docs} fro total {bad_docs+good_docs}"
            )
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)

예제 #28

0

파일 보기

파일: data.py 프로젝트: bopopescu/news_crawl

import re
from pyvi import ViTokenizer
from joblib import load
from sentence_splitter import SentenceSplitter
splitter = SentenceSplitter(language='en')
from cfg.config import SENTIMENT_MODEL_PATH
"""
date format: d/m/y
date_range format: (d/m/y, d/m/y)
check if date is in the date_range
"""


def date_in_range(date, date_range):
    try:
        if date and date_range:
            date_start = date_range[0]
            date_end = date_range[1]
            date_tuple = [int(elem) for elem in reversed(date.split("/"))]
            date_start_tuple = [
                int(elem) for elem in reversed(date_start.split("/"))
            ]
            date_end_tuple = [
                int(elem) for elem in reversed(date_end.split("/"))
            ]
            return date_start_tuple < date_tuple < date_end_tuple
        else:
            return True
    except Exception as e:
        print("Error checking date in range " + str(e))
        return True

예제 #29

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_invalid_language_code():
    """Invalid language code."""
    with pytest.raises(SentenceSplitterException):
        SentenceSplitter(language='/etc/passwd')

예제 #30

0

파일 보기

파일: __init__.py 프로젝트: berkmancenter/mediacloud

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Splits text into sentences with "sentence_splitter" module.

        Language code will be read from language_code() method."""
        text = decode_object_from_bytes_if_needed(text)

        language_code = self.language_code()

        if self.__sentence_splitter is None:
            try:
                self.__sentence_splitter = SentenceSplitter(language=language_code)
            except Exception as ex:
                raise McLanguageException(
                    "Unable to initialize sentence splitter for language '%s': %s" % (language_code, str(ex),)
                )

        if text is None:
            log.warning("Text is None.")
            return []

        # Sentence tokenizer can hang for a very long on very long text, and anything greater than 1 MB is more likely
        # to be an artifact than actual text
        if len(text) > self.__MAX_TEXT_LENGTH:
            text = text[:self.__MAX_TEXT_LENGTH]

        # Only "\n\n" (not a single "\n") denotes the end of sentence, so remove single line breaks
        text = re.sub('([^\n])\n([^\n])', r"\1 \2", text, flags=re.DOTALL)

        # Remove asterisks from lists
        text = re.sub(r" {2}\*", " ", text, flags=re.DOTALL)

        text = re.sub(r"\n\s\*\n", "\n\n", text, flags=re.DOTALL)
        text = re.sub(r"\n\n\n\*", "\n\n", text, flags=re.DOTALL)
        text = re.sub(r"\n\n", "\n", text, flags=re.DOTALL)

        # Replace tabs with spaces
        text = re.sub(r"\t", " ", text, flags=re.DOTALL)

        # Replace non-breaking spaces with normal spaces
        text = re.sub(r"\xa0", " ", text, flags=re.DOTALL)

        # Replace multiple spaces with a single space
        text = re.sub(" +", " ", text, flags=re.DOTALL)

        # The above regexp and HTML stripping often leave a space before the period at the end of a sentence
        text = re.sub(r" +\.", ".", text, flags=re.DOTALL)

        # We see lots of cases of missing spaces after sentence ending periods (has a hardcoded lower limit of
        # characters because otherwise it breaks Portuguese "a.C.." abbreviations and such)
        text = re.sub(r"([a-z]{2,})\.([A-Z][a-z]+)", r"\1. \2", text, flags=re.DOTALL)

        # Replace Unicode's "…" with "..."
        text = text.replace("…", "...")

        # Trim whitespace from start / end of the whole string
        text = text.strip()

        # FIXME: fix "bla bla... yada yada"? is it two sentences?
        # FIXME: fix "text . . some more text."?

        if len(text) == 0:
            log.debug("Text is empty after processing it.")
            return []

        # Split to sentences
        sentences = self.__sentence_splitter.split(text=text)

        non_empty_sentences = []
        # Trim whitespace from start / end of each of the sentences
        for sentence in sentences:
            sentence = sentence.strip()

            if len(sentence) > 0:
                non_empty_sentences.append(sentence)

        return non_empty_sentences

예제 #31

0

파일 보기

파일: test_sentence_splitter.py 프로젝트: mediacloud/sentence-splitter

def test_text_empty():
    """Text is empty."""
    splitter = SentenceSplitter(language='en')
    assert splitter.split(text='') == []