Exemplo n.º 1
0
class NERTokenizer:
    def __init__(self):

        self._word_tokenizer = Tokenizer(split_camel_case=True,
                                         token_classes=False,
                                         extra_info=False)

        self._sentence_splitter = SentenceSplitter()

    def parse_text(self, text):
        tokens = self._word_tokenizer.tokenize_paragraph(text)

        sentences_tokenized = self._sentence_splitter.split(tokens)

        sentences = []
        for sen in sentences_tokenized:

            sen = [tok.replace(" ", "") for tok in sen]

            if len(sen) == 0:
                continue

            sentences.append((sen, []))

        return sentences
Exemplo n.º 2
0
def get_sents(texts):
    tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
    sentence_splitter = SentenceSplitter(is_tuple=False)
    
    results = []
    for text in texts:
#         text = clean(text, lang='de', lower=False)
        tokens = tokenizer.tokenize_paragraph(text)
        sentences = sentence_splitter.split(tokens)
        cleaned = [clean(' '.join(s), no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, lang='de') for s in sentences]
        results.append(cleaned)
    return results
def predict(input_text, model = learner):
    # input_txt = ""
    doc = nlp(input_text)
    if 'en' in doc._.language['language']:
        tokenizer = Tokenizer(language="en")
        input_txt = ' '.join(token for token in tokenizer.tokenize_paragraph(input_text) if token not in [',', '.', '?', '!'])
        labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt))

    elif 'de' in doc._.language['language']:
        tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
        input_txt = ' '.join(token for token in tokenizer.tokenize_paragraph(input_text) if token not in [',', '.', '?', '!'])
        labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt))

    elif 'fr' in doc._.language['language']:
        tokenizer = Tokenizer(language="en")
        input_txt = re.sub(r'[,.?!]', '', input_text).strip()
        labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt))
    else:
        tokenizer = Tokenizer(language="en")
        input_txt = re.sub(r'[,.?!]', '', input_text).strip()
        labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt))

    if not input_txt:
        return input_txt

    ## Assigning random language
    language = 'English'

    X = pd.DataFrame([(input_txt, labels, language)], columns=['Sentences', 'labels', 'language'])
    X.to_csv('/data/vchordia/sen_boundary/X.csv', index=False)
    dl = get_data_loader_for_predict(data, df_path="/data/vchordia/sen_boundary/X.csv")
    preds = learner.predict(dl)
    pred_tokens, pred_labels = bert_labels2tokens(dl, preds[0])
    res_str = final_str(pred_tokens, pred_labels)
    return res_str