Exemplo n.º 1
0
    def setUpClass(self):
        self._text = 'Texto para teste. Este texto contém 3 frases. 3a frase.'
        self._st1, self._st2, self._st3 = utils.get_sentence_tokens(
            'Texto para teste. Este texto contém 3 frases. 3a frase.')

        self._st1_words = utils.get_word_tokens(self._st1._text)
        self._st2_words = utils.get_word_tokens(self._st2._text)
        self._st3_words = utils.get_word_tokens(self._st3._text)

        self._created_file_name = "./resources/test.conll"
        self._correct_file_name = "./resources/correct_test.conll"
Exemplo n.º 2
0
def get_preds_df(preds, input_texts):
    preds_df = pd.DataFrame({"pred": preds.values.reshape(-1, 1).ravel()})
    preds_df["input"] = np.array([[i] * 5 for i in input_texts]).ravel()
    preds_df["pred_len"] = preds_df["pred"].apply(
        lambda x: len(get_word_tokens(x)))
    preds_df["input_len"] = preds_df["input"].apply(
        lambda x: len(get_word_tokens(x)))
    preds_df["cosine_sim"] = preds_df.apply(
        lambda x: get_similarities(model, tokenizer, x["pred"], x["input"]),
        axis=1)
    preds_df["cosine_sim"] = preds_df["cosine_sim"].apply(lambda x: x[0][0])
    preds_df["rouge_l"] = preds_df.apply(
        lambda x: get_rougel(x["pred"], x["input"]), axis=1)
    return preds_df
Exemplo n.º 3
0
    def test_fit_known_tokens_create_token_to_the_left(self):
        # Create a token positioned inside the first word
        text = 'to'  # Tex-to
        init = self._text.find(text)
        end = init + len(text) - 1
        known_token = Token(text, init, end, "teste")

        # Only the first sentence will be used
        text = self._st1._text
        word_tokens_before = utils.get_word_tokens(text)

        pipeline = NerCorpusPipeline(text, [known_token])
        pipeline.apply_processing_rules()

        word_tokens_after = pipeline.word_tokens

        # Ensure the words have a valid structure
        for token in word_tokens_after:
            self.assertTrue(self._text[token._init_index:token._end_index +
                                       1] == token._text)

        # Ensure the known tokens have a valid structure
        for token in pipeline.known_tokens:
            self.assertTrue(self._text[token._init_index:token._end_index +
                                       1] == token._text)

        self.assertTrue(len(word_tokens_after) == 4)
        self.assertTrue(
            set(['Tex', 'to', 'para', 'teste', '.']) == set([
                t._text for t in utils.sort_tokens(pipeline.known_tokens +
                                                   word_tokens_after)
            ]))
Exemplo n.º 4
0
def get_rf_from_dev(dev_df, preds_dev, max_depth=None, random_state=19):
    preds_df = preds_dev.copy()
    dev_df_grouped = dev_df.groupby("input").agg({
        "output": list,
        "cosine_sim": list,
        "rouge_l": list,
        "input_len": max,
        "output_len": list
    }).reset_index()
    preds_df["ref"] = [
        l for sublist in dev_df_grouped["output"].apply(
            lambda x: [x] * 5).tolist() for l in sublist
    ]
    preds_df["ref"] = preds_df["ref"].apply(lambda x: [[i] for i in x])

    preds_df["pred_len"] = preds_df["pred"].apply(
        lambda x: len(get_word_tokens(x)))
    preds_df["input_len"] = preds_df["input"].apply(
        lambda x: len(get_word_tokens(x)))

    preds_df["sari"] = preds_df.apply(lambda x: corpus_sari(
        orig_sents=[x["input"]],
        sys_sents=[x["pred"]],
        refs_sents=x["ref"],
    ),
                                      axis=1)

    rf = RandomForestRegressor(n_estimators=1000,
                               max_depth=max_depth,
                               n_jobs=-1,
                               random_state=random_state)

    X_train = preds_df[["cosine_sim", "rouge_l", "input_len", "pred_len"]]
    y_train = preds_df["sari"]

    rf.fit(X_train, y_train)

    return rf, preds_df
Exemplo n.º 5
0
    tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
    model = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
    model.to("cuda")

    # validation dataset
    dev_df = pd.read_csv(DEV_PATH, index_col=0)
    dev_df.columns = ["input", "output"]

    dev_df["cosine_sim"] = dev_df.apply(
        lambda x: get_similarities(model, tokenizer, x["input"], x["output"]),
        axis=1)
    dev_df["cosine_sim"] = dev_df["cosine_sim"].apply(lambda x: x[0][0])
    dev_df["rouge_l"] = dev_df.apply(
        lambda x: get_rougel(x["input"], x["output"]), axis=1)
    dev_df["input_len"] = dev_df["input"].apply(
        lambda x: len(get_word_tokens(x)))
    dev_df["output_len"] = dev_df["output"].apply(
        lambda x: len(get_word_tokens(x)))
    dev_df.to_csv(OUTPUT_DIR / "dev_df_metrics.csv", index=False)

    # train dataset
    dfs = [
        pd.read_csv(path, usecols=["target_x", "target_y"])
        for path in WIKI_DIR.glob("*")
    ]
    wiki_df = pd.concat(dfs).reset_index(drop=True)
    wiki_df.columns = ["input", "output"]

    wiki_df["cosine_sim"] = wiki_df.apply(
        lambda x: get_similarities(model, tokenizer, x["input"], x["output"]),
        axis=1)
Exemplo n.º 6
0
 def __init__(self, text, known_tokens):
     self._text = text
     # TODO: Guarantee that all known tokens do not intersect
     self.known_tokens = known_tokens
     self.sentences_tokens = get_sentence_tokens(text)
     self.word_tokens = get_word_tokens(text, 'O')
Exemplo n.º 7
0
 def test_word_tokenizer(self):
     word_tokens = utils.get_word_tokens(self._text)
     for token in word_tokens:
         self.assertTrue(self._text[token._init_index:token._end_index +
                                    1] == token._text)