Пример #1
0
selected_text_predicted = []
for doc, labels in zip(train_docs, train_bool_labels):
    predicted_text = patch_whitespace(doc, labels)
    selected_text_predicted.append(predicted_text)

train_preds_to_df = train_sentiments_featurized.filter(items=[stg.ID_COL])\
                                               .assign(sentence_pred=selected_text_predicted)

train_all_sentiments = pd.merge(left=train,
                                right=train_preds_to_df,
                                on=stg.ID_COL, how='left')\
    .assign(sentence_pred=lambda df: np.where(df['sentence_pred'].isna(), df['text'], df['sentence_pred']))\
    .assign(sentence_pred=lambda df: np.where(df['sentence_pred'] == '', df['text'], df['sentence_pred']))

train_score = jaccard_score(y_true=train_all_sentiments[stg.SELECTED_TEXT_COL],
                            y_pred=train_all_sentiments['sentence_pred'])

print('--------------------------')
print(f'train score: {train_score}')
logging.info(f'train score: {train_score}')
print('--------------------------')

validation_pred = BLSTM.predict(X_test_word=x_validation_indexed,
                                X_test_features=validation_extra_features)

validation_bool_labels = [[1 if score[0] > 0.5 else 0 for score in scores]
                          for scores in validation_pred]

selected_text_predicted = []
for doc, labels in zip(validation_docs, validation_bool_labels):
    predicted_text = patch_whitespace(doc, labels)
    for sentence, _ in train_preprocessed_rnn:
        inputs = prepare_sequence(sentence, dictionary.token2id)
        tag_scores = model(inputs)
        tag_scores_list.append(tag_scores)

# tag_scores_1d = [[x[1] for x in list_pred] for list_pred in tag_scores_list]
train_bool_pred = [[True if score > 0.5 else False for score in list_pred]
                   for list_pred in tag_scores_list]

# TODO: rebuild full train dataset (with neutral sentiment) before performing jaccard_score
train_model_pred = [
    ' '.join(np.array(sentence)[pred])
    for sentence, pred in zip(train_data['tokens_text'], train_bool_pred)
]

train_score = jaccard_score(y_true=train[stg.SELECTED_TEXT_COL],
                            y_pred=train_model_pred)
print('--------------------------')
print(f'train score: {train_score}')
print('--------------------------')

with torch.no_grad():
    tag_scores_list = []

    for sentence, _ in validation_preprocessed_rnn:
        inputs = prepare_sequence(sentence, dictionary.token2id)
        tag_scores = model(inputs)
        tag_scores_list.append(tag_scores)

# tag_scores_1d = [[x[1] for x in list_pred] for list_pred in tag_scores_list]
validation_bool_pred = [[
    True if score > 0.5 else False for score in list_pred
from tweet_sentiment_extraction.domain.sentence_constructor import SentenceConstructor as sc
from tweet_sentiment_extraction.utils.metrics import jaccard_score

import tweet_sentiment_extraction.settings as stg

train = pd.read_csv(join(stg.PROCESSED_DATA_DIR, 'train.csv'))
validation = pd.read_csv(join(stg.PROCESSED_DATA_DIR, 'validation.csv'))
test = pd.read_csv(join(stg.PROCESSED_DATA_DIR, 'test.csv'))

train_tokens = SentenceCleaner.add_tokenized_column(df=train, column_name_to_tokenize=stg.TEXT_COL)\
                              .dropna()\
                              .rename(columns={stg.SELECTED_TEXT_COL: stg.SENTENCE_TARGET_COL,
                                               stg.TOKENS_TEXT_COL: stg.TOKENS_PRED_COL})
train_predictions = sc.add_sentence_pred_from_tokens_col(df=train_tokens)

train_score = jaccard_score(y_true=train_predictions[stg.SENTENCE_TARGET_COL],
                            y_pred=train_predictions[stg.SENTENCE_PRED_COL])
print(f'Train score: {train_score}')

validation_tokens = SentenceCleaner.add_tokenized_column(df=validation, column_name_to_tokenize=stg.TEXT_COL)\
                                   .rename(columns={stg.SELECTED_TEXT_COL: stg.SENTENCE_TARGET_COL,
                                                    stg.TOKENS_TEXT_COL: stg.TOKENS_PRED_COL})
validation_predictions = sc.add_sentence_pred_from_tokens_col(
    df=validation_tokens)

validation_score = jaccard_score(
    y_true=validation_predictions[stg.SENTENCE_TARGET_COL],
    y_pred=validation_predictions[stg.SENTENCE_PRED_COL])
print(f'Validation score: {validation_score}')

test_tokens = SentenceCleaner.add_tokenized_column(df=test, column_name_to_tokenize=stg.TEXT_COL)\
                             .rename(columns={stg.TOKENS_TEXT_COL: stg.TOKENS_PRED_COL})
Пример #4
0
        return ' '


sys.exit()

train_pred = [[
    ent.text if ent.label_ == sentiment else 'no' for ent in doc.ents
    if ent.label_ == sentiment
] for doc, sentiment in zip(nlp.pipe(train['text']), train['sentiment'])]

train = train.assign(
    spacy_pred=train_pred,
    sentence_pred=lambda df: np.where(df['sentiment'] == 'neutral', df['text'],
                                      df['spacy_pred'].apply(toto)))

train_score = jaccard_score(y_true=train['selected_text'],
                            y_pred=train['sentence_pred'])
print('--------------------------')
print(f'Train score: {train_score}')
print('--------------------------')

validation_pred = [[
    ent.text if ent.label_ == sentiment else 'no' for ent in doc.ents
    if ent.label_ == sentiment
] for doc, sentiment in zip(nlp.pipe(validation['text']),
                            validation['sentiment'])]

validation = validation.assign(
    spacy_pred=validation_pred,
    sentence_pred=lambda df: np.where(df['sentiment'] == 'neutral', df['text'],
                                      df['spacy_pred'].apply(toto)))