Exemplo n.º 1
0
def get_predictions(path,
                    rescaled,
                    original,
                    LSTM_ind=False,
                    threshold_fixed=0.5):
    auto_encoder = load_model(path)
    # threshold_fixed = chose_weights_test_results(negative_weight,positive_weight,path,rescaled,original,LSTM_ind)
    valid_x_predictions = auto_encoder.predict(rescaled)
    if LSTM_ind:
        mse = np.mean(np.power(
            flatten(rescaled) - flatten(valid_x_predictions), 2),
                      axis=1)
    else:
        mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1)
    if LSTM_ind:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': list(original)
        })
    else:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': original['y']
        })

    pred_y = [
        1 if e > threshold_fixed else 0
        for e in error_df.Reconstruction_error.values
    ]
    return pred_y
Exemplo n.º 2
0
def chose_weights_test_results(negative_weight,
                               positive_weight,
                               path,
                               rescaled,
                               original,
                               LSTM_ind=False):
    auto_encoder = load_model(path)
    # Predictions on validation set
    valid_x_predictions = auto_encoder.predict(rescaled)
    if LSTM_ind:
        mse = np.mean(np.power(
            flatten(rescaled) - flatten(valid_x_predictions), 2),
                      axis=1)
    else:
        mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1)

    if LSTM_ind:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': list(original)
        })
    else:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': original['y']
        })
    prob = []
    cost_list = []
    fp_values = []
    fn_values = []
    # Chose the threshold based on the validation set
    for i in [x / 100.0 for x in range(0, 40, 1)]:
        pred_y = [
            1 if e > i else 0 for e in error_df.Reconstruction_error.values
        ]
        true_y = list(map(int, error_df.True_class.values))
        c1 = [
            x and y
            for x, y in zip([x == 1 for x in pred_y], [x == 0 for x in true_y])
        ]
        c2 = [
            x and y
            for x, y in zip([x == 0 for x in pred_y], [x == 1 for x in true_y])
        ]
        fp_values.append(sum(c1))
        fn_values.append(sum(c2))
        # Calculate cost based on weights.
        cost = np.sum(
            np.array(c1) * negative_weight + np.array(c2) * positive_weight)
        prob.append(i)
        cost_list.append(cost)
    return prob[cost_list.index(min(cost_list))]
Exemplo n.º 3
0
def test_metrics_print(path,
                       rescaled,
                       original,
                       LSTM_ind=False,
                       threshold_fixed=0.5):
    auto_encoder = load_model(path)
    # threshold_fixed = chose_weights_test_results(negative_weight,positive_weight,path,rescaled,original,LSTM_ind)
    valid_x_predictions = auto_encoder.predict(rescaled)
    if LSTM_ind:
        mse = np.mean(np.power(
            flatten(rescaled) - flatten(valid_x_predictions), 2),
                      axis=1)
    else:
        mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1)
    if LSTM_ind:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': list(original)
        })
    else:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': original['y']
        })
    pred_y = [
        1 if e > threshold_fixed else 0
        for e in error_df.Reconstruction_error.values
    ]
    predictions = pd.DataFrame({
        'true': error_df.True_class,
        'predicted': pred_y
    })
    conf_matrix = confusion_matrix(error_df.True_class, pred_y)
    fig = plt.figure(figsize=(8, 8))
    LABELS = ["Normal", "Break"]
    sns.heatmap(conf_matrix,
                xticklabels=LABELS,
                yticklabels=LABELS,
                annot=True,
                fmt="d")
    plt.title("Confusion matrix")
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    fig.savefig('..\\results\FC_results\DL_results\conf.png')
    plt.show()
    tn, fp, fn, tp = confusion_matrix(error_df.True_class, pred_y).ravel()
    print('sensitivity', tp / (tp + fn) * 100)
    print('specificity', tn / (tn + fp) * 100)
    print('precision', tp / (tp + fp) * 100)
    print('accuracy', (tp + tn) / (tp + tn + fp + fn) * 100)
Exemplo n.º 4
0
def roc_curve_plot(
    rescaled,
    original,
    path,
    LSTM_ind=False,
):

    auto_encoder = load_model(path)
    valid_x_predictions = auto_encoder.predict(rescaled)
    if LSTM_ind:
        mse = np.mean(np.power(
            flatten(rescaled) - flatten(valid_x_predictions), 2),
                      axis=1)
    else:
        mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1)
    if LSTM_ind:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': list(original)
        })
    else:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': original['y']
        })
    false_pos_rate, true_pos_rate, thresholds = roc_curve(
        error_df.True_class, error_df.Reconstruction_error)
    roc_auc = auc(false_pos_rate, true_pos_rate)
    fig = plt.figure(figsize=(8, 8))
    plt.plot(false_pos_rate,
             true_pos_rate,
             linewidth=5,
             label='AUC = %0.3f' % roc_auc)
    plt.plot([0, 1], [0, 1], linewidth=5)
    plt.xlim([-0.01, 1])
    plt.ylim([0, 1.01])
    plt.legend(loc='lower right')
    plt.title(
        'Receiver operating characteristic curve (ROC) on Validation set')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    fig.savefig('..\\results\FC_results\DL_results\\roc.png')
    plt.show()
Exemplo n.º 5
0
def calculate_auc(rescaled, original, path, LSTM_ind=False):
    model = load_model(path)
    valid_x_predictions = model.predict(rescaled)
    if LSTM_ind:
        mse = np.mean(np.power(
            flatten(rescaled) - flatten(valid_x_predictions), 2),
                      axis=1)
    else:
        mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1)
    if LSTM_ind:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': list(original)
        })
    else:
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': original['y']
        })
    false_pos_rate, true_pos_rate, thresholds = roc_curve(
        error_df.True_class, error_df.Reconstruction_error)
    roc_auc = auc(false_pos_rate, true_pos_rate)
    return roc_auc
Exemplo n.º 6
0
def get_question_info(url: str) -> List[Dict[str, Any]]:
    """Take in an episode and return a list of each question and its relevant information"""

    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    ep_id = int(re.findall('#(.*),', soup.title.text)[0])

    rounds = soup.find_all(id=re.compile(
        'jeopardy_round|double_jeopardy_round|final_jeopardy_round'))

    if len(rounds) == 0:
        return []

    category_list = {
        _round['id']: [
            category.text
            for category in _round.find_all('td', class_='category_name')
        ]
        for _round in rounds
    }

    round_clue_orders = [[
        int(clue.find('td', class_='clue_order_number').text) -
        1 if clue.find('td', class_='clue_order_number') else i
        for i, clue in enumerate(clue_sect.find_all('div'))
    ] for clue_sect in rounds]

    clue_order = [[
        index[0] for index in sorted(enumerate(_round), key=lambda x: x[1])
    ] for _round in round_clue_orders]

    order_iter = itertools.chain.from_iterable(clue_order)

    clue_chunks = [
        list(
            zip(_round.find_all('div'), [
                clue.text for clue in _round.find_all('td', class_='clue_text')
            ])) for _round in rounds
    ]

    clue_chunks = [
        clue_set[next(order_iter)] for clue_set in clue_chunks
        for _, _ in enumerate(clue_set)
    ]

    #had to re-soupify the div tags to get unicode out of quotes. If I learn a better way to do this I'll change it
    div_tag_soup = [
        BeautifulSoup(clue[0]['onmouseover'], 'html.parser')
        for clue in clue_chunks
    ]

    clue_rounds = [clue[0].find_parent('div')['id'] for clue in clue_chunks]

    is_fj = [_round == 'final_jeopardy_round' for _round in clue_rounds]

    contestants = itertools.chain.from_iterable([
        div.find_all('td', class_=re.compile('wrong|right'))
        for div in div_tag_soup
    ])

    contestants = list(
        set([
            answerer.text for answerer in contestants
            if ('Triple Stumper' not in answerer.text)
            & ('Quadruple Stumper' not in answerer.text)
        ]))

    #A dictionary describing who answered each question and if they were right or wrong
    answerer_dicts = [
        defaultdict(lambda: 'neither',
                    [[contestant.text, contestant['class'][0]] for contestant
                     in div.find_all('td', class_=re.compile('wrong|right'))
                     if 'Triple Stumper' not in contestant.text])
        for div in div_tag_soup
    ]

    value_tags = [
        clue[0].find('td', class_=lambda text: 'clue_value' in text).text
        for i, clue in enumerate(clue_chunks) if not is_fj[i]
    ]

    #clue_values = [int(value.split('$')[1].replace(',', '')) for value in value_tags]
    clue_values = [
        int(re.compile('[^\d,](?=\d)').split(value)[-1].replace(',', ''))
        for value in value_tags
    ]

    fj_index = [i for i, _ in enumerate(clue_chunks) if is_fj[i]]

    fj_values = []

    fj_contestants = []

    if len(fj_index) > 0:

        fj_values = [
            [int(value.replace(',', '').replace('$', '').split('.')[0])] +
            [0] * (len(fj_index) - 1)
            for value in div_tag_soup[fj_index[0]].find_all(
                string=re.compile('^[\$\d][\d,]+$'))
        ]

        fj_contestants = [
            contestant.text for contestant in div_tag_soup[
                fj_index[0]].find_all('td', class_=re.compile('wrong|right'))
        ]

    fj_dict = dict(zip(fj_contestants, fj_values))

    contestant_value_dict = {
        contestant:
        (clue_values +
         fj_dict[contestant] if fj_dict.get(contestant) is not None else
         (clue_values + [0] if len(fj_dict) > 0 else clue_values))
        for contestant in contestants
    }

    contestant_score_dict = {
        contestant: [
            value * value_sign_dict[answerer_dicts[i][contestant]]
            for i, value in enumerate(contestant_value_dict[contestant])
        ]
        for contestant in contestants
    }

    contestant_scores = [
        flatten([[contestant, contestant_score_dict[contestant][i]]
                 for contestant in contestant_score_dict])
        for i, _ in enumerate(contestant_score_dict[contestants[0]])
    ]

    if len(contestant_scores[0]) < 8:

        contestant_scores = [
            clue + [''] * (8 - len(clue)) for clue in contestant_scores
        ]

    score_keys = []

    for i in range(4):

        score_keys.append('contestant_{}'.format(i + 1))
        score_keys.append('c{}_score_update'.format(i + 1))

    difficulty = [
        int(clue[0].find('td', class_='clue_unstuck')['id'].split('_')[-2])
        if clue[0].find('td', class_='clue_unstuck') is not None else 0
        for clue in clue_chunks
    ]

    clue_columns = [
        int(clue[0].find('td', class_='clue_unstuck')['id'].split('_')[-3]) -
        1 if clue[0].find('td', class_='clue_unstuck') is not None else 0
        for clue in clue_chunks
    ]

    clue_cats = [
        category_list[_round][clue_columns[i]]
        for i, _round in enumerate(clue_rounds)
    ]

    season = season_dict[url]

    questions = [clue[1] for clue in clue_chunks]

    answers = [
        clue.find('em', class_=lambda text: 'correct_response' in text).text
        for clue in div_tag_soup
    ]

    daily_double = [
        'DD' in value if not is_fj[i] else False
        for i, value in enumerate(value_tags)
    ] + ['False'] * len(fj_index)

    clues = [
        dict(
            **{
                'season': season,
                'ep_id': ep_id,
                'question_id': i + 1,
                'round': clue_rounds[i],
                'category': clue_cats[i],
                'difficulty': difficulty[i],
                'question': questions[i],
                'answer': answers[i],
                'DD': daily_double[i]
            }, **dict(zip(score_keys, contestant_scores[i])))
        for i, _ in enumerate(clue_chunks)
    ]

    return clues