def testing_on_data(type_of_data, X_test, Y_test, model, batch_size, summary_only=False):
    # Predicting
    print "====================== %s ======================" %(type_of_data)
    print "Predicting... (SEPARATELY FOR EACH DOCUMENT)"
    data = {
        'wd_r': [],
        'wd_e': [],
        'pk': []
    }
    avg_segment_lengths_across_test_data = [] # Average segment length across the documents
    for Xi_test, Yi_test in zip(X_test, Y_test):
        pred_per_doc = []
        Xi_test, Yi_test = Xi_test.reshape((1,) + Xi_test.shape), Yi_test.reshape((1,) + Yi_test.shape)   # Convert to format of 1 document
        for batch_X_left, batch_X_mid, batch_X_right, batch_Y_mid in batch_gen_consecutive_context_segments_from_big_seq(Xi_test, Yi_test, batch_size, ONE_SIDE_CONTEXT_SIZE):
            batch_y_pred = model.predict_on_batch([batch_X_left, batch_X_mid, batch_X_right])
            pred_per_doc.append(batch_y_pred)

        if not len(pred_per_doc): # batch generator might drop a few documents
            continue

        #rounded = np.round(pred_per_doc)
        pred_per_doc = np.concatenate(pred_per_doc, axis=0)
        actual_avg_seg_length, result = helper.windiff_and_pk_metric_ONE_SEQUENCE(Yi_test[0], pred_per_doc, window_size=-1, rounded=False, print_individual_stats=not summary_only)
        avg_segment_lengths_across_test_data.append(actual_avg_seg_length)
        data['pk'].append(result['pk'])
        data['wd_r'].append(result['wd_r'])
        data['wd_e'].append(result['wd_e'])

    print ">> Summary (%s):" %(type_of_data)
    print "AVG segment length in test data: %f" % (np.mean(avg_segment_lengths_across_test_data))
    print "WinDiff metric (Windiff_r):: avg: %f | std: %f | min: %f | max: %f" %(np.mean(data['wd_r']), np.std(data['wd_r']), np.min(data['wd_r']), np.max(data['wd_r']))
    print "WinDiff metric (Windiff_e):: avg: %f | std: %f | min: %f | max: %f" %(np.mean(data['wd_e']), np.std(data['wd_e']), np.min(data['wd_e']), np.max(data['wd_e']))
    print "Pk metric:: avg: %f | std: %f | min: %f | max: %f" %(np.mean(data['pk']), np.std(data['pk']), np.min(data['pk']), np.max(data['pk']))
    print('___________________________________')
Пример #2
0
def save_predictions(type_of_data, X_test, Y_test, model, batch_size, summary_only=False):
    # Predicting
    print "====================== %s ======================" %(type_of_data)
    print "GET PREDICTIONS... (SEPARATELY FOR EACH DOCUMENT)"
    data = {
        'wd_r': [],
        'wd_e': [],
        'pk': []
    }
    doc_idx = 18
    avg_segment_lengths_across_test_data = [] # Average segment length across the documents
    predictions_return = []
    zipped = zip(X_test, Y_test)
    for i, (Xi_test, Yi_test) in enumerate(zipped):

        if i != doc_idx:
            continue

        print Xi_test.shape
        pred_per_doc = []
        Xi_test, Yi_test = Xi_test.reshape((1,) + Xi_test.shape), Yi_test.reshape((1,) + Yi_test.shape)   # Convert to format of 1 document
        for batch_X_left, batch_X_mid, batch_X_right, batch_Y_mid in batch_gen_consecutive_context_segments_from_big_seq("test", Xi_test, Yi_test, batch_size, ONE_SIDE_CONTEXT_SIZE):
            batch_y_pred = model.predict_on_batch([batch_X_left, batch_X_mid, batch_X_right])
            pred_per_doc.append(batch_y_pred)

        if not len(pred_per_doc): # batch generator might drop a few documents
            continue

        #rounded = np.round(pred_per_doc)
        pred_per_doc = np.concatenate(pred_per_doc, axis=0)
        #return pred_per_doc
        predictions_return.append(pred_per_doc)
        actual_avg_seg_length, result = helper.windiff_and_pk_metric_ONE_SEQUENCE(Yi_test[0], pred_per_doc, window_size=-1, rounded=False, print_individual_stats=not summary_only)
        avg_segment_lengths_across_test_data.append(actual_avg_seg_length)
        data['pk'].append(result['pk'])
        data['wd_r'].append(result['wd_r'])
        data['wd_e'].append(result['wd_e'])

        print "WD: %f, PK: %f" %(result['wd_r'], result['pk'])
        # Save for visualization
        #rounded_per_doc = np.round(pred_per_doc)
        rounded_per_doc = pred_per_doc
        output = ["ref,hyp"]
        for (ref, hyp) in zip(Y_test[doc_idx], rounded_per_doc):
            output.append(str(int(ref[0])) + "," + str(hyp[0]))
        file_name = "prediction_output_save.csv"
        with open(file_name, "a") as f:
            for line in output:
                f.write(line + "\r\n")
        print "Written document index: `%d` to file: `%s`" %(doc_idx, file_name)
        return
Пример #3
0
def RANDOM_testing_on_data(type_of_data,
                           X_test,
                           Y_test,
                           model,
                           batch_size,
                           summary_only=False,
                           visualize=False):
    # Predicting
    print "====================== %s ======================" % (type_of_data)
    print "Predicting... (SEPARATELY FOR EACH DOCUMENT)"
    data = {'wd_r': [], 'wd_e': [], 'pk': []}
    avg_segment_lengths_across_test_data = [
    ]  # Average segment length across the documents
    for Xi_test, Yi_test in zip(X_test, Y_test):
        pred_per_doc = []
        Xi_test, Yi_test = Xi_test.reshape(
            (1, ) + Xi_test.shape), Yi_test.reshape(
                (1, ) + Yi_test.shape)  # Convert to format of 1 document

        original_yi = np.copy(Yi_test[0])
        pred_per_doc = np.copy(Yi_test[0])
        np.random.shuffle(pred_per_doc)

        actual_avg_seg_length, result = helper.windiff_and_pk_metric_ONE_SEQUENCE(
            original_yi,
            pred_per_doc,
            window_size=-1,
            rounded=False,
            print_individual_stats=not summary_only)
        avg_segment_lengths_across_test_data.append(actual_avg_seg_length)
        data['pk'].append(result['pk'])
        data['wd_r'].append(result['wd_r'])
        data['wd_e'].append(result['wd_e'])

    print ">> Summary (%s):" % (type_of_data)
    print "AVG segment length in test data: %f, std: %f" % (
        np.mean(avg_segment_lengths_across_test_data),
        np.std(avg_segment_lengths_across_test_data))
    print "WinDiff metric (Windiff_r):: avg: %f | std: %f | min: %f | max: %f" % (
        np.mean(data['wd_r']), np.std(data['wd_r']), np.min(
            data['wd_r']), np.max(data['wd_r']))
    print "WinDiff metric (Windiff_e):: avg: %f | std: %f | min: %f | max: %f" % (
        np.mean(data['wd_e']), np.std(data['wd_e']), np.min(
            data['wd_e']), np.max(data['wd_e']))
    print "Pk metric:: avg: %f | std: %f | min: %f | max: %f" % (np.mean(
        data['pk']), np.std(data['pk']), np.min(data['pk']), np.max(
            data['pk']))
    print('___________________________________')
Пример #4
0
def testing_on_data(type_of_data, X_test, Y_test, model, batch_size, summary_only=False, visualize=False):
    # Predicting
    print "====================== %s ======================" %(type_of_data)
    print "Predicting... (SEPARATELY FOR EACH DOCUMENT)"
    data = {
        'wd_r': [],
        'wd_e': [],
        'pk': []
    }
    avg_segment_lengths_across_test_data = [] # Average segment length across the documents
    for Xi_test, Yi_test in zip(X_test, Y_test):
        pred_per_doc = []
        for batch_count, (batch_X, batch_Y) in enumerate(batch_gen_sentences_without_context(Xi_test, Yi_test, batch_size, fixed_size=False)):
            start_time = time.time()
            batch_y_pred = model.predict_on_batch([batch_X])
            #print time.time() - start_time, "sec/batch of size", batch_size
            pred_per_doc.append(batch_y_pred[:,1])

        if not len(pred_per_doc): # batch generator might drop a few documents
            continue

        if len(pred_per_doc) > Xi_test.shape[0]:
            pred_per_doc = pred_per_doc[:Xi_test.shape[0]]

        #rounded = np.round(pred_per_doc)
        pred_per_doc = np.concatenate(pred_per_doc, axis=0)
        if visualize:
            print ">>>>> VISUALIZE <<<<<<"
            pdb.set_trace()
        #pdb.set_trace()

        Yi_test_for_windiff = to_categorical(Yi_test)[:,1]

        actual_avg_seg_length, result = helper.windiff_and_pk_metric_ONE_SEQUENCE(Yi_test_for_windiff, pred_per_doc, window_size=-1, rounded=False, print_individual_stats=not summary_only)
        avg_segment_lengths_across_test_data.append(actual_avg_seg_length)
        data['pk'].append(result['pk'])
        data['wd_r'].append(result['wd_r'])
        data['wd_e'].append(result['wd_e'])

    print ">> Summary (%s):" %(type_of_data)
    print "AVG segment length in test data: %f, std: %f" % (np.mean(avg_segment_lengths_across_test_data), np.std(avg_segment_lengths_across_test_data))
    print "WinDiff metric (Windiff_r):: avg: %f | std: %f | min: %f | max: %f" %(np.mean(data['wd_r']), np.std(data['wd_r']), np.min(data['wd_r']), np.max(data['wd_r']))
    print "WinDiff metric (Windiff_e):: avg: %f | std: %f | min: %f | max: %f" %(np.mean(data['wd_e']), np.std(data['wd_e']), np.min(data['wd_e']), np.max(data['wd_e']))
    print "Pk metric:: avg: %f | std: %f | min: %f | max: %f" %(np.mean(data['pk']), np.std(data['pk']), np.min(data['pk']), np.max(data['pk']))
    print('___________________________________')