예제 #1
0
def write_prediction_to_file (prediction_seq, label_seq, overlapped_predictions, output, options_dict):

    crf_sequence_length = options_dict ['crf_sequence_length']
    sliding_window_length = options_dict ['sliding_window_length']

    '''
    if len(overlapped_predictions) == 0 :
        if crf_sequence_length != len(prediction_seq):
            # write from 0 to len(prediction_seq)
            for i in range(0: legnt)
            overlapped_predictions = []
            return overlapped_predictions
        else:         
            # write from 0 to (prediction_seq - sliding window length)
            new_start = crf_sequence_length - sliding_window_length
            overlapped_predictions = prediction_seq[new_start:len(prediction_seq)]
            return overlapped_predictions
    '''
    # voting mechanism: we can find overlap between subsequent sequences as follows,
    ## 1. for the overlap received from the previous sequence, voting can be done and written
    ## 2. for no overlap region, the prediction can be written as are, to the output file
    ## 2. if the sequence is not cut off, it will have some overlap with the next sequence. So the overlap can be passed to the next
    ##    of this method for voting

    # I assume that the length of the prediction sequence should be at least the length of the overlapped predictions. This will 
    # fail if the sequence cut off exactly at the provided sequence length (crf_sequence_file). 
    # Need to have an exact way of finding overlaps. Sequences could have an additional feature in the input file
    # telling if shares an overlap with the next one.
    for i in range(0, len(overlapped_predictions)) :

        if prediction_seq[i] != overlapped_predictions[i] :
            
            print ("applying voting. i: ",i,", prediction_seq[i]: ", prediction_seq[i], ", overlapped_predictions[i]: ",overlapped_predictions[i],"label_seq[i]: ",label_seq[i],"\n")
            similarity_cur = util.get_similarity(prediction_seq[i],prediction_seq[0:len(prediction_seq)])
            similarity_prev = util.get_similarity(overlapped_predictions[i], overlapped_predictions)
            print("similarity_cur: ", similarity_cur, ", similarity_prev: ", similarity_prev, "\n")
            
            if similarity_prev < similarity_cur :
                output.write( prediction_seq[i] + "," + label_seq[i] +"\n")
            else :
                output.write( overlapped_predictions[i] + "," + label_seq[i] +"\n")

        else :
            output.write( prediction_seq[i] + "," + label_seq[i] +"\n")

    no_overlap_region = min(crf_sequence_length - sliding_window_length, len(prediction_seq))

    for i in range(len(overlapped_predictions), no_overlap_region) :
        output.write( prediction_seq[i] + "," + label_seq[i] +"\n")

    if(no_overlap_region == len(prediction_seq)):
        overlapped_predictions = []
        return overlapped_predictions

    overlapped_predictions = prediction_seq[no_overlap_region:crf_sequence_length]
    return overlapped_predictions
예제 #2
0
def process_feedback_movie_vector(feedback, input_movie_ids, movies_list,
                                  movies_df):
    #process feedback to get relevant movies and movies to be excluded
    relevant_movies, movie_to_exclude = util.process_feedback(
        feedback, input_movie_ids)

    relevant_movie_count = len(relevant_movies)
    #if all recommended movies are relevant then return
    if relevant_movie_count == 5:
        print "\nAll the movies were relevant hence no modification to the suggestion"
        return

    #fetch data frames for relevant and feedback movies
    relevant_movies_df = movies_df.loc[relevant_movies]
    feedback_movies_df = movies_df.loc[feedback.keys()]

    modified_query = util.probabilistic_feedback_query(feedback_movies_df,
                                                       relevant_movies_df,
                                                       movies_list,
                                                       relevant_movie_count)

    similarity_matrix = util.get_similarity(movies_df, modified_query)

    return similarity_matrix, movie_to_exclude
예제 #3
0
def update_similarity_and_mmr(hyp, importances, batch, enc_tokens, vocab):
    summ_sents, summ_tokens = get_summ_sents_and_tokens(
        hyp.tokens, batch, vocab)
    hyp.similarity = get_similarity(enc_tokens, summ_tokens, vocab)
    hyp.mmr = calc_mmr_from_sim_and_imp(hyp.similarity, importances)
예제 #4
0
def save_distribution_plots(importances,
                            enc_sentences,
                            enc_tokens,
                            hyp,
                            batch,
                            vocab,
                            ex_index,
                            sort=True):
    enc_sentences_str = [' '.join(sent) for sent in enc_sentences]
    summ_sents, summ_tokens = get_summ_sents_and_tokens(
        hyp.tokens, batch, vocab)
    prev_mmr = importances

    if sort:
        sort_order = np.argsort(importances, 0)[::-1]

    for sent_idx in range(0, len(summ_sents)):
        cur_summ_sents = summ_sents[:sent_idx]
        cur_summ_tokens = summ_tokens[:sent_idx]
        summ_str = ' '.join([' '.join(sent) for sent in cur_summ_sents])
        similarity_amount = get_similarity(enc_tokens, cur_summ_tokens, vocab)

        if FLAGS.pg_mmr:
            mmr_for_sentences = calc_mmr_from_sim_and_imp(
                similarity_amount, importances)
        else:
            mmr_for_sentences = None  # Don't use mmr if no sentence-level option is used

        distr_dir = os.path.join(FLAGS.log_root, 'mmr_distributions')
        if not os.path.exists(distr_dir):
            os.makedirs(distr_dir)
        save_name = os.path.join("%06d_decoded_%s_%d_sent" %
                                 (ex_index, '', sent_idx))
        plot_importances(enc_sentences_str,
                         distribution,
                         summ_str,
                         save_location=distr_dir,
                         save_name=save_name)
        file_path = os.path.join(distr_dir, save_name)
        np.savez(file_path,
                 mmr=mmr_for_sentences,
                 importances=importances,
                 enc_sentences=enc_sentences,
                 summ_str=summ_str)
        distributions = [('similarity', similarity_amount),
                         ('importance', importances),
                         ('mmr', mmr_for_sentences)]
        for distr_str, distribution in distributions:
            if sort:
                distribution = distribution[sort_order]
            save_name = os.path.join("%06d_decoded_%s_%d_sent" %
                                     (ex_index, distr_str, sent_idx))

            img_file_names = sorted([
                file_name for file_name in os.listdir(distr_dir)
                if save_name in file_name and 'jpg' in file_name
                and 'combined' not in file_name
            ])
            imgs = []
            for file_name in img_file_names:
                img = PIL.Image.open(os.path.join(distr_dir, file_name))
                imgs.append(img)
            max_shape = sorted([(np.sum(i.size), i.size) for i in imgs])[-1][1]
            combined_img = np.vstack(
                (np.asarray(i.resize(max_shape)) for i in imgs))
            combined_img = PIL.Image.fromarray(combined_img)
            combined_img.save(
                os.path.join(distr_dir, save_name + '_combined.jpg'))
            for file_name in img_file_names:
                os.remove(os.path.join(distr_dir, file_name))
        prev_mmr = mmr_for_sentences
    return mmr_for_sentences