示例#1
0
def do_score(args):
    # Read input
    input_kb = KnowledgeBase(
        *parse_input(csv.reader(args.input, delimiter="\t")))
    ref_kb = KnowledgeBase(*parse_input(csv.reader(args.refs, delimiter="\t")))

    if args.mode == "instance":
        print(score_instance(ref_kb, input_kb))
    elif args.mode == "relation":
        print(score_relation(ref_kb, input_kb))
    elif args.mode == "entity":
        print(score_entity(ref_kb, input_kb))
示例#2
0
def do_sample(args):
    """
    entity link -> [mentions]
    - sample entities
        - sample fills
            - sample mentions
    """
    random.seed(args.seed)

    # Read input
    reader = csv.reader(args.input, delimiter="\t")
    mentions, canonical_mention, links, relations = parse_input(reader)
    entries = mentions + canonical_mention + links + relations
    old_entries = [
        parse_input(csv.reader(f, delimiter="\t"))[-1]
        for f in args.old_entries
    ]
    types = {r[0]: r[1] for r in mentions}

    if args.by_mention:
        relations = sample_by_mention(entries, args.num_entries)
    elif args.by_relation:
        relations = sample_by_relation(types, entries, args.num_entries,
                                       old_entries)
    else:
        relations = sample_by_entity(types, entries, args.num_entries,
                                     args.per_entity, old_entries)
    logger.info("Sampled %d relations", len(relations))

    relations = list(map_relations(mentions, relations))
    mentions = set(m for row in relations for m in [row[0], row[2]])
    mentions.update(row[2] for row in entries
                    if row[1] == "canonical_mention" and row[0] in mentions)
    docs = set(parse_prov(row[3])[0] for row in relations)
    logger.info("Touches %d mentions + canonical-mentions from %d documents",
                len(mentions), len(docs))

    # Nones are for sampling weights.
    mentions = [
        row + [None, None, None] for row in entries
        if row[0] in mentions and not is_reln(row[1])
    ]

    # Reconstruct output: collect all mentions, links, canonical mentions and relations.
    writer = csv.writer(args.output, delimiter="\t")
    for entry in mentions + relations:
        writer.writerow(entry)
def main():
    err, input_movie_ids = util.parse_input(sys.argv)
    if err:
        return

    #process movie list to get matrix
    matrix = util.get_movie_matrix_from_hd5()

    # *** Write your code here ***

    #perform LDA
    lda = LDA(n_components=no_of_components)
    lda.fit(matrix)
    lda_df = pd.DataFrame(lda.transform(matrix), index=matrix.index)

    input_movie_df = lda_df.loc[input_movie_ids]

    output_movies = []
    for index, movie in lda_df.iterrows():
        cosine_sum = 0
        order = 1
        for j, input_movie in input_movie_df.iterrows():
            cosine_sum += (1 - cosine(movie, input_movie)) * order
            order -= order_factor
        output_movies.append((index, cosine_sum))
    other_movies = list(
        filter(lambda tup: tup[0] not in input_movie_ids, output_movies))
    other_movies.sort(key=lambda tup: tup[1], reverse=True)
    output_movie_ids = [t[0] for t in other_movies][:5]

    #print output and log them
    feedback = util.process_output(input_movie_ids, output_movie_ids,
                                   output_file)

    #process feedback to get relevant movies and movies to be excluded
    relevant_movies, movie_to_exclude = util.process_feedback(
        feedback, input_movie_ids)

    relevant_movie_count = len(relevant_movies)
    #if all recommended movies are relevant then return
    if relevant_movie_count == 5:
        print "\nAll the movies were relevant hence no modification to the suggestion"
        return

    #fetch data frames for relevant and feedback movies
    relevant_movies_df = lda_df.loc[relevant_movies]
    feedback_movies_df = lda_df.loc[feedback.keys()]

    modified_query = util.probabilistic_feedback_query(feedback_movies_df,
                                                       relevant_movies_df,
                                                       lda_df.index,
                                                       relevant_movie_count)

    revised_movie_ids = util.get_revised_movies(lda_df, modified_query,
                                                movie_to_exclude)

    util.print_revised(revised_movie_ids, output_file)
示例#4
0
def main():
    err, input_movie_ids = util.parse_input(sys.argv)
    if err:
        return

    matrix = util.get_movie_matrix_from_hd5()

    svd_dict, svd_df = do_svd(matrix, input_movie_ids)
    print('SVD done')
    lda_dict, lda_df = do_lda(matrix, input_movie_ids)
    print('LDA done')
    tensor_dict, decomposed_movies_df, movies_list = do_tensor(input_movie_ids)
    print('Tensor done')
    page_rank_dict = do_page_rank(input_movie_ids)
    print('PageRank done')

    other_movies = []
    for k in svd_dict:
        if k in input_movie_ids:
            continue
        total_weight = svd_dict[k] + lda_dict.get(k, 0) + tensor_dict.get(
            k, 0) + page_rank_dict.get(k, 0)
        other_movies.append((k, total_weight))

    other_movies.sort(key=lambda tup: tup[1], reverse=True)
    output_movie_ids = [t[0] for t in other_movies][:5]

    feedback = util.process_output(input_movie_ids, output_movie_ids,
                                   output_file)

    similarity_svd, movie_to_exclude = process_feedback_movie_vector(
        feedback, input_movie_ids, svd_df.index, svd_df)

    similarity_lda, movie_to_exclude = process_feedback_movie_vector(
        feedback, input_movie_ids, lda_df.index, lda_df)

    similarity_tensor, movie_to_exclude = process_feedback_movie_vector(
        feedback, input_movie_ids, movies_list, decomposed_movies_df)

    page_rank_dict = do_page_rank_relevance(feedback, input_movie_ids)

    other_movies = []
    for k in similarity_svd:
        if k in movie_to_exclude:
            continue
        total_weight = similarity_svd[k] + similarity_lda.get(
            k, 0) + similarity_tensor.get(k, 0) + page_rank_dict.get(k, 0)
        other_movies.append((k, total_weight))

    other_movies.sort(key=lambda tup: tup[1], reverse=True)
    revised_movie_ids = [t[0] for t in other_movies][:5]

    util.print_revised(revised_movie_ids, output_file)
示例#5
0
def do_reweight(args):
    """
    entity link -> [mentions]
    - sample entities
        - sample fills
            - sample mentions
    """
    # Read input
    reader = csv.reader(args.reference, delimiter="\t")
    mentions, canonical_mention, links, relations = parse_input(reader)
    entries = mentions + canonical_mention + links + relations
    types = {r[0]: r[1] for r in mentions}

    reader = csv.reader(args.input, delimiter="\t")
    new_entries = sum(parse_input(reader), [])

    if args.by_relation:
        scheme = "relation"
    else:
        scheme = "entity"

    relations = reweight(types, entries, new_entries, scheme)
    # Don't need to do map because new_entries is ok
    mentions = set(m for row in relations for m in [row[0], row[2]])
    mentions.update(row[2] for row in entries
                    if row[1] == "canonical_mention" and row[0] in mentions)
    docs = set(parse_prov(row[3])[0] for row in relations)
    logger.info("Touches %d mentions + canonical-mentions from %d documents",
                len(mentions), len(docs))
    # Nones are for sampling weights.
    mentions = [
        row + [None, None, None] for row in entries
        if row[0] in mentions and not is_reln(row[1])
    ]

    # Reconstruct output: collect all mentions, links, canonical mentions and relations.
    writer = csv.writer(args.output, delimiter="\t")
    for entry in mentions + relations:
        writer.writerow(entry)
def main():
    err, input_movie_ids = util.parse_input(sys.argv)
    if err:
        return

    # *** Write your code here ***
    #process movie list to get matrix
    #matrix = util.get_matrix(movie_list)

    #perform PageRank
    #output = get_page_rank()

    # Remove this line pass the output here
    output_movie_ids = input_movie_ids

    #print output and log them
    util.process_output(input_movie_ids, output_movie_ids, output_file)
示例#7
0
def paste_votes():
    data = request.get_json(force=True)

    if "csv" not in data:
        return jsonify({'error': 'must provide csv'})

    rd = []
    for row in csv.reader(StringIO(data["csv"]), skipinitialspace=True):
        rd.append(row)

    return jsonify(
        util.parse_input(
            input=rd,
            parties_included=data["has_parties"],
            const_included=data["has_constituencies"],
            const_seats_included=data["has_constituency_seats"],
            adj_seats_included=data["has_constituency_adjustment_seats"]))
def main():
    err, input_movie_ids = util.parse_input(sys.argv)
    if err:
        return
    # read the pickle file that contains tensor
    actor_movie_year_3d_matrix = cPickle.load(
        open("actor_movie_genre_tensor.pkl", "rb"))
    actor_movie_year_array = np.array(actor_movie_year_3d_matrix)
    # perform cp decomposition
    decomposed = parafac(actor_movie_year_array,
                         no_of_components,
                         init='random')

    mlmovies = util.read_mlmovies()
    mlmovies = mlmovies.loc[mlmovies['year'] >= util.movie_year_for_tensor]
    movies_list = mlmovies.movieid.unique()

    # data frame for movie factor matrix from cp decomposition
    decomposed_movies_df = pd.DataFrame(decomposed[1], index=movies_list)
    # dataframe containing only input movies
    input_movie_df = decomposed_movies_df.loc[input_movie_ids]

    output_movies = []
    # finding cosine similarity of each movie vector with the input movie vector and fetching the top 5 values
    for index, movie in decomposed_movies_df.iterrows():
        cosine_sum = 0
        order = 1
        for j, input_movie in input_movie_df.iterrows():
            cosine_sum += (1 - cosine(movie, input_movie)) * order
            order -= order_factor
        output_movies.append((index, cosine_sum))
    other_movies = list(
        filter(lambda tup: tup[0] not in input_movie_ids, output_movies))
    other_movies.sort(key=lambda tup: tup[1], reverse=True)
    output_movie_ids = [t[0] for t in other_movies][:5]

    #print output and log them
    feedback = util.process_output(input_movie_ids, output_movie_ids,
                                   output_file)

    #process feedback to get relevant movies and movies to be excluded
    relevant_movies, movie_to_exclude = util.process_feedback(
        feedback, input_movie_ids)

    relevant_movie_count = len(relevant_movies)
    #if all recommended movies are relevant then return
    if relevant_movie_count == 5:
        print "\nAll the movies were relevant hence no modification to the suggestion"
        return

    #fetch data frames for relevant and feedback movies
    relevant_movies_df = decomposed_movies_df.loc[relevant_movies]
    feedback_movies_df = decomposed_movies_df.loc[feedback.keys()]

    modified_query = util.probabilistic_feedback_query(feedback_movies_df,
                                                       relevant_movies_df,
                                                       movies_list,
                                                       relevant_movie_count)

    revised_movie_ids = util.get_revised_movies(decomposed_movies_df,
                                                modified_query,
                                                movie_to_exclude)

    util.print_revised(revised_movie_ids, output_file)