def do_score(args): # Read input input_kb = KnowledgeBase( *parse_input(csv.reader(args.input, delimiter="\t"))) ref_kb = KnowledgeBase(*parse_input(csv.reader(args.refs, delimiter="\t"))) if args.mode == "instance": print(score_instance(ref_kb, input_kb)) elif args.mode == "relation": print(score_relation(ref_kb, input_kb)) elif args.mode == "entity": print(score_entity(ref_kb, input_kb))
def do_sample(args): """ entity link -> [mentions] - sample entities - sample fills - sample mentions """ random.seed(args.seed) # Read input reader = csv.reader(args.input, delimiter="\t") mentions, canonical_mention, links, relations = parse_input(reader) entries = mentions + canonical_mention + links + relations old_entries = [ parse_input(csv.reader(f, delimiter="\t"))[-1] for f in args.old_entries ] types = {r[0]: r[1] for r in mentions} if args.by_mention: relations = sample_by_mention(entries, args.num_entries) elif args.by_relation: relations = sample_by_relation(types, entries, args.num_entries, old_entries) else: relations = sample_by_entity(types, entries, args.num_entries, args.per_entity, old_entries) logger.info("Sampled %d relations", len(relations)) relations = list(map_relations(mentions, relations)) mentions = set(m for row in relations for m in [row[0], row[2]]) mentions.update(row[2] for row in entries if row[1] == "canonical_mention" and row[0] in mentions) docs = set(parse_prov(row[3])[0] for row in relations) logger.info("Touches %d mentions + canonical-mentions from %d documents", len(mentions), len(docs)) # Nones are for sampling weights. mentions = [ row + [None, None, None] for row in entries if row[0] in mentions and not is_reln(row[1]) ] # Reconstruct output: collect all mentions, links, canonical mentions and relations. writer = csv.writer(args.output, delimiter="\t") for entry in mentions + relations: writer.writerow(entry)
def main(): err, input_movie_ids = util.parse_input(sys.argv) if err: return #process movie list to get matrix matrix = util.get_movie_matrix_from_hd5() # *** Write your code here *** #perform LDA lda = LDA(n_components=no_of_components) lda.fit(matrix) lda_df = pd.DataFrame(lda.transform(matrix), index=matrix.index) input_movie_df = lda_df.loc[input_movie_ids] output_movies = [] for index, movie in lda_df.iterrows(): cosine_sum = 0 order = 1 for j, input_movie in input_movie_df.iterrows(): cosine_sum += (1 - cosine(movie, input_movie)) * order order -= order_factor output_movies.append((index, cosine_sum)) other_movies = list( filter(lambda tup: tup[0] not in input_movie_ids, output_movies)) other_movies.sort(key=lambda tup: tup[1], reverse=True) output_movie_ids = [t[0] for t in other_movies][:5] #print output and log them feedback = util.process_output(input_movie_ids, output_movie_ids, output_file) #process feedback to get relevant movies and movies to be excluded relevant_movies, movie_to_exclude = util.process_feedback( feedback, input_movie_ids) relevant_movie_count = len(relevant_movies) #if all recommended movies are relevant then return if relevant_movie_count == 5: print "\nAll the movies were relevant hence no modification to the suggestion" return #fetch data frames for relevant and feedback movies relevant_movies_df = lda_df.loc[relevant_movies] feedback_movies_df = lda_df.loc[feedback.keys()] modified_query = util.probabilistic_feedback_query(feedback_movies_df, relevant_movies_df, lda_df.index, relevant_movie_count) revised_movie_ids = util.get_revised_movies(lda_df, modified_query, movie_to_exclude) util.print_revised(revised_movie_ids, output_file)
def main(): err, input_movie_ids = util.parse_input(sys.argv) if err: return matrix = util.get_movie_matrix_from_hd5() svd_dict, svd_df = do_svd(matrix, input_movie_ids) print('SVD done') lda_dict, lda_df = do_lda(matrix, input_movie_ids) print('LDA done') tensor_dict, decomposed_movies_df, movies_list = do_tensor(input_movie_ids) print('Tensor done') page_rank_dict = do_page_rank(input_movie_ids) print('PageRank done') other_movies = [] for k in svd_dict: if k in input_movie_ids: continue total_weight = svd_dict[k] + lda_dict.get(k, 0) + tensor_dict.get( k, 0) + page_rank_dict.get(k, 0) other_movies.append((k, total_weight)) other_movies.sort(key=lambda tup: tup[1], reverse=True) output_movie_ids = [t[0] for t in other_movies][:5] feedback = util.process_output(input_movie_ids, output_movie_ids, output_file) similarity_svd, movie_to_exclude = process_feedback_movie_vector( feedback, input_movie_ids, svd_df.index, svd_df) similarity_lda, movie_to_exclude = process_feedback_movie_vector( feedback, input_movie_ids, lda_df.index, lda_df) similarity_tensor, movie_to_exclude = process_feedback_movie_vector( feedback, input_movie_ids, movies_list, decomposed_movies_df) page_rank_dict = do_page_rank_relevance(feedback, input_movie_ids) other_movies = [] for k in similarity_svd: if k in movie_to_exclude: continue total_weight = similarity_svd[k] + similarity_lda.get( k, 0) + similarity_tensor.get(k, 0) + page_rank_dict.get(k, 0) other_movies.append((k, total_weight)) other_movies.sort(key=lambda tup: tup[1], reverse=True) revised_movie_ids = [t[0] for t in other_movies][:5] util.print_revised(revised_movie_ids, output_file)
def do_reweight(args): """ entity link -> [mentions] - sample entities - sample fills - sample mentions """ # Read input reader = csv.reader(args.reference, delimiter="\t") mentions, canonical_mention, links, relations = parse_input(reader) entries = mentions + canonical_mention + links + relations types = {r[0]: r[1] for r in mentions} reader = csv.reader(args.input, delimiter="\t") new_entries = sum(parse_input(reader), []) if args.by_relation: scheme = "relation" else: scheme = "entity" relations = reweight(types, entries, new_entries, scheme) # Don't need to do map because new_entries is ok mentions = set(m for row in relations for m in [row[0], row[2]]) mentions.update(row[2] for row in entries if row[1] == "canonical_mention" and row[0] in mentions) docs = set(parse_prov(row[3])[0] for row in relations) logger.info("Touches %d mentions + canonical-mentions from %d documents", len(mentions), len(docs)) # Nones are for sampling weights. mentions = [ row + [None, None, None] for row in entries if row[0] in mentions and not is_reln(row[1]) ] # Reconstruct output: collect all mentions, links, canonical mentions and relations. writer = csv.writer(args.output, delimiter="\t") for entry in mentions + relations: writer.writerow(entry)
def main(): err, input_movie_ids = util.parse_input(sys.argv) if err: return # *** Write your code here *** #process movie list to get matrix #matrix = util.get_matrix(movie_list) #perform PageRank #output = get_page_rank() # Remove this line pass the output here output_movie_ids = input_movie_ids #print output and log them util.process_output(input_movie_ids, output_movie_ids, output_file)
def paste_votes(): data = request.get_json(force=True) if "csv" not in data: return jsonify({'error': 'must provide csv'}) rd = [] for row in csv.reader(StringIO(data["csv"]), skipinitialspace=True): rd.append(row) return jsonify( util.parse_input( input=rd, parties_included=data["has_parties"], const_included=data["has_constituencies"], const_seats_included=data["has_constituency_seats"], adj_seats_included=data["has_constituency_adjustment_seats"]))
def main(): err, input_movie_ids = util.parse_input(sys.argv) if err: return # read the pickle file that contains tensor actor_movie_year_3d_matrix = cPickle.load( open("actor_movie_genre_tensor.pkl", "rb")) actor_movie_year_array = np.array(actor_movie_year_3d_matrix) # perform cp decomposition decomposed = parafac(actor_movie_year_array, no_of_components, init='random') mlmovies = util.read_mlmovies() mlmovies = mlmovies.loc[mlmovies['year'] >= util.movie_year_for_tensor] movies_list = mlmovies.movieid.unique() # data frame for movie factor matrix from cp decomposition decomposed_movies_df = pd.DataFrame(decomposed[1], index=movies_list) # dataframe containing only input movies input_movie_df = decomposed_movies_df.loc[input_movie_ids] output_movies = [] # finding cosine similarity of each movie vector with the input movie vector and fetching the top 5 values for index, movie in decomposed_movies_df.iterrows(): cosine_sum = 0 order = 1 for j, input_movie in input_movie_df.iterrows(): cosine_sum += (1 - cosine(movie, input_movie)) * order order -= order_factor output_movies.append((index, cosine_sum)) other_movies = list( filter(lambda tup: tup[0] not in input_movie_ids, output_movies)) other_movies.sort(key=lambda tup: tup[1], reverse=True) output_movie_ids = [t[0] for t in other_movies][:5] #print output and log them feedback = util.process_output(input_movie_ids, output_movie_ids, output_file) #process feedback to get relevant movies and movies to be excluded relevant_movies, movie_to_exclude = util.process_feedback( feedback, input_movie_ids) relevant_movie_count = len(relevant_movies) #if all recommended movies are relevant then return if relevant_movie_count == 5: print "\nAll the movies were relevant hence no modification to the suggestion" return #fetch data frames for relevant and feedback movies relevant_movies_df = decomposed_movies_df.loc[relevant_movies] feedback_movies_df = decomposed_movies_df.loc[feedback.keys()] modified_query = util.probabilistic_feedback_query(feedback_movies_df, relevant_movies_df, movies_list, relevant_movie_count) revised_movie_ids = util.get_revised_movies(decomposed_movies_df, modified_query, movie_to_exclude) util.print_revised(revised_movie_ids, output_file)