def integrated_gradient(model, x, pred_label, step_size=0.02, n_iters=4): avg_grad = None for n in range(1, n_iters + 1): x_ = float(n) / n_iters * x x_ = x_.detach() gradient, _, _, _ = vanilla_gradient(model, x_, pred_label, step_size) if n == 1: avg_grad = gradient else: avg_grad += gradient avg_grad /= n_iters inte_grad = np.multiply(avg_grad, x.detach().cpu().data.numpy()) scale = np.sum(inte_grad, axis=-1, keepdims=True) intp = np.multiply(avg_grad, scale) grad_l2 = np.sum(intp[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size model.hidden = model.init_hidden() pred, _ = model(x.cpu()) p_prior = logit2prob(pred[0].data.numpy()) intp /= np.sqrt(np.sum(intp[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after.cpu()) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior return inte_grad, importance_score, x_after, changes_pred, avg_grad
def smooth_gradient(model, x0, pred_label, DEVICE, step_size, noise_range=0.02, n_iters=20): smooth_grad = None for n in range(n_iters): x0_ = x0 + torch.randn(x0.shape).to(DEVICE) * noise_range gradient, _, _, _ = vanilla_gradient(model, x0_, pred_label) if n == 0: smooth_grad = gradient else: smooth_grad += gradient smooth_grad /= n_iters grad_l2 = np.sum(smooth_grad[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size model.hidden = model.init_hidden() pred, _ = model(x0.cpu()) p_prior = logit2prob(pred[0].data.numpy()) smooth_grad /= np.sqrt(np.sum( smooth_grad[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x0.cpu().data.numpy()) x_after = perturb_embedding(x_after, smooth_grad * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior return smooth_grad, importance_score, x_after, changes_pred
def vanilla_gradient(model, x, pred_label, step_size=0.02): model.batch_size = 1 model.hidden = model.init_hidden() x = x.cpu() x.requires_grad = True pred, _ = model(x) x_prior = x.data.numpy() p_prior = logit2prob(pred[0].data.numpy()) one_hot = np.zeros((1, 2), dtype=np.float32) one_hot[0][pred_label[0]] = 1 one_hot = torch.from_numpy(one_hot) one_hot.requires_grad = True one_hot = torch.sum(one_hot * pred[0]) gradient = grad(one_hot, x)[0].numpy() grad_l2 = np.sum(gradient[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size gradient /= np.sqrt(np.sum(gradient[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x_prior) x_after = perturb_embedding(x_after, gradient * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior #print(pred_label) #print(importance_score) #print(changes_pred) return gradient, importance_score, x_after, changes_pred
def iterative_gradient(model, x0, pred_label, step_size, epsilon, max_iters=80): x0_np = x0.cpu().numpy() x_after_np = np.copy(x0_np) # iterative perturbation x_after = x0.detach() cnt = 0 while np.linalg.norm(x_after_np - x0_np) <= epsilon and cnt <= max_iters: _, _, x_after, _ = vanilla_gradient(model, x_after, pred_label, step_size) x_after = x_after.detach() x_after_np = x_after.cpu().numpy() cnt += 1 x_delta = x_after - x0.cpu() grad_l2 = np.sum(x_delta.numpy()[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) model.hidden = model.init_hidden() pred, _ = model(x0.cpu()) p_prior = logit2prob(pred[0].data.numpy()) model.hidden = model.init_hidden() pred, _ = model(x_after.cpu()) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior #print(changes_pred) return x_delta.numpy(), importance_score, x_after, changes_pred
def gradient_times_input(model, row, pred_label, DEVICE, step_size=0.02): gradient, importance_score, x_after, changes_pred = vanilla_gradient( model, row, pred_label, DEVICE, step_size=step_size) x0, segments_ids, input_masks = row grad_times_input = np.multiply(gradient, x0.detach().cpu().data.numpy()) scale = np.sum(grad_times_input, axis=-1, keepdims=True) intp = np.multiply(gradient, scale) grad_l2 = np.sum(intp[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) intp /= np.sqrt(np.sum(intp[0, :, :]**2)) # normalize to unit length x_after = np.copy(x0.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior return grad_times_input, importance_score, x_after, changes_pred
def vanilla_gradient(model, row, pred_label, DEVICE, step_size=0.02): x, segments_ids, input_masks = row x.requires_grad = True pred = model(inputs_embeds=x, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] x_prior = x.cpu().data.numpy() p_prior = logit2prob(pred[0].cpu().data.numpy()) one_hot = np.zeros((1, 2), dtype=np.float32) one_hot[0][pred_label[0]] = 1 one_hot = torch.from_numpy(one_hot).to(DEVICE) one_hot.requires_grad = True one_hot = torch.sum(one_hot * pred[0]) gradient = grad(one_hot, x)[0].cpu().numpy() grad_l2 = np.sum(gradient[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size gradient_unit = gradient / np.sqrt(np.sum(gradient[0, :, :]** 2)) # normalize to unit length x_after = np.copy(x_prior) x_after = perturb_embedding(x_after, gradient_unit * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior # print(pred_label) # print(changes_pred) return gradient, importance_score, x_after, changes_pred
def integrated_gradient(model, row, pred_label, DEVICE, step_size=0.02, n_iters=7): x, segments_ids, input_masks = row avg_grad = None for n in range(1, n_iters + 1): x_ = float(n) / n_iters * x x_ = x_.detach() gradient, _, _, _ = vanilla_gradient(model, [x_, segments_ids, input_masks], pred_label, DEVICE) if n == 1: avg_grad = gradient else: avg_grad += gradient avg_grad /= n_iters inte_grad = np.multiply(avg_grad, x.detach().cpu().data.numpy()) scale = np.sum(inte_grad, axis=-1, keepdims=True) intp = np.multiply(avg_grad, scale) grad_l2 = np.sum(intp[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size pred = model(inputs_embeds=x, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) intp /= np.sqrt(np.sum(intp[0, :, :]**2)) # normalize to unit length x_after = np.copy(x.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior return inte_grad, importance_score, x_after, changes_pred
def gradient_times_input(model, x, pred_label, step_size=0.02): gradient, importance_score, x_after, changes_pred = vanilla_gradient( model, x.detach(), pred_label, step_size=step_size) grad_times_input = np.multiply(gradient, x.detach().cpu().data.numpy()) scale = np.sum(grad_times_input, axis=-1, keepdims=True) intp = np.multiply(gradient, scale) grad_l2 = np.sum(intp[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size model.hidden = model.init_hidden() pred, _ = model(x.cpu()) p_prior = logit2prob(pred[0].data.numpy()) intp /= np.sqrt(np.sum(intp[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after.cpu()) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior return intp, importance_score, x_after, changes_pred
def smooth_gradient(model, row, pred_label, DEVICE, step_size, n_iters=20): x0, segments_ids, input_masks = row noise_range = 0.4 * step_size smooth_grad = None for n in range(n_iters): noise = torch.randn(x0.shape) noise = noise / torch.sqrt(torch.sum( noise[0, :, :]**2)) * noise_range # normalize noise to unit length x0_ = x0 + noise.to(DEVICE) gradient, _, _, _ = vanilla_gradient(model, [x0_, segments_ids, input_masks], pred_label, DEVICE) if n == 0: smooth_grad = gradient else: smooth_grad += gradient smooth_grad /= n_iters grad_l2 = np.sum(smooth_grad[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) smooth_grad /= np.sqrt(np.sum( smooth_grad[0, :, :]**2)) # normalize to unit length x_after = np.copy(x0.cpu().data.numpy()) x_after = perturb_embedding(x_after, smooth_grad * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior return smooth_grad, importance_score, x_after, changes_pred
def iterative_gradient(model, row, pred_label, DEVICE, step_size, epsilon, max_iters=40): x0, segments_ids, input_masks = row x0_np = x0.cpu().numpy() x_after_np = np.copy(x0_np) # iterative perturbation x_after = x0.detach() cnt = 0 while np.linalg.norm(x_after_np - x0_np) <= epsilon and cnt <= max_iters: _, _, x_after, _ = vanilla_gradient( model, [x_after, segments_ids, input_masks], pred_label, DEVICE, step_size) x_after = x_after.clone().detach() x_after_np = x_after.cpu().numpy() cnt += 1 x_delta = x_after - x0 grad_l2 = np.sum(x_delta.cpu().numpy()[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior # print(changes_pred) return x_delta.cpu().numpy(), importance_score, x_after, changes_pred
def search(): data = [] # user inputs similar = request.args.get('similar') genres = request.args.get('genres') castCrew = request.args.get('castCrew') keywords = request.args.get('keywords') duration = request.args.get('duration') release_start = request.args.get('release_start') release_end = request.args.get('release_end') ratings = request.args.get('ratings') languages = request.args.get('languages') acclaim = request.args.get('acclaim') popularity = request.args.get('popularity') if not acclaim and not popularity: data = [] else: data = [] old_inputs = '' filtered_movie_dict = dict(movie_dict) query_dict = dict() ########### QUERY DICT GENERATION ########### if similar: selected_movies = parse_lst_str(similar) old_similar = similar.replace('"', '') old_similar = old_similar.replace("'", "") old_inputs += '<strong>Similar Movies: </strong>' + old_similar + "<br>" if genres: selected_genres = parse_lst_str(genres) query_dict['genres'] = selected_genres old_inputs += '<strong>Genres: </strong>' + genres + "<br>" if castCrew: selected_crew = parse_lst_str(castCrew) old_castCrew = castCrew.replace('"', '') old_castCrew = old_castCrew.replace("'", "") query_dict['castCrew'] = selected_crew old_inputs += '<strong>Cast/Crew: </strong>' + old_castCrew + "<br>" if keywords: selected_keywords = parse_lst_str(keywords) old_keywords = keywords.replace('"', '') old_keywords = old_keywords.replace("'", "") query_dict['keywords'] = keywords old_inputs += '<strong>Keywords: </strong>' + old_keywords + "<br>" if duration: duration_val = user_duration.parse(duration) duration_val = duration_val[0] if len( duration_val) == 1 else (duration_val[0] + duration_val[1]) / 2 query_dict['runtime'] = duration_val old_inputs += '<strong>Duration: </strong>' + duration + " min<br>" if release_start or release_end: years = user_release.parse([release_start, release_end]) if len(years) > 1: old_inputs += '<strong>Release Years: </strong>' + str( years[0]) + "-" + str(years[1]) + "<br>" if ratings: selected_ratings = parse_lst_str(ratings) old_inputs += '<strong>Ratings: </strong>' + ratings + "<br>" if languages: selected_languages = parse_lst_str(languages) old_inputs += '<strong>Languages: </strong>' + languages + "<br>" if acclaim == 'yes': old_inputs += '<strong>Acclaim: </strong>Yes<br>' if popularity == 'yes': old_inputs += '<strong>Popularity: </strong>Yes<br>' ########### FILTERING OF DICTIONARIES ########### # updates dicts with hard filters if duration: filtered_movie_dict = user_duration.main(filtered_movie_dict, duration, 1) duration_score_dict = boosting.gaussian_score_duration( filtered_movie_dict, query_dict['runtime'], 1, 0) if release_start or release_end: filtered_movie_dict = user_release.main( filtered_movie_dict, [release_start, release_end]) if ratings: filtered_movie_dict = utils.filter_ratings(filtered_movie_dict, selected_ratings) if languages: filtered_movie_dict = utils.filter_languages( filtered_movie_dict, selected_languages) # if no results will be left after the filters if not filtered_movie_dict: return render_template('search.html', old_similar=xstr(similar), old_genres=xstr(genres), old_castCrew=xstr(castCrew), old_keywords=xstr(keywords), old_duration=xstr(duration), old_release_start=xstr(release_start), old_release_end=xstr(release_end), old_ratings=xstr(ratings), old_languages=xstr(languages), old_acclaim=xstr(acclaim), old_popularity=xstr(popularity), data=[], year_list=year_list) if acclaim == 'yes': acclaim_score_dict = utils.half_gaussian_acclaim( filtered_movie_dict, 1, 0) ########### BOOST THE "QUERY MOVIE" WITH THE SIMILAR MOVIES ########### if similar: similar_tup_lst = [] for similar_mov in selected_movies: similar_id = reverse_dict[similar_mov] similar_genres = movie_dict[similar_id]['genres'] sim_rating = movie_dict[similar_id]['rating'] sim_lang = movie_dict[similar_id]['original_language'] sim_cast = [ member['name'] for member in movie_dict[similar_id]['cast'] ] sim_crew = [ member['name'] for member in movie_dict[similar_id]['crew'] ] similar_castCrew = sim_cast + sim_crew sim_release_year = user_release.parse_single( movie_dict[similar_id]['release_date']) similar_tup_lst.append( (similar_id, similar_genres, similar_castCrew, sim_release_year, sim_rating, sim_lang)) filtered_movie_dict = utils.filter_similar(filtered_movie_dict, selected_movies) ranked_sim_lst = [ utils.get_similar_ranking(tup, filtered_movie_dict) for tup in similar_tup_lst ] ########### VECTORIZE MOVIES GIVEN QUERY ########### movie_feature_lst, movie_id_lookup = [], {} for index, movie in enumerate(filtered_movie_dict): features_lst = [] filtered_movie_dict[movie]['scores'] = dict() if similar: cumulative_score = 0.0 for sim_movie in selected_movies: sim_id = reverse_dict[sim_movie] genres_score = utils.get_set_overlap( movie_dict[sim_id]['genres'], filtered_movie_dict[movie]['genres']) sim_cast = [ member['name'] for member in movie_dict[sim_id]['cast'] ] sim_crew = [ member['name'] for member in movie_dict[sim_id]['crew'] ] cast = [ member['name'] for member in filtered_movie_dict[movie]['cast'] ] crew = [ member['name'] for member in filtered_movie_dict[movie]['crew'] ] cast_score = utils.get_set_overlap(sim_cast + sim_crew, cast + crew) keywords_score = utils.get_set_overlap( movie_dict[sim_id]['keywords'], filtered_movie_dict[movie]['keywords']) cumulative_score += (2.0 * genres_score + cast_score + keywords_score) / 4.0 average_score = cumulative_score / len(selected_movies) filtered_movie_dict[movie]['scores'][ 'similar movies'] = math.floor( round(average_score, 2) * 100) # list of genres for movie m -> jaccard sim with query if genres: genres_score = utils.get_set_overlap( query_dict['genres'], filtered_movie_dict[movie]['genres']) filtered_movie_dict[movie]['scores']['genres'] = math.floor( round(genres_score, 2) * 100) features_lst.append(genres_score) # list of cast and crew for movie m -> jaccard sim with the query if castCrew: cast = [ member['name'] for member in filtered_movie_dict[movie]['cast'] ] crew = [ member['name'] for member in filtered_movie_dict[movie]['crew'] ] castCrew_score = utils.get_set_overlap(query_dict['castCrew'], cast + crew) filtered_movie_dict[movie]['scores']['cast'] = math.floor( round(castCrew_score, 2) * 100) features_lst.append(castCrew_score) # keywords from query -> jaccard sim with the movie m synopsis if keywords: keywords_score = utils.get_set_overlap( selected_keywords, filtered_movie_dict[movie]['keywords']) filtered_movie_dict[movie]['scores']['keywords'] = math.floor( round(keywords_score, 2) * 100) features_lst.append(keywords_score) # duration & release date from movie m -> probabilistic gaussian fit around the mean if duration and len(user_duration.parse(duration)) == 1: duration_score = duration_score_dict[movie] filtered_movie_dict[movie]['scores']['duration'] = math.floor( round(duration_score, 2) * 100) features_lst.append(duration_score) if duration and len(user_duration.parse(duration)) == 2: filtered_movie_dict[movie]['scores']['duration'] = 100 if release_start or release_end: filtered_movie_dict[movie]['scores']['release'] = 100 # acclaim -> value between 0 and 1 if acclaim == "yes": acclaim_score = acclaim_score_dict[movie] / 0.14 filtered_movie_dict[movie]['scores']['acclaim'] = math.floor( round(acclaim_score, 2) * 100) features_lst.append(acclaim_score) # popularity -> value between 0 and 1 if popularity == "yes": popularity_score = utils.calc_popularity( filtered_movie_dict, movie, max_tmdb_count, max_imdb_count, max_meta_count) filtered_movie_dict[movie]['scores'][ 'popularity'] = math.floor( round(popularity_score, 2) * 100) features_lst.append(popularity_score) if ratings: filtered_movie_dict[movie]['scores']['ratings'] = 100 if languages: filtered_movie_dict[movie]['scores']['languages'] = 100 movie_feature_lst.append(features_lst) movie_id_lookup[index] = movie movie_matrix = np.zeros( (len(movie_feature_lst), len(movie_feature_lst[0]))) for i in range(len(movie_feature_lst)): for k in range(len(movie_feature_lst[i])): movie_matrix[i][k] = movie_feature_lst[i][k] ########### RUN KNN ON VECTORS, RETURN TOP MATCHES ########### n, d = movie_matrix.shape query = np.ones(d) dists = np.linalg.norm(movie_matrix - query, axis=1, ord=2) ranked_lst = np.argsort(dists) sorted_movie_list = [ movie_id_lookup[movie_id] for movie_id in ranked_lst ] sorted_movie_dict = {m: i for i, m in enumerate(sorted_movie_list, 1)} ########### CONSILIDATE WITH THE SIMILAR MOVIE LIST ########### # if similar movies is the only user input which is filled out, don't consider the sorted_movie_list if similar: if not (genres or castCrew or keywords): sorted_movie_dict = {} for lst in ranked_sim_lst: for index, movie in enumerate(lst, 1): if movie not in sorted_movie_dict: sorted_movie_dict[movie] = 0 sorted_movie_dict[movie] += index # compute the overall similarity score... overall_score = {} denom = len(sorted_movie_dict) * len( ranked_sim_lst) if similar else len(sorted_movie_dict) for movie in sorted_movie_dict: overall_score[movie] = abs( math.log((float(sorted_movie_dict[movie]) / denom))) overall_score = utils.normalize_score(overall_score, denom) sorted_movie_tup_lst = sorted(sorted_movie_dict.items(), key=operator.itemgetter(1)) sorted_movie_list = [k for k, v in sorted_movie_tup_lst] ########### TRANSFORM THE SORTED LIST INTO FRONT-END FORM ########### for movie_id in sorted_movie_list[:24]: filtered_movie_dict[movie_id]['scores'][ 'overall_score'] = math.floor( round(overall_score[movie_id], 2) * 100) filtered_movie_dict[movie_id]['scores'][ 'old_inputs'] = old_inputs.encode('ascii', 'ignore') data.append(filtered_movie_dict[movie_id]) data = [data[i:i + 4] for i in xrange(0, len(data), 4)] return render_template('search.html', old_similar=xstr(similar), old_genres=xstr(genres), old_castCrew=xstr(castCrew), old_keywords=xstr(keywords), old_duration=xstr(duration), old_release_start=xstr(release_start), old_release_end=xstr(release_end), old_ratings=xstr(ratings), old_languages=xstr(languages), old_acclaim=xstr(acclaim), old_popularity=xstr(popularity), advanced=(castCrew or keywords or duration or release_start or release_end or ratings or languages), data=data[:6], year_list=year_list)