def emit_ndcg_model_predictions(use_wandb=False): build_code_embeddings() queries = utils.get_evaluation_queries() predictions = [] for language in shared.LANGUAGES: print(f'Evaluating {language}') evaluation_docs = [{ 'url': doc['url'], 'identifier': doc['identifier'] } for doc in utils.load_cached_docs(language, 'evaluation')] code_embeddings = utils.load_cached_code_embeddings(language) model = utils.load_cached_model_weights(language, train_model.get_model()) query_embedding_predictor = train_model.get_query_embedding_predictor( model) query_seqs = prepare_data.pad_encode_seqs( prepare_data.preprocess_query_tokens, (line.split(' ') for line in queries), shared.QUERY_MAX_SEQ_LENGTH, language, 'query') query_embeddings = query_embedding_predictor.predict(query_seqs) # TODO: Query annoy index nn = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1) nn.fit(code_embeddings) _, nearest_neighbor_indices = nn.kneighbors(query_embeddings) for query_idx, query in enumerate(queries): for query_nearest_code_idx in nearest_neighbor_indices[ query_idx, :]: predictions.append({ 'query': query, 'language': language, 'identifier': evaluation_docs[query_nearest_code_idx]['identifier'], 'url': evaluation_docs[query_nearest_code_idx]['url'], }) del evaluation_docs gc.collect() df_predictions = pd.DataFrame( predictions, columns=['query', 'language', 'identifier', 'url']) save_path = os.path.join( wandb.run.dir, 'model_predictions.csv') if use_wandb else '../model_predictions.csv' df_predictions.to_csv(save_path, index=False)
def build_code_embeddings(): for language in shared.LANGUAGES: print(f'Building {language} code embeddings') model = utils.load_cached_model_weights(language, train_model.get_model()) code_embedding_predictor = train_model.get_code_embedding_predictor( model) evaluation_code_seqs = utils.load_cached_seqs(language, 'evaluation', 'code') code_embedding = code_embedding_predictor.predict(evaluation_code_seqs) utils.cache_code_embeddings(code_embedding, language)
def get_nearest_query_neighbors_per_language(query): nearest_neighbors_per_language = {} for language in shared.LANGUAGES: query_seq = prepare_data.pad_encode_query(query, language) model = utils.load_cached_model_weights(language, train_model.get_model()) query_embedding_predictor = train_model.get_query_embedding_predictor( model) query_embedding = query_embedding_predictor.predict( query_seq.reshape(1, -1))[0, :] ann = utils.load_cached_ann(language) nearest_neighbors_per_language[language] = ann.get_nns_by_vector( query_embedding, RESULTS_PER_LANGUAGE, include_distances=True) return nearest_neighbors_per_language
def evaluate_language_mean_mrr(language): model = utils.load_cached_model_weights(language, train_model.get_model()) valid_code_seqs = utils.load_cached_seqs(language, 'valid', 'code') valid_query_seqs = utils.load_cached_seqs(language, 'valid', 'query') valid_mean_mrr = evaluate_model_mean_mrr(model, valid_code_seqs, valid_query_seqs) test_code_seqs = utils.load_cached_seqs(language, 'test', 'code') test_query_seqs = utils.load_cached_seqs(language, 'test', 'query') test_mean_mrr = evaluate_model_mean_mrr(model, test_code_seqs, test_query_seqs) print( f'Evaluating {language} - Valid Mean MRR: {valid_mean_mrr}, Test Mean MRR: {test_mean_mrr}' ) return valid_mean_mrr, test_mean_mrr
query = sys.argv[1] for language in shared.LANGUAGES: print(f'Evaluating {language}') evaluation_docs = [{ 'url': doc['url'], 'identifier': doc['identifier'] } for doc in utils.load_cached_docs(language, 'evaluation')] print('Read the docs') code_embeddings = utils.load_cached_code_embeddings(language) query_seqs = prepare_data.pad_encode_seqs( prepare_data.preprocess_query_tokens, (line.split(' ') for line in [query]), shared.QUERY_MAX_SEQ_LENGTH, language, 'query') model = utils.load_cached_model_weights(language, train_model.get_model()) query_embedding_predictor = train_model.get_query_embedding_predictor( model) query_embeddings = query_embedding_predictor.predict(query_seqs) # TODO: Replace with annoy index nn = NearestNeighbors(n_neighbors=3, metric='cosine', n_jobs=-1) nn.fit(code_embeddings) _, nearest_neighbor_indices = nn.kneighbors(query_embeddings) for query_nearest_code_idx in nearest_neighbor_indices[0, :]: print(evaluation_docs[query_nearest_code_idx]['url'])