def search(self, query, language='python', topk=5): predictions = [] query_embedding = self.model.get_query_representations([{ 'docstring_tokens': tokenize_docstring_from_string(query), 'language': language }])[0] idxs, distances = self.indices[language].get_nns_by_vector( query_embedding, topk, search_k=10000, include_distances=True) for i, idx in enumerate(idxs): # print(self.definitions[idx].keys()) predictions.append({ "id": self.definitions[language][idx]['sha'], "name": self.definitions[language][idx]['identifier'], "func": self.definitions[language][idx]['function'], "languages": [language], "scores": [{ "name": "similarity", "score": distances[i] }] }) return predictions
def query_model(query, model, indices, language, topk=100): query_embedding = model.get_query_representations([{ 'docstring_tokens': tokenize_docstring_from_string(query), 'language': language }])[0] idxs, distances = indices.get_nns_by_vector(query_embedding, topk, include_distances=True) return idxs, distances
language), 'rb')) # dict_keys(['nwo', 'sha', 'path', 'language', 'identifier', 'parameters', 'argument_list', 'return_statement', # 'docstring', 'docstring_summary', 'docstring_tokens', 'function', 'function_tokens', 'url', 'score']) indexes = [{ 'code': d['function'], 'code_tokens': d['function_tokens'], 'language': d['language'] } for d in tqdm(definitions)] code_representations = model.get_code_representations(indexes) # use KNN query_embeddings = [] for query in queries: query_embedding = model.get_query_representations([{ 'docstring_tokens': tokenize_docstring_from_string(query), 'language': language }])[0] query_embeddings.append(query_embedding) nn = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1) nn.fit(code_representations) _, nearest_neighbor_indices = nn.kneighbors(query_embeddings) for query_idx, query in enumerate(queries): for query_nearest_code_idx in nearest_neighbor_indices[ query_idx, :]: predictions.append({ 'query': query,