def create_searchindex(postgres): base_dir = '' code2emb_path = Path(base_dir + './data/code2emb/') seq2seq_path = Path(base_dir + './data/seq2seq/') data_path = Path(base_dir + './data/processed_data/') output_path = Path(base_dir + './data/search') input_path = Path(base_dir + './data/processed_data/') npy_path = Path(base_dir + './data/npy/') with open(code2emb_path / 'nodoc_vecs.npy', 'wb') as f_handle: # os.chdir(npyfilespath) first = False dataArray = None # os.chdir(npyfilespath) for npfile in glob.glob("data/npy/*"): # Find the path of the file filepath = os.path.join("", npfile) print("filepath = ", filepath) temp = npfile.split('####') if ("\\" in temp): temp = temp[0].split("\\") elif ("/" in temp): temp = temp[0].split("/") print("temp=", temp) fileid = temp[-1] print("fileid = ", fileid) matching_rows = postgres.check_fileid_exists(fileid) if (matching_rows > 0): if first == False: # Load file dataArray = np.load(filepath) first = True print("this is first file") else: dataArray = np.concatenate((dataArray, np.load(filepath)), axis=0) print("this is not first file") np.save(f_handle, dataArray) nodoc_vecs = np.load(code2emb_path / 'nodoc_vecs.npy') print("nodoc_vecs.shape[0] = ", nodoc_vecs.shape[0]) # assert nodoc_vecs.shape[0] == self.ref_df.shape[0] search_index = create_nmslib_search_index(nodoc_vecs) search_index.saveIndex('search_index.nmslib') print("SearchIndex is created")
def create_searchindex(self, postgres): npyfilespath = self.npy_path with open(self.code2emb_path / 'nodoc_vecs.npy', 'wb') as f_handle: # os.chdir(npyfilespath) first = False dataArray = None # os.chdir(npyfilespath) for npfile in glob.glob("data/npy/*"): # Find the path of the file filepath = os.path.join("", npfile) print("filepath = ", filepath) temp = npfile.split('####') print("temp[0] = ", temp[0]) if ("\\" in temp[0]): temp = temp[0].split("\\") elif ("/" in temp[0]): temp = temp[0].split("/") print("temp=", temp) fileid = temp[-1] print("fileid = ", fileid) matching_rows = postgres.check_fileid_exists(fileid) if (matching_rows > 0): if first == False: # Load file dataArray = np.load(filepath) first = True print("this is first file") else: dataArray = np.concatenate( (dataArray, np.load(filepath)), axis=0) print("this is not first file") np.save(f_handle, dataArray) nodoc_vecs = np.load(self.code2emb_path / 'nodoc_vecs.npy') np.savetxt(self.code2emb_path / 'nodoc_vecs_to_text.txt', nodoc_vecs, delimiter=' ') print("self.ref_df.shape[0] = ", self.ref_df.shape[0]) print("nodoc_vecs.shape[0] = ", nodoc_vecs.shape[0]) assert nodoc_vecs.shape[0] == self.ref_df.shape[0] search_index = create_nmslib_search_index(nodoc_vecs) search_index.saveIndex('search_index.nmslib') print("SearchIndex is created")
def create_searchindex_paras(self, paras): paras = [str(item) for item in paras] # paras = [item.strip('\r') for item in paras] # chars_to_remove = ['\r', '.', '!', '?', '[', ']', '{', '}', '!', '@', '#', '$', '+', '%', '*', ':', '-', ',', '=', # '/', '\'', '\”', '\"'] # rx = '[' + re.escape(''.join(chars_to_remove)) + ']' # paras = [re.sub(rx, '', item) for item in paras] # paras = [re.sub("[!@#$+%*:()'-]", '', item) for item in paras] # paras = [item.strip() for item in paras] # paras = [str(item) for item in paras] # paras = np.array(paras).astype(np.float32) no_docstring_funcs = paras encinp = self.enc_pp_vector.transform(no_docstring_funcs) # pd.DataFrame(paras).to_csv("nmslib_paragraphs.csv") # npy_filename = "nodoc_vecs.npy" # np.save(npy_filename, paras) # no_docstring_funcs = np.load(npy_filename) print("no_docstring_funcs = ", no_docstring_funcs) print("size of paragraphs = ", len(no_docstring_funcs)) search_index = create_nmslib_search_index(encinp) search_index.saveIndex('search_index.nmslib') print("SearchIndex is created for paras")
from lang_model_utils import Query2Emb from pathlib import Path import numpy as np from lang_model_utils import load_lm_vocab import torch # In[79]: # Load matrix of vectors loadpath = Path('./data/lang_model_emb/') avg_emb_dim500 = np.load(loadpath / 'avg_emb_dim500_test_v2.npy') # In[67]: # Build search index (takes about an hour on a p3.8xlarge) dim500_avg_searchindex = create_nmslib_search_index(avg_emb_dim500) # In[68]: # save search index dim500_avg_searchindex.saveIndex( './data/lang_model_emb/dim500_avg_searchindex.nmslib') # Note that if you did not train your own language model and are downloading the pre-trained model artifacts instead, you can similarly download the pre-computed search index here: # # https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/dim500_avg_searchindex.nmslib # After you have built this search index with nmslib, you can do fast nearest-neighbor lookups. We use the `Query2Emb` object to help convert strings to the embeddings: # In[80]:
code_df = pd.read_json(input_path / 'without_docstrings_original_function.json.gz') code_df.columns = ['code'] # print(code_df.shape) # make sure these files have same number of rows assert code_df.shape[0] == url_df.shape[0] # collect these two together into a dataframe ref_df = pd.concat([url_df, code_df], axis=1).reset_index(drop=True) print(ref_df.head()) print("Creating the Search Index For Vectorized Code") nodoc_vecs = np.load(code2emb_path / 'nodoc_vecs.npy') assert nodoc_vecs.shape[0] == ref_df.shape[0] search_index = create_nmslib_search_index(nodoc_vecs) print("Saving the search Index") search_index.saveIndex('./data/search/search_index.nmslib') print("Building the minimal search Index") lang_model = torch.load('./data/lang_model/lang_model_cpu_v2.torch', map_location=lambda storage, loc: storage) vocab = load_lm_vocab('./data/lang_model/vocab_v2.cls') q2emb = Query2Emb(lang_model=lang_model.cpu(), vocab=vocab) search_index = nmslib.init(method='hnsw', space='cosinesimil') search_index.loadIndex('./data/search/search_index.nmslib') print("Search Index loaded for the vectorised code") print("Activating the searcher")