예제 #1
0
def create_searchindex(postgres):
    base_dir = ''
    code2emb_path = Path(base_dir + './data/code2emb/')
    seq2seq_path = Path(base_dir + './data/seq2seq/')
    data_path = Path(base_dir + './data/processed_data/')
    output_path = Path(base_dir + './data/search')
    input_path = Path(base_dir + './data/processed_data/')
    npy_path = Path(base_dir + './data/npy/')

    with open(code2emb_path / 'nodoc_vecs.npy', 'wb') as f_handle:
        # os.chdir(npyfilespath)
        first = False
        dataArray = None
        # os.chdir(npyfilespath)
        for npfile in glob.glob("data/npy/*"):

            # Find the path of the file
            filepath = os.path.join("", npfile)
            print("filepath = ", filepath)
            temp = npfile.split('####')
            if ("\\" in temp):
                temp = temp[0].split("\\")
            elif ("/" in temp):
                temp = temp[0].split("/")
            print("temp=", temp)
            fileid = temp[-1]
            print("fileid = ", fileid)
            matching_rows = postgres.check_fileid_exists(fileid)
            if (matching_rows > 0):
                if first == False:
                    # Load file
                    dataArray = np.load(filepath)
                    first = True
                    print("this is first file")
                else:
                    dataArray = np.concatenate((dataArray, np.load(filepath)),
                                               axis=0)
                    print("this is not first file")
        np.save(f_handle, dataArray)
    nodoc_vecs = np.load(code2emb_path / 'nodoc_vecs.npy')
    print("nodoc_vecs.shape[0] = ", nodoc_vecs.shape[0])
    # assert nodoc_vecs.shape[0] == self.ref_df.shape[0]
    search_index = create_nmslib_search_index(nodoc_vecs)
    search_index.saveIndex('search_index.nmslib')
    print("SearchIndex is created")
 def create_searchindex(self, postgres):
     npyfilespath = self.npy_path
     with open(self.code2emb_path / 'nodoc_vecs.npy', 'wb') as f_handle:
         # os.chdir(npyfilespath)
         first = False
         dataArray = None
         # os.chdir(npyfilespath)
         for npfile in glob.glob("data/npy/*"):
             # Find the path of the file
             filepath = os.path.join("", npfile)
             print("filepath = ", filepath)
             temp = npfile.split('####')
             print("temp[0] = ", temp[0])
             if ("\\" in temp[0]):
                 temp = temp[0].split("\\")
             elif ("/" in temp[0]):
                 temp = temp[0].split("/")
             print("temp=", temp)
             fileid = temp[-1]
             print("fileid = ", fileid)
             matching_rows = postgres.check_fileid_exists(fileid)
             if (matching_rows > 0):
                 if first == False:
                     # Load file
                     dataArray = np.load(filepath)
                     first = True
                     print("this is first file")
                 else:
                     dataArray = np.concatenate(
                         (dataArray, np.load(filepath)), axis=0)
                     print("this is not first file")
         np.save(f_handle, dataArray)
     nodoc_vecs = np.load(self.code2emb_path / 'nodoc_vecs.npy')
     np.savetxt(self.code2emb_path / 'nodoc_vecs_to_text.txt',
                nodoc_vecs,
                delimiter=' ')
     print("self.ref_df.shape[0] = ", self.ref_df.shape[0])
     print("nodoc_vecs.shape[0] = ", nodoc_vecs.shape[0])
     assert nodoc_vecs.shape[0] == self.ref_df.shape[0]
     search_index = create_nmslib_search_index(nodoc_vecs)
     search_index.saveIndex('search_index.nmslib')
     print("SearchIndex is created")
 def create_searchindex_paras(self, paras):
     paras = [str(item) for item in paras]
     # paras = [item.strip('\r') for item in paras]
     # chars_to_remove = ['\r', '.', '!', '?', '[', ']', '{', '}', '!', '@', '#', '$', '+', '%', '*', ':', '-', ',', '=',
     #                    '/',  '\'', '\”', '\"']
     # rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
     # paras = [re.sub(rx, '', item) for item in paras]
     # paras = [re.sub("[!@#$+%*:()'-]", '', item) for item in paras]
     # paras = [item.strip() for item in paras]
     # paras = [str(item) for item in paras]
     # paras = np.array(paras).astype(np.float32)
     no_docstring_funcs = paras
     encinp = self.enc_pp_vector.transform(no_docstring_funcs)
     # pd.DataFrame(paras).to_csv("nmslib_paragraphs.csv")
     # npy_filename = "nodoc_vecs.npy"
     # np.save(npy_filename, paras)
     # no_docstring_funcs = np.load(npy_filename)
     print("no_docstring_funcs = ", no_docstring_funcs)
     print("size of paragraphs = ", len(no_docstring_funcs))
     search_index = create_nmslib_search_index(encinp)
     search_index.saveIndex('search_index.nmslib')
     print("SearchIndex is created for paras")
예제 #4
0
from lang_model_utils import Query2Emb
from pathlib import Path
import numpy as np
from lang_model_utils import load_lm_vocab
import torch

# In[79]:

# Load matrix of vectors
loadpath = Path('./data/lang_model_emb/')
avg_emb_dim500 = np.load(loadpath / 'avg_emb_dim500_test_v2.npy')

# In[67]:

# Build search index (takes about an hour on a p3.8xlarge)
dim500_avg_searchindex = create_nmslib_search_index(avg_emb_dim500)

# In[68]:

# save search index
dim500_avg_searchindex.saveIndex(
    './data/lang_model_emb/dim500_avg_searchindex.nmslib')

# Note that if you did not train your own language model and are downloading the pre-trained model artifacts instead, you can similarly download the pre-computed search index here:
#
# https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model_emb/dim500_avg_searchindex.nmslib

# After you have built this search index with nmslib, you can do fast nearest-neighbor lookups.  We use the `Query2Emb` object to help convert strings to the embeddings:

# In[80]:
예제 #5
0
    code_df = pd.read_json(input_path /
                           'without_docstrings_original_function.json.gz')
    code_df.columns = ['code']
    # print(code_df.shape)
    # make sure these files have same number of rows
    assert code_df.shape[0] == url_df.shape[0]

    # collect these two together into a dataframe
    ref_df = pd.concat([url_df, code_df], axis=1).reset_index(drop=True)
    print(ref_df.head())

    print("Creating the Search Index For Vectorized Code")

    nodoc_vecs = np.load(code2emb_path / 'nodoc_vecs.npy')
    assert nodoc_vecs.shape[0] == ref_df.shape[0]
    search_index = create_nmslib_search_index(nodoc_vecs)
    print("Saving the search Index")
    search_index.saveIndex('./data/search/search_index.nmslib')

print("Building the minimal search Index")
lang_model = torch.load('./data/lang_model/lang_model_cpu_v2.torch',
                        map_location=lambda storage, loc: storage)

vocab = load_lm_vocab('./data/lang_model/vocab_v2.cls')
q2emb = Query2Emb(lang_model=lang_model.cpu(), vocab=vocab)

search_index = nmslib.init(method='hnsw', space='cosinesimil')
search_index.loadIndex('./data/search/search_index.nmslib')
print("Search Index loaded for the vectorised code")

print("Activating the searcher")