def parse_dedupe_definitions(data_manager: DataManager, language: str):
    dedupe_definitions_pkl_path = os.path.join(
        shared.CODESEARCHNET_DATA_DIR, f'{language}_dedupe_definitions_v2')
    dedupe_definitions = serialize.load('pickle', dedupe_definitions_pkl_path)

    corpus = (rename_dedupe_definitions_keys(doc)
              for doc in dedupe_definitions)
    data_manager.save_language_corpus(corpus, language, shared.DataSet.ALL)
def get_codesearchnet_language_set_corpus(language: str, set_: shared.DataSet):
    if set_ == shared.DataSet.TRAIN:
        file_paths = [
            get_base_language_doc_path(language, set_, i)
            for i in range(shared.LANGUAGES_NUM_FILES[language])
        ]
    else:
        file_paths = [get_base_language_doc_path(language, set_, 0)]

    for file_path in file_paths:
        yield from serialize.load('jsonl-gzip', file_path)
示例#3
0
 def get_query_vocabulary(self):
     return serialize.load('pickle', self._get_query_vocabulary_path())
示例#4
0
 def get_language_vocabulary(self, language: str):
     return serialize.load('pickle',
                           self._get_language_vocabulary_path(language))
示例#5
0
 def get_preprocessed_language_corpus(self, language: str,
                                      set_: shared.DataSet):
     return serialize.load(
         'jsonl-gzip',
         self._get_preprocessed_language_corpus_path(language, set_))
示例#6
0
 def get_query_embedding_weights(self):
     return serialize.load('numpy',
                           self._get_query_embedding_weights_path())
示例#7
0
 def get_language_embedding_weights(self, language: str):
     return serialize.load(
         'numpy', self._get_language_embedding_weights_path(language))
示例#8
0
 def get_language_annoy_index(self, annoy_index, language: str):
     return serialize.load('annoy',
                           self.get_language_annoy_index_path(language),
                           annoy_index=annoy_index)
示例#9
0
 def get_torch_model(self, model):
     model.load_state_dict(
         serialize.load('torch', self._get_torch_model_path()))
     return model
示例#10
0
 def get_language_seqs(self, language: str, type_: shared.DataType,
                       set_: shared.DataSet):
     return serialize.load(
         'numpy', self._get_language_seqs_path(language, type_, set_))