def parse_dedupe_definitions(data_manager: DataManager, language: str): dedupe_definitions_pkl_path = os.path.join( shared.CODESEARCHNET_DATA_DIR, f'{language}_dedupe_definitions_v2') dedupe_definitions = serialize.load('pickle', dedupe_definitions_pkl_path) corpus = (rename_dedupe_definitions_keys(doc) for doc in dedupe_definitions) data_manager.save_language_corpus(corpus, language, shared.DataSet.ALL)
def get_codesearchnet_language_set_corpus(language: str, set_: shared.DataSet): if set_ == shared.DataSet.TRAIN: file_paths = [ get_base_language_doc_path(language, set_, i) for i in range(shared.LANGUAGES_NUM_FILES[language]) ] else: file_paths = [get_base_language_doc_path(language, set_, 0)] for file_path in file_paths: yield from serialize.load('jsonl-gzip', file_path)
def get_query_vocabulary(self): return serialize.load('pickle', self._get_query_vocabulary_path())
def get_language_vocabulary(self, language: str): return serialize.load('pickle', self._get_language_vocabulary_path(language))
def get_preprocessed_language_corpus(self, language: str, set_: shared.DataSet): return serialize.load( 'jsonl-gzip', self._get_preprocessed_language_corpus_path(language, set_))
def get_query_embedding_weights(self): return serialize.load('numpy', self._get_query_embedding_weights_path())
def get_language_embedding_weights(self, language: str): return serialize.load( 'numpy', self._get_language_embedding_weights_path(language))
def get_language_annoy_index(self, annoy_index, language: str): return serialize.load('annoy', self.get_language_annoy_index_path(language), annoy_index=annoy_index)
def get_torch_model(self, model): model.load_state_dict( serialize.load('torch', self._get_torch_model_path())) return model
def get_language_seqs(self, language: str, type_: shared.DataType, set_: shared.DataSet): return serialize.load( 'numpy', self._get_language_seqs_path(language, type_, set_))