def _get_language_data_path( self, file_service: FileService, run_type: RunType): output_data_path = file_service.get_data_path() language_data_path = os.path.join( output_data_path, f'{run_type.to_str()}_language_data.pickle') if not os.path.exists(language_data_path): challenge_path = file_service.get_challenge_path() full_data_path = os.path.join(challenge_path, 'full') if not os.path.exists(full_data_path) or len(os.listdir(full_data_path)) == 0: newseye_path = os.path.join('data', 'newseye') trove_path = os.path.join('data', 'trove') # ocr_download.combine_data(challenge_path, newseye_path, trove_path) # TODO Fix download pickles_path = file_service.get_pickles_path() train_data_path = file_service.get_pickles_path() preprocess_data( self._tokenize_service, self._metrics_service, self._vocabulary_service, pickles_path, full_data_path, output_data_path) return language_data_path
def _get_language_data_path(self, file_service: FileService, run_type: RunType): output_data_path = file_service.get_data_path() language_data_path = os.path.join( output_data_path, f'{run_type.to_str()}_language_data.pickle') if not os.path.exists(language_data_path): train_data_path = file_service.get_pickles_path() test_data_path = None preprocess_data(train_data_path, test_data_path, output_data_path, self._tokenize_service.tokenizer, self._vocabulary_service) return language_data_path