def _get_language_data_path( self, file_service: FileService, run_type: RunType): output_data_path = file_service.get_data_path() language_data_path = os.path.join( output_data_path, f'{run_type.to_str()}_language_data.pickle') if not os.path.exists(language_data_path): challenge_path = file_service.get_challenge_path() full_data_path = os.path.join(challenge_path, 'full') if not os.path.exists(full_data_path) or len(os.listdir(full_data_path)) == 0: newseye_path = os.path.join('data', 'newseye') trove_path = os.path.join('data', 'trove') # ocr_download.combine_data(challenge_path, newseye_path, trove_path) # TODO Fix download pickles_path = file_service.get_pickles_path() train_data_path = file_service.get_pickles_path() preprocess_data( self._tokenize_service, self._metrics_service, self._vocabulary_service, pickles_path, full_data_path, output_data_path) return language_data_path
def __init__(self, language: Language, arguments_service: PretrainedArgumentsService, tokenize_service: BaseTokenizeService, file_service: FileService, vocabulary_service: VocabularyService, **kwargs): super(SemEvalTestDataset, self).__init__() self._arguments_service = arguments_service challenge_path = file_service.get_challenge_path() targets_path = os.path.join(challenge_path, 'eval', str(language), 'targets.txt') with open(targets_path, 'r', encoding='utf-8') as targets_file: self._target_words = targets_file.read().splitlines() self._target_words.sort(key=lambda v: v.upper()) # English words end with POS tags (e.g. 'test_nn') if language == Language.English: target_words = [x[:-3] for x in self._target_words] else: target_words = self._target_words if arguments_service.include_pretrained_model: encodings = tokenize_service.encode_sequences(target_words) self._target_word_ids = [x[0] for x in encodings] else: self._target_word_ids = [ vocabulary_service.string_to_id(target_word) for target_word in target_words ]