def process_batch_files(self, batch): """ Обработка пакета .ASS файлов и запись результата в файл .CSV Аргументы: batch: пакет файлов """ transcriptions = pd.DataFrame( columns=['Audio File', 'Start', 'End', 'Name', 'Text']) if self.log: logger = create_logger('logger_' + str(os.getpid()), 'file', logging.DEBUG, self.log) else: logger = create_logger('logger', 'stream', logging.DEBUG) for file in batch: try: transcription = pysubs2.load(file) if not transcription.events: logger.debug( "В файле '{}' отсутствуют события".format(file)) else: for event in transcription.events: attributes = self.get_event_attributes(event) attributes[ 'Audio File'] = transcription.aegisub_project[ 'Audio File'] transcriptions = transcriptions.append( pd.DataFrame(attributes, index=[0])[transcriptions.columns], ignore_index=True) except: logger.error("Не удалось обработать файл '{}'".format(file)) with open(self.csv, 'a') as f: transcriptions.to_csv(f, header=False, index=False, encoding='cp1251')
options.add_train_args(parser) args = parser.parse_args() if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) tokenizer = RobertaTokenizer.from_pretrained(args.roberta_model) args.cuda = args.gpu_num > 0 args_path = os.path.join(args.save_dir, "args.json") with open(args_path, "w") as f: json.dump(vars(args), f) args.batch_size = args.batch_size // args.gradient_accumulation_steps logger = create_logger("Bert Drop Pretraining", log_file=os.path.join(args.save_dir, args.log_file)) pprint(args) set_environment(args.seed, args.cuda) def main(): best_result = float("-inf") logger.info("Loading data...") train_itr = DropBatchGen(args, data_mode="train", tokenizer=tokenizer) dev_itr = DropBatchGen(args, data_mode="dev", tokenizer=tokenizer) num_train_steps = int(args.max_epoch * len(train_itr) / args.gradient_accumulation_steps) logger.info("Num update steps {}!".format(num_train_steps)) logger.info("Build bert model.")
IS_PICKLE = args.pickle try: transcriptions_csv = CSV if CSV else str(OUTPUT_DIR / 'transcriptions.csv') with open(transcriptions_csv, 'w') as f: writer = csv.writer(f) writer.writerow(['Audio File', 'Start', 'End', 'Name', 'Text']) except: raise Exception("Не удалось создать результирующий .CSV-файл") if LOG_DIR: try: log_name = str(LOG_DIR / str(time.strftime('%Y%m%d-%H%M%S') + '.log')) logger = create_logger('logger', 'file', logging.DEBUG, log_name) except: raise Exception("Не удалось создать лог-файл") else: logger = create_logger('logger', 'stream', logging.INFO) pool = Pool(PROCESSES) logger.info("Запуск парсинга файлов") logger.debug("Количество процессов: {}".format(PROCESSES)) logger.debug("Размер пакета: {}".format(BATCH_SIZE)) transcr_parser = TranscriptionsParser(ASS_DIR, OUTPUT_DIR, log_name, PROCESSES, BATCH_SIZE, CSV, IS_PICKLE) tq = tqdm.tqdm files = glob.glob(str(ASS_DIR / '*.ass'))
args.save_dir = args.save_dir + "_{}_{}".format(args.op_mode, args.ablation_mode) args.data_dir = args.data_dir + "_{}_{}".format(args.op_mode, args.ablation_mode) if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) args.cuda = args.gpu_num > 0 args_path = os.path.join(args.save_dir, "args.json") with open(args_path, "w") as f: json.dump((vars(args)), f) args.batch_size = args.batch_size // args.gradient_accumulation_steps logger = create_logger("Roberta Training", log_file=os.path.join(args.save_dir, args.log_file)) pprint(args) set_environment(args.seed, args.cuda) def main(): # best_result = float("-inf") logger.info("Loading data...") train_itr = TaTQABatchGen(args, data_mode="train", encoder=args.encoder) dev_itr = TaTQABatchGen(args, data_mode="dev", encoder=args.encoder) num_train_steps = int(args.max_epoch * len(train_itr) / args.gradient_accumulation_steps) logger.info("Num update steps {}!".format(num_train_steps))