def process_batch_files(self, batch):
     """
     Обработка пакета .ASS файлов и запись результата в файл .CSV
     
     Аргументы:
         batch: пакет файлов            
     """
     transcriptions = pd.DataFrame(
         columns=['Audio File', 'Start', 'End', 'Name', 'Text'])
     if self.log:
         logger = create_logger('logger_' + str(os.getpid()), 'file',
                                logging.DEBUG, self.log)
     else:
         logger = create_logger('logger', 'stream', logging.DEBUG)
     for file in batch:
         try:
             transcription = pysubs2.load(file)
             if not transcription.events:
                 logger.debug(
                     "В файле '{}' отсутствуют события".format(file))
             else:
                 for event in transcription.events:
                     attributes = self.get_event_attributes(event)
                     attributes[
                         'Audio File'] = transcription.aegisub_project[
                             'Audio File']
                     transcriptions = transcriptions.append(
                         pd.DataFrame(attributes,
                                      index=[0])[transcriptions.columns],
                         ignore_index=True)
         except:
             logger.error("Не удалось обработать файл '{}'".format(file))
     with open(self.csv, 'a') as f:
         transcriptions.to_csv(f,
                               header=False,
                               index=False,
                               encoding='cp1251')
示例#2
0
options.add_train_args(parser)

args = parser.parse_args()

if not os.path.exists(args.save_dir):
    os.mkdir(args.save_dir)

tokenizer = RobertaTokenizer.from_pretrained(args.roberta_model)

args.cuda = args.gpu_num > 0
args_path = os.path.join(args.save_dir, "args.json")
with open(args_path, "w") as f:
    json.dump(vars(args), f)

args.batch_size = args.batch_size // args.gradient_accumulation_steps
logger = create_logger("Bert Drop Pretraining",
                       log_file=os.path.join(args.save_dir, args.log_file))

pprint(args)
set_environment(args.seed, args.cuda)


def main():
    best_result = float("-inf")
    logger.info("Loading data...")
    train_itr = DropBatchGen(args, data_mode="train", tokenizer=tokenizer)
    dev_itr = DropBatchGen(args, data_mode="dev", tokenizer=tokenizer)
    num_train_steps = int(args.max_epoch * len(train_itr) /
                          args.gradient_accumulation_steps)
    logger.info("Num update steps {}!".format(num_train_steps))

    logger.info("Build bert model.")
    IS_PICKLE = args.pickle

    try:
        transcriptions_csv = CSV if CSV else str(OUTPUT_DIR /
                                                 'transcriptions.csv')
        with open(transcriptions_csv, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['Audio File', 'Start', 'End', 'Name', 'Text'])
    except:
        raise Exception("Не удалось создать результирующий .CSV-файл")

    if LOG_DIR:
        try:
            log_name = str(LOG_DIR /
                           str(time.strftime('%Y%m%d-%H%M%S') + '.log'))
            logger = create_logger('logger', 'file', logging.DEBUG, log_name)
        except:
            raise Exception("Не удалось создать лог-файл")
    else:
        logger = create_logger('logger', 'stream', logging.INFO)

    pool = Pool(PROCESSES)
    logger.info("Запуск парсинга файлов")
    logger.debug("Количество процессов: {}".format(PROCESSES))
    logger.debug("Размер пакета: {}".format(BATCH_SIZE))

    transcr_parser = TranscriptionsParser(ASS_DIR, OUTPUT_DIR, log_name,
                                          PROCESSES, BATCH_SIZE, CSV,
                                          IS_PICKLE)
    tq = tqdm.tqdm
    files = glob.glob(str(ASS_DIR / '*.ass'))
示例#4
0
    args.save_dir = args.save_dir + "_{}_{}".format(args.op_mode,
                                                    args.ablation_mode)
    args.data_dir = args.data_dir + "_{}_{}".format(args.op_mode,
                                                    args.ablation_mode)

if not os.path.exists(args.save_dir):
    os.mkdir(args.save_dir)

args.cuda = args.gpu_num > 0
args_path = os.path.join(args.save_dir, "args.json")
with open(args_path, "w") as f:
    json.dump((vars(args)), f)

args.batch_size = args.batch_size // args.gradient_accumulation_steps

logger = create_logger("Roberta Training",
                       log_file=os.path.join(args.save_dir, args.log_file))

pprint(args)
set_environment(args.seed, args.cuda)


def main():
    # best_result = float("-inf")
    logger.info("Loading data...")

    train_itr = TaTQABatchGen(args, data_mode="train", encoder=args.encoder)
    dev_itr = TaTQABatchGen(args, data_mode="dev", encoder=args.encoder)

    num_train_steps = int(args.max_epoch * len(train_itr) /
                          args.gradient_accumulation_steps)
    logger.info("Num update steps {}!".format(num_train_steps))