def _feed_queue(self, first_call=False): """ Read the next inqueued e-mails. :param bool first_call: Whether this is the first time this method is called (default False) """ if not first_call: self._delete_messages() self._queue = [] self._uids = [] self._current = 0 while not len(self._queue): _, data = self._imap.search(None, 'ALL') uids = data[0].split() msg_pack = uids[:10] if len(uids) > 10 else uids for num in msg_pack: # Skip mails that previously failed if num in self._failed_uids: continue _, raw_msg = self._imap.fetch(num, '(RFC822)') self._queue.append(raw_msg[0][1]) self._uids.append(num) if not len(self._queue): LOGGER.debug('No email retrieved. Waiting before retrying.') time.sleep(10)
def _create_collection(self, name): """ Create a single collection. :param str name: Name of the collection to create """ LOGGER.info('Creating collection [%s]...', name) self._db.create_collection(name)
def send_buffer(data_buffer, token): data = {'system_token': token, 'send_timestamp': time.time(), 'data': data_buffer } req = requests.put(CONFIG.SERVER_ADDR + MEASURES_RES, json=data) if req.ok: LOGGER.info('Data pack sent')
def _send_mail(self, subject, body): """ Send a simple text e-mail. Settings are used to get the recipient. :param str subject: Subject of the email :param str body: Body content of the email """ try: msg = MIMEText(body) msg['Subject'] = subject msg['From'] = settings.SCORING_EMAIL['reporting']['from'] msg['To'] = settings.SCORING_EMAIL['reporting']['to'] smtp = smtplib.SMTP_SSL(settings.SCORING_EMAIL['host']) smtp.sendmail(msg['From'], msg['To'], msg.as_string()) smtp.quit() except Exception as ex: LOGGER.error('Something went wrong when sending the email: %s', ex)
def main(): """ Spamhaus blacklisted ip extracting tool entry point. """ LOGGER.info("Started...") # Read from stdin data buf = [] for line in sys.stdin: buf.append(line) content = '\n'.join(buf) LOGGER.info("Parsing html (%d bytes)", len(content)) documents = parse_html(content) LOGGER.info("%d spamhaus entries found.", len(documents)) LOGGER.info("Updating database.") update_db(documents) LOGGER.info("Done.")
def run(self): """ Run the parser. """ with mongo.Mongo() as database: current = self.next() while current: try: addr = self.get_ip(current) if not addr: LOGGER.info('Entry skipped because no specified IP.') current = self.next() continue if not utils.is_managed_ip(addr): LOGGER.debug('Not a managed IP [%s].', addr) current = self.next() continue doc_ts = int(time.mktime(self.get_date(current).timetuple())) if doc_ts < YESTERDAY: LOGGER.debug('This entry is too old [%s].', self.get_date(current)) current = self.next() continue document = { 'ip': addr, 'timestamp': doc_ts, 'weight': self.compute_weight(current), 'source': self.get_source(current), 'raw': self.get_raw(current) } database.push_ip_document(document) except Exception as exc: LOGGER.error('Unexpected error: %s [%s]', type(exc), exc.message) LOGGER.error(traceback.format_exc()) current = self.next() self.close()
def push_ip_document(self, input_dict): """ Push a new document regarding an IP or update existing document to append new data. :param dict input_dict: Expect a dictionnary having at least those fields: [IP, filename, weight, source, timestamp, raw] """ file_doc = self._build_file_document(input_dict) input_dict['filename'] = file_doc['filename'] if self.does_ip_exist(input_dict['ip']): LOGGER.debug('IP [%s] already exists. Update...', input_dict['ip']) self._ip_collection.update( {'ip': input_dict['ip']}, {'$push': {'events': self._build_event_document(input_dict)}} ) else: LOGGER.debug('Brand new IP [%s]. Insert...', input_dict['ip']) doc = self._build_full_document(input_dict) self._ip_collection.save(doc) self._ip_cache.append(input_dict['ip']) self._raw_collection.save(file_doc)
def get_reader_for_mail(raw): """ Automatically detect the appropriate reader that will be able to read the passed e-mail. This method is static. :param str raw: The raw e-mail content :rtype: Object :return: An instance of :py:class:`parsing.mails.mailreader.AbstractMailReader` """ match = re.search(r'{}:\s(.*)'.format(settings.SCORING_EMAIL['partner_header']), raw) if not match: raise Exception('Malformed input mail :: missing header [{}]'.format(settings.SCORING_EMAIL['partner_header'])) source = match.group(1).strip() LOGGER.debug('Mail from %s', source) if source in ("AOL", "SignalSpam"): return arf.ArfReader(raw, source) elif source == "SpamCop": return spamcop.SpamcopReader(raw) raise Exception( 'Malformed input mail :: unknown value [{}] for header [{}]'.format(source, settings.SCORING_EMAIL['partner_header']) )
def validate_videoQA(model, val_loader, split, task="tvqa", save_logits=False): LOGGER.info(f"start running validation on {task} {split} split...") model.eval() val_loss = 0 n_ex = 0 tot_score = 0 results = {} logits = {} val_log = {} st = time() has_gt_target = True for i, batch in enumerate(val_loader): targets = batch['targets'] if has_gt_target and targets.min() < 0: has_gt_target = False LOGGER.info( "No GT annotations provided, only generate predictions") if 'qids' in batch: qids = batch['qids'] del batch['qids'] scores = model(batch, task, compute_loss=False) answers = [ i for i in scores.max(dim=-1, keepdim=False)[1].cpu().tolist() ] for qid, answer in zip(qids, answers): results[str(qid)] = answer if save_logits: scores = scores.cpu().tolist() for qid, logit in zip(qids, scores): logits[str(qid)] = logit if has_gt_target: loss = F.cross_entropy(scores, targets.squeeze(-1), reduction='sum') val_loss += loss.item() tot_score += compute_accuracies(scores, targets) n_ex += len(qids) if has_gt_target: val_loss = sum(all_gather_list(val_loss)) tot_score = sum(all_gather_list(tot_score)) n_ex = sum(all_gather_list(n_ex)) tot_time = time() - st val_loss /= n_ex val_acc = tot_score / n_ex val_log = { 'valid/loss': val_loss, 'valid/acc': val_acc, 'valid/ex_per_s': n_ex / tot_time } LOGGER.info(f"validation finished in {int(tot_time)} seconds, " f"loss:{val_loss:.2f}, score: {val_acc*100:.2f}") model.train() return val_log, results, logits
def validate_mlm(model, val_loader): LOGGER.info("start running MLM validation...") val_loss = 0 n_correct = 0 n_word = 0 st = time() for i, batch in enumerate(val_loader): scores = model(batch, task='mlm', compute_loss=False) labels = batch['txt_labels'] labels = labels[labels != -1] loss = F.cross_entropy(scores, labels, reduction='sum') val_loss += loss.item() n_correct += (scores.max(dim=-1)[1] == labels).sum().item() n_word += labels.numel() val_loss = sum(all_gather_list(val_loss)) n_correct = sum(all_gather_list(n_correct)) n_word = sum(all_gather_list(n_word)) tot_time = time() - st val_loss /= n_word acc = n_correct / n_word val_log = {'loss': val_loss, 'acc': acc, 'tok_per_s': n_word / tot_time} LOGGER.info(f"validation finished in {int(tot_time)} seconds, " f"acc: {acc*100:.2f}") return val_log
def export_val_predictions(self, test=False, test_idx=0, threshold=0.5): batch_loader = self.config['val_loader'] if not test else self.config[ 'test_loader'][test_idx] test_name = batch_loader.dataset.name LOGGER.info("Exporting %s predictions..." % (test_name)) self.model.eval() ## Step 1: Find the optimal threshold on validation set _, _ = self.eval_model(test=test, test_idx=test_idx) val_probs = torch.tensor(self.probs_list) val_labels = torch.tensor(self.labels_list) if len(self.id_list) != 0: val_ids = torch.tensor(self.id_list) else: val_ids = torch.zeros_like(val_labels) - 1 val_preds = (val_probs > threshold).long() self._export_preds(val_ids, val_probs, val_preds, labels=val_labels, file_postfix="_%s_preds.csv" % test_name) LOGGER.info("Finished export of %s predictions" % test_name)
def create_dataloaders(datasets, is_train, opts, all_img_dbs=None): # opts.conf_th : 0.2 # opts.min_bb : 10 # opts.num_bb 36 if all_img_dbs is None: all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) dataloaders = {} for dset in datasets: if is_train: assert len(dset['db']) == len(dset['img']) assert len(dset['tasks']) == len(dset['mix_ratio']) img_db = [all_img_dbs[path] for path in dset['img']] else: assert len(dset['db']) == len(dset['img']) == 1 img_db = all_img_dbs[dset['img'][0]] for i, t in enumerate(dset['tasks']): task = f'{t}_{dset["name"]}' if is_train: LOGGER.info(f"Loading {task} train dataset " f"{dset['db']}, {[img.img_dir for img in img_db]}") txt_db = [ TxtTokLmdb(path, opts.max_txt_len) for path in dset['db'] ] else: LOGGER.info(f"Loading {task} validation dataset, " f"{dset['db']}, {img_db.img_dir}") txt_db = TxtTokLmdb(dset['db'][0], -1) if task.startswith('mlm'): dataset = build_mlm_dataset(txt_db, img_db, is_train, opts) elif task.startswith('mrfr'): dataset = build_mrfr_dataset(txt_db, img_db, is_train, opts) elif task.startswith('mrc'): dataset = build_mrc_dataset(txt_db, img_db, is_train, opts) elif task.startswith('itm'): dataset = build_itm_dataset(txt_db, img_db, is_train, opts) else: raise ValueError(f'Undefined task {task}') LOGGER.info(f"{len(dataset[0])*hvd.size()} samples loaded") if task.startswith('itm'): # itm handles distributed training in dset not sampler loader = build_dataloader_itm(*dataset, is_train, opts) else: loader = build_dataloader(*dataset, is_train, opts) if is_train: ratio = dset['mix_ratio'][i] dataloaders[task] = (loader, ratio) else: dataloaders[task] = PrefetchLoader(loader) return dataloaders, all_img_dbs
def obtain_system_token(): LOGGER.info('Trying to read token from file: %s', CONFIG.TOKEN_FILE) token_file = open(CONFIG.TOKEN_FILE, 'r') file_content = token_file.read().splitlines() if len(file_content) >= 2 and file_content[0] == TOKEN_START and file_content[2] == TOKEN_END: LOGGER.info('Reading token from file succeeded') return file_content[1] else: LOGGER.warn('Reading token from file failed') return aquire_token()
def create_dataloaders(datasets, is_train, opts, all_img_dbs=None): all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) dataloaders = {} for dset in datasets: if is_train: txt_path = opts.train_txt_dbs img_path = opts.train_img_dbs else: txt_path = opts.val_txt_dbs img_path = opts.val_img_dbs for i, t in enumerate(dset['tasks']): task = f'{t}_{dset["name"]}' if is_train: LOGGER.info(f"Loading {task} train dataset " f"{dset['db']}, {dset['img']}") else: LOGGER.info(f"Loading {task} validation dataset, " f"{dset['db']}, {dset['img']}") if task.startswith('mlm'): dataset = build_mlm_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('mrfr'): dataset = build_mrfr_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('mrckl'): dataset = build_mrc_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('itm'): dataset = build_itm_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('itkm'): dataset = build_itkm_dataset(txt_path, img_path, all_img_dbs, is_train, opts) elif task.startswith('mkm'): dataset = build_mkm_dataset(txt_path, img_path, all_img_dbs, is_train, opts) else: raise ValueError(f'Undefined task {task}') LOGGER.info(f"{len(dataset[0])*hvd.size()} samples loaded") if task.startswith('itm'): # itm handles distributed training in dset not sampler loader = build_dataloader_itm(*dataset, is_train, opts) else: loader = build_dataloader(*dataset, is_train, opts) if is_train: ratio = dset['mix_ratio'][i] dataloaders[task] = (loader, ratio) else: dataloaders[task] = PrefetchLoader(loader) return dataloaders, all_img_dbs
def aquire_token(): LOGGER.info('Trying to aquire token from server') token_file = open(CONFIG.TOKEN_FILE, 'w') req = requests.put(CONFIG.SERVER_ADDR + TOKENS_RES, json={'system_name': CONFIG.SYSTEM_NAME}) if req.ok: LOGGER.info('Aquiring token from server succeeded') generated_token = req.json()['generated_token'] token_file.write('\n'.join([TOKEN_START, generated_token, TOKEN_END])) return generated_token else: LOGGER.error('Failed to aquire token from server') raise TokenNotFoundException()
def main(): args = get_args() helper.print_script_args_and_info(args) os.makedirs(args.embeddings_result_folder, exist_ok=True) LOGGER.info('Loading pre-trained embedding') LOGGER.info('Starting to process datasets') Parallel(n_jobs=args.n_jobs)( delayed(process_dataset)(dataset_name, args) for dataset_name in dataset_helper.get_dataset_names_with_concept_map( limit_datasets=args.limit_dataset)) LOGGER.info('Finished')
def purge_old_documents(self): """ Archive IP sub-documents older than a month ago. These documents are moved into a dedicated archiving collection. """ total_count = 0 request = { 'events.timestamp': { '$lt': A_MONTH_AGO } } LOGGER.debug("Archiving events older than %d...", A_MONTH_AGO) for doc in self._ip_collection.find(request): archives_bulk = [] for event in doc['events']: # All documents having at least 1 timestamp < A_MONTH_AGO are retrieved. # This condition removes subdocuments that do not match. if event['timestamp'] < A_MONTH_AGO: archives_bulk.append({ 'ip': doc['ip'], 'filename': event['filename'], 'source': event['source'], 'weight': event['weight'], 'timestamp': event['timestamp'] }) result = self._archive_collection.insert(archives_bulk) total_count = total_count + len(result) result = self._ip_collection.update(request, { '$pull': { 'events': { 'timestamp': { '$lt': A_MONTH_AGO } } } }, multi=True) LOGGER.info('%d documents archived.', total_count) # Remove single entries result = self._ip_collection.remove({ 'events.timestamp': { '$exists': False } }, multi=True) LOGGER.info('%d single entries have been removed.', result['n'])
def purge_old_documents(self): """ Archive IP sub-documents older than a month ago. These documents are moved into a dedicated archiving collection. """ total_count = 0 request = {'events.timestamp': {'$lt': A_MONTH_AGO}} LOGGER.debug("Archiving events older than %d...", A_MONTH_AGO) for doc in self._ip_collection.find(request): archives_bulk = [] for event in doc['events']: # All documents having at least 1 timestamp < A_MONTH_AGO are retrieved. # This condition removes subdocuments that do not match. if event['timestamp'] < A_MONTH_AGO: archives_bulk.append({ 'ip': doc['ip'], 'filename': event['filename'], 'source': event['source'], 'weight': event['weight'], 'timestamp': event['timestamp'] }) result = self._archive_collection.insert(archives_bulk) total_count += len(result) self._ip_collection.update( request, {'$pull': { 'events': { 'timestamp': { '$lt': A_MONTH_AGO } } }}, multi=True) LOGGER.info('%d documents archived.', total_count) # Remove single entries result = self._ip_collection.remove( {'events.timestamp': { '$exists': False }}, multi=True) LOGGER.info('%d single entries have been removed.', result['n'])
def get_response(req_type="POST", url=None, data=None, headers=DEFAULT_HEADER, cookies=COOKIES): """ http请求 """ LOGGER.info(url + " " + str(data) + " " + str(COOKIES)) try: if req_type.upper() == "POST": r = requests.post(url=url, data=data, headers=headers, allow_redirects=True, cookies=cookies) elif req_type.upper() == "GET": param_list = [] for key, value in data.items(): param_list.append(key + "=" + value) r = requests.get(url=url + "?" + "&".join(param_list), data={}, headers=headers, allow_redirects=True, cookies=cookies) else: raise TypeError("http method error") except (requests.exceptions.ConnectionError, TypeError) as e: LOGGER.error("send request fail " + str(e)) return None if r.status_code == requests.codes.ok: # LOGGER.info(r.text) # 更新cookies if len(r.cookies) != 0: COOKIES.update(r.cookies) for res in r.history: if len(res.cookies) != 0: COOKIES.update(res.cookies) return r.text else: LOGGER.error("status code " + str(r.status_code)) return None
def evaluate(model, eval_loader): LOGGER.info("start running evaluation...") model.eval() tot_score = 0 n_ex = 0 st = time() predictions = [] for i, batch in enumerate(eval_loader): (tgt_box_list, obj_boxes_list, sent_ids) = ( batch['tgt_box'], batch['obj_boxes'], batch['sent_ids']) # scores (n, max_num_bb) scores = model(batch, compute_loss=False) ixs = torch.argmax(scores, 1).cpu().detach().numpy() # (n, ) # pred_boxes for ix, obj_boxes, tgt_box, sent_id in \ zip(ixs, obj_boxes_list, tgt_box_list, sent_ids): pred_box = obj_boxes[ix] predictions.append({'sent_id': int(sent_id), 'pred_box': pred_box.tolist(), 'tgt_box': tgt_box.tolist()}) if eval_loader.loader.dataset.computeIoU(pred_box, tgt_box) > .5: tot_score += 1 n_ex += 1 if i % 100 == 0 and hvd.rank() == 0: n_results = len(predictions) n_results *= hvd.size() # an approximation to avoid hangs LOGGER.info(f'{n_results}/{len(eval_loader.dataset)} ' 'answers predicted') n_ex = sum(all_gather_list(n_ex)) tot_time = time()-st tot_score = sum(all_gather_list(tot_score)) val_acc = tot_score / n_ex val_log = {'valid/acc': val_acc, 'valid/ex_per_s': n_ex/tot_time} model.train() LOGGER.info(f"validation ({n_ex} sents) finished in" f" {int(tot_time)} seconds" f", accuracy: {val_acc*100:.2f}%") # summarizae results = {'acc': val_acc, 'predictions': predictions} return val_log, results
def get_hito_data(request): if request.method == 'POST': # 获取一言 hitoko_url = "https://api.imjad.cn/hitokoto" hitoko_data = { "c": "c", "encode": "json", "charset": "utf-8", "length": 50, } data = [] for i in range(0, 10): hitoko_res = requests.get(hitoko_url, hitoko_data, verify=False) hitoko_res_text = str(hitoko_res.text) json_obj = json.loads(hitoko_res_text) if json_obj: hito_text = json_obj['hitokoto'] LOGGER.info(hito_text) data.append(hito_text) res = {} if data: res['msg'] = "获取一言完成" res['status'] = True LOGGER.info("获取一言完成") else: res['msg'] = "获取一言失败" res['status'] = False LOGGER.error("获取一言失败") response_data = { "msg": res['msg'], "status": res['status'], "data": data } return JsonResponse(response_data)
def validate(loader, generator, tokenizer, evaluator): st = time() generator.model.eval() results = [] for batch in loader: vids = batch['vid_names'] cids = batch['clip_ids'] all_ts = batch['all_ts'] outputs = generator.greedy_decode(batch) for vid, cid, ts, out_ids in zip(vids, cids, all_ts, outputs): output = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens(out_ids)) results.append({'vid_name': vid, 'clip_id': cid, 'ts': ts, 'descs': [{'desc': output}]}) results = [r for rs in all_gather_list(results) for r in rs] LOGGER.info(f'decoding finished in {int(time() - st)} seconds') if hvd.rank() == 0: val_log = evaluator(results) LOGGER.info(f'Validation finished in {int(time() - st)} seconds') LOGGER.info(f'CIDEr: {val_log["CIDEr"]}') else: val_log = {} generator.model.train() return val_log, results
parser.add_argument('--race_gender_hidden_size', type=int, default=0, help='Hidden size for race and gender') args, unparsed = parser.parse_known_args() config = args.__dict__ wandb.config.update(config) config['device'] = get_device() config['n_classes'] = 2 if config['loss_func'] == 'ce' else 1 # Check all provided paths: if not os.path.exists(config['data_path']): raise ValueError("[!] ERROR: Dataset path does not exist") else: LOGGER.info("Data path checked..") if not os.path.exists(config['model_path']): LOGGER.warning( "Creating checkpoint path for saved models at: {}\n".format( config['model_path'])) os.makedirs(config['model_path']) else: LOGGER.info("Model save path checked..") if 'config' in config: if not os.path.isfile(config['config']): raise ValueError("[!] ERROR: config JSON path does not exist") else: LOGGER.info("config JSON path checked..") if not os.path.exists(config['vis_path']): LOGGER.warning( "Creating checkpoint path for Tensorboard visualizations at: {}\n"
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) hps_file = f"{opts.output_dir}/log/hps.json" model_opts = Struct(json.load(open(hps_file))) # train_examples = None ans2label_file = f"{opts.output_dir}/ckpt/ans2label.json" ans2label = json.load(open(ans2label_file)) label2ans = {label: ans for ans, label in ans2label.items()} # load DBs and image dirs eval_img_db = DetectFeatLmdb( opts.img_db, model_opts.conf_th, model_opts.max_bb, model_opts.min_bb, model_opts.num_bb, opts.compressed_db, ) eval_txt_db = TxtTokLmdb(opts.txt_db, -1) eval_dataset = VqaEvalDataset(len(ans2label), eval_txt_db, eval_img_db) # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f"{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt" checkpoint = torch.load(ckpt_file) model = UniterForVisualQuestionAnswering.from_pretrained( f"{opts.output_dir}/log/model.json", checkpoint, img_dim=IMG_DIM, num_answer=len(ans2label), ) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=True, opt_level="O2") sampler = TokenBucketSampler( eval_dataset.lens, bucket_size=BUCKET_SIZE, batch_size=opts.batch_size, droplast=False, ) eval_dataloader = DataLoader( eval_dataset, batch_sampler=sampler, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vqa_eval_collate, ) eval_dataloader = PrefetchLoader(eval_dataloader) val_log, results, logits = evaluate(model, eval_dataloader, label2ans, opts.save_logits) result_dir = f"{opts.output_dir}/results_test" if not exists(result_dir) and rank == 0: os.makedirs(result_dir) all_results = list(concat(all_gather_list(results))) if opts.save_logits: all_logits = {} for id2logit in all_gather_list(logits): all_logits.update(id2logit) if hvd.rank() == 0: with open(f"{result_dir}/" f"results_{opts.checkpoint}_all.json", "w") as f: json.dump(all_results, f) if opts.save_logits: np.savez(f"{result_dir}/logits_{opts.checkpoint}_all.npz", **all_logits)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) opts.n_gpu = n_gpu LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, " f"{opts.vfeat_db}") if opts.task != "didemo_video_only": video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, opts.vfeat_interval, opts) else: txt_meta = load_json(join(opts.train_query_txt_db, "meta.json")) video_db = load_video_only_dataset(opts.vfeat_db, txt_meta, opts.vfeat_interval, opts) # data loaders # train video_ids = get_video_ids(opts.train_query_txt_db) train_q_txt_db = QueryTokLmdb(opts.train_query_txt_db, opts.max_txt_len) train_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, True, opts, shuffle=True, q_txt_db=train_q_txt_db) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # val video_ids = get_video_ids(opts.val_query_txt_db) val_q_txt_db = QueryTokLmdb(opts.val_query_txt_db, -1) val_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, False, opts, q_txt_db=val_q_txt_db) if opts.task != "didemo_video_only": inf_dataset = VcmrFullEvalDataset else: inf_dataset = VcmrVideoOnlyFullEvalDataset LOGGER.info(f"Loading Inference Dataset {opts.val_query_txt_db} (val)") val_dset = inf_dataset(video_ids, video_db, val_q_txt_db, distributed=opts.distributed_eval) inf_loader_val = DataLoader(val_dset, batch_size=opts.vcmr_eval_q_batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) inf_loader_val = PrefetchLoader(inf_loader_val) if opts.test_query_txt_db: LOGGER.info( f"Loading Inference Dataset {opts.test_query_txt_db} (test)") video_ids = get_video_ids(opts.test_query_txt_db) test_q_txt_db = QueryTokLmdb(opts.test_query_txt_db, -1) test_dset = inf_dataset(video_ids, video_db, test_q_txt_db, distributed=opts.distributed_eval) inf_loader_test = DataLoader(test_dset, batch_size=opts.vcmr_eval_q_batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) inf_loader_test = PrefetchLoader(inf_loader_test) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN model = HeroForVcmr.from_pretrained( opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=opts.lw_neg_ctx, lw_neg_q=opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=opts.hard_pool_size, margin=opts.margin, use_all_neg=opts.use_all_neg, drop_svmr_prob=opts.drop_svmr_prob) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') restorer = TrainingRestorer(opts, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) if not exists(join(opts.output_dir, 'results')): # store tvr predictions os.makedirs(join(opts.output_dir, 'results')) if opts.nms_thd != -1: # store tvr-nms predictions if not exists(join(opts.output_dir, 'results_nms')): os.makedirs(join(opts.output_dir, 'results_nms')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) task2loss = { task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys() } for obj in (f'{opts.task}_st_ed', f'{opts.task}_neg_ctx', f'{opts.task}_neg_q'): task2loss[obj] = RunningMeter(f'loss/{obj}') model.train() n_examples = defaultdict(int) start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() if global_step == 0: optimizer.step() for step, (task, batch) in enumerate(meta_loader): if len(opts.hard_negtiave_start_step) > 0: for i, hn_step in enumerate(opts.hard_negtiave_start_step): if global_step >= hn_step and hn_step != -1: model.set_hard_negative(True, opts.hard_pool_size[i], opts.hard_neg_weights[i]) if opts.train_span_start_step != -1 and\ global_step >= opts.train_span_start_step: model.set_train_st_ed(opts.lw_st_ed) n_examples[task] += opts.train_batch_size loss = model(batch, task=task, compute_loss=True) loss_st_ed, loss_neg_ctx, loss_neg_q = loss loss = loss_st_ed + loss_neg_ctx + loss_neg_q for n, ls, w in (('st_ed', loss_st_ed, opts.lw_st_ed), ('neg_ctx', loss_neg_ctx, opts.lw_neg_ctx), ('neg_q', loss_neg_q, opts.lw_neg_q)): ls = ls.item() if w: ls /= w task2loss[f'{task}_{n}'](ls) loss = loss.mean() task2loss[task](loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[task]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss TB_LOGGER.log_scaler_dict({ temp_loss.name: temp_loss.val for temp_loss in task2loss.values() if temp_loss.val is not None }) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') for t in train_dataloaders.keys(): tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, opts) if hvd.rank() == 0 or opts.distributed_eval: log, results = validate_full_vcmr(model, inf_loader_val, 'val', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'val_results_{global_step}_rank{hvd.rank()}.json') TB_LOGGER.log_scaler_dict(log) if opts.test_query_txt_db: log, results = validate_full_vcmr(model, inf_loader_test, 'test', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'test_results_{global_step}_rank{hvd.rank()}.json' ) TB_LOGGER.log_scaler_dict(log) LOGGER.info('===========================================') model_saver.save(model, global_step) # step restorer in the end to prevent missing validation checkpoint restorer.step() if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: if hvd.rank() == 0 or opts.distributed_eval: log, results = validate_full_vcmr(model, inf_loader_val, 'val', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'val_results_{global_step}' f'_rank{hvd.rank()}_final.json') TB_LOGGER.log_scaler_dict(log) if opts.test_query_txt_db: log, results = validate_full_vcmr(model, inf_loader_test, 'test', opts, model_opts=opts) save_json( results, f'{opts.output_dir}/results/' f'test_results_{global_step}_rank{hvd.rank()}.json') TB_LOGGER.log_scaler_dict(log) model_saver.save(model, f'{global_step}_final')
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) device = torch.device("cuda:1") if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) if hvd.rank() == 0: TB_LOGGER.create(join(opts.output_dir, 'log')) os.makedirs(join(opts.output_dir, 'ckpt')) save_training_meta(opts) # TB_LOGGER.create(join(opts.output_dir, 'log')) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) # store ITM predictions os.makedirs(join(opts.output_dir, 'results_val')) os.makedirs(join(opts.output_dir, 'results_test')) os.makedirs(join(opts.output_dir, 'results_train')) else: LOGGER.disabled = True model_saver = NoOp() # load DBs and image dirs all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) # train train_dataset = MemeAIDataset(json_path = '/home/data/meme_json/train.json', npz_folder = '/home/data/faster_cnn_feature/', mode = 'train') train_loader = DataLoader(train_dataset, batch_size = opts.train_batch_size, shuffle = True, num_workers = opts.n_workers, collate_fn=collate_fn) train_loader = PrefetchLoader(train_loader) # val val_dataset = MemeAIDataset(json_path = '/home/data/meme_json/dev.json', npz_folder = '/home/data/faster_cnn_feature/', mode = 'val') val_loader = DataLoader(val_dataset, batch_size = opts.inf_minibatch_size, shuffle = False, num_workers = opts.n_workers, collate_fn=collate_fn) val_loader = PrefetchLoader(val_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = Meme.from_pretrained( opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.init_output() # pretrain ITM head is different from ranking head model.to(device) # make sure every process has same model parameters in the beginning # broadcast_tensors([p.data for p in model.parameters()], 0) # set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') global_step = 0 # LOGGER.info(f"***** Running training on {n_gpu} GPUs *****") # LOGGER.info(" Num examples = %d", len(train_dataset) * hvd.size()) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() # while True: for epoch in range(opts.epoch): print('epoch {}/ {}'.format(epoch, opts.epoch)) pbar = tqdm(total=len(train_loader)) model.train() preds = None gt = None for step, batch in enumerate(train_loader): x = batch[0] y = batch[1] n_examples += x['input_ids'].size(0) pred = model(x) if preds is None: preds = torch.sigmoid(pred) gt = y else: preds = torch.cat((preds, torch.sigmoid(pred)), dim = 0) gt = torch.cat((gt, y), dim = 0) loss = F.binary_cross_entropy(torch.sigmoid(pred), y) delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale ) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() with torch.no_grad(): preds = preds.cpu().numpy().reshape(len(preds), ) gt = gt.cpu().numpy() roc = roc_auc_score(gt, preds) acc = accuracy_score(gt, np.around(preds)) train_log = {'train/roc': roc, 'train/acc': acc} TB_LOGGER.log_scaler_dict({f"train/{k}": v for k, v in train_log.items()}) # monitor training throughput val_log = validate(model, val_loader) TB_LOGGER.log_scaler_dict({f"valid/{k}": v for k, v in val_log.items()}) LOGGER.info(train_log) LOGGER.info(val_log) model_saver.save(model, global_step) pbar.close()
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(args.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() all_dbs = [db for datasets in [opts.train_datasets, opts.val_datasets] for dset in datasets for db in dset['db']] tokenizer = json.load(open(f'{all_dbs[0]}/meta.json'))['bert'] assert all(tokenizer == json.load(open(f'{db}/meta.json'))['bert'] for db in all_dbs) # build data loaders train_dataloaders, all_img_dbs = create_dataloaders( opts.train_datasets, True, opts) val_dataloaders, _ = create_dataloaders( opts.val_datasets, False, opts, all_img_dbs) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = UniterForPretraining.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM) model.to(device) model.train() # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') global_step = 0 LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) # to compute training statistics task2loss = {task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys()} # ITM w/ OT if opts.itm_ot_lambda > 0: for task in train_dataloaders.keys(): if task.startswith('itm'): task2loss[f'{task}_xe'] = RunningMeter(f'loss/{task}_xe') task2loss[f'{task}_ot'] = RunningMeter(f'loss/{task}_ot') task2loss[f'{task}_ot_pos'] = RunningMeter( f'loss/{task}_ot_pos') task2loss[f'{task}_ot_neg'] = RunningMeter( f'loss/{task}_ot_neg') n_examples = defaultdict(int) n_in_units = defaultdict(int) n_loss_units = defaultdict(int) grad_norm = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() for step, (name, batch) in enumerate(meta_loader): # forward pass n_examples[name] += batch['input_ids'].size(0) n_in_units[name] += (batch['attn_masks'] == 1).sum().item() task = name.split('_')[0] loss = model(batch, task=task, compute_loss=True) if task.startswith('itm'): # OT itm_loss, ot_loss = loss n_loss_units[name] += itm_loss.size(0) itm_loss = itm_loss.mean() if ot_loss is not None: ot_pos, ot_neg = ot_loss ot_loss = (ot_pos.sum() - ot_neg.sum() ) / (ot_pos.size(0) + ot_neg.size(0)) # NOTE: be ware of empty tensor ot_pos = ot_pos.mean().item() if not math.isnan(ot_pos): task2loss[f'{name}_ot_pos'](ot_pos) ot_neg = ot_neg.mean().item() if not math.isnan(ot_neg): task2loss[f'{name}_ot_neg'](ot_neg) loss = itm_loss + opts.itm_ot_lambda * ot_loss task2loss[f'{name}_xe'](itm_loss.item()) task2loss[f'{name}_ot'](ot_loss.item()) else: loss = itm_loss else: n_loss_units[name] += loss.size(0) loss = loss.mean() # loss is not normalized in model # backward pass delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[name]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) task2loss[name](loss.item()) # optimizer update and logging if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.log_scaler_dict({ll.name: ll.val for ll in task2loss.values() if ll.val is not None}) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f'==============Step {global_step}===============') for t in train_dataloaders.keys(): assert all(tt == t for tt in all_gather_list(t)) tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time()-start)) tot_in = sum(all_gather_list(n_in_units[t])) in_per_sec = int(tot_in / (time()-start)) tot_l = sum(all_gather_list(n_loss_units[t])) l_per_sec = int(tot_l / (time()-start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/{t}_in_per_s', in_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/{t}_loss_per_s', l_per_sec, global_step) LOGGER.info('===============================================') if global_step % opts.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_dataloaders) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step % opts.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_dataloaders) model_saver.save(model, global_step)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'ckpt'), exist_ok=True) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) # store ITM predictions os.makedirs(join(opts.output_dir, 'results_val'), exist_ok=True) os.makedirs(join(opts.output_dir, 'results_test'), exist_ok=True) os.makedirs(join(opts.output_dir, 'results_train'), exist_ok=True) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() # train_examples = None LOGGER.info(f"Loading Train Dataset {opts.train_txt_dbs}, " f"{opts.train_img_dbs}") # check multiple DBs assert len(opts.train_txt_dbs) == len(opts.train_img_dbs), \ "train txt_db and img_db have different length" # load DBs and image dirs all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) # train LOGGER.info(f"Loading Train Dataset " f"{opts.train_txt_dbs}, {opts.train_img_dbs}") train_datasets = [] for txt_path, img_path in zip(opts.train_txt_dbs, opts.train_img_dbs): if "itm_coco_zh" not in txt_path: img_db = all_img_dbs[img_path] txt_db = TxtTokLmdb(txt_path, opts.max_txt_len) if opts.hard_neg_size > 0: train_datasets.append( ItmRankDatasetHardNeg(txt_db, img_db, opts.negative_size, opts.hard_neg_size)) else: train_datasets.append( ItmRankDataset(txt_db, img_db, opts.negative_size)) else: img_train_db = all_img_dbs[img_path[0]] img_val_db = all_img_dbs[img_path[1]] txt_db = TxtTokLmdb(txt_path, opts.max_txt_len) if opts.hard_neg_size > 0: train_datasets.append( ItmRankDatasetHardNeg(txt_db, img_db, opts.negative_size, opts.hard_neg_size)) else: train_datasets.append( ItmRankDataset_COCO_CN(txt_db, img_train_db, img_val_db, opts.negative_size)) train_dataset = ConcatDataset(train_datasets) # hard negative # hn_datasets = [] # for txt_path, img_path in zip(opts.train_txt_dbs, opts.train_img_dbs): # img_db = all_img_dbs[img_path] # txt_db = TxtTokLmdb(txt_path, opts.max_txt_len) # hn_datasets.append(ItmHardNegDataset(txt_db, img_db, # opts.inf_minibatch_size)) # hn_dataset = ConcatDataset(hn_datasets) # hn_dataloader = build_dataloader(hn_dataset, itm_hn_collate, False, opts) # hard_neg_dir = f'{opts.output_dir}/results_train/' # val LOGGER.info(f"Loading Val Dataset {opts.val_txt_db}, {opts.val_img_db}") val_img_db = all_img_dbs[opts.val_img_db[0]] val_txt_db = TxtTokLmdb(opts.val_txt_db[0], -1) val_dataset = ItmValDataset(val_txt_db, val_img_db, opts.inf_minibatch_size) val_dataloader = build_dataloader(val_dataset, itm_val_collate, False, opts) # eval LOGGER.info(f"Loading val, test Dataset for full evaluation: " f"{opts.val_txt_db}, {opts.val_img_db}" f"{opts.test_txt_db}, {opts.test_img_db}") eval_dataset_val = ItmEvalDataset(val_txt_db, val_img_db, opts.inf_minibatch_size) eval_loader_val = build_dataloader(eval_dataset_val, itm_eval_collate, False, opts) eval_loader_list = [] assert len(opts.test_img_db) == len(opts.test_txt_db) for test_img_db_path, test_txt_db_path in zip(opts.test_img_db, opts.test_txt_db): if "itm_coco_zh" not in test_txt_db_path: test_img_db = all_img_dbs[test_img_db_path] test_txt_db = TxtTokLmdb(test_txt_db_path, -1) eval_dataset_test = ItmEvalDataset(test_txt_db, test_img_db, opts.inf_minibatch_size) else: test_img_train_db = all_img_dbs[test_img_db_path[0]] test_img_val_db = all_img_dbs[test_img_db_path[1]] test_txt_db = TxtTokLmdb(test_txt_db_path, -1) eval_dataset_test = ItmEvalDataset_COCO_CN(test_txt_db, test_img_train_db, test_img_val_db, opts.inf_minibatch_size) eval_loader_test = build_dataloader(eval_dataset_test, itm_eval_collate, False, opts) eval_loader_list.append(eval_loader_test) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} #Rename the key if specified if opts.rename_checkpoints: rename_checkpoint(checkpoint) model = VLXLMRForImageTextRetrieval.from_pretrained( opts.model_config, state_dict=checkpoint, load_embedding_only=opts.load_embedding_only, load_layer=opts.load_layer, img_dim=IMG_DIM, margin=opts.margin) model.init_output() # pretrain ITM head is different from ranking head model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer if opts.separate_lr: optimizer = build_xlmr_optimizer(model, opts) else: optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') #global_step = 0 LOGGER.info(f"***** Running training on {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataset) * hvd.size()) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() if opts.steps_per_hard_neg != -1: compute_hard_neg(model, hn_dataloader, train_dataset, opts.hard_neg_pool_size, hard_neg_dir) #Initialize the TrainingRestorer restorer = TrainingRestorer(opts, model, optimizer) global_step = restorer.global_step TB_LOGGER._global_step = global_step if hvd.rank() != 0: restorer = NoOp() #Added for Restoring the Checkpoints if global_step > 0: pbar.update(global_step) n_examples = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: train_dataloader = build_dataloader(train_dataset, xlmr_itm_rank_collate, True, opts) for step, batch in enumerate(train_dataloader): #print(batch['input_ids']) n_examples += batch['input_ids'].size(0) loss = model(batch, compute_loss=True) loss = loss.mean() delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) # print("run the loss") if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) if opts.separate_lr: #added by Mingyang xlmr_lr_this_step = get_xlmr_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i < 2: param_group['lr'] = xlmr_lr_this_step else: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('xlmr_lr', xlmr_lr_this_step, global_step) else: for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss losses = all_gather_list(running_loss) running_loss = RunningMeter( 'loss', sum(l.val for l in losses) / len(losses)) TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f'============Step {global_step}=============') tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec, global_step) LOGGER.info(f'===========================================') if global_step % opts.valid_steps == 0 and global_step > 0: # if global_step > 7000: if opts.full_val: val_log = evaluate(model, eval_loader_val) TB_LOGGER.log_scaler_dict( {f"valid/{k}": v for k, v in val_log.items()}) #Log the information # LOGGER.info( # f"========================= {split} ===========================\n" # f"image retrieval R1: {eval_log['img_r1']*100:.2f},\n" # f"image retrieval R5: {eval_log['img_r5']*100:.2f},\n" # f"image retrieval R10: {eval_log['img_r10']*100:.2f}\n" # f"text retrieval R1: {eval_log['txt_r1']*100:.2f},\n" # f"text retrieval R5: {eval_log['txt_r5']*100:.2f},\n" # f"text retrieval R10: {eval_log['txt_r10']*100:.2f}") # LOGGER.info("=========================================================") else: val_log = validate(model, val_dataloader) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step) restorer.step() if (opts.steps_per_hard_neg != -1 and global_step % opts.steps_per_hard_neg == 0): # sample hard negatives for training compute_hard_neg(model, hn_dataloader, train_dataset, opts.hard_neg_pool_size, hard_neg_dir) # break to reconstruct loader # for potential multi-worker issue (not sure) break if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break # NOTE can no longer count epochs pbar.close() # final validation # val_log = validate(model, val_dataloader) # TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, f'{global_step}_final') for i, loader in enumerate(eval_loader_list): split = "test_{}".format(i) eval_log = evaluate(model, loader) TB_LOGGER.log_scaler_dict( {f"eval/{split}_{k}": v for k, v in eval_log.items()}) if hvd.rank() != 0: continue LOGGER.info( f"========================= {split} ===========================\n" f"image retrieval R1: {eval_log['img_r1']*100:.2f},\n" f"image retrieval R5: {eval_log['img_r5']*100:.2f},\n" f"image retrieval R10: {eval_log['img_r10']*100:.2f}\n" f"text retrieval R1: {eval_log['txt_r1']*100:.2f},\n" f"text retrieval R5: {eval_log['txt_r5']*100:.2f},\n" f"text retrieval R10: {eval_log['txt_r10']*100:.2f}") LOGGER.info("=========================================================")
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(json.load(open(hps_file))) model_config = f'{opts.output_dir}/log/model_config.json' # load DBs and image dirs video_ids = get_video_ids(opts.query_txt_db) video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db q_txt_db = QaQueryTokLmdb(opts.query_txt_db, -1) eval_dataset = ViolinEvalDataset(video_ids, video_db, q_txt_db, sampled_by_q=model_opts.sampled_by_q) collate_fn = violin_eval_collate # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" assert img_pos_embed_weight_key in checkpoint max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) model = HeroForViolin.from_pretrained(model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn) eval_dataloader = PrefetchLoader(eval_dataloader) _, results, logits = validate_violin(model, eval_dataloader, opts.split, opts.save_logits) result_dir = f'{opts.output_dir}/results_{opts.split}' if opts.save_logits: result_dir += '_w_logit' if not exists(result_dir) and hvd.rank() == 0: os.makedirs(result_dir) all_results = {} for id2res in all_gather_list(results): all_results.update(id2res) if opts.save_logits: all_logits = {} for id2logit in all_gather_list(logits): all_logits.update(id2logit) if hvd.rank() == 0: save_json(all_results, f'{result_dir}/results_{opts.checkpoint}_all.json') LOGGER.info('All results written......') if opts.save_logits: save_pickle(all_logits, f'{result_dir}/logits_{opts.checkpoint}_all.pkl') LOGGER.info('All logits written......')
def __register_impl(self, base, class_obj): self._registered_implementations[base.__name__] = class_obj LOGGER.debug("Custom implementation [%s] registered.", class_obj)
def get_hard_negs(model, loader, hard_negative_num=20): LOGGER.info("start running hard negative extraction") st = time() if hvd.rank() == 0: pbar = tqdm(total=len(loader)) else: pbar = NoOp() model.eval() txt2hardimgs = {} img_to_score_txts = defaultdict(list) for batch in loader: scores = model(batch, compute_loss=False).squeeze(-1) txt = batch['gt_txt_id'] imgs = batch['neg_img_ids'] # record hard images hard_indices = scores.topk(hard_negative_num, sorted=False)[1].tolist() txt2hardimgs[txt] = [imgs[i] for i in hard_indices] # record img2txts for i, img in enumerate(imgs): img_to_score_txts[img].append((scores[i].item(), txt)) pbar.update(1) pbar.close() LOGGER.info("start computing hard texts from images...") n_less_neg = 0 tot_text = 0 img2hardtxts = {} # need to gather hard texts from all GPUs all_img_ids = [ i for dset in loader.dataset.datasets for i in dset.all_img_ids ] all_img_ids = any_broadcast(all_img_ids, 0) for img in all_img_ids: score_txts = img_to_score_txts[img] scores, txts = map( list, unzip(pair for pairs in all_gather_list(score_txts) for pair in pairs)) if hvd.rank() != 0: # only rank 0 needs to compute continue tot_text += len(txts) if len(txts) < hard_negative_num: # not enough negatives hard_indices = range(len(txts)) n_less_neg += 1 else: hard_indices = torch.tensor(scores).topk(hard_negative_num, sorted=False)[1].tolist() img2hardtxts[img] = [txts[i] for i in hard_indices] n_less_neg = sum(all_gather_list(n_less_neg)) if n_less_neg: LOGGER.info(f"Warning: {n_less_neg} images did not " f"sample enough negatives") LOGGER.info(f"hard negative extraction finished " f"in {int(time() - st)} seconds " f"({tot_text//len(img_to_score_txts)} texts per images)") model.train() return txt2hardimgs, img2hardtxts
def end_training(self): # Termination message print("\n" + "-" * 100) if self.terminate_training: LOGGER.info( "Training terminated early because the Validation {} did not improve for {} epochs" .format(self.config['optimize_for'], self.config['patience'])) else: LOGGER.info( "Maximum epochs of {} reached. Finished training !!".format( self.config['max_epoch'])) print_test_stats(self.best_val_metrics, test=False) print("-" * 50 + "\n\t\tEvaluating on test set\n" + "-" * 50) if not self.config["no_model_checkpoints"]: if os.path.isfile(self.model_file): self.load_model() self.model.to(self.device) else: raise ValueError( "No Saved model state_dict found for the chosen model...!!! \nAborting evaluation on test set..." .format(self.config['model_name'])) self.export_val_predictions( ) # Runs evaluation, no need to run it again here val_probs = torch.tensor(self.probs_list) val_labels = torch.tensor(self.labels_list) threshold = 0.5 # the default threshelod for binary classification # Uncomment below line if you have implemented this optional feature # threshold = find_optimal_threshold(val_probs, val_labels, metric="accuracy") best_val_metrics = standard_metrics(val_probs, val_labels, threshold=threshold, add_aucroc=False) LOGGER.info( "Optimal threshold on validation dataset: %.4f (accuracy=%4.2f%%)" % (threshold, 100.0 * best_val_metrics["accuracy"])) # Testing is in the standard form not possible, as we do not have any labels (gives an error in standard_metrics) # Instead, we should write out the predictions in the form of the leaderboard self.test_metrics = dict() for test_idx in range(len(self.config['test_loader'])): test_name = self.config['test_loader'][test_idx].dataset.name LOGGER.info("Export and testing on %s..." % test_name) if hasattr(self.config['test_loader'][test_idx].dataset, "data") and \ hasattr(self.config['test_loader'][test_idx].dataset.data, "labels") and \ self.config['test_loader'][test_idx].dataset.data.labels[0] == -1: # Step 1: Find the optimal threshold on validation set self.export_test_predictions(test_idx=test_idx, threshold=threshold) self.test_metrics[test_name] = dict() else: test_idx_metrics, _ = self.eval_model(test=True, test_idx=test_idx) self.test_metrics[test_name] = test_idx_metrics print_test_stats(test_idx_metrics, test=True) self.export_val_predictions(test=True, test_idx=test_idx, threshold=threshold) else: LOGGER.info( "No model checkpoints were saved. Hence, testing will be skipped." ) self.test_metrics = dict() self.export_metrics() self.config['writer'].close() if self.config['remove_checkpoints']: LOGGER.info("Removing checkpoint %s..." % self.model_file) os.remove(self.model_file)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) opts.task = 'tvc' # train_examples = None LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, " f"{opts.vfeat_db}") video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, opts.vfeat_interval, opts) # data loaders # train LOGGER.info(f"Loading train dataset {opts.train_db}") train_cap = CaptionTokLmdb(opts.train_db, opts.max_txt_len) train_dset = TvcTrainDataset(video_db, train_cap, opts.max_cap_per_vid) LOGGER.info(f"{sum(all_gather_list(len(train_dset)))} samples loaded") train_loader = build_dataloader(train_dset, opts.train_batch_size, TvcTrainDataset.collate, True, opts) # val LOGGER.info(f"Loading val dataset {opts.val_db}") val_cap = CaptionTokLmdb(opts.val_db, -1) val_dset = TvcValDataset(video_db, val_cap, -1) val_loader = build_dataloader(val_dset, opts.val_batch_size, TvcValDataset.collate, False, opts) if hvd.rank() == 0: evaluator = TVCEval(opts.val_ref) else: evaluator = NoOp() # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN model = HeroForTvc.from_pretrained(opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lsr=opts.lsr) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') # assumes roberta tokenizer only if hvd.local_rank() == 0: # quick hack to prevent multi-process download collision toker = RobertaTokenizer.from_pretrained('roberta-base') all_gather_list(None) else: all_gather_list(None) toker = RobertaTokenizer.from_pretrained('roberta-base') bos = toker.convert_tokens_to_ids(['<s>'])[0] eos = toker.convert_tokens_to_ids(['</s>'])[0] generator = TvcGenerator(model, opts.max_gen_step, bos, eos, opts.fp16) global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'results')) # store val predictions add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) train_loss = RunningMeter('loss') n_vid = 0 n_cap = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() model.train() while True: for step, batch in enumerate(train_loader): n_vid += opts.train_batch_size n_cap += batch['cap_input_ids'].size(0) loss = model(batch, compute_loss=True) loss = loss.mean() train_loss(loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i == 0 or i == 1: param_group['lr'] = lr_this_step * opts.lr_mul elif i == 2 or i == 3: param_group['lr'] = lr_this_step else: raise ValueError() TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss TB_LOGGER.add_scalar(train_loss.name, train_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') tot_vid = sum(all_gather_list(n_vid)) vid_per_sec = int(tot_vid / (time() - start)) LOGGER.info(f'{tot_vid} videos trained at ' f'{vid_per_sec} vid/s') tot_cap = sum(all_gather_list(n_cap)) cap_per_sec = int(tot_cap / (time() - start)) TB_LOGGER.add_scalar(f'perf/vid_per_s', vid_per_sec, global_step) TB_LOGGER.add_scalar(f'perf/cap_per_s', cap_per_sec, global_step) if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start validation") val_log, results = validate(val_loader, generator, toker, evaluator) if hvd.rank() == 0: save_jsonl( results, f"{opts.output_dir}/results/" f"/results_{global_step}.jsonl") TB_LOGGER.log_scaler_dict(val_log) LOGGER.info('===========================================') model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"finished {n_epoch} epochs") if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: val_log, results = validate(val_loader, generator, toker, evaluator) if hvd.rank() == 0: save_jsonl( results, f"{opts.output_dir}/results/" f"/results_{global_step}.jsonl") TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.train_config is not None: train_opts = Struct(json.load(open(opts.train_config))) opts.conf_th = train_opts.conf_th opts.max_bb = train_opts.max_bb opts.min_bb = train_opts.min_bb opts.num_bb = train_opts.num_bb # load DBs and image dirs eval_img_db = DetectFeatLmdb(opts.img_db, opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) eval_txt_db = TxtTokLmdb(opts.txt_db, -1) eval_dataset = ItmEvalDataset(eval_txt_db, eval_img_db, opts.batch_size) # Prepare model checkpoint = torch.load(opts.checkpoint) model = UniterForImageTextRetrieval.from_pretrained(opts.model_config, checkpoint, img_dim=IMG_DIM) if 'rank_output' not in checkpoint: model.init_output() # zero shot setting model.to(device) model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=1, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=itm_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) eval_log, results = evaluate(model, eval_dataloader) if hvd.rank() == 0: if not exists(opts.output_dir) and rank == 0: os.makedirs(opts.output_dir) with open(f'{opts.output_dir}/config.json', 'w') as f: json.dump(vars(opts), f) with open(f'{opts.output_dir}/results.bin', 'wb') as f: pickle.dump(results, f) with open(f'{opts.output_dir}/scores.json', 'w') as f: json.dump(eval_log, f) LOGGER.info(f'evaluation finished') LOGGER.info( f"======================== Results =========================\n" f"image retrieval R1: {eval_log['img_r1']*100:.2f},\n" f"image retrieval R5: {eval_log['img_r5']*100:.2f},\n" f"image retrieval R10: {eval_log['img_r10']*100:.2f}\n" f"text retrieval R1: {eval_log['txt_r1']*100:.2f},\n" f"text retrieval R5: {eval_log['txt_r5']*100:.2f},\n" f"text retrieval R10: {eval_log['txt_r10']*100:.2f}") LOGGER.info("========================================================")
def process_dataset(dataset_name, args): LOGGER.info('{:15} - Start'.format(dataset_name)) LOGGER.info('{:15} - Retrieving trained embedding'.format(dataset_name)) pre_trained_embedding = embeddings.get_embedding_model( args.pre_trained_embedding, binary=False, first_line_header=True, with_gensim=True) try: trained_embedding = dataset_helper.get_w2v_embedding_for_dataset( dataset_name) except FileNotFoundError as e: LOGGER.exception(e) return cmap_cache_files = dataset_helper.get_all_cached_graph_datasets( dataset_name=dataset_name, graph_type=constants.TYPE_CONCEPT_MAP) coo_cache_files = [ x for x in dataset_helper.get_all_cached_graph_datasets( dataset_name=dataset_name, graph_type=constants.TYPE_COOCCURRENCE) if 'all' in x ] if not len(cmap_cache_files) or not len(coo_cache_files): return used_graphs = [cmap_cache_files[0], coo_cache_files[0]] LOGGER.info('{:15} - Retrieving dataset'.format(dataset_name)) all_labels = set() for graph_cache_file in used_graphs: X, _ = dataset_helper.get_dataset_cached(graph_cache_file) X = graph_helper.get_graphs_only(X) all_labels |= graph_helper.get_all_node_labels_uniq( X, as_sorted_list=False) LOGGER.info('{:15} - Resolving embeddings'.format(dataset_name)) embeddings_pre_trained, not_found_pre_trained_coreferenced, not_found_trained, not_found_pre_trained, lookup, similar_els = embeddings.get_embeddings_for_labels_with_lookup( all_labels, trained_embedding, pre_trained_embedding) LOGGER.info('{:15} - Missing'.format(dataset_name)) for label, s in [('trained', not_found_trained), ('pre_trained', not_found_pre_trained), ('after_coreference', not_found_pre_trained_coreferenced) ]: LOGGER.info('\t{:20} {:>6}'.format(label, len(s))) embedding_file = '{}/{}.w2v.txt'.format(args.embeddings_result_folder, dataset_name) embeddings.save_embedding_dict(embeddings_pre_trained, embedding_file) embeddings_pre_trained = embeddings.load_word2vec_format( fname=embedding_file, binary=False) LOGGER.info('{:15} - Co-reference resolution'.format(dataset_name)) max_topn = max(args.topn) similar_labels = coreference.get_most_similar_labels( all_labels, embeddings_pre_trained, max_topn) for topn in args.topn: for threshold in args.merge_threshold: LOGGER.info( '{:15} - Co-reference resolution: topn: {}, threshold: {}'. format(dataset_name, topn, threshold)) clique_lookup = coreference.create_label_cliques_by_similarity( similar_labels, threshold=threshold, topn=topn) new_lookup = embeddings.merge_lookups(clique_lookup, lookup) with open( '{}/{}.threshold-{}.topn-{}.label-lookup.npy'.format( args.embeddings_result_folder, dataset_name, threshold, topn), 'wb') as f: pickle.dump(new_lookup, f) LOGGER.info('{:15} - Finished'.format(dataset_name))
def initialize(): LOGGER.info('Initializing sensor module') SESSION_INFO.TOKEN = obtain_system_token() LOGGER.info('Sensor module initialized successfully')
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading Train Dataset {opts.train_txt_db}, " f"{opts.train_img_dir}") if 'paired' in opts.model: DatasetCls = Nlvr2PairedDataset EvalDatasetCls = Nlvr2PairedEvalDataset collate_fn = nlvr2_paired_collate eval_collate_fn = nlvr2_paired_eval_collate if opts.model == 'paired': ModelCls = UniterForNlvr2Paired elif opts.model == 'paired-attn': ModelCls = UniterForNlvr2PairedAttn else: raise ValueError('unrecognized model type') elif opts.model == 'triplet': DatasetCls = Nlvr2TripletDataset EvalDatasetCls = Nlvr2TripletEvalDataset ModelCls = UniterForNlvr2Triplet collate_fn = nlvr2_triplet_collate eval_collate_fn = nlvr2_triplet_eval_collate else: raise ValueError('unrecognized model type') # data loaders train_dataloader = create_dataloader(opts.train_img_db, opts.train_txt_db, opts.train_batch_size, True, DatasetCls, collate_fn, opts) val_dataloader = create_dataloader(opts.val_img_db, opts.val_txt_db, opts.val_batch_size, False, EvalDatasetCls, eval_collate_fn, opts) test_dataloader = create_dataloader(opts.test_img_db, opts.test_txt_db, opts.val_batch_size, False, EvalDatasetCls, eval_collate_fn, opts) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = ModelCls.from_pretrained(opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.init_type_embedding() model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'results')) # store val predictions add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataloader.dataset)) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(train_dataloader): targets = batch['targets'] n_examples += targets.size(0) loss = model(**batch, compute_loss=True) loss = loss.mean() delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss losses = all_gather_list(running_loss) running_loss = RunningMeter( 'loss', sum(l.val for l in losses) / len(losses)) TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'Step {global_step}: ' f'{tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: for split, loader in [('val', val_dataloader), ('test', test_dataloader)]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") log, results = validate(model, loader, split) with open( f'{opts.output_dir}/results/' f'{split}_results_{global_step}_' f'rank{rank}.csv', 'w') as f: for id_, ans in results: f.write(f'{id_},{ans}\n') TB_LOGGER.log_scaler_dict(log) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"Step {global_step}: finished {n_epoch} epochs") for split, loader in [('val', val_dataloader), ('test', test_dataloader)]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") log, results = validate(model, loader, split) with open( f'{opts.output_dir}/results/' f'{split}_results_{global_step}_' f'rank{rank}_final.csv', 'w') as f: for id_, ans in results: f.write(f'{id_},{ans}\n') TB_LOGGER.log_scaler_dict(log) model_saver.save(model, f'{global_step}_final')
def next(self): if self._current >= len(self._queue): self._feed_queue() res = self._queue[self._current] LOGGER.debug('Parsing mail...') try: self._parser = MailReaderFactory.get_reader_for_mail(res) self._current = self._current + 1 except Exception as ex: LOGGER.error('Error while parsing mail #%s', self._uids[self._current]) LOGGER.error('Unable to determine source of this mail (raw content follows): %s', ex) LOGGER.error('Retrieved email:\n%s', res) LOGGER.debug('-- Recovery mode --') # Add this uid to the failed list so don't retry to parse this mail anymore self._failed_uids.append(self._uids[self._current]) # Remove uid from the list so this email won't be deleted. self._uids.remove(self._uids[self._current]) # Remove mail from the queue self._queue.remove(res) LOGGER.debug('Ok. Now, try to fetch another mail...') # Try to fetch next mail one more time... return self.next() return res
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score with timer("データ処理"): cancer = load_breast_cancer() data = cancer.data target = cancer.target train_x, valid_x, train_y, valid_y = train_test_split(data, target, stratify=target, random_state=2) train_x = pd.DataFrame(train_x, columns=cancer.feature_names) valid_x = pd.DataFrame(valid_x, columns=cancer.feature_names) with timer("モデルを推論"): preds = 0 #ここらをstackモデルに変更 for i, model in enumerate(models): oof_pred, pred = model.predict(train_x, valid_x, train_y) preds += pred LOGGER.info( f1_score(np.argmax(pred, axis=1), valid_y, average="binary")) LOGGER.info( f1_score(np.argmax(preds / len(models), axis=1), valid_y, average="binary")) #seed average
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading Train Dataset {opts.train_txt_db}, " f"{opts.train_img_db}") train_dataloader = create_dataloader( opts.train_img_db, opts.train_txt_db, opts.train_batch_size, True, VeDataset, ve_collate, opts, ) val_dataloader = create_dataloader( opts.val_img_db, opts.val_txt_db, opts.val_batch_size, False, VeEvalDataset, ve_eval_collate, opts, ) test_dataloader = create_dataloader( opts.test_img_db, opts.test_txt_db, opts.val_batch_size, False, VeEvalDataset, ve_eval_collate, opts, ) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} bert_model = json.load(open(f"{opts.train_txt_db}/meta.json"))["bert"] if "bert" not in bert_model: bert_model = "bert-large-cased" # quick hack for glove exp model = UniterForVisualEntailment.from_pretrained(opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level="O2") global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, "log")) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, "ckpt")) pickle.dump(ans2label, open(join(opts.output_dir, "ckpt", "ans2label.pkl"), "wb")) os.makedirs(join(opts.output_dir, "results")) # store VQA predictions add_log_to_file(join(opts.output_dir, "log", "log.txt")) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataloader.dataset)) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter("loss") model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(train_dataloader): n_examples += batch["input_ids"].size(0) loss = model(batch, compute_loss=True) loss = loss.mean() * batch["targets"].size(1) # instance-leval bce delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step TB_LOGGER.add_scalar("lr", lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar("loss", running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar("grad_norm", grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f"============Step {global_step}=============") tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f"{tot_ex} examples trained at " f"{ex_per_sec} ex/s") TB_LOGGER.add_scalar("perf/ex_per_s", ex_per_sec, global_step) LOGGER.info(f"===========================================") if global_step % opts.valid_steps == 0: for split, loader in [ ("val", val_dataloader), ("test", test_dataloader), ]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") val_log, results = validate(model, loader, label2ans, split) with open( f"{opts.output_dir}/results/" f"{split}_results_{global_step}_" f"rank{rank}.json", "w", ) as f: json.dump(results, f) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"Step {global_step}: finished {n_epoch} epochs") if opts.num_train_steps % opts.valid_steps != 0: for split, loader in [("val", val_dataloader), ("test", test_dataloader)]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") val_log, results = validate(model, loader, label2ans, split) with open( f"{opts.output_dir}/results/" f"{split}_results_{global_step}_" f"rank{rank}_final.json", "w", ) as f: json.dump(results, f) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step)
def inf_mlm(model, eval_loader, eval_len, label2ans, save_logits=False, task='mlm', predict_p=0, ensemble=1, text_only=False): LOGGER.info("start running evaluation {}...".format(task)) model.eval() n_ex = 0 st = time() results = [] logits = {} pbar = tqdm(total=eval_len) for i, batch in enumerate(eval_loader): qids = batch['qids'] scores = model(batch, compute_loss=False, task=task, text_only=text_only) if scores.nelement() == 0: masked_toks = iter([]) else: if predict_p > 0: assert predict_p <= 1, "Invalid prediction probability threshold {}".format( predict_p) softmax_scores = torch.nn.Softmax(dim=1)(scores) max_scores = softmax_scores.max(dim=-1, keepdim=False) scores = max_scores[0].cpu().tolist() indices = max_scores[1].cpu().tolist() masked_toks = [] for max_scores_i in range(0, len(scores)): if scores[max_scores_i] >= predict_p: masked_toks.append(indices[max_scores_i]) else: masked_toks.append(-1) else: masked_toks = scores.max(dim=-1, keepdim=False)[1].cpu().tolist() if ensemble > 1: masked_toks = torch.topk(scores, ensemble, dim=-1)[1].cpu().tolist() masked_toks = iter(masked_toks) for qid, q_toks in zip(qids, batch['input_ids']): predicted_toks = [] for tok in q_toks: tok = tok.item() if tok == 103: predicted_toks.append(next(masked_toks)) results.append({ 'predicted_toks': predicted_toks, 'question_id': qid }) n_ex += len(qids) pbar.update(len(qids)) # TODO: dont commit, for testing only #if i > 4: # break n_ex = sum(all_gather_list(n_ex)) tot_time = time() - st val_log = {'valid/ex_per_s': n_ex / tot_time} LOGGER.info(f"evaluation finished in {int(tot_time)} seconds " f"at {int(n_ex/tot_time)} examples per second") return val_log, results, logits
''' Author: Ribbon Huang MongoDB的调用的封装 ''' from utils.logger import LOGGER import pymongo from conf.settings import MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_SHEET, LOGGER_MONGO_NAME from pymongo.errors import WriteError, WTimeoutError, ConnectionFailure import numpy as np import pandas as pd # 记录日常日志 logger = LOGGER.createLogger(LOGGER_MONGO_NAME) class MongoUse: def __init__(self): try: self.client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT) except ConnectionFailure: logger.warning('MongoDB ConnectionFailure') except TypeError: logger.warning('MongoDB Variables is error') db = self.client[MONGO_DB] self.sheet = db[MONGO_SHEET] def insertDb(self, info): try: self.sheet.insert(info) except WriteError:
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) opts.n_gpu = n_gpu LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, " f"{opts.vfeat_db}") video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, opts.vfeat_interval, opts) # data loaders # train LOGGER.info(f"Loading the train QA dataset {opts.train_query_txt_db}") video_ids = get_video_ids(opts.train_query_txt_db) train_q_txt_db = QaQueryTokLmdb(opts.train_query_txt_db, opts.max_txt_len) train_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, True, opts, q_txt_db=train_q_txt_db, shuffle=True) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # val LOGGER.info(f"Loading the val QA dataset {opts.val_query_txt_db}") video_ids = get_video_ids(opts.val_query_txt_db) val_q_txt_db = QaQueryTokLmdb(opts.val_query_txt_db, -1) val_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, False, opts, q_txt_db=val_q_txt_db) if opts.test_query_txt_db: LOGGER.info(f"Loading the test QA dataset {opts.test_query_txt_db}") video_ids = get_video_ids(opts.test_query_txt_db) test_q_txt_db = QaQueryTokLmdb(opts.test_query_txt_db, -1) test_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, False, opts, q_txt_db=test_q_txt_db) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN model = HeroForVideoQA.from_pretrained(opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') restorer = TrainingRestorer(opts, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) if not exists(join(opts.output_dir, 'results')): # store tvqa predictions os.makedirs(join(opts.output_dir, 'results')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) task2loss = { task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys() } for obj in (f'{opts.task}_qa', f'{opts.task}_st_ed'): task2loss[obj] = RunningMeter(f'loss/{obj}') model.train() n_examples = defaultdict(int) start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() if global_step == 0: optimizer.step() for step, (task, batch) in enumerate(meta_loader): n_examples[task] += opts.train_batch_size loss = model(batch, task=task, compute_loss=True) loss_qa, loss_st_ed = loss loss = loss_qa + opts.lw_st_ed * loss_st_ed for n, ls in (('st_ed', loss_st_ed), ('qa', loss_qa)): ls = ls.item() task2loss[f'{task}_{n}'](ls) loss = loss.mean() task2loss[task](loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[task]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i == 0 or i == 1: param_group['lr'] = lr_this_step * opts.lr_mul elif i == 2 or i == 3: param_group['lr'] = lr_this_step else: raise ValueError() TB_LOGGER.add_scalar('lr', lr_this_step, global_step) TB_LOGGER.log_scaler_dict({ temp_loss.name: temp_loss.val for temp_loss in task2loss.values() if temp_loss.val is not None }) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() restorer.step() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') for t in train_dataloaders.keys(): tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, "val", opts, global_step=global_step) if opts.test_query_txt_db: validate(model, test_dataloaders, "test", opts, global_step=global_step) LOGGER.info('===========================================') model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, "val", opts, global_step=global_step) if opts.test_query_txt_db: validate(model, test_dataloaders, "test", opts, global_step=global_step) LOGGER.info('===========================================') model_saver.save(model, f'{global_step}_final')