예제 #1
0
    def __init__(self, opt, padding_idx4item=0, padding_idx4prefer=0):
        super().__init__()  # self.pad_idx, self.start_idx, self.end_idx)
        self.batch_size = opt['batch_size']
        self.max_length = opt['max_length']
        self.dropout = opt['dropout']
        self.num_layers = 2  #opt['num_layers']
        self.vocab_size = opt['vocab_size']
        self.user_size = opt['user_size']
        self.dim = opt['dim']
        self.embedding_size = opt['embedding_size']

        self.pad_idx4item = padding_idx4item
        self.pad_idx4prefer = padding_idx4prefer

        self.embeddings = _create_embeddings(self.vocab_size,
                                             self.embedding_size,
                                             self.pad_idx4item)
        self.user_embeddings = _create_embeddings(self.user_size,
                                                  self.embedding_size,
                                                  self.pad_idx4item)
        self.position_embeddings = nn.Embedding(opt['max_length'], opt['dim'])
        self.LayerNorm = LayerNorm(opt['dim'], eps=1e-12)
        self.dropout = nn.Dropout(opt['dropout'])

        opt['num_layers'] = 2

        self.SAS_encoder = Encoder(opt)
        self.prefer_SAS_encoder = Encoder(opt)
        self.neg_SAS_encoder = Encoder(opt)

        self.item_norm = nn.Linear(opt['dim'], opt['dim'])

        self.criterion = nn.BCELoss()
        self.cs_loss = nn.CrossEntropyLoss()
    def __init__(self, sentences, context=1, hidden=5, concat=False):
        logging.info(msg="starting CBOW training..")
        self.context = context
        self.encoder = Encoder(sentences=sentences)
        self.huffman_encoder = HuffmanEncoder(self.encoder.counter)
        self.encoding_length = self.encoder.encoding_length
        self.hidden_units = hidden
        self.output_units = 1
        self.input_units = context if concat else 1
        self.input2hidden = np.random.rand(
            self.hidden_units, self.input_units * self.encoding_length) * 0.1
        self.hidden2output = np.random.rand(
            self.output_units * self.encoding_length - 1,
            self.hidden_units) * 0.1

        # train model
        word_count = 0
        last_time = time.time()
        for sentence in sentences:
            context_pairs = sentence2contexts(sentence, self.context)
            for w, c in context_pairs:
                self._train(w, c)
                # break
                word_count += 1
                if word_count % 100 == 0:
                    now = time.time()
                    time_spent = 1.0 / (now - last_time) * 100
                    logging.info(msg="trained on %s words. %s words/sec" %
                                 (word_count, time_spent))
                    last_time = time.time()
예제 #3
0
def create_encoders(data: List[Tuple[Name, Lang]]) \
        -> Tuple[Encoder[Char], Encoder[Lang]]:
    """Create the encoders for the input characters and
    the output languages."""
    char_enc = Encoder(char for name, lang in data for char in name)
    lang_enc = Encoder(lang for name, lang in data)
    return char_enc, lang_enc
예제 #4
0
def make_model(cnn3d,
               tgt_vocab,
               N=6,
               d_model=512,
               d_ff=2048,
               h=8,
               dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab), cnn3d)

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.named_parameters():
        if not p[0].startswith(
                "cnn3d") and p[1].requires_grad and p[1].dim() > 1:
            nn.init.xavier_uniform_(p[1])

    return model
예제 #5
0
파일: model.py 프로젝트: ForeverPs/SSD
    def __init__(self, backbone=None, num_classes=21):
        super(SSD300, self).__init__()
        self.feature_extractor = backbone
        self.num_classes = num_classes
        # number of default bounding boxes in each feature map
        self.num_defaults = [4, 6, 6, 6, 4, 4]
        # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
        self._build_additional_features(self.feature_extractor.out_channels)

        # output of location regression and classification
        location_extractors = list()
        confidence_extractors = list()

        # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
        for nd, oc in zip(self.num_defaults,
                          self.feature_extractor.out_channels):
            # nd is number_default_boxes, oc is output_channel
            location_extractors.append(
                nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
            confidence_extractors.append(
                nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1))

        # location regression layers and classification layers
        self.loc = nn.ModuleList(location_extractors)
        self.conf = nn.ModuleList(confidence_extractors)
        self._init_weights()

        # all default bounding boxes in SSD
        # shape [8732, 4]
        default_box = dboxes300()
        self.compute_loss = Loss(default_box)
        self.encoder = Encoder(default_box)
        self.postprocess = PostProcess(default_box)
예제 #6
0
class InputTransformer:
    def __init__(self):
        self.encoder = Encoder()

    def transform(self, X_train, y_train, augment):
        X_train = list(X_train)
        y_train = list(y_train)
        print('before augmenting', len(X_train))
        if augment is not None:
            X_train, y_train = augment(X_train, y_train)

        print('after augmetning', len(X_train), len(y_train))

        def char_func(char):
            # word = WordNetLemmatizer().lemmatize(word)
            return self.encoder.transform(char) + 1

        X_train = [
            preprocess_chars(ingredients, char_func) for ingredients in X_train
        ]
        lengths = numpy.array(list(len(x) for x in X_train))
        print(lengths.min(), lengths.mean(), lengths.max(), lengths.std())

        X_train = sequence.pad_sequences(X_train, maxlen=600)

        print("ingredients")
        print(X_train[:3])

        label_transform = LabelBinarizer()
        y_train = label_transform.fit_transform(y_train)

        return X_train, y_train
예제 #7
0
def eval_ssd300_mlperf_coco(args):
    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    val_trans = SSDTransformer(dboxes, (300, 300), val=True)

    val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")

    cocoGt = COCO(annotation_file=val_annotate)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    inv_map = {v:k for k,v in val_coco.label_map.items()}

    ssd300 = SSD300(val_coco.labelnum)

    print("loading model checkpoint", args.checkpoint)
    od = torch.load(args.checkpoint, map_location=lambda storage, loc: storage)
    ssd300.load_state_dict(od["model"])

    if use_cuda:
        ssd300.cuda(args.device)
    loss_func = Loss(dboxes)
    if use_cuda:
        loss_func.cuda(args.device)

    coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold,args.device)
class SceneDataset(InMemoryDataset):
    def __init__(self, root, config, transform=None, pre_transform=None):
        self.config = config
        self.attr_encoder = Encoder(config)

        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        # return ["graphs.pkl"]
        return [cmd_args.graph_file_name]

    @property
    def processed_file_names(self):
        # return ['train_1000_dataset.pt']
        return [cmd_args.dataset_name]

    def download(self):
        pass

    def process(self):
        data_list = []

        for raw_path in self.raw_paths:
            with open(raw_path, 'rb') as raw_file:
                graphs = pickle.load(raw_file)

            for graph_id, graph in enumerate(graphs):

                x = self.attr_encoder.get_embedding(
                    [node.name for node in graph.nodes])
                edge_index, edge_types = graph.get_edge_info()
                edge_attrs = torch.tensor(
                    self.attr_encoder.get_embedding(
                        [f"edge_{tp}" for tp in edge_types]))
                data_point = Data(torch.tensor(x), torch.tensor(edge_index),
                                  edge_attrs, graph.target_id)

                # print(torch.tensor(x), torch.tensor(edge_index), edge_attrs, graph.target_id)
                data_point.obj_num = len(graph.scene["objects"])
                data_point.graph_id = graph_id
                # data_point.attr_encoder = self.attr_encoder
                data_list.append(data_point)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
예제 #9
0
 def __init__(self):
     self.ChunkExp = []  # the list save all chunks, extracted features
     self.ChunkNegExp = []
     self.chunker = Chunker.Chunker(
     )  # the SVM judgement model to chunk a given sentence
     self.SRLabler = SRLabeler.SRLabeler(
     )  # the NBclassification model on a chunk-level sentence.
     self.encoder = En.Encoder()
예제 #10
0
def eval_ssd_r34_mlperf_coco(args):
    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    dboxes = dboxes_R34_coco(args.image_size, args.strides)

    encoder = Encoder(dboxes)

    val_trans = SSDTransformer(dboxes,
                               (args.image_size[0], args.image_size[1]),
                               val=True)

    if not args.dummy:
        val_annotate = os.path.join(args.data,
                                    "annotations/instances_val2017.json")
        val_coco_root = os.path.join(args.data, "val2017")

        cocoGt = COCO(annotation_file=val_annotate)
        val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
        inv_map = {v: k for k, v in val_coco.label_map.items()}

        if args.accuracy_mode:
            val_dataloader = DataLoader(val_coco,
                                        batch_size=args.batch_size,
                                        shuffle=False,
                                        sampler=None,
                                        num_workers=args.workers)
        else:
            val_dataloader = DataLoader(val_coco,
                                        batch_size=args.batch_size,
                                        shuffle=False,
                                        sampler=None,
                                        num_workers=args.workers,
                                        drop_last=True)
        labelnum = val_coco.labelnum
    else:
        cocoGt = None
        encoder = None
        inv_map = None
        val_dataloader = None
        labelnum = 81

    ssd_r34 = SSD_R34(labelnum, strides=args.strides)

    if args.checkpoint:
        print("loading model checkpoint", args.checkpoint)
        od = torch.load(args.checkpoint,
                        map_location=lambda storage, loc: storage)
        ssd_r34.load_state_dict(od["model"])

    if use_cuda:
        ssd_r34.cuda(args.device)
    coco_eval(ssd_r34, val_dataloader, cocoGt, encoder, inv_map, args)
예제 #11
0
def create_encoders(
        data: List[Tuple[Inp, Out]]) -> Tuple[Encoder[Char], Encoder[POS]]:
    """Create a pair of encoders, for words and POS tags respectively.

    Parameters
    ----------
    data : List[Tuple[Inp, Out]]
        List of input/output pairs based on which the encoders
        will be created; this parameter should only contain the
        training pairs, and not development or evaluation pairs.

    Returns
    -------
    (char_enc, pos_enc) : Tuple[Encoder[Char], Encoder[POS]]
        Pair of encoders for input characters and output POS tags.
    """
    # Enumerate all input characters present in the dataset
    # and create the encoder out of the resulting iterable
    char_enc = Encoder(char for inp, _ in data for word in inp
                       for char in word)
    # Enumerate all POS tags in the dataset and create
    # the corresponding encoder
    pos_enc = Encoder(pos for _, out in data for pos in out)
    return (char_enc, pos_enc)
예제 #12
0
def main():
    # Parse arguments
    args = parse_args()

    # Get categories names
    with open(args.annotations, 'r') as anno:
        js = json.loads(anno.read())
        coco_names = js['categories']

    # Prepare map of COCO labels to COCO names
    name_map = {}
    for name in coco_names:
        name_map[name['id']] = name['name']

    # Prepare map of SSD to COCO labels
    deleted = [12, 26, 29, 30, 45, 66, 68, 69, 71, 83]
    inv_map = {}
    cnt = 0
    for i in range(1, 81):
        while i + cnt in deleted:
            cnt += 1
        inv_map[i] = i + cnt

    # Prepare colors for categories
    category_id_to_color = dict([
        (cat_id,
         [random.uniform(0, 1),
          random.uniform(0, 1),
          random.uniform(0, 1)]) for cat_id in range(1, 91)
    ])

    # Set math plot lib size
    plt.rcParams["figure.figsize"] = (12, 8)

    # Build and load SSD model
    ssd300 = SSD300(81, backbone="resnet34", model_path=None, dilation=None)
    load_checkpoint(ssd300, args.model)
    ssd300.eval()

    # Prepare encoder
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)

    # Print images
    for image in args.images:
        print_image(image, ssd300, encoder, inv_map, name_map,
                    category_id_to_color, args.threshold)
예제 #13
0
class AssignGTtoDefaultBox(object):
    def __init__(self):
        self.default_box = dboxes300()
        self.encoder = Encoder(self.default_box)

    def __call__(self, image, target):
        # boxes : target bounding boxes in shape [batch, n_objects, 4]
        # labels : target labels in shape [batch, n_objects]
        boxes = target['boxes']
        labels = target['labels']

        # assign ground truth to default bounding boxes
        # bboxes_out : [batch, 8732, 4]
        # labels_out : [batch, 8732]
        bboxes_out, labels_out = self.encoder.encode(boxes, labels)
        target['boxes'] = bboxes_out
        target['labels'] = labels_out

        return image, target
예제 #14
0
def eval_ssd_r34_mlperf_coco(args):
    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    dboxes = dboxes_R34_coco(args.image_size, args.strides)
    encoder = Encoder(dboxes)
    val_trans = SSDTransformer(dboxes,
                               (args.image_size[0], args.image_size[1]),
                               val=True)

    val_annotate = os.path.join(args.data,
                                "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")

    cocoGt = COCO(annotation_file=val_annotate)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    inv_map = {v: k for k, v in val_coco.label_map.items()}

    print('ssd r34')
    ssd_r34 = SSD_R34(val_coco.labelnum, strides=args.strides)

    print("loading model checkpoint", args.checkpoint)
    od = torch.load(args.checkpoint, map_location=lambda storage, loc: storage)
    # import pdb; pdb.set_trace()
    ssd_r34.load_state_dict(od["model"])

    if use_cuda:
        ssd_r34.cuda(args.device)
    loss_func = Loss(dboxes)
    if use_cuda:
        loss_func.cuda(args.device)

    if args.onnx:
        if args.onnx == 'export':
            return coco_eval_export(ssd_r34, val_coco, cocoGt, encoder,
                                    inv_map, args.threshold, args.device,
                                    use_cuda)
        elif args.onnx == 'eval':
            return coco_eval_onnx(ssd_r34, val_coco, cocoGt, encoder, inv_map,
                                  args.threshold, args.device, use_cuda)
    return coco_eval(ssd_r34, val_coco, cocoGt, encoder, inv_map,
                     args.threshold, args.device, use_cuda)
예제 #15
0
 def __init__(self):
     self.encoder = Encoder()
예제 #16
0
파일: train.py 프로젝트: deepakn94/training
def train300_mlperf_coco(args):
    args.distributed = args.world_size > 1

    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    train_trans = SSDTransformer(dboxes, (300, 300), val=False)
    val_trans = SSDTransformer(dboxes, (300, 300), val=True)

    val_annotate = os.path.join(args.data,
                                "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")
    train_annotate = os.path.join(args.data,
                                  "annotations/instances_train2017.json")
    train_coco_root = os.path.join(args.data, "train2017")

    cocoGt = COCO(annotation_file=val_annotate)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_coco)
    else:
        train_sampler = None
    train_dataloader = DataLoader(train_coco,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=4,
                                  sampler=train_sampler)

    ssd300 = SSD300(train_coco.labelnum)
    if args.checkpoint is not None:
        print("loading model checkpoint", args.checkpoint)
        od = torch.load(args.checkpoint)
        ssd300.load_state_dict(od["model"])
    ssd300.train()
    if use_cuda:
        ssd300.cuda()
    loss_func = Loss(dboxes)
    if use_cuda:
        loss_func.cuda()
    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)
        ssd300 = DistributedDataParallel(ssd300)
    else:
        ssd300 = torch.nn.DataParallel(ssd300)

    optim = torch.optim.SGD(ssd300.parameters(),
                            lr=1e-3,
                            momentum=0.9,
                            weight_decay=5e-4)
    print("epoch", "nbatch", "loss")

    iter_num = args.iteration
    avg_loss = 0.0
    inv_map = {v: k for k, v in val_coco.label_map.items()}

    for epoch in range(args.epochs):

        for nbatch, (img, img_size, bbox,
                     label) in enumerate(train_dataloader):

            start = time.time()
            if iter_num == 160000:
                print("")
                print("lr decay step #1")
                for param_group in optim.param_groups:
                    param_group['lr'] = 1e-4

            if iter_num == 200000:
                print("")
                print("lr decay step #2")
                for param_group in optim.param_groups:
                    param_group['lr'] = 1e-5

            if use_cuda:
                img = img.cuda()
            img = Variable(img, requires_grad=True)
            ploc, plabel = ssd300(img)
            trans_bbox = bbox.transpose(1, 2).contiguous()
            if use_cuda:
                trans_bbox = trans_bbox.cuda()
                label = label.cuda()
            gloc, glabel = Variable(trans_bbox, requires_grad=False), \
                           Variable(label, requires_grad=False)
            loss = loss_func(ploc, plabel, gloc, glabel)

            if not np.isinf(loss.item()):
                avg_loss = 0.999 * avg_loss + 0.001 * loss.item()

            optim.zero_grad()
            loss.backward()
            optim.step()
            end = time.time()

            if nbatch % 10 == 0:
                print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Average time: {:.3f} secs"\
                            .format(iter_num, loss.item(), avg_loss, end - start))

            if iter_num in args.evaluation:
                if not args.no_save:
                    print("")
                    print("saving model...")
                    torch.save(
                        {
                            "model": ssd300.state_dict(),
                            "label_map": train_coco.label_info
                        }, "./models/iter_{}.pt".format(iter_num))

                if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map,
                             args.threshold):
                    return

            iter_num += 1
예제 #17
0
def val300(path):
    ssd300 = SSD300(21)
    dboxes = dboxes300()
    encoder = Encoder(dboxes)
    trans = SSDTransformer(dboxes, (300, 300), val=True)
    valmodel(ssd300, path, dboxes, trans, encoder)
예제 #18
0
def val300_coco(model_path):
    print("loading model at {}".format(model_path))
    from pycocotools.coco import COCO
    from pycocotools.cocoeval import COCOeval

    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    trans = SSDTransformer(dboxes, (300, 300), val=True)

    #annotate = "../../coco_ssd/instances_minival2014.json"
    #coco_root = "../../coco_data/val2014"
    #annotate = "../../coco_ssd/image_info_test-dev2015.json"
    #coco_root = "../../coco_data/test2015"

    annotate = "../../coco_ssd/instances_val2017.json"
    coco_root = "../../coco_data/val2017"

    cocoGt = COCO(annotation_file=annotate)
    coco = COCODetection(coco_root, annotate, trans)

    model = SSD300(coco.labelnum)

    od = torch.load(model_path)
    model.load_state_dict(od["model"])

    model.eval()
    model.cuda()

    ret = []

    inv_map = {v: k for k, v in coco.label_map.items()}
    start = time.time()
    for idx, image_id in enumerate(coco.img_keys):
        img, (htot, wtot), _, _ = coco[idx]

        with torch.no_grad():
            print("Parsing image: {}/{}".format(idx + 1, len(coco)), end="\r")
            ploc, plabel = model(img.unsqueeze(0).cuda())

            try:
                result = encoder.decode_batch(ploc, plabel, 0.50, 200)[0]
            except:
                #raise
                print("")
                print("No object detected in idx: {}".format(idx), end="\r")
                continue

            loc, label, prob = [r.cpu().numpy() for r in result]
            for loc_, label_, prob_ in zip(loc, label, prob):
                ret.append([image_id, loc_[0]*wtot, \
                                      loc_[1]*htot,
                                      (loc_[2] - loc_[0])*wtot,
                                      (loc_[3] - loc_[1])*htot,
                                      prob_,
                                      inv_map[label_]])
    print("")
    print("Predicting Ended, totoal time: {:.2f} s".format(time.time() -
                                                           start))

    cocoDt = cocoGt.loadRes(np.array(ret))

    E = COCOeval(cocoGt, cocoDt, iouType='bbox')
    #E.params.useSegm = 0
    #E.params.recThrs = [0.5]
    #E.params.maxDets = [10, 100, 200]
    E.evaluate()
    E.accumulate()
    E.summarize()
예제 #19
0
def val512(path):
    ssd512 = SSD512(21)
    dboxes = dboxes512()
    encoder = Encoder(dboxes)
    trans = SSDTransformer(dboxes, (512, 512), val=True)
    valmodel(ssd512, path, dboxes, trans, encoder)
예제 #20
0
def train300_mlperf_coco(args):
    global torch
    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    args.distributed = False
    if use_cuda:
        try:
            from apex.parallel import DistributedDataParallel as DDP
            if 'WORLD_SIZE' in os.environ:
                args.distributed = int(os.environ['WORLD_SIZE']) > 1
        except:
            raise ImportError(
                "Please install APEX from https://github.com/nvidia/apex")

    local_seed = args.seed
    os.environ['USE_CUDA'] = str(use_cuda)
    if args.world_size > 1:
        args.distributed = True

    if args.distributed:
        # necessary pytorch imports
        import torch.utils.data.distributed
        import torch.distributed as dist
        print('Distributed training with DDP')
        if args.no_cuda:
            device = torch.device('cpu')
            os.environ['RANK'] = str(os.environ.get('PMI_RANK', args.rank))
            os.environ['WORLD_SIZE'] = str(
                os.environ.get('PMI_SIZE', args.world_size))
            os.environ['MASTER_ADDR'] = args.master_addr
            os.environ['MASTER_PORT'] = args.port

            # Initialize the process group with ccl backend
            if args.backend == 'ccl':
                import torch_ccl
            dist.init_process_group(backend=args.backend)
        else:
            torch.cuda.set_device(args.local_rank)
            device = torch.device('cuda')
            dist.init_process_group(backend='nccl', init_method='env://')
            # set seeds properly
            args.seed = broadcast_seeds(args.seed, device)
            local_seed = (args.seed + dist.get_rank()) % 2**32
    mllogger.event(key=mllog_const.SEED, value=local_seed)
    # Refer to https://pytorch.org/docs/stable/notes/randomness.html#dataloader
    torch.manual_seed(local_seed)  # Set PyTorch seed
    np.random.seed(seed=local_seed)  # Set Numpy seed
    random.seed(local_seed)  # Set the Python seed

    args.rank = dist.get_rank() if args.distributed else args.local_rank
    print("args.rank = {}".format(args.rank))
    print("local rank = {}".format(args.local_rank))
    print("distributed={}".format(args.distributed))

    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)

    input_size = 300
    train_trans = SSDTransformer(
        dboxes, (input_size, input_size),
        val=False,
        num_cropping_iterations=args.num_cropping_iterations)
    val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True)

    val_annotate = os.path.join(args.data,
                                "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")
    train_annotate = os.path.join(args.data,
                                  "annotations/instances_train2017.json")
    train_coco_root = os.path.join(args.data, "train2017")

    cocoGt = COCO(annotation_file=val_annotate)
    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco))
    mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_coco)
    else:
        train_sampler = None
    train_dataloader = DataLoader(train_coco,
                                  batch_size=args.batch_size,
                                  shuffle=(train_sampler is None),
                                  sampler=train_sampler,
                                  num_workers=0)
    # set shuffle=True in DataLoader
    # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks
    val_dataloader = DataLoader(val_coco,
                                batch_size=args.val_batch_size
                                or args.batch_size,
                                shuffle=False,
                                sampler=None,
                                num_workers=0)

    ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone)

    ssd300.train()
    if use_cuda:
        ssd300.cuda()
    loss_func = Loss(dboxes)
    if use_cuda:
        loss_func.cuda()
    if args.distributed:
        N_gpu = torch.distributed.get_world_size()
    else:
        N_gpu = 1

    global_batch_size = N_gpu * args.batch_size
    mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size)
    # Reference doesn't support group batch norm, so bn_span==local_batch_size
    mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size)
    current_lr = args.lr * (global_batch_size / 32)

    assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits"
    fragment_size = args.batch_size // args.batch_splits
    if args.batch_splits != 1:
        print("using gradient accumulation with fragments of size {}".format(
            fragment_size))

    # Model to NHWC
    ssd300 = ssd300.to(memory_format=torch.channels_last)

    current_momentum = 0.9
    optim = torch.optim.SGD(ssd300.parameters(),
                            lr=current_lr,
                            momentum=current_momentum,
                            weight_decay=args.weight_decay)
    ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr)
    ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay)

    iter_num = args.iteration
    avg_loss = 0.0
    inv_map = {v: k for k, v in val_coco.label_map.items()}
    success = torch.zeros(1)
    if use_cuda:
        success = success.cuda()

    if args.warmup:
        nonempty_imgs = len(train_coco)
        wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size))
        ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb)
        warmup_step = lambda iter_num, current_lr: lr_warmup(
            optim, wb, iter_num, current_lr, args)
    else:
        warmup_step = lambda iter_num, current_lr: None

    ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor)
    ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS,
              value=args.lr_decay_schedule)
    mllogger.start(key=mllog_const.BLOCK_START,
                   metadata={
                       mllog_const.FIRST_EPOCH_NUM: 1,
                       mllog_const.EPOCH_COUNT: args.epochs
                   })

    if args.performance_only:
        train_time = AverageMeter('TrainTime', ':6.3f')
        progress = ProgressMeter(args.train_iteration, [train_time],
                                 prefix='Train: ')

    # Restore the model and optim from checkpoint
    if args.checkpoint is not None:
        print("loading model checkpoint", args.checkpoint)
        od = torch.load(args.checkpoint)
        ssd300.load_state_dict(od["model"])
        optim.load_state_dict(od['optim'])

    # Model Prepack
    if use_ipex:
        if args.autocast:
            ssd300, optim = ipex.optimize(ssd300,
                                          dtype=torch.bfloat16,
                                          optimizer=optim)
        else:
            ssd300, optim = ipex.optimize(ssd300,
                                          dtype=torch.float32,
                                          optimizer=optim)

    # parallelize
    if args.distributed:
        device_ids = None
        ssd300 = torch.nn.parallel.DistributedDataParallel(
            ssd300, device_ids=device_ids)

    optim.zero_grad(set_to_none=True)
    for epoch in range(args.epochs):
        mllogger.start(key=mllog_const.EPOCH_START,
                       metadata={mllog_const.EPOCH_NUM: epoch})
        # set the epoch for the sampler
        if args.distributed:
            train_sampler.set_epoch(epoch)

        if epoch in args.lr_decay_schedule:
            current_lr *= 0.1
            print("")
            print("lr decay step #{num}".format(
                num=args.lr_decay_schedule.index(epoch) + 1))
            for param_group in optim.param_groups:
                param_group['lr'] = current_lr
        for nbatch, (img, img_id, img_size, bbox,
                     label) in enumerate(train_dataloader):
            naive_train_case = True  # img.shape[0] == fragment_size
            if naive_train_case:
                # Naive train case
                fimg, gloc, glabel, mask, pos_num, neg_num, num_mask = data_preprocess(
                    img, bbox, label, loss_func, args.autocast)

                if args.performance_only and iter_num >= args.warmup_iterations:
                    start_time = time.time()
                if args.profile and args.performance_only and iter_num == 30:
                    # Profile Mode
                    with torch.profiler.profile(
                            on_trace_ready=trace_handler) as prof:
                        with torch.cpu.amp.autocast(enabled=args.autocast):
                            ploc, plabel = ssd300(fimg)
                            loss = loss_func(ploc, plabel, gloc, glabel, mask,
                                             pos_num, neg_num, num_mask,
                                             args.autocast)
                        loss.backward()

                        warmup_step(iter_num, current_lr)
                        optim.step()
                        optim.zero_grad(set_to_none=True)
                else:
                    # Non Profile Mode
                    with torch.cpu.amp.autocast(enabled=args.autocast):
                        ploc, plabel = ssd300(fimg)
                        loss = loss_func(ploc, plabel, gloc, glabel, mask,
                                         pos_num, neg_num, num_mask,
                                         args.autocast)
                    loss.backward()

                    warmup_step(iter_num, current_lr)
                    optim.step()
                    optim.zero_grad(set_to_none=True)
            else:
                # Train case: when split input to several fragment size
                print("Not support input with several fragment size yet.")
                exit(-1)
                # current_batch_size = img.shape[0]
                # # Split batch for gradient accumulation
                # img = torch.split(img, fragment_size)
                # bbox = torch.split(bbox, fragment_size)
                # label = torch.split(label, fragment_size)

                # if args.performance_only and iter_num >= args.warmup_iterations:
                #     start_time=time.time()
                # for (fimg, fbbox, flabel) in zip(img, bbox, label):
                #     current_fragment_size = fimg.shape[0]
                #     trans_bbox = fbbox.transpose(1,2).contiguous()
                #     if use_cuda:
                #         fimg = fimg.cuda()
                #         trans_bbox = trans_bbox.cuda()
                #         flabel = flabel.cuda()
                #     fimg = Variable(fimg, requires_grad=True)
                #     gloc, glabel = Variable(trans_bbox, requires_grad=False), \
                #                 Variable(flabel, requires_grad=False)
                #     gloc = loss_func._loc_vec(gloc)
                #     mask = glabel > 0
                #     pos_num = mask.sum(dim=1)
                #     neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1)
                #     num_mask = (pos_num > 0).float()
                #     # image to NHWC
                #     fimg = fimg.contiguous(memory_format=torch.channels_last)
                #     if use_ipex:
                #         with ipex.amp.autocast(enabled=args.autocast, configure=ipex.conf.AmpConf(torch.bfloat16)):
                #             ploc, plabel = ssd300(fimg)
                #             loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask)
                #     else:
                #         ploc, plabel = ssd300(fimg)
                #         loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask)
                #     loss = loss * (current_fragment_size / current_batch_size) # weighted mean
                #     loss.backward()

                # warmup_step(iter_num, current_lr)
                # optim.step()
                # optim.zero_grad(set_to_none=True)
            if args.performance_only and iter_num >= args.warmup_iterations:
                train_time.update(time.time() - start_time)
            if args.performance_only and iter_num % args.print_freq == 0:
                progress.display(iter_num)
            if not np.isinf(loss.item()):
                avg_loss = 0.999 * avg_loss + 0.001 * loss.item()
            if args.log_interval and not iter_num % args.log_interval:
                print("Iteration: {:6d}, Loss function: {:5.8f}, Average Loss: {:.8f}"\
                    .format(iter_num, loss.item(), avg_loss))
            iter_num += 1
            if args.performance_only and iter_num >= args.train_iteration:
                break
        if args.performance_only and iter_num >= args.train_iteration:
            break

        if (args.val_epochs and (epoch+1) in args.val_epochs) or \
           (args.val_interval and not (epoch+1) % args.val_interval):
            if args.distributed:
                world_size = float(dist.get_world_size())
                for bn_name, bn_buf in ssd300.module.named_buffers(
                        recurse=True):
                    if ('running_mean' in bn_name) or ('running_var'
                                                       in bn_name):
                        dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
                        bn_buf /= world_size
                        ssd_print(key=mllog_const.MODEL_BN_SPAN,
                                  value=bn_buf.cpu().detach().numpy())
            if args.rank == 0 or True:  # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks
                if not args.no_save:
                    print("")
                    print("saving model...")
                    torch.save(
                        {
                            "model": ssd300.state_dict(),
                            "label_map": train_coco.label_info,
                            "optim": optim.state_dict()
                        }, "./models/iter_{}.pt".format(iter_num))

                if coco_eval(ssd300,
                             val_dataloader,
                             cocoGt,
                             encoder,
                             inv_map,
                             args.threshold,
                             epoch + 1,
                             iter_num,
                             log_interval=args.log_interval,
                             nms_valid_thresh=args.nms_valid_thresh,
                             use_autocast=args.autocast):
                    success = torch.ones(1)
                    if use_cuda:
                        success = success.cuda()
            # Leslie: same Workaround: since we run evalution on all ranks, we don't need to broadcast the evalutation result
            # if args.distributed:
            #     dist.broadcast(success, 0)
            if success[0]:
                return True
            mllogger.end(key=mllog_const.EPOCH_STOP,
                         metadata={mllog_const.EPOCH_NUM: epoch})
    mllogger.end(key=mllog_const.BLOCK_STOP,
                 metadata={
                     mllog_const.FIRST_EPOCH_NUM: 1,
                     mllog_const.EPOCH_COUNT: args.epochs
                 })

    if args.performance_only:
        batch_size = args.batch_size
        latency = train_time.avg / batch_size * 1000
        perf = batch_size / train_time.avg
        print('train latency %.2f ms' % latency)
        print('train performance %.2f fps' % perf)
        print("Throughput: {:.3f} fps".format(perf))

    return False
    def __init__(self, root, config, transform=None, pre_transform=None):
        self.config = config
        self.attr_encoder = Encoder(config)

        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
예제 #22
0
def train_mlperf_coco(args):
    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    ssd_r34 = SSD_R34(81, strides=args.strides)
    #img_size=[args.image_size,args.image_size]
    dboxes = dboxes_coco(args.image_size, args.strides)
    encoder = Encoder(dboxes)
    train_trans = SSDTransformer(dboxes, tuple(args.image_size), val=False)
    val_trans = SSDTransformer(dboxes, tuple(args.image_size), val=True)

    val_annotate = os.path.join(args.data,
                                "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")
    train_annotate = os.path.join(args.data,
                                  "annotations/instances_train2017.json")
    train_coco_root = os.path.join(args.data, "train2017")

    cocoGt = COCO(annotation_file=val_annotate)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)

    #print("Number of labels: {}".format(train_coco.labelnum))
    train_dataloader = DataLoader(train_coco,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=4)

    ssd_r34 = SSD_R34(train_coco.labelnum, strides=args.strides)
    if args.checkpoint is not None:
        print("loading model checkpoint", args.checkpoint)
        od = torch.load(args.checkpoint)
        ssd_r34.load_state_dict(od["model"])
    ssd_r34.train()
    ssd_r34.to('cuda')
    if use_cuda:
        if args.device_ids and len(args.device_ids) > 1:
            ssd_r34 = nn.DataParallel(ssd_r34, args.device_ids)

    loss_func = Loss(dboxes)
    if use_cuda:
        loss_func.to('cuda')
        loss_func = nn.DataParallel(loss_func, args.device_ids)

    optim = torch.optim.SGD(ssd_r34.parameters(),
                            lr=1e-3,
                            momentum=0.9,
                            weight_decay=5e-4)
    print("epoch", "nbatch", "loss")

    iter_num = args.iteration
    avg_loss = 0.0
    last_loss = [0.0] * 10
    inv_map = {v: k for k, v in val_coco.label_map.items()}

    for epoch in range(args.epochs):

        for nbatch, (img, img_size, bbox,
                     label) in enumerate(train_dataloader):

            if iter_num == 160000:
                print("")
                print("lr decay step #1")
                for param_group in optim.param_groups:
                    param_group['lr'] = 1e-4

            if iter_num == 200000:
                print("")
                print("lr decay step #2")
                for param_group in optim.param_groups:
                    param_group['lr'] = 1e-5

            img = Variable(img, requires_grad=True)
            ploc, plabel, _ = ssd_r34(img.to('cuda'))
            trans_bbox = bbox.transpose(1, 2).contiguous()

            gloc, glabel = Variable(trans_bbox, requires_grad=False), \
                           Variable(label, requires_grad=False)

            loss = loss_func(ploc, plabel, gloc, glabel).mean()

            if not np.isinf(loss.item()):
                avg_loss = 0.999 * avg_loss + 0.001 * loss.item()
            last_loss.pop()
            last_loss = [loss.item()] + last_loss
            avg_last_loss = sum(last_loss) / len(last_loss)
            print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Average Last 10 Loss: {:.3f}"\
                        .format(iter_num, loss.item(), avg_loss,avg_last_loss), end="\r")
            optim.zero_grad()
            loss.backward()
            optim.step()

            loss = None

            if iter_num in args.evaluation:
                if not args.no_save:
                    print("")
                    print("saving model...")
                    module = ssd_r34.module if len(
                        args.device_ids) > 1 else ssd_r34
                    torch.save(
                        {
                            "model": module.state_dict(),
                            "label_map": train_coco.label_info
                        }, args.save_path + "/iter_{}.pt".format(iter_num))
                if coco_eval(ssd_r34, val_coco, cocoGt, encoder, inv_map,
                             args.threshold, args.device_ids):
                    return

            iter_num += 1
예제 #23
0
def train300_mlperf_coco(args):
    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)

    input_size = 300
    train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False)
    val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True)
    mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size)

    val_annotate = os.path.join(args.data,
                                "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")
    train_annotate = os.path.join(args.data,
                                  "annotations/instances_train2017.json")
    train_coco_root = os.path.join(args.data, "train2017")

    cocoGt = COCO(annotation_file=val_annotate)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)

    #print("Number of labels: {}".format(train_coco.labelnum))
    train_dataloader = DataLoader(train_coco,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=4)
    # set shuffle=True in DataLoader
    mlperf_log.ssd_print(key=mlperf_log.INPUT_SHARD, value=None)
    mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER)
    mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE,
                         value=args.batch_size)

    ssd300 = SSD300(train_coco.labelnum)
    if args.checkpoint is not None:
        print("loading model checkpoint", args.checkpoint)
        od = torch.load(args.checkpoint)
        ssd300.load_state_dict(od["model"])
    ssd300.train()
    if use_cuda:
        ssd300.cuda()
    loss_func = Loss(dboxes)
    if use_cuda:
        loss_func.cuda()

    current_lr = 1e-3
    current_momentum = 0.9
    current_weight_decay = 5e-4
    optim = torch.optim.SGD(ssd300.parameters(),
                            lr=current_lr,
                            momentum=current_momentum,
                            weight_decay=current_weight_decay)
    mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value="SGD")
    mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr)
    mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum)
    mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY,
                         value=current_weight_decay)

    print("epoch", "nbatch", "loss")

    iter_num = args.iteration
    avg_loss = 0.0
    inv_map = {v: k for k, v in val_coco.label_map.items()}

    mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(args.epochs):
        mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        for nbatch, (img, img_size, bbox,
                     label) in enumerate(train_dataloader):

            if iter_num == 160000:
                current_lr = 1e-4
                print("")
                print("lr decay step #1")
                for param_group in optim.param_groups:
                    param_group['lr'] = current_lr
                mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr)

            if iter_num == 200000:
                current_lr = 1e-5
                print("")
                print("lr decay step #2")
                for param_group in optim.param_groups:
                    param_group['lr'] = current_lr
                mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr)
            if use_cuda:
                img = img.cuda()
            img = Variable(img, requires_grad=True)
            ploc, plabel = ssd300(img)
            trans_bbox = bbox.transpose(1, 2).contiguous()
            if use_cuda:
                trans_bbox = trans_bbox.cuda()
                label = label.cuda()
            gloc, glabel = Variable(trans_bbox, requires_grad=False), \
                           Variable(label, requires_grad=False)
            loss = loss_func(ploc, plabel, gloc, glabel)

            if not np.isinf(loss.item()):
                avg_loss = 0.999 * avg_loss + 0.001 * loss.item()

            print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\
                        .format(iter_num, loss.item(), avg_loss), end="\r")
            optim.zero_grad()
            loss.backward()
            optim.step()

            if iter_num in args.evaluation:
                if not args.no_save:
                    print("")
                    print("saving model...")
                    torch.save(
                        {
                            "model": ssd300.state_dict(),
                            "label_map": train_coco.label_info
                        }, "./models/iter_{}.pt".format(iter_num))

                if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map,
                             args.threshold, epoch, iter_num):
                    return True

            iter_num += 1
    return False
예제 #24
0
def create_encoders(data: List[Tuple[Inp, Out]]) \
        -> Tuple[Encoder[Word], Encoder[POS]]:
    """Create a pair of encoders, for words and POS tags respectively."""
    word_enc = Encoder(word for inp, _ in data for word in inp)
    pos_enc = Encoder(pos for _, out in data for pos in out)
    return (word_enc, pos_enc)
예제 #25
0
    
    data_dir = os.path.abspath(__file__ + "../../../../data")
    raw_path = os.path.abspath(os.path.join(data_dir, "./processed_dataset/raw"))
    scenes_path = os.path.abspath(os.path.join(raw_path, cmd_args.scene_file_name))
    graphs_path = os.path.join(raw_path, cmd_args.graph_file_name)

    # In the pytorch geometry package, only int and tensor seems to be allowed to save
    # we process all the graphs and save them to a file.
    
    with open(scenes_path, 'r') as scenes_file:
        scenes = json.load(scenes_file)

    config = get_config()

    graphs = []
    attr_encoder = Encoder(config)

    for scene in scenes:
        for target_id in range(len(scene["objects"])):
            graph = Graph(config, scene, target_id)
            graphs.append(graph)
    
    with open(graphs_path, 'wb') as graphs_file:
        pickle.dump(graphs, graphs_file) 

    root = os.path.join(data_dir, "./processed_dataset")
    scene_dataset = SceneDataset(root, config)

    if os.path.exists(cmd_args.model_path) and os.path.getsize(cmd_args.model_path) > 0:
        refrl = torch.load(cmd_args.model_path)
        logging.info("Loaded refrl model")
예제 #26
0
            return True

        # not possible
        if not self.possible:
            return True

        return False


if __name__ == "__main__":
    # load the data
    data_dir = os.path.abspath(__file__ + "../../../data")
    root = os.path.abspath(os.path.join(data_dir, "./processed_dataset"))

    config = get_config()
    attr_encoder = Encoder(config)

    scenes_path = os.path.abspath(
        os.path.join(data_dir,
                     f"./processed_dataset/raw/{cmd_args.scene_file_name}"))
    with open(scenes_path, 'r') as scenes_file:
        scenes = json.load(scenes_file)

    # construct a mini example
    target_id = 0
    graph = Graph(config, scenes[0], target_id)

    x = attr_encoder.get_embedding([node.name for node in graph.nodes])
    edge_index, edge_types = graph.get_edge_info()
    edge_attrs = torch.tensor(attr_encoder.get_embedding(edge_types))
    data_point = Data(x=x,
예제 #27
0
def train300_mlperf_coco(exp, args):
    from coco import COCO

    device = exp.get_device()
    chrono = exp.chrono()

    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)

    input_size = 300
    train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False)
    val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True)
    # mlperf_log.ssd_print(key=# mlperf_log.INPUT_SIZE, value=input_size)

    val_annotate = os.path.join(args.data,
                                "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")
    train_annotate = os.path.join(args.data,
                                  "annotations/instances_train2017.json")
    train_coco_root = os.path.join(args.data, "train2017")

    cocoGt = COCO(annotation_file=val_annotate)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)

    #print("Number of labels: {}".format(train_coco.labelnum))
    train_dataloader = DataLoader(train_coco,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=4)
    # set shuffle=True in DataLoader
    # mlperf_log.ssd_print(key=# mlperf_log.INPUT_SHARD, value=None)
    # mlperf_log.ssd_print(key=# mlperf_log.INPUT_ORDER)
    # mlperf_log.ssd_print(key=# mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size)

    ssd300 = SSD300(train_coco.labelnum)
    if args.checkpoint is not None:
        print("loading model checkpoint", args.checkpoint)
        od = torch.load(args.checkpoint)
        ssd300.load_state_dict(od["model"])

    ssd300.train()
    ssd300 = ssd300.to(device)
    loss_func = Loss(dboxes).to(device)

    current_lr = 1e-3
    current_momentum = 0.9
    current_weight_decay = 5e-4

    optim = torch.optim.SGD(ssd300.parameters(),
                            lr=current_lr,
                            momentum=current_momentum,
                            weight_decay=current_weight_decay)

    # mlperf_log.ssd_print(key=# mlperf_log.OPT_NAME, value="SGD")
    # mlperf_log.ssd_print(key=# mlperf_log.OPT_LR, value=current_lr)
    # mlperf_log.ssd_print(key=# mlperf_log.OPT_MOMENTUM, value=current_momentum)
    # mlperf_log.ssd_print(key=# mlperf_log.OPT_WEIGHT_DECAY,  value=current_weight_decay)

    print("epoch", "nbatch", "loss")

    iter_num = args.iteration
    avg_loss = 0.0
    inv_map = {v: k for k, v in val_coco.label_map.items()}

    # mlperf_log.ssd_print(key=# mlperf_log.TRAIN_LOOP)

    for epoch in range(args.repeat):

        # mlperf_log.ssd_print(key=# mlperf_log.TRAIN_EPOCH, value=epoch)
        with chrono.time('train') as t:
            for nbatch, (img, img_size, bbox,
                         label) in enumerate(train_dataloader):
                if nbatch > args.number:
                    break

                img = Variable(img.to(device), requires_grad=True)

                ploc, plabel = ssd300(img)

                trans_bbox = bbox.transpose(1, 2).contiguous()

                trans_bbox = trans_bbox.to(device)
                label = label.to(device)

                gloc = Variable(trans_bbox, requires_grad=False)
                glabel = Variable(label, requires_grad=False)

                loss = loss_func(ploc, plabel, gloc, glabel)

                if not np.isinf(loss.item()):
                    avg_loss = 0.999 * avg_loss + 0.001 * loss.item()

                exp.log_batch_loss(loss)

                optim.zero_grad()
                loss.backward()
                optim.step()

                iter_num += 1

        exp.show_eta(epoch, t)

    exp.report()
    return False
예제 #28
0
def train300_mlperf_coco(args):
    from pycocotools.coco import COCO

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda

    # Setup multi-GPU if necessary
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
    local_seed = set_seeds(args)
    # start timing here
    if args.distributed:
        N_gpu = torch.distributed.get_world_size()
    else:
        N_gpu = 1

    validate_group_bn(args.bn_group)
    # Setup data, defaults
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    input_size = 300
    val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True)

    val_annotate = os.path.join(args.data,
                                "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")
    train_annotate = os.path.join(args.data,
                                  "annotations/instances_train2017.json")
    train_coco_root = os.path.join(args.data, "train2017")

    # Build the model
    model_options = {
        'backbone': args.backbone,
        'use_nhwc': args.nhwc,
        'pad_input': args.pad_input,
        'bn_group': args.bn_group,
    }

    ssd300 = SSD300(args.num_classes, **model_options)
    if args.checkpoint is not None:
        load_checkpoint(ssd300, args.checkpoint)

    ssd300.train()
    ssd300.cuda()
    if args.opt_loss:
        loss_func = OptLoss(dboxes)
    else:
        loss_func = Loss(dboxes)
    loss_func.cuda()

    if args.distributed:
        N_gpu = torch.distributed.get_world_size()
    else:
        N_gpu = 1

    if args.use_fp16:
        ssd300 = network_to_half(ssd300)

    # Parallelize.  Need to do this after network_to_half.
    if args.distributed:
        if args.delay_allreduce:
            print_message(args.local_rank,
                          "Delaying allreduces to the end of backward()")
        ssd300 = DDP(ssd300,
                     gradient_predivide_factor=N_gpu / 8.0,
                     delay_allreduce=args.delay_allreduce,
                     retain_allreduce_buffers=args.use_fp16)

    # Create optimizer.  This must also be done after network_to_half.
    global_batch_size = (N_gpu * args.batch_size)
    mlperf_print(key=mlperf_compliance.constants.MODEL_BN_SPAN,
                 value=args.bn_group * args.batch_size)
    mlperf_print(key=mlperf_compliance.constants.GLOBAL_BATCH_SIZE,
                 value=global_batch_size)

    # mlperf only allows base_lr scaled by an integer
    base_lr = 2.5e-3
    requested_lr_multiplier = args.lr / base_lr
    adjusted_multiplier = max(
        1, round(requested_lr_multiplier * global_batch_size / 32))

    current_lr = base_lr * adjusted_multiplier
    current_momentum = 0.9
    current_weight_decay = args.wd
    static_loss_scale = 128.
    if args.use_fp16:
        if args.distributed and not args.delay_allreduce:
            # We can't create the flat master params yet, because we need to
            # imitate the flattened bucket structure that DDP produces.
            optimizer_created = False
        else:
            model_buckets = [
                [
                    p for p in ssd300.parameters()
                    if p.requires_grad and p.type() == "torch.cuda.HalfTensor"
                ],
                [
                    p for p in ssd300.parameters()
                    if p.requires_grad and p.type() == "torch.cuda.FloatTensor"
                ]
            ]
            flat_master_buckets = create_flat_master(model_buckets)
            optim = torch.optim.SGD(flat_master_buckets,
                                    lr=current_lr,
                                    momentum=current_momentum,
                                    weight_decay=current_weight_decay)
            optimizer_created = True
    else:
        optim = torch.optim.SGD(ssd300.parameters(),
                                lr=current_lr,
                                momentum=current_momentum,
                                weight_decay=current_weight_decay)
        optimizer_created = True

    mlperf_print(key=mlperf_compliance.constants.OPT_BASE_LR, value=current_lr)
    mlperf_print(key=mlperf_compliance.constants.OPT_WEIGHT_DECAY,
                 value=current_weight_decay)
    if args.warmup is not None:
        mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_STEPS,
                     value=args.warmup)
        mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_FACTOR,
                     value=args.warmup_factor)

    # Model is completely finished -- need to create separate copies, preserve parameters across
    # them, and jit
    ssd300_eval = SSD300(args.num_classes,
                         backbone=args.backbone,
                         use_nhwc=args.nhwc,
                         pad_input=args.pad_input).cuda()
    if args.use_fp16:
        ssd300_eval = network_to_half(ssd300_eval)

    # Get the existant state from the train model
    # * if we use distributed, then we want .module
    train_model = ssd300.module if args.distributed else ssd300

    ssd300_eval.load_state_dict(train_model.state_dict())

    ssd300_eval.eval()

    print_message(args.local_rank, "epoch", "nbatch", "loss")
    eval_points = np.array(args.evaluation) * 32 / global_batch_size
    eval_points = list(map(int, list(eval_points)))

    iter_num = args.iteration
    avg_loss = 0.0

    start_elapsed_time = time.time()
    last_printed_iter = args.iteration
    num_elapsed_samples = 0

    # Generate normalization tensors
    mean, std = generate_mean_std(args)

    dummy_overflow_buf = torch.cuda.IntTensor([0])

    def step_maybe_fp16_maybe_distributed(optim):
        if args.use_fp16:
            if args.distributed:
                for flat_master, allreduce_buffer in zip(
                        flat_master_buckets, ssd300.allreduce_buffers):
                    if allreduce_buffer is None:
                        raise RuntimeError("allreduce_buffer is None")
                    flat_master.grad = allreduce_buffer.float()
                    flat_master.grad.data.mul_(1. / static_loss_scale)
            else:
                for flat_master, model_bucket in zip(flat_master_buckets,
                                                     model_buckets):
                    flat_grad = apex_C.flatten(
                        [m.grad.data for m in model_bucket])
                    flat_master.grad = flat_grad.float()
                    flat_master.grad.data.mul_(1. / static_loss_scale)
        optim.step()
        if args.use_fp16:
            # Use multi-tensor scale instead of loop & individual parameter copies
            for model_bucket, flat_master in zip(model_buckets,
                                                 flat_master_buckets):
                multi_tensor_applier(
                    amp_C.multi_tensor_scale, dummy_overflow_buf, [
                        apex_C.unflatten(flat_master.data, model_bucket),
                        model_bucket
                    ], 1.0)

    input_c = 4 if args.pad_input else 3
    example_shape = [args.batch_size, 300, 300, input_c
                     ] if args.nhwc else [args.batch_size, input_c, 300, 300]
    example_input = torch.randn(*example_shape).cuda()

    if args.use_fp16:
        example_input = example_input.half()
    if args.jit:
        # DDP has some Python-side control flow.  If we JIT the entire DDP-wrapped module,
        # the resulting ScriptModule will elide this control flow, resulting in allreduce
        # hooks not being called.  If we're running distributed, we need to extract and JIT
        # the wrapped .module.
        # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks
        # to go out of scope, and therefore silently disappear.
        module_to_jit = ssd300.module if args.distributed else ssd300
        if args.distributed:
            ssd300.module = torch.jit.trace(module_to_jit, example_input)
        else:
            ssd300 = torch.jit.trace(module_to_jit, example_input)
        # JIT the eval model too
        ssd300_eval = torch.jit.trace(ssd300_eval, example_input)

    # do a dummy fprop & bprop to make sure cudnnFind etc. are timed here
    ploc, plabel = ssd300(example_input)

    # produce a single dummy "loss" to make things easier
    loss = ploc[0, 0, 0] + plabel[0, 0, 0]
    dloss = torch.randn_like(loss)
    # Cause cudnnFind for dgrad, wgrad to run
    loss.backward(dloss)

    mlperf_print(key=mlperf_compliance.constants.INIT_STOP, sync=True)
    ##### END INIT

    # This is the first place we touch anything related to data
    ##### START DATA TOUCHING
    mlperf_print(key=mlperf_compliance.constants.RUN_START, sync=True)
    barrier()
    cocoGt = COCO(annotation_file=val_annotate, use_ext=True)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)

    if args.distributed:
        val_sampler = GeneralDistributedSampler(val_coco, pad=False)
    else:
        val_sampler = None

    if args.no_dali:
        train_trans = SSDTransformer(dboxes, (input_size, input_size),
                                     val=False)
        train_coco = COCODetection(train_coco_root, train_annotate,
                                   train_trans)

        if args.distributed:
            train_sampler = GeneralDistributedSampler(train_coco, pad=False)
        else:
            train_sampler = None

        train_loader = DataLoader(train_coco,
                                  batch_size=args.batch_size *
                                  args.input_batch_multiplier,
                                  shuffle=(train_sampler is None),
                                  sampler=train_sampler,
                                  num_workers=args.num_workers,
                                  collate_fn=partial(my_collate,
                                                     is_training=True))
    else:
        train_pipe = COCOPipeline(args.batch_size *
                                  args.input_batch_multiplier,
                                  args.local_rank,
                                  train_coco_root,
                                  train_annotate,
                                  N_gpu,
                                  num_threads=args.num_workers,
                                  output_fp16=args.use_fp16,
                                  output_nhwc=args.nhwc,
                                  pad_output=args.pad_input,
                                  seed=local_seed - 2**31,
                                  use_nvjpeg=args.use_nvjpeg,
                                  use_roi=args.use_roi_decode,
                                  dali_cache=args.dali_cache,
                                  dali_async=(not args.dali_sync))
        print_message(args.local_rank,
                      "time_check a: {secs:.9f}".format(secs=time.time()))
        train_pipe.build()
        print_message(args.local_rank,
                      "time_check b: {secs:.9f}".format(secs=time.time()))
        test_run = train_pipe.run()
        train_loader = SingleDaliIterator(
            train_pipe, [
                'images',
                DALIOutput('bboxes', False, True),
                DALIOutput('labels', True, True)
            ],
            train_pipe.epoch_size()['train_reader'],
            ngpu=N_gpu)

    train_loader = EncodingInputIterator(train_loader,
                                         dboxes=encoder.dboxes.cuda(),
                                         nhwc=args.nhwc,
                                         fake_input=args.fake_input,
                                         no_dali=args.no_dali)
    if args.input_batch_multiplier > 1:
        train_loader = RateMatcher(input_it=train_loader,
                                   output_size=args.batch_size)

    val_dataloader = DataLoader(
        val_coco,
        batch_size=args.eval_batch_size,
        shuffle=False,  # Note: distributed sampler is shuffled :(
        sampler=val_sampler,
        num_workers=args.num_workers)

    inv_map = {v: k for k, v in val_coco.label_map.items()}

    ##### END DATA TOUCHING
    i_eval = 0
    first_epoch = 1
    mlperf_print(key=mlperf_compliance.constants.BLOCK_START,
                 metadata={
                     'first_epoch_num':
                     first_epoch,
                     'epoch_count':
                     args.evaluation[i_eval] * 32 /
                     train_pipe.epoch_size()['train_reader']
                 },
                 sync=True)
    for epoch in range(args.epochs):
        mlperf_print(key=mlperf_compliance.constants.EPOCH_START,
                     metadata={'epoch_num': epoch + 1},
                     sync=True)
        for p in ssd300.parameters():
            p.grad = None

        for i, (img, bbox, label) in enumerate(train_loader):

            if args.profile_start is not None and iter_num == args.profile_start:
                torch.cuda.profiler.start()
                torch.cuda.synchronize()
                if args.profile_nvtx:
                    torch.autograd._enable_profiler(
                        torch.autograd.ProfilerState.NVTX)

            if args.profile is not None and iter_num == args.profile:
                if args.profile_start is not None and iter_num >= args.profile_start:
                    # we turned cuda and nvtx profiling on, better turn it off too
                    if args.profile_nvtx:
                        torch.autograd._disable_profiler()
                    torch.cuda.profiler.stop()
                return

            if args.warmup is not None and optimizer_created:
                lr_warmup(optim, args.warmup, iter_num, epoch, current_lr,
                          args)
            if iter_num == ((args.decay1 * 1000 * 32) // global_batch_size):
                print_message(args.local_rank, "lr decay step #1")
                current_lr *= 0.1
                for param_group in optim.param_groups:
                    param_group['lr'] = current_lr

            if iter_num == ((args.decay2 * 1000 * 32) // global_batch_size):
                print_message(args.local_rank, "lr decay step #2")
                current_lr *= 0.1
                for param_group in optim.param_groups:
                    param_group['lr'] = current_lr

            if (img is None) or (bbox is None) or (label is None):
                print("No labels in batch")
                continue

            ploc, plabel = ssd300(img)
            ploc, plabel = ploc.float(), plabel.float()

            N = img.shape[0]
            gloc, glabel = Variable(bbox, requires_grad=False), \
                           Variable(label, requires_grad=False)
            loss = loss_func(ploc, plabel, gloc, glabel)

            if np.isfinite(loss.item()):
                avg_loss = 0.999 * avg_loss + 0.001 * loss.item()
            else:
                print("model exploded (corrupted by Inf or Nan)")
                sys.exit()

            num_elapsed_samples += N
            if args.local_rank == 0 and iter_num % args.print_interval == 0:
                end_elapsed_time = time.time()
                elapsed_time = end_elapsed_time - start_elapsed_time

                avg_samples_per_sec = num_elapsed_samples * N_gpu / elapsed_time

                print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\
                            .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n")

                last_printed_iter = iter_num
                start_elapsed_time = time.time()
                num_elapsed_samples = 0

            # loss scaling
            if args.use_fp16:
                loss = loss * static_loss_scale
            loss.backward()

            if not optimizer_created:
                # Imitate the model bucket structure created by DDP.
                # These will already be split by type (float or half).
                model_buckets = []
                for bucket in ssd300.active_i_buckets:
                    model_buckets.append([])
                    for active_i in bucket:
                        model_buckets[-1].append(
                            ssd300.active_params[active_i])
                flat_master_buckets = create_flat_master(model_buckets)
                optim = torch.optim.SGD(flat_master_buckets,
                                        lr=current_lr,
                                        momentum=current_momentum,
                                        weight_decay=current_weight_decay)
                optimizer_created = True
                # Skip this first iteration because flattened allreduce buffers are not yet created.
                # step_maybe_fp16_maybe_distributed(optim)
            else:
                step_maybe_fp16_maybe_distributed(optim)

            # Likely a decent skew here, let's take this opportunity to set the gradients to None.
            # After DALI integration, playing with the placement of this is worth trying.
            for p in ssd300.parameters():
                p.grad = None

            if iter_num in eval_points:
                # Get the existant state from the train model
                # * if we use distributed, then we want .module
                train_model = ssd300.module if args.distributed else ssd300

                if args.distributed and args.allreduce_running_stats:
                    if get_rank() == 0:
                        print("averaging bn running means and vars")
                    # make sure every node has the same running bn stats before
                    # using them to evaluate, or saving the model for inference
                    world_size = float(torch.distributed.get_world_size())
                    for bn_name, bn_buf in train_model.named_buffers(
                            recurse=True):
                        if ('running_mean' in bn_name) or ('running_var'
                                                           in bn_name):
                            torch.distributed.all_reduce(bn_buf,
                                                         op=dist.ReduceOp.SUM)
                            bn_buf /= world_size

                if get_rank() == 0:
                    if not args.no_save:
                        print("saving model...")
                        torch.save(
                            {
                                "model": ssd300.state_dict(),
                                "label_map": val_coco.label_info
                            }, "./models/iter_{}.pt".format(iter_num))

                ssd300_eval.load_state_dict(train_model.state_dict())
                succ = coco_eval(
                    ssd300_eval,
                    val_dataloader,
                    cocoGt,
                    encoder,
                    inv_map,
                    args.threshold,
                    epoch,
                    iter_num,
                    args.eval_batch_size,
                    use_fp16=args.use_fp16,
                    local_rank=args.local_rank if args.distributed else -1,
                    N_gpu=N_gpu,
                    use_nhwc=args.nhwc,
                    pad_input=args.pad_input)
                mlperf_print(key=mlperf_compliance.constants.BLOCK_STOP,
                             metadata={'first_epoch_num': first_epoch},
                             sync=True)
                if succ:
                    return True
                if iter_num != max(eval_points):
                    i_eval += 1
                    first_epoch = epoch + 1
                    mlperf_print(key=mlperf_compliance.constants.BLOCK_START,
                                 metadata={
                                     'first_epoch_num':
                                     first_epoch,
                                     'epoch_count':
                                     (args.evaluation[i_eval] -
                                      args.evaluation[i_eval - 1]) * 32 /
                                     train_pipe.epoch_size()['train_reader']
                                 },
                                 sync=True)
            iter_num += 1
            if args.max_iter > 0:
                if iter_num > args.max_iter:
                    break

        train_loader.reset()
        mlperf_print(key=mlperf_compliance.constants.EPOCH_STOP,
                     metadata={'epoch_num': epoch + 1},
                     sync=True)
    return False
예제 #29
0
 def test_encoder(self):
     encoder = Encoder()
     self.assertEqual(encoder.transform("a"), 0)
     self.assertEqual(encoder.transform("b"), 1)
     self.assertEqual(encoder.transform("a"), 0)
예제 #30
0
def train300_mlperf_coco(args):
    global torch
    from coco import COCO
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    args.distributed = False
    if use_cuda:
        try:
            from apex.parallel import DistributedDataParallel as DDP
            if 'WORLD_SIZE' in os.environ:
                args.distributed = int(os.environ['WORLD_SIZE']) > 1
        except:
            raise ImportError(
                "Please install APEX from https://github.com/nvidia/apex")

    if args.distributed:
        # necessary pytorch imports
        import torch.utils.data.distributed
        import torch.distributed as dist
        #     ssd_print(key=mlperf_log.RUN_SET_RANDOM_SEED)
        if args.no_cuda:
            device = torch.device('cpu')
        else:
            torch.cuda.set_device(args.local_rank)
            device = torch.device('cuda')
            dist.init_process_group(backend='nccl', init_method='env://')
            # set seeds properly
            args.seed = broadcast_seeds(args.seed, device)
            local_seed = (args.seed + dist.get_rank()) % 2**32
            print(dist.get_rank(), "Using seed = {}".format(local_seed))
            torch.manual_seed(local_seed)
            np.random.seed(seed=local_seed)

    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)

    input_size = 300
    train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False)
    val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True)
    ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size)

    val_annotate = os.path.join(args.data,
                                "annotations/instances_val2017.json")
    val_coco_root = os.path.join(args.data, "val2017")
    train_annotate = os.path.join(args.data,
                                  "annotations/instances_train2017.json")
    train_coco_root = os.path.join(args.data, "train2017")

    cocoGt = COCO(annotation_file=val_annotate)
    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)

    #print("Number of labels: {}".format(train_coco.labelnum))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_coco)
    else:
        train_sampler = None
    train_dataloader = DataLoader(train_coco,
                                  batch_size=args.batch_size,
                                  shuffle=(train_sampler is None),
                                  sampler=train_sampler,
                                  num_workers=4)
    # set shuffle=True in DataLoader
    ssd_print(key=mlperf_log.INPUT_SHARD, value=None)
    ssd_print(key=mlperf_log.INPUT_ORDER)
    ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size)

    ssd300 = SSD300(train_coco.labelnum)
    if args.checkpoint is not None:
        print("loading model checkpoint", args.checkpoint)
        od = torch.load(args.checkpoint)
        ssd300.load_state_dict(od["model"])
    ssd300.train()
    if use_cuda:
        ssd300.cuda()
    loss_func = Loss(dboxes)
    if use_cuda:
        loss_func.cuda()
    if args.distributed:
        N_gpu = torch.distributed.get_world_size()
    else:
        N_gpu = 1

# parallelize
    if args.distributed:
        ssd300 = DDP(ssd300)

    global_batch_size = N_gpu * args.batch_size
    current_lr = args.lr * (global_batch_size / 32)
    current_momentum = 0.9
    current_weight_decay = 5e-4
    optim = torch.optim.SGD(ssd300.parameters(),
                            lr=current_lr,
                            momentum=current_momentum,
                            weight_decay=current_weight_decay)
    ssd_print(key=mlperf_log.OPT_NAME, value="SGD")
    ssd_print(key=mlperf_log.OPT_LR, value=current_lr)
    ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum)
    ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay)
    eval_points = args.evaluation
    print("epoch", "nbatch", "loss")

    iter_num = args.iteration
    avg_loss = 0.0
    inv_map = {v: k for k, v in val_coco.label_map.items()}
    success = torch.zeros(1)
    if use_cuda:
        success = success.cuda()

    if args.warmup:
        nonempty_imgs = len(train_coco)
        wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size))
        warmup_step = lambda iter_num, current_lr: lr_warmup(
            optim, wb, iter_num, current_lr, args)
    else:
        warmup_step = lambda iter_num, current_lr: None

    for epoch in range(args.epochs):
        ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        # set the epoch for the sampler
        if args.distributed:
            train_sampler.set_epoch(epoch)

        if epoch in args.lr_decay_schedule:
            current_lr *= 0.1
            print("")
            print("lr decay step #{num}".format(
                num=args.lr_decay_schedule.index(epoch) + 1))
            for param_group in optim.param_groups:
                param_group['lr'] = current_lr
            ssd_print(key=mlperf_log.OPT_LR, value=current_lr)

        for nbatch, (img, img_size, bbox,
                     label) in enumerate(train_dataloader):

            if use_cuda:
                img = img.cuda()
            img = Variable(img, requires_grad=True)
            ploc, plabel = ssd300(img)
            trans_bbox = bbox.transpose(1, 2).contiguous()
            if use_cuda:
                trans_bbox = trans_bbox.cuda()
                label = label.cuda()
            gloc, glabel = Variable(trans_bbox, requires_grad=False), \
                           Variable(label, requires_grad=False)
            loss = loss_func(ploc, plabel, gloc, glabel)

            if not np.isinf(loss.item()):
                avg_loss = 0.999 * avg_loss + 0.001 * loss.item()

            print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\
                        .format(iter_num, loss.item(), avg_loss), end="\r")
            optim.zero_grad()
            loss.backward()
            warmup_step(iter_num, current_lr)
            optim.step()

            iter_num += 1

        if epoch + 1 in eval_points:
            rank = dist.get_rank() if args.distributed else args.local_rank
            if args.distributed:
                world_size = float(dist.get_world_size())
                for bn_name, bn_buf in ssd300.module.named_buffers(
                        recurse=True):
                    if ('running_mean' in bn_name) or ('running_var'
                                                       in bn_name):
                        dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
                        bn_buf /= world_size
            if rank == 0:
                if not args.no_save:
                    print("")
                    print("saving model...")
                    torch.save(
                        {
                            "model": ssd300.state_dict(),
                            "label_map": train_coco.label_info
                        }, "./models/iter_{}.pt".format(iter_num))

                if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map,
                             args.threshold, epoch + 1, iter_num):
                    success = torch.ones(1)
                    if use_cuda:
                        success = success.cuda()
            if args.distributed:
                dist.broadcast(success, 0)
            if success[0]:
                return True

    return False