예제 #1
0
def main_global(args):
    data_dir = args.data_dir
    params = {'batch_size': args.batch, 'shuffle': False}
    if args.bert_fts:
        type_dir = "all_bertemb/"
    else:
        type_dir = "all/"
    data_dir_back = ""
    if (args.trainon == 'bothway') or (args.trainon == 'bothWselect'):
        if args.bert_fts:
            data_dir_back = args.data_dir + "all_backward_bertemb/"
        else:
            data_dir_back = args.data_dir + "all_backward/"
    train_data = EventDataset(args.data_dir + type_dir, "train",
                              args.glove2vocab, data_dir_back, args.bert_fts)
    print('train_data: %s in total' % len(train_data))
    train_generator = get_data_loader(train_data, **params)
    dev_data = EventDataset(args.data_dir + type_dir, "dev", args.glove2vocab,
                            data_dir_back, args.bert_fts)
    print('dev_data: %s in total' % len(dev_data))
    dev_generator = get_data_loader(dev_data, **params)

    if args.bert_fts:
        data_dir_back = args.data_dir + "all_backward_bertemb/"
    else:
        data_dir_back = args.data_dir + "all_backward/"
    test_data = EventDataset(args.data_dir + type_dir, "test",
                             args.glove2vocab, data_dir_back, args.bert_fts)
    test_generator = get_data_loader(test_data, **params)

    s_time = time.time()
    models = [NNClassifier()]
    score = 0
    for model in models:
        dev_f1 = model.train_epoch(train_generator, dev_generator, args)
        print('total time escape', time.time() - s_time)
        evaluator = Evaluator(model)
        #print(evaluator.evaluate(test_generator, args))
        score = evaluator.get_score(test_generator, args)
        #evaluator.collect_result(test_generator, args)
        print('final test f1: %.4f' % (score))
    return float(dev_f1), float(score)
    def parallel_cv(self, split, emb=np.array([]), pos_emb=[], args=None):
        params = {'batch_size': args.batch, 'shuffle': False}
        if args.bert_fts:
            type_dir = "cv_bertemb"
        else:
            type_dir = "cv_shuffle" if args.cv_shuffle else 'cv'

        backward_dir = ""
        if (args.trainon == 'bothway') or (args.trainon == 'bothWselect'):
            if args.bert_fts:
                backward_dir = "%scv_backward_bertemb/fold%s/" % (
                    args.data_dir, split)
            else:
                backward_dir = "%scv_backward/fold%s/" % (args.data_dir, split)

        train_data = EventDataset(
            args.data_dir + '%s/fold%s/' % (type_dir, split), "train",
            args.glove2vocab, backward_dir, args.bert_fts)
        train_generator = get_data_loader(train_data, **params)
        dev_data = EventDataset(
            args.data_dir + '%s/fold%s/' % (type_dir, split), "dev",
            args.glove2vocab, backward_dir, args.bert_fts)
        dev_generator = get_data_loader(dev_data, **params)

        seeds = [0, 10, 20]
        accumu_f1 = 0.
        accumu_epoch = 0.
        for seed in seeds:
            exec("args.%s=%s" % ('seed', seed))
            f1, epoch = self._train(train_generator,
                                    dev_generator,
                                    emb,
                                    pos_emb,
                                    args,
                                    in_cv=True)
            accumu_f1 += f1
            accumu_epoch += epoch
        avg_f1 = accumu_f1 / float(len(seeds))
        avg_epoch = accumu_epoch / float(len(seeds))

        return avg_f1, avg_epoch
def main_local(args):
    data_dir = args.data_dir
    params = {'batch_size': args.batch, 'shuffle': False}
    if args.bert_fts:
        type_dir = "all_bertemb/"
    else:
        type_dir = "all/"
    data_dir_back = ""
    if (args.trainon == 'bothway') or (args.trainon == 'bothWselect'):
        if args.bert_fts:
            data_dir_back = args.data_dir + "all_backward_bertemb/"
        else:
            data_dir_back = args.data_dir + "all_backward/"
    train_data = EventDataset(args.data_dir + type_dir, "train",
                              args.glove2vocab, data_dir_back, args.bert_fts)
    print('total train_data %s samples' % len(train_data))
    train_generator = get_data_loader(train_data, **params)
    dev_data = EventDataset(args.data_dir + type_dir, "dev", args.glove2vocab,
                            data_dir_back, args.bert_fts)
    print('total dev_data %s samples' % len(dev_data))
    dev_generator = get_data_loader(dev_data, **params)

    if args.bert_fts:
        data_dir_back = args.data_dir + "all_backward_bertemb/"
    else:
        data_dir_back = args.data_dir + "all_backward/"
    test_data = EventDataset(args.data_dir + type_dir, "test",
                             args.glove2vocab, data_dir_back, args.bert_fts)
    test_generator = get_data_loader(test_data, **params)

    models = [NNClassifier()]
    for model in models:
        dev_f1 = model.train_epoch(train_generator, dev_generator, args)
        evaluator = Evaluator(model)
        #evaluator.for_analysis(test_generator, args)
        score = evaluator.get_score(test_generator, args)
    return float(dev_f1), float(score)
예제 #4
0
    def train_epoch(self, train_data, dev_data, args, test_data=None):
        if args.data_type == "matres":
            label_map = matres_label_map
        elif args.data_type == "tbd":
            label_map = tbd_label_map
        else:
            label_map = new_label_map
        all_labels = list(OrderedDict.fromkeys(label_map.values()))
        self._label_to_id = OrderedDict([(all_labels[l], l)
                                         for l in range(len(all_labels))])
        self._id_to_label = OrderedDict([(l, all_labels[l])
                                         for l in range(len(all_labels))])
        args.label_to_id = self._label_to_id
        if args.joint:
            label_map_c = causal_label_map
            all_labels_c = list(OrderedDict.fromkeys(label_map_c.values()))
            self._label_to_id_c = OrderedDict([
                (all_labels_c[l], l) for l in range(len(all_labels_c))
            ])
            self._id_to_label_c = OrderedDict([
                (l, all_labels_c[l]) for l in range(len(all_labels_c))
            ])

        emb = args.emb_array
        np.random.seed(0)
        emb = np.vstack((np.random.uniform(0, 1, (2, emb.shape[1])), emb))
        assert emb.shape[0] == len(args.glove2vocab)
        pos_emb = np.zeros((len(args.pos2idx) + 2, len(args.pos2idx) + 2))
        for i in range(pos_emb.shape[0]):
            pos_emb[i, i] = 1.0

        self.args = args
        selected_epoch = 20
        if args.cv == True:
            best_params, avg_epoch = self.cross_validation(
                emb, pos_emb, copy.deepcopy(args))
            for k, v in best_params.items():
                exec("args.%s=%s" % (k, v))
            if args.write:
                with open(
                        'best_param/global_cv_bestparam_' +
                        str(args.data_type) + '_TrainOn' + str(args.trainon) +
                        '_Teston' + str(args.teston) + '_uf' +
                        str(args.usefeature) + '_trainpos' +
                        str(args.train_pos_emb) + '_joint' + str(args.joint) +
                        '_devbytrain' + str(args.devbytrain), 'w') as file:
                    for k, v in vars(args).items():
                        if (k != 'emb_array') and (k != 'glove2vocab'):
                            file.write(str(k) + '    ' + str(v) + '\n')
            selected_epoch = avg_epoch
        elif args.selectparam == True:
            best_params, best_epoch = self.selectparam(emb, pos_emb, args)
            for k, v in best_params.items():
                exec("args.%s=%s" % (k, v))
            if args.write:
                with open(
                        'best_param/global_selectDev_bestparam_' +
                        str(args.data_type) + '_TrainOn' + str(args.trainon) +
                        '_Teston' + str(args.teston) + '_uf' +
                        str(args.usefeature) + '_trainpos' +
                        str(args.train_pos_emb) + '_joint' + str(args.joint) +
                        '_devbytrain' + str(args.devbytrain), 'w') as file:
                    for k, v in vars(args).items():
                        if (k != 'emb_array') and (k != 'glove2vocab'):
                            file.write(str(k) + '    ' + str(v) + '\n')
            selected_epoch = best_epoch

        if args.refit_all:
            exec('args.epochs=%s' % int(selected_epoch))
            print('refit all.....')
            params = {'batch_size': args.batch, 'shuffle': False}
            if args.bert_fts:
                type_dir = "all_bertemb/"
            else:
                type_dir = 'all/'
            data_dir_back = ""
            if (args.trainon == 'bothway') or (args.trainon == 'bothWselect'):
                if args.bert_fts:
                    data_dir_back = args.data_dir + "all_backward_bertemb/"
                else:
                    data_dir_back = args.data_dir + "all_backward/"
            t_data = EventDataset(args.data_dir + type_dir, 'train',
                                  args.glove2vocab, data_dir_back,
                                  args.bert_fts)
            d_data = EventDataset(args.data_dir + type_dir, 'dev',
                                  args.glove2vocab, data_dir_back,
                                  args.bert_fts)
            t_data.merge_dataset(d_data)
            train_data = get_data_loader(t_data, **params)
            dev_data = []
        best_f1, best_epoch = self._train(train_data, dev_data, emb, pos_emb,
                                          args)
        print("Final Epoch Use: %s" % best_epoch)
        print("Final Dev F1: %.4f" % best_f1)
        return best_f1
예제 #5
0
파일: test.py 프로젝트: TIBHannover/VisE
def main():
    args = parse_args()
    level = logging.INFO
    if args.debug:
        level = logging.DEBUG

    logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s",
                        datefmt="%d-%m-%Y %H:%M:%S",
                        level=level)

    # load cfg
    if os.path.exists(args.cfg):
        with open(args.cfg) as f:
            cfg = yaml.load(f, Loader=yaml.FullLoader)
            logging.debug(cfg)
    else:
        logging.error(f"Cannot find cfg file: {args.cfg}")
        return 0

    # load ontology
    OntReader = OntologyReader(graph_file=os.path.join(
        os.path.dirname(args.cfg), cfg["graph"]),
                               weighting_scheme=cfg["weighting_scheme"],
                               leaf_node_weight=cfg["leaf_node_weight"])

    # init torch
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        batch_size = torch.cuda.device_count() * args.batch_size
    else:
        batch_size = args.batch_size

    # build model and load checkpoint
    if cfg["model_type"] == "ontology":
        weights = OntReader.get_node_weights(cfg["redundancy_removal"])
        num_classes = len(weights)
    else:  # cfg["model_type"] == "classification"
        num_classes = OntReader.num_leafs

    if torch.cuda.device_count() == 0:
        logging.info(f"Test on CPU with batch_size {batch_size}")
    else:
        logging.info(
            f"Test on {torch.cuda.device_count()} GPU(s) with batch_size {batch_size}"
        )

    model = ResNet50(num_classes=num_classes,
                     model_type=cfg["model_type"],
                     redundancy_removal=cfg["redundancy_removal"])
    model.to(device)

    if torch.cuda.device_count() > 1:
        logging.info(f"Found {torch.cuda.device_count()} GPUs")
        model = nn.DataParallel(model)

    model.eval()
    model.load(device=device,
               path=os.path.join(os.path.dirname(args.cfg),
                                 cfg["model_checkpoint"]))

    # Init testing dataset
    infer_dataset = EventDataset(image_dir=args.image_dir,
                                 testset_path=args.testset)
    infer_dataloader = DataLoader(infer_dataset,
                                  batch_size=batch_size,
                                  num_workers=8)

    # predict event classes for images
    sample_predictions = get_sample_predictions(
        infer_dataloader=infer_dataloader,
        OntReader=OntReader,
        model=model,
        device=device,
        s2l_strategy=args.s2l_strategy)

    # calculate result for all nodes in the ontology
    logging.info("Calculate results ...")
    node_results = get_test_results(sample_predictions=sample_predictions,
                                    OntReader=OntReader)

    # print final results (global results are stored in the root node occurrence (Q1190554))
    if "Q1190554" not in node_results:
        logging.warning("No results written ...")
        return 0

    print_results(node_results["Q1190554"]["metrics"],
                  node_results["Q1190554"]["num_test_images"])

    # write results for each node
    if args.output:
        if not os.path.exists(os.path.dirname(args.output)):
            os.makedirs(os.path.dirname(args.output))

        result_list = []
        for val in node_results.values():
            # calculate mean result
            for metric, result in val["metrics"].items():
                val["metrics"][metric] = result / val["num_test_images"]
            result_list.append(val)

        result_list = sorted(result_list,
                             key=lambda x: x["num_test_images"],
                             reverse=True)
        with open(args.output, "w") as jsonfile:
            for result in result_list:
                jsonfile.write(json.dumps(result) + "\n")

        logging.info(f"Results written to: {args.output}")

    return 0