def save_classifier(classifier, file_path="./model/"):
    """Saves classifier in the given location

    :param classifier: Model to save
    :param str file_path: Path to file
    """
    make_directory(file_path.rsplit('/', 1)[0])
    # Save classifier params
    with open(os.path.join(file_path, 'params.json'), 'w') as fp:
        params = {"class_name": type(classifier).__name__, "defences": classifier.defences}
        json.dump(params, fp)

    # Serialize model to JSON
    with open(os.path.join(file_path, "model.json"), "w") as json_file:
        model_json = classifier.model.to_json()
        json_file.write(model_json)

    # Serialize weights to HDF5
    classifier.model.save_weights(os.path.join(file_path, "weights.h5"))

    # Save compilation params to json
    if classifier.comp_param:
        with open(os.path.join(file_path, 'comp_par.json'), 'w') as fp:
            try:
                json.dump(classifier.comp_param, fp)
            except:
                fp.seek(0)
                json.dump({"loss": 'categorical_crossentropy', "optimizer": "sgd",
                           "metrics": ['accuracy']}, fp)
                fp.truncate()
示例#2
0
def process(args):
    utils.make_directory(args.path['model'])
    tokenizer = args.tokenizer(args.path['vocab'])
    train_x = utils.read_lines(args.path['train_x'])
    train_y = utils.read_lines(args.path['train_y'])
    dataset = train_x + train_y
    keywords = None

    if args.problem == 'lda':
        model = LDAModel(args)
    else:
        trainset = [tokenizer.encode_line_into_words(i) for i in dataset]
        train_keywords(trainset, args.path['model'])
        keywords = load_keywords(args.path['model'])
        model = TFIDFModel(args)

    list_toks = []
    for n, line in enumerate(train_x):
        if not n % 10000 and n:
            utils.verbose('Tokenizing {} lines for {}'.format(n, args.problem))
        if keywords is None:
            list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line)])
        else:
            list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line)
                              if s in keywords[: args.num_keywords]])
    model.fit(list_toks)
示例#3
0
def run_ner():
    """run ner task"""
    args_opt = parse_args()
    epoch_num = args_opt.epoch_num
    assessment_method = args_opt.assessment_method.lower()
    load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path
    save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path
    load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path
    target = args_opt.device_target
    if target == "Ascend":
        context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
    elif target == "GPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
        if bert_net_cfg.compute_type != mstype.float32:
            logger.warning('GPU only support fp32 temporarily, run with fp32.')
            bert_net_cfg.compute_type = mstype.float32
    else:
        raise Exception("Target error, GPU or Ascend is supported.")
    label_list = []
    with open(args_opt.label_file_path) as f:
        for label in f:
            label_list.append(label.strip())
    tag_to_index = convert_labels_to_index(label_list)
    if args_opt.use_crf.lower() == "true":
        max_val = max(tag_to_index.values())
        tag_to_index["<START>"] = max_val + 1
        tag_to_index["<STOP>"] = max_val + 2
        number_labels = len(tag_to_index)
    else:
        number_labels = args_opt.num_class
    if args_opt.do_train.lower() == "true":
        netwithloss = BertNER(bert_net_cfg, args_opt.train_batch_size, True, num_labels=number_labels,
                              use_crf=(args_opt.use_crf.lower() == "true"),
                              tag_to_index=tag_to_index, dropout_prob=0.1)
        ds = create_ner_dataset(batch_size=args_opt.train_batch_size, repeat_count=1,
                                assessment_method=assessment_method, data_file_path=args_opt.train_data_file_path,
                                schema_file_path=args_opt.schema_file_path,
                                do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
        do_train(ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num)

        if args_opt.do_eval.lower() == "true":
            if save_finetune_checkpoint_path == "":
                load_finetune_checkpoint_dir = _cur_dir
            else:
                load_finetune_checkpoint_dir = make_directory(save_finetune_checkpoint_path)
            load_finetune_checkpoint_path = LoadNewestCkpt(load_finetune_checkpoint_dir,
                                                           ds.get_dataset_size(), epoch_num, "ner")

    if args_opt.do_eval.lower() == "true":
        ds = create_ner_dataset(batch_size=args_opt.eval_batch_size, repeat_count=1,
                                assessment_method=assessment_method, data_file_path=args_opt.eval_data_file_path,
                                schema_file_path=args_opt.schema_file_path,
                                do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
        do_eval(ds, BertNER, args_opt.use_crf, number_labels, assessment_method,
                args_opt.eval_data_file_path, load_finetune_checkpoint_path, args_opt.vocab_file_path,
                args_opt.label_file_path, tag_to_index, args_opt.eval_batch_size)
示例#4
0
def run_squad():
    """run squad task"""
    parser = argparse.ArgumentParser(description="run classifier")
    parser.add_argument("--device_target", type=str, default="Ascend", help="Device type, default is Ascend")
    parser.add_argument("--do_train", type=str, default="false", help="Eable train, default is false")
    parser.add_argument("--do_eval", type=str, default="false", help="Eable eval, default is false")
    parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
    parser.add_argument("--epoch_num", type=int, default="1", help="Epoch number, default is 1.")
    parser.add_argument("--num_class", type=int, default="2", help="The number of class, default is 2.")
    parser.add_argument("--train_data_shuffle", type=str, default="true",
                        help="Enable train data shuffle, default is true")
    parser.add_argument("--eval_data_shuffle", type=str, default="false",
                        help="Enable eval data shuffle, default is false")
    parser.add_argument("--vocab_file_path", type=str, default="", help="Vocab file path")
    parser.add_argument("--eval_json_path", type=str, default="", help="Evaluation json file path, can be eval.json")
    parser.add_argument("--save_finetune_checkpoint_path", type=str, default="", help="Save checkpoint path")
    parser.add_argument("--load_pretrain_checkpoint_path", type=str, default="", help="Load checkpoint file path")
    parser.add_argument("--load_finetune_checkpoint_path", type=str, default="", help="Load checkpoint file path")
    parser.add_argument("--train_data_file_path", type=str, default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--eval_data_file_path", type=str, default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_file_path", type=str, default="",
                        help="Schema path, it is better to use absolute path")
    args_opt = parser.parse_args()
    epoch_num = args_opt.epoch_num
    load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path
    save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path
    load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path

    if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower() == "false":
        raise ValueError("At least one of 'do_train' or 'do_eval' must be true")
    if args_opt.do_train.lower() == "true" and args_opt.train_data_file_path == "":
        raise ValueError("'train_data_file_path' must be set when do finetune task")
    if args_opt.do_eval.lower() == "true":
        if args_opt.eval_data_file_path == "":
            raise ValueError("'eval_data_file_path' must be set when do evaluation task")
        if args_opt.vocab_file_path == "":
            raise ValueError("'vocab_file_path' must be set when do evaluation task")
        if args_opt.eval_json_path == "":
            raise ValueError("'tokenization_file_path' must be set when do evaluation task")


    target = args_opt.device_target
    if target == "Ascend":
        context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
    elif target == "GPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
        if bert_net_cfg.compute_type != mstype.float32:
            logger.warning('GPU only support fp32 temporarily, run with fp32.')
            bert_net_cfg.compute_type = mstype.float32
    else:
        raise Exception("Target error, GPU or Ascend is supported.")

    netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1)

    if args_opt.do_train.lower() == "true":
        ds = create_squad_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1,
                                  data_file_path=args_opt.train_data_file_path,
                                  schema_file_path=args_opt.schema_file_path,
                                  do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
        do_train(ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num)
        if args_opt.do_eval.lower() == "true":
            if save_finetune_checkpoint_path == "":
                load_finetune_checkpoint_dir = _cur_dir
            else:
                load_finetune_checkpoint_dir = make_directory(save_finetune_checkpoint_path)
            load_finetune_checkpoint_path = LoadNewestCkpt(load_finetune_checkpoint_dir,
                                                           ds.get_dataset_size(), epoch_num, "squad")

    if args_opt.do_eval.lower() == "true":
        ds = create_squad_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1,
                                  data_file_path=args_opt.eval_data_file_path,
                                  schema_file_path=args_opt.schema_file_path, is_training=False,
                                  do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
        do_eval(ds, args_opt.vocab_file_path, args_opt.eval_json_path,
                load_finetune_checkpoint_path, bert_net_cfg.seq_length)
示例#5
0
    logits, z, _, _ = model(inputs)

    labels = torch.argmax(logits.data, 1)

    return labels.cpu().detach().numpy(), z[-1].cpu().detach().numpy()


if __name__ == "__main__":

    data_dir = "../DATA/nc/"
    model_dir = "results/iresnet/best/"
    save_dir = "results/iresnet/best/"

    save_dir_labels = os.path.join(save_dir, "predicted-label-masks")
    make_directory(save_dir_labels)

    save_dir_z = os.path.join(save_dir, "predicted-z", "z")
    save_dir_loc = os.path.join(save_dir, "predicted-z", "locations")

    make_directory(save_dir_z)
    make_directory(save_dir_loc)

    m = np.load(os.path.join(model_dir, "../mean.npy"))
    s = np.load(os.path.join(model_dir, "../std.npy"))

    # dataset loader
    tile_extr = TileExtractor()
    normalizer = Normalizer(m, s)
    dataset = CumuloDataset(root_dir="../DATA/nc/",
                            ext="nc",
示例#6
0
def run_classifier():
    """run classifier task"""
    parser = argparse.ArgumentParser(description="run classifier")
    parser.add_argument("--device_target",
                        type=str,
                        default="Ascend",
                        choices=["Ascend", "GPU"],
                        help="Device type, default is Ascend")
    parser.add_argument(
        "--assessment_method",
        type=str,
        default="Accuracy",
        choices=["Mcc", "Spearman_correlation", "Accuracy", "F1"],
        help=
        "assessment_method including [Mcc, Spearman_correlation, Accuracy, F1],\
                             default is Accuracy")
    parser.add_argument("--do_train",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Enable train, default is false")
    parser.add_argument("--do_eval",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Enable eval, default is false")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--epoch_num",
                        type=int,
                        default="1",
                        help="Epoch number, default is 1.")
    parser.add_argument("--num_class",
                        type=int,
                        default="2",
                        help="The number of class, default is 2.")
    parser.add_argument("--train_data_shuffle",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Enable train data shuffle, default is true")
    parser.add_argument("--eval_data_shuffle",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Enable eval data shuffle, default is false")
    parser.add_argument("--save_finetune_checkpoint_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_pretrain_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--load_finetune_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--train_data_file_path",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--eval_data_file_path",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_file_path",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")
    args_opt = parser.parse_args()
    epoch_num = args_opt.epoch_num
    assessment_method = args_opt.assessment_method.lower()
    load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path
    save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path
    load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path

    if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower(
    ) == "false":
        raise ValueError(
            "At least one of 'do_train' or 'do_eval' must be true")
    if args_opt.do_train.lower(
    ) == "true" and args_opt.train_data_file_path == "":
        raise ValueError(
            "'train_data_file_path' must be set when do finetune task")
    if args_opt.do_eval.lower(
    ) == "true" and args_opt.eval_data_file_path == "":
        raise ValueError(
            "'eval_data_file_path' must be set when do evaluation task")

    target = args_opt.device_target
    if target == "Ascend":
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="Ascend",
                            device_id=args_opt.device_id)
    elif target == "GPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
        if bert_net_cfg.compute_type != mstype.float32:
            logger.warning('GPU only support fp32 temporarily, run with fp32.')
            bert_net_cfg.compute_type = mstype.float32
    else:
        raise Exception("Target error, GPU or Ascend is supported.")

    netwithloss = BertCLS(bert_net_cfg,
                          True,
                          num_labels=args_opt.num_class,
                          dropout_prob=0.1,
                          assessment_method=assessment_method)

    if args_opt.do_train.lower() == "true":
        ds = create_classification_dataset(
            batch_size=bert_net_cfg.batch_size,
            repeat_count=1,
            assessment_method=assessment_method,
            data_file_path=args_opt.train_data_file_path,
            schema_file_path=args_opt.schema_file_path,
            do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
        do_train(ds, netwithloss, load_pretrain_checkpoint_path,
                 save_finetune_checkpoint_path, epoch_num)

        if args_opt.do_eval.lower() == "true":
            if save_finetune_checkpoint_path == "":
                load_finetune_checkpoint_dir = _cur_dir
            else:
                load_finetune_checkpoint_dir = make_directory(
                    save_finetune_checkpoint_path)
            load_finetune_checkpoint_path = LoadNewestCkpt(
                load_finetune_checkpoint_dir, ds.get_dataset_size(), epoch_num,
                "classifier")

    if args_opt.do_eval.lower() == "true":
        ds = create_classification_dataset(
            batch_size=bert_net_cfg.batch_size,
            repeat_count=1,
            assessment_method=assessment_method,
            data_file_path=args_opt.eval_data_file_path,
            schema_file_path=args_opt.schema_file_path,
            do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
        do_eval(ds, BertCLS, args_opt.num_class, assessment_method,
                load_finetune_checkpoint_path)
示例#7
0
classifier.compile(comp_params)

if args.save is not False:
    if args.save:
        MODEL_PATH = os.path.abspath(args.save)
    else:
        if args.defences:
            defences = "-".join(args.defences)
        else:
            defences = ""
        MODEL_PATH = os.path.join(os.path.abspath(DATA_PATH), "classifiers",
                                  args.dataset, args.classifier, args.act,
                                  defences)

    v_print("Classifier saved in", MODEL_PATH)
    make_directory(MODEL_PATH)

    # Save best classifier weights
    # checkpoint = ModelCheckpoint(os.path.join(FILEPATH,"best-weights.{epoch:02d}-{val_acc:.2f}.h5"),
    #                              monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    checkpoint = ModelCheckpoint(os.path.join(MODEL_PATH, "best-weights.h5"),
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')

    # Remote monitor
    monitor = TensorBoard(log_dir=os.path.join(MODEL_PATH, 'logs'),
                          write_graph=False)
    callbacks_list = [checkpoint, monitor]
else:
示例#8
0
def run_ner():
    """run ner task"""
    parser = argparse.ArgumentParser(description="run classifier")
    parser.add_argument("--device_target",
                        type=str,
                        default="Ascend",
                        choices=["Ascend", "GPU"],
                        help="Device type, default is Ascend")
    parser.add_argument(
        "--assessment_method",
        type=str,
        default="F1",
        choices=["F1", "clue_benchmark"],
        help="assessment_method include: [F1, clue_benchmark], default is F1")
    parser.add_argument("--do_train",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Eable train, default is false")
    parser.add_argument("--do_eval",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Eable eval, default is false")
    parser.add_argument("--use_crf",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Use crf, default is false")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--epoch_num",
                        type=int,
                        default="1",
                        help="Epoch number, default is 1.")
    parser.add_argument("--num_class",
                        type=int,
                        default="2",
                        help="The number of class, default is 2.")
    parser.add_argument("--train_data_shuffle",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Enable train data shuffle, default is true")
    parser.add_argument("--eval_data_shuffle",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Enable eval data shuffle, default is false")
    parser.add_argument("--vocab_file_path",
                        type=str,
                        default="",
                        help="Vocab file path, used in clue benchmark")
    parser.add_argument("--label2id_file_path",
                        type=str,
                        default="",
                        help="label2id file path, used in clue benchmark")
    parser.add_argument("--save_finetune_checkpoint_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_pretrain_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--load_finetune_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--train_data_file_path",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--eval_data_file_path",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_file_path",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")
    args_opt = parser.parse_args()
    epoch_num = args_opt.epoch_num
    assessment_method = args_opt.assessment_method.lower()
    load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path
    save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path
    load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path

    if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower(
    ) == "false":
        raise ValueError(
            "At least one of 'do_train' or 'do_eval' must be true")
    if args_opt.do_train.lower(
    ) == "true" and args_opt.train_data_file_path == "":
        raise ValueError(
            "'train_data_file_path' must be set when do finetune task")
    if args_opt.do_eval.lower(
    ) == "true" and args_opt.eval_data_file_path == "":
        raise ValueError(
            "'eval_data_file_path' must be set when do evaluation task")
    if args_opt.assessment_method.lower(
    ) == "clue_benchmark" and args_opt.vocab_file_path == "":
        raise ValueError("'vocab_file_path' must be set to do clue benchmark")
    if args_opt.use_crf.lower(
    ) == "true" and args_opt.label2id_file_path == "":
        raise ValueError("'label2id_file_path' must be set to use crf")
    if args_opt.assessment_method.lower(
    ) == "clue_benchmark" and args_opt.label2id_file_path == "":
        raise ValueError(
            "'label2id_file_path' must be set to do clue benchmark")

    target = args_opt.device_target
    if target == "Ascend":
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="Ascend",
                            device_id=args_opt.device_id)
    elif target == "GPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
        if bert_net_cfg.compute_type != mstype.float32:
            logger.warning('GPU only support fp32 temporarily, run with fp32.')
            bert_net_cfg.compute_type = mstype.float32
    else:
        raise Exception("Target error, GPU or Ascend is supported.")

    tag_to_index = None
    if args_opt.use_crf.lower() == "true":
        with open(args_opt.label2id_file_path) as json_file:
            tag_to_index = json.load(json_file)
        max_val = max(tag_to_index.values())
        tag_to_index["<START>"] = max_val + 1
        tag_to_index["<STOP>"] = max_val + 2
        number_labels = len(tag_to_index)
    else:
        number_labels = args_opt.num_class
    netwithloss = BertNER(bert_net_cfg,
                          True,
                          num_labels=number_labels,
                          use_crf=(args_opt.use_crf.lower() == "true"),
                          tag_to_index=tag_to_index,
                          dropout_prob=0.1)
    if args_opt.do_train.lower() == "true":
        ds = create_ner_dataset(
            batch_size=bert_net_cfg.batch_size,
            repeat_count=1,
            assessment_method=assessment_method,
            data_file_path=args_opt.train_data_file_path,
            schema_file_path=args_opt.schema_file_path,
            do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
        do_train(ds, netwithloss, load_pretrain_checkpoint_path,
                 save_finetune_checkpoint_path, epoch_num)

        if args_opt.do_eval.lower() == "true":
            if save_finetune_checkpoint_path == "":
                load_finetune_checkpoint_dir = _cur_dir
            else:
                load_finetune_checkpoint_dir = make_directory(
                    save_finetune_checkpoint_path)
            load_finetune_checkpoint_path = LoadNewestCkpt(
                load_finetune_checkpoint_dir, ds.get_dataset_size(), epoch_num,
                "ner")

    if args_opt.do_eval.lower() == "true":
        ds = create_ner_dataset(
            batch_size=bert_net_cfg.batch_size,
            repeat_count=1,
            assessment_method=assessment_method,
            data_file_path=args_opt.eval_data_file_path,
            schema_file_path=args_opt.schema_file_path,
            do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
        do_eval(ds, BertNER, args_opt.use_crf, number_labels,
                assessment_method, args_opt.eval_data_file_path,
                load_finetune_checkpoint_path, args_opt.vocab_file_path,
                args_opt.label2id_file_path, tag_to_index)
示例#9
0
def run_ner():
    """run ner task"""
    args_opt = parse_args()
    epoch_num = args_opt.epoch_num
    assessment_method = "f1"
    load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path
    save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path
    load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path
    target = args_opt.device_target
    if target == "Ascend":
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="Ascend",
                            device_id=args_opt.device_id)
    elif target == "GPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
        if ernie_net_cfg.compute_type != mstype.float32:
            logger.warning('GPU only support fp32 temporarily, run with fp32.')
            ernie_net_cfg.compute_type = mstype.float32
        if optimizer_cfg.optimizer == 'AdamWeightDecay' and args_opt.use_crf.lower(
        ) == "false":
            context.set_context(enable_graph_kernel=True)
    else:
        raise Exception("Target error, GPU or Ascend is supported.")
    with open(args_opt.label_map_config) as f:
        tag_to_index = json.load(f)
    number_labels = args_opt.number_labels
    if args_opt.do_train.lower() == "true":
        netwithloss = ErnieNER(ernie_net_cfg,
                               args_opt.train_batch_size,
                               True,
                               num_labels=number_labels,
                               use_crf=(args_opt.use_crf.lower() == "true"),
                               tag_to_index=tag_to_index,
                               dropout_prob=0.1)
        ds = create_finetune_dataset(
            batch_size=args_opt.train_batch_size,
            repeat_count=1,
            data_file_path=args_opt.train_data_file_path,
            schema_file_path=args_opt.schema_file_path,
            do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
        print("==============================================================")
        print("processor_name: {}".format(args_opt.device_target))
        print("test_name: ERNIE Finetune Training")
        print("model_name: {}".format("ERNIE+MLP+CRF" if args_opt.use_crf.
                                      lower() == "true" else "ERNIE + MLP"))
        print("batch_size: {}".format(args_opt.train_batch_size))

        do_train(ds, netwithloss, load_pretrain_checkpoint_path,
                 save_finetune_checkpoint_path, epoch_num)

        if args_opt.do_eval.lower() == "true":
            if save_finetune_checkpoint_path == "":
                load_finetune_checkpoint_dir = _cur_dir
            else:
                load_finetune_checkpoint_dir = make_directory(
                    save_finetune_checkpoint_path)
            load_finetune_checkpoint_path = LoadNewestCkpt(
                load_finetune_checkpoint_dir, ds.get_dataset_size(), epoch_num,
                "ner")

    if args_opt.do_eval.lower() == "true":
        ds = create_finetune_dataset(
            batch_size=args_opt.eval_batch_size,
            repeat_count=1,
            data_file_path=args_opt.eval_data_file_path,
            schema_file_path=args_opt.schema_file_path,
            do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
        do_eval(ds, ErnieNER, args_opt.use_crf, number_labels,
                assessment_method, args_opt.eval_data_file_path,
                load_finetune_checkpoint_path, args_opt.vocab_file_path,
                args_opt.label_file_path, tag_to_index,
                args_opt.eval_batch_size)
 def setUp(self):
     make_directory("./tests/")
示例#11
0
def run_squad():
    """run squad task"""
    parser = argparse.ArgumentParser(description="run squad")
    parser.add_argument("--device_target",
                        type=str,
                        default="Ascend",
                        choices=["Ascend", "GPU"],
                        help="Device type, default is Ascend")
    parser.add_argument("--do_train",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Eable train, default is false")
    parser.add_argument("--do_eval",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Eable eval, default is false")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--epoch_num",
                        type=int,
                        default=3,
                        help="Epoch number, default is 1.")
    parser.add_argument("--num_class",
                        type=int,
                        default=2,
                        help="The number of class, default is 2.")
    parser.add_argument("--train_data_shuffle",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Enable train data shuffle, default is true")
    parser.add_argument("--eval_data_shuffle",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Enable eval data shuffle, default is false")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=32,
                        help="Train batch size, default is 32")
    parser.add_argument("--eval_batch_size",
                        type=int,
                        default=1,
                        help="Eval batch size, default is 1")
    parser.add_argument("--vocab_file_path",
                        type=str,
                        default="",
                        help="Vocab file path")
    parser.add_argument("--eval_json_path",
                        type=str,
                        default="",
                        help="Evaluation json file path, can be eval.json")
    parser.add_argument("--save_finetune_checkpoint_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_pretrain_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--load_finetune_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--train_data_file_path",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_file_path",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")
    args_opt = parser.parse_args()
    epoch_num = args_opt.epoch_num
    load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path
    save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path
    load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path

    if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower(
    ) == "false":
        raise ValueError(
            "At least one of 'do_train' or 'do_eval' must be true")
    if args_opt.do_train.lower(
    ) == "true" and args_opt.train_data_file_path == "":
        raise ValueError(
            "'train_data_file_path' must be set when do finetune task")
    if args_opt.do_eval.lower() == "true":
        if args_opt.vocab_file_path == "":
            raise ValueError(
                "'vocab_file_path' must be set when do evaluation task")
        if args_opt.eval_json_path == "":
            raise ValueError(
                "'tokenization_file_path' must be set when do evaluation task")

    target = args_opt.device_target
    if target == "Ascend":
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="Ascend",
                            device_id=args_opt.device_id)
    elif target == "GPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
        if bert_net_cfg.compute_type != mstype.float32:
            logger.warning('GPU only support fp32 temporarily, run with fp32.')
            bert_net_cfg.compute_type = mstype.float32
    else:
        raise Exception("Target error, GPU or Ascend is supported.")

    netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1)

    if args_opt.do_train.lower() == "true":
        ds = create_squad_dataset(
            batch_size=args_opt.train_batch_size,
            repeat_count=1,
            data_file_path=args_opt.train_data_file_path,
            schema_file_path=args_opt.schema_file_path,
            do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
        do_train(ds, netwithloss, load_pretrain_checkpoint_path,
                 save_finetune_checkpoint_path, epoch_num)
        if args_opt.do_eval.lower() == "true":
            if save_finetune_checkpoint_path == "":
                load_finetune_checkpoint_dir = _cur_dir
            else:
                load_finetune_checkpoint_dir = make_directory(
                    save_finetune_checkpoint_path)
            load_finetune_checkpoint_path = LoadNewestCkpt(
                load_finetune_checkpoint_dir, ds.get_dataset_size(), epoch_num,
                "squad")

    if args_opt.do_eval.lower() == "true":
        from src import tokenization
        from src.create_squad_data import read_squad_examples, convert_examples_to_features
        from src.squad_get_predictions import write_predictions
        from src.squad_postprocess import SQuad_postprocess
        tokenizer = tokenization.FullTokenizer(
            vocab_file=args_opt.vocab_file_path, do_lower_case=True)
        eval_examples = read_squad_examples(args_opt.eval_json_path, False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=bert_net_cfg.seq_length,
            doc_stride=128,
            max_query_length=64,
            is_training=False,
            output_fn=None,
            vocab_file=args_opt.vocab_file_path)
        ds = create_squad_dataset(
            batch_size=args_opt.eval_batch_size,
            repeat_count=1,
            data_file_path=eval_features,
            schema_file_path=args_opt.schema_file_path,
            is_training=False,
            do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
        outputs = do_eval(ds, load_finetune_checkpoint_path,
                          args_opt.eval_batch_size)
        all_predictions = write_predictions(eval_examples, eval_features,
                                            outputs, 20, 30, True)
        SQuad_postprocess(args_opt.eval_json_path,
                          all_predictions,
                          output_metrics="output.json")
示例#12
0
import numpy as np
import os
from shutil import copyfile

from src.datasets import CumuloDataset
from src.utils import make_directory

load_path = "data/npz/"
save_path = "datasets/cumulo-dc/"

THR = 1000 # select tiles that contain at least <threshold> pixels of class Deep Convection
LABEL = 7 # index corresponding to coarse class Deep Convection

dataset = CumuloDataset(load_path, ext="npz")

make_directory(save_path)

for instance in dataset:

    name, *_, mask, labels = instance

    dc_pixels = np.sum(np.logical_and(labels == LABEL, mask))

    if dc_pixels > THR:

        print(name, dc_pixels)
        copyfile(name, os.path.join(save_path, os.path.basename(name)))
示例#13
0
def predict_and_save(save_dir, model_path, swath):

    filename, tiles, locations, _, rois, _ = swath
    
    print("processing", filename)

    model = load_lgbm(model_path)

    labels = predict_tiles(model, tiles)

    save_path = os.path.join(save_dir, os.path.basename(filename)).replace(".nc", ".npy")

    save_labels(labels, locations, rois.squeeze(), save_path)

    print(save_path, "processed")

if __name__ == "__main__":

    import os

    model_path = "results/lgbm/lightgbm-model.txt"
    
    save_dir = os.path.join("results/lgbm/predicted-label-masks/")
    make_directory(save_dir)

    tile_extr = TileExtractor()
    dataset = CumuloDataset(root_dir="../DATA/nc/", ext="nc", label_preproc=None, tiler=tile_extr)
    
    for swath in dataset:
        predict_and_save(save_dir, model_path, swath)
示例#14
0
nb_iter_norm = 5

# shape of the input (channels, height, width)
in_shape = (13, t_size, t_size)

use_cuda = torch.cuda.is_available()
print("using GPUs?", use_cuda)

classification_weight = in_shape[0] * in_shape[1] * in_shape[2]

save_dir = "results/iresnet"

save_dir_best = os.path.join(save_dir, "best")
save_dir_last = os.path.join(save_dir, "last")

make_directory(save_dir_best)
make_directory(save_dir_last)

train_log = open(os.path.join(save_dir, "train_log.txt"), 'w')
val_log = open(os.path.join(save_dir, "val_log.txt"), 'w')
test_log = open(os.path.join(save_dir, "test_log.txt"), 'w')

# compute class weights and normalizer
try:
    class_weights = np.load(os.path.join(save_dir, "class-weights.npy"))
    m = np.load(os.path.join(save_dir, "mean.npy"))
    s = np.load(os.path.join(save_dir, "std.npy"))

except:
    # load dataset characteristics
    print("Computing dataset mean, standard deviation and class ratios")
v_print = get_verbose_print(args.verbose)
alpha = 0.05  # constant for random perturbation

# get dataset
(X_train, Y_train), (X_test, Y_test), min_, max_ = load_dataset(args.dataset)

session = tf.Session()
k.set_session(session)

# Load classification model
MODEL_PATH = os.path.join(os.path.abspath(args.load), "")
classifier = load_classifier(MODEL_PATH, "best-weights.h5")

if args.save:
    SAVE_ADV = os.path.join(os.path.abspath(args.save), args.adv_method)
    make_directory(SAVE_ADV)

    with open(os.path.join(SAVE_ADV, "readme.txt"), "w") as wfile:
        wfile.write("Model used for crafting the adversarial examples is in " +
                    MODEL_PATH)

    v_print("Adversarials crafted with", args.adv_method, "on", MODEL_PATH,
            "will be saved in", SAVE_ADV)

if args.adv_method in ['fgsm', "vat", "rnd_fgsm"]:

    eps_ranges = {
        'fgsm': [e / 10 for e in range(1, 11)],
        'rnd_fgsm': [e / 10 for e in range(1, 11)],
        'vat': [1.5, 2.1, 5, 7, 10]
    }
示例#16
0
def process(args):
    utils.make_directory(args.path['model'])
    tokenizer = args.tokenizer(args.path['vocab'])
    train_batch = args.batch(tokenizer, args.max_lens)
    train_batch.set_data(utils.read_lines(args.path['train_x']),
                         utils.read_lines(args.path['train_y']))
    dev_batch = args.batch(tokenizer, args.max_lens)
    dev_batch.set_data(utils.read_lines(args.path['dev_x']),
                       utils.read_lines(args.path['dev_y']))
    model = args.model(args)

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_device
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(pad_step_number=True)
        recorder = Recorder()
        starter = time.time()

        for i in range(args.max_steps):
            input_x, input_y, idx, update_epoch = train_batch.next_batch(
                args.batch_size, recorder.train_idx)
            train_features = {
                'input_x_ph': input_x,
                'input_y_ph': input_y,
                'keep_prob_ph': args.keep_prob
            }
            recorder.train_idx = idx
            train_fetches, train_feed = model.train_step(train_features)
            _, train_loss, train_acc = sess.run(train_fetches, train_feed)
            recorder.train_losses.append(train_loss)
            recorder.train_accs.append(train_acc)

            if not i % args.show_steps and i:
                input_x, input_y, idx, update_epoch = dev_batch.next_batch(
                    args.batch_size, recorder.dev_idx)
                dev_features = {
                    'input_x_ph': input_x,
                    'input_y_ph': input_y,
                    'keep_prob_ph': 1.0
                }
                recorder.dev_idx = idx
                dev_fetches, dev_feed = model.dev_step(dev_features)
                dev_loss, dev_acc = sess.run(dev_fetches, dev_feed)
                recorder.dev_losses.append(dev_loss)
                recorder.dev_accs.append(dev_acc)
                speed = args.show_steps / (time.time() - starter)
                utils.verbose(
                    r'        step {:05d} | train [{:.5f} {:.5f}] | '
                    r'dev [{:.5f} {:.5f}] | speed {:.5f} it/s'.format(
                        i, train_loss, train_acc, dev_loss, dev_acc, speed))
                starter = time.time()

            if not i % args.save_steps and i:
                features = recorder.stats()
                if features['save']:
                    saver.save(sess, args.path['model'])
                utils.verbose(
                    r'step {:05d} - {:05d} | train [{:.5f} {:.5f}] | '
                    r'dev [{:.5f} {:.5f}]'.format(i - args.save_steps, i,
                                                  features['train_loss'],
                                                  features['train_acc'],
                                                  features['dev_loss'],
                                                  features['dev_acc']))
                print('-+' * 55)
                utils.write_result(args, recorder.lowest_loss)

        utils.verbose('Start building vector space from dual encoder model')
        vectors = []
        infer_batch = args.batch(tokenizer, args.max_lens)
        infer_batch.set_data(utils.read_lines(args.path['train_x']),
                             utils.read_lines(args.path['train_y']))
        starter = time.time()
        idx = 0
        update_epoch = False
        i = 0
        while not update_epoch:
            input_x, input_y, idx, update_epoch = infer_batch.next_batch(
                args.batch_size, idx)
            infer_features = {'input_x_ph': input_x, 'keep_prob_ph': 1.0}
            infer_fetches, infer_feed = model.infer_step(infer_features)
            enc_questions = sess.run(infer_fetches, infer_feed)
            vectors += enc_questions
            if not i % args.show_steps and i:
                speed = args.show_steps / (time.time() - starter)
                utils.verbose('step : {:05d} | speed: {:.5f} it/s'.format(
                    i, speed))
                starter = time.time()
            i += 1
    vectors = np.reshape(np.array(vectors),
                         [-1, args.hidden])[:infer_batch.data_size]
    vec_dim = vectors.shape[-1]
    ann = AnnoyIndex(vec_dim)
    for n, ii in enumerate(vectors):
        ann.add_item(n, ii)
    ann.build(args.num_trees)
    ann.save(args.path['ann'])
    utils.verbose('Annoy has been dump in {}'.format(args.path['ann']))