Пример #1
0
def create_input_pipeline(el_mode, model_folder, filenames):
    tf.reset_default_graph()
    folder = config.base_folder+"data/tfrecords/" + args.experiment_name + ("/test/" if el_mode else "/train/")
    datasets = []
    for file in filenames:
        datasets.append(reader.test_input_pipeline([folder+file], args))

    input_handle_ph = tf.placeholder(tf.string, shape=[], name="input_handle_ph")
    iterator = tf.contrib.data.Iterator.from_string_handle(
        input_handle_ph, datasets[0].output_types, datasets[0].output_shapes)
    next_element = iterator.get_next()

    train_args = load_train_args(args.output_folder, "ensemble_eval")

    print("loading Model:", model_folder)
    #train_args.evaluation_script = True
    train_args.entity_extension = args.entity_extension
    model = Model(train_args, next_element)
    model.build()
    #print("model train_args:", model.args)
    #print("model checkpoint_folder:", model.args.checkpoints_folder)
    model.input_handle_ph = input_handle_ph
    model.restore_session("el" if el_mode else "ed")

    #iterators, handles = from_datasets_to_iterators_and_handles(model.sess, datasets)
    iterators = []
    handles = []
    for dataset in datasets:
        #iterator = dataset.make_initializable_iterator() # one shot iterators fits better here
        iterator = dataset.make_one_shot_iterator()
        iterators.append(iterator)
        handles.append(model.sess.run(iterator.string_handle()))
    return model, handles
Пример #2
0
def _parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--experiment_name",
                        default="per_document_no_wikidump",
                        help="under folder data/tfrecords/")
    parser.add_argument("--training_name",
                        default="doc_fixed_nowiki_evecsl2dropout")
    parser.add_argument("--all_spans_training", type=bool, default=False)
    parser.add_argument("--el_mode", dest='el_mode', action='store_true')
    parser.add_argument("--ed_mode", dest='el_mode', action='store_false')
    parser.set_defaults(el_mode=True)

    parser.add_argument("--running_mode",
                        default=None,
                        help="el_mode or ed_mode, so"
                        "we can restore an ed_mode model and run it for el")

    parser.add_argument("--lowercase_spans_pem", type=bool, default=False)

    parser.add_argument(
        "--entity_extension",
        default=None,
        help="extension_entities or extension_entities_all etc")

    # those are for building the entity set
    parser.add_argument("--build_entity_universe", type=bool, default=False)
    parser.add_argument("--hardcoded_thr",
                        type=float,
                        default=None,
                        help="0, 0.2")
    parser.add_argument("--el_with_stanfordner_and_our_ed",
                        type=bool,
                        default=False)

    parser.add_argument("--persons_coreference", type=bool, default=False)
    parser.add_argument("--persons_coreference_merge",
                        type=bool,
                        default=False)

    args = parser.parse_args()
    if args.persons_coreference_merge:
        args.persons_coreference = True
    print(args)
    if args.build_entity_universe:
        return args, None

    temp = "all_spans_" if args.all_spans_training else ""
    args.experiment_folder = config.base_folder + "data/tfrecords/" + args.experiment_name + "/"

    args.output_folder = config.base_folder+"data/tfrecords/" + \
                         args.experiment_name+"/{}training_folder/".format(temp) + \
                         args.training_name+"/"

    train_args = load_train_args(args.output_folder, "gerbil")
    train_args.entity_extension = args.entity_extension

    print(train_args)
    return args, train_args
def _parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--experiment_name",
        default="alldatasets_perparagr",  #"standard",
        help="under folder data/tfrecords/")
    parser.add_argument("--training_name", help="under folder data/tfrecords/")
    parser.add_argument(
        "--checkpoint_model_num",
        default=None,
        help="e.g. give '7' or '4' if you want checkpoints/model-7 or model-4 "
        "to be restored")

    parser.add_argument(
        "--print_predictions",
        dest='print_predictions',
        action='store_true',
        help=
        "prints for each dataset the predictions to a file and compares with ground"
        "truth and simple baselines.")
    parser.add_argument("--no_print_predictions",
                        dest='print_predictions',
                        action='store_false')
    parser.set_defaults(print_predictions=True)
    parser.add_argument("--print_global_voters", type=bool, default=False)
    parser.add_argument("--print_global_pairwise_scores",
                        type=bool,
                        default=False)

    parser.add_argument(
        "--ed_datasets",
        default="aida_train.txt_z_aida_dev.txt_z_aida_test.txt_z_"
        "ace2004.txt_z_aquaint.txt_z_clueweb.txt_z_msnbc.txt_z_wikipedia.txt")
    parser.add_argument(
        "--el_datasets",
        default="aida_train.txt_z_aida_dev.txt_z_aida_test.txt_z_"
        "ace2004.txt_z_aquaint.txt_z_clueweb.txt_z_msnbc.txt_z_wikipedia.txt")

    parser.add_argument("--ed_val_datasets", default="1")
    parser.add_argument("--el_val_datasets", default="1")

    parser.add_argument(
        "--p_e_m_algorithm",
        type=bool,
        default=False,
        help="Baseline. Doesn't use the NN but only the p_e_m dictionary for"
        "its predictions.")
    parser.add_argument("--all_spans_training", type=bool, default=False)
    parser.add_argument(
        "--entity_extension",
        default=None,
        help="extension_entities or extension_entities_all etc")

    parser.add_argument("--debug", type=bool, default=False)
    parser.add_argument(
        "--gm_bucketing_pempos",
        default=None,
        help="0_1_2_7  will create bins 0, 1, 2, [3,7], [8,inf)")
    parser.add_argument("--hardcoded_thr", type=float, default=None)
    args = parser.parse_args()

    temp = "all_spans_" if args.all_spans_training else ""
    args.output_folder = config.base_folder+"data/tfrecords/" + \
                         args.experiment_name+"/{}training_folder/".format(temp)+ \
                         args.training_name+"/"
    args.checkpoints_folder = args.output_folder + "checkpoints/"
    args.predictions_folder = args.output_folder + "predictions/"

    if args.p_e_m_algorithm:
        args.predictions_folder = args.output_folder + "p_e_m_predictions/"

    if args.print_predictions and not os.path.exists(args.predictions_folder):
        os.makedirs(args.predictions_folder)
    if args.print_predictions and not os.path.exists(args.predictions_folder +
                                                     "ed/"):
        os.makedirs(args.predictions_folder + "ed/")
    if args.print_predictions and not os.path.exists(args.predictions_folder +
                                                     "el/"):
        os.makedirs(args.predictions_folder + "el/")

    train_args = load_train_args(args.output_folder, "evaluate")

    args.ed_datasets = args.ed_datasets.split(
        '_z_') if args.ed_datasets != "" else None
    args.el_datasets = args.el_datasets.split(
        '_z_') if args.el_datasets != "" else None
    args.ed_val_datasets = [int(x) for x in args.ed_val_datasets.split('_')]
    args.el_val_datasets = [int(x) for x in args.el_val_datasets.split('_')]
    args.gm_bucketing_pempos = [
        int(x) for x in args.gm_bucketing_pempos.split('_')
    ] if args.gm_bucketing_pempos else []

    print(args)
    return args, train_args
Пример #4
0
def _parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--experiment_name",
        default="alldatasets_perparagr",  #"standard",
        help="under folder data/tfrecords/")
    parser.add_argument("--training_name",
                        default=None,
                        help="under folder data/tfrecords/")
    parser.add_argument("--shuffle_capacity", type=int, default=500)
    parser.add_argument("--debug", type=bool, default=False)

    parser.add_argument("--nepoch_no_imprv", type=int, default=5)
    parser.add_argument(
        "--improvement_threshold",
        type=float,
        default=0.3,
        help="if improvement less than this then"
        "it is considered not significant and we have early stopping.")
    parser.add_argument("--clip",
                        type=int,
                        default=-1,
                        help="if negative then no clipping")
    parser.add_argument("--lr_decay",
                        type=float,
                        default=-1.0,
                        help="if negative then no decay")
    parser.add_argument("--lr", type=float, default=0.001)
    parser.add_argument("--lr_method", default="adam")
    parser.add_argument("--batch_size", type=int, default=10)
    parser.add_argument("--dropout", type=float, default=0.5)
    parser.add_argument("--train_ent_vecs",
                        dest='train_ent_vecs',
                        action='store_true')
    parser.add_argument("--no_train_ent_vecs",
                        dest='train_ent_vecs',
                        action='store_false')
    parser.set_defaults(train_ent_vecs=False)

    parser.add_argument("--steps_before_evaluation", type=int, default=10000)
    parser.add_argument("--evaluation_minutes",
                        type=int,
                        default=15,
                        help="every this number of minutes pause"
                        " training and run an evaluation epoch")
    parser.add_argument("--dim_char", type=int, default=100)
    parser.add_argument("--hidden_size_char",
                        type=int,
                        default=100,
                        help="lstm on chars")
    parser.add_argument("--hidden_size_lstm",
                        type=int,
                        default=300,
                        help="lstm on word embeddings")

    parser.add_argument("--use_chars",
                        dest="use_chars",
                        action='store_true',
                        help="use character embeddings or not")
    parser.add_argument("--no_use_chars",
                        dest="use_chars",
                        action='store_false')
    parser.set_defaults(use_chars=True)
    parser.add_argument(
        "--model_heads_from_bilstm",
        type=bool,
        default=False,
        help=
        "use the bilstm vectors for the head instead of the word embeddings")
    parser.add_argument(
        "--span_boundaries_from_wordemb",
        type=bool,
        default=False,
        help="instead of using the "
        "output of contextual bilstm for start and end of span we use word+char emb"
    )
    parser.add_argument("--span_emb",
                        default="boundaries_head",
                        help="boundaries for start and end, and head")

    parser.add_argument("--max_mention_width", type=int, default=10)
    parser.add_argument("--use_features",
                        type=bool,
                        default=False,
                        help="like mention width")
    parser.add_argument(
        "--feature_size", type=int,
        default=20)  # each width is represented by a vector of that size

    parser.add_argument("--ent_vecs_regularization",
                        default="l2dropout",
                        help="'no', "
                        "'dropout', 'l2', 'l2dropout'")

    parser.add_argument(
        "--span_emb_ffnn",
        default="0_0",
        help="int_int  the first int"
        "indicates the number of hidden layers and the second the hidden size"
        "so 2_100 means 2 hidden layers of width 100 and then projection to output size"
        ". 0_0 means just projecting without hidden layers")
    parser.add_argument("--final_score_ffnn",
                        default="1_100",
                        help="int_int  look span_emb_ffnn")

    parser.add_argument("--gamma_thr", type=float, default=0.2)

    parser.add_argument("--nocheckpoints", type=bool, default=False)
    parser.add_argument("--checkpoints_num",
                        type=int,
                        default=1,
                        help="maximum number of checkpoints to keep")

    parser.add_argument("--ed_datasets", default="")
    parser.add_argument("--ed_val_datasets",
                        default="1",
                        help="based on these datasets pick the optimal"
                        "gamma thr and also consider early stopping")
    #--ed_val_datasets=1_4  # aida_dev, aquaint
    parser.add_argument("--el_datasets", default="")
    parser.add_argument(
        "--el_val_datasets",
        default="1")  #--el_val_datasets=1_4   # aida_dev, aquaint

    parser.add_argument("--train_datasets",
                        default="HIPE-data-v1.0-train-de.txt")
    #--train_datasets=aida_train.txt_z_wikidumpRLTD.txt

    parser.add_argument(
        "--continue_training",
        type=bool,
        default=False,
        help="if true then just restore the previous command line"
        "arguments and continue the training in exactly the"
        "same way. so only the experiment_name and "
        "training_name are used from here. Retrieve values from"
        "latest checkpoint.")
    parser.add_argument("--onleohnard", type=bool, default=False)

    parser.add_argument(
        "--comment",
        default="",
        help="put any comment here that describes your experiment"
        ", for logging purposes only.")

    parser.add_argument("--all_spans_training", type=bool, default=False)
    parser.add_argument(
        "--fast_evaluation",
        type=bool,
        default=False,
        help="if all_spans training then evaluate only"
        "on el tests, corresponding if gm training evaluate only on ed tests.")

    parser.add_argument(
        "--entity_extension",
        default=None,
        help="extension_entities or extension_entities_all etc")

    parser.add_argument(
        "--nn_components",
        default="pem_lstm",
        help="each option is one scalar, then these are fed to"
        "the final ffnn and we have the final score. choose any combination you want: e.g"
        "pem_lstm_attention_global, pem_attention, lstm_attention, pem_lstm_global, etc"
    )
    parser.add_argument("--attention_K",
                        type=int,
                        default=100,
                        help="K from left and K from right, in total 2K")
    parser.add_argument("--attention_R",
                        type=int,
                        default=30,
                        help="hard attention")
    parser.add_argument("--attention_use_AB", type=bool, default=False)
    parser.add_argument(
        "--attention_on_lstm",
        type=bool,
        default=False,
        help="instead of using attention on"
        "original pretrained word embedding. use it on vectors or lstm, "
        "needs also projection now the context vector x_c to 300 dimensions")
    parser.add_argument("--attention_ent_vecs_no_regularization",
                        type=bool,
                        default=False)
    parser.add_argument(
        "--attention_retricted_num_of_entities",
        type=int,
        default=None,
        help=
        "instead of using 30 entities for creating the context vector we use only"
        "the top x number of entities for reducing noise.")
    parser.add_argument("--global_thr", type=float,
                        default=0.1)  # 0.0, 0.05, -0.05, 0.2
    parser.add_argument("--global_mask_scale_each_mention_voters_to_one",
                        type=bool,
                        default=False)
    parser.add_argument("--global_topk", type=int, default=None)
    parser.add_argument("--global_gmask_based_on_localscore",
                        type=bool,
                        default=False)  # new
    parser.add_argument("--global_topkthr", type=float,
                        default=None)  # 0.0, 0.05, -0.05, 0.2
    parser.add_argument("--global_score_ffnn",
                        default="1_100",
                        help="int_int  look span_emb_ffnn")
    parser.add_argument("--global_one_loss", type=bool, default=False)
    parser.add_argument("--global_norm_or_mean", default="norm")
    parser.add_argument("--global_topkfromallspans", type=int, default=None)
    parser.add_argument("--global_topkfromallspans_onlypositive",
                        type=bool,
                        default=False)
    parser.add_argument("--global_gmask_unambigious", type=bool, default=False)

    parser.add_argument(
        "--hardcoded_thr",
        type=float,
        default=None,
        help="if this is specified then we don't calculate"
        "optimal threshold based on the dev dataset but use this one.")
    parser.add_argument("--ffnn_dropout",
                        dest="ffnn_dropout",
                        action='store_true')
    parser.add_argument("--no_ffnn_dropout",
                        dest="ffnn_dropout",
                        action='store_false')
    parser.set_defaults(ffnn_dropout=True)
    parser.add_argument(
        "--ffnn_l2maxnorm",
        type=float,
        default=None,
        help="if positive"
        " then bound the Frobenius norm <= value for the weight tensor of the "
        "hidden layers and the output layer of the FFNNs")
    parser.add_argument("--ffnn_l2maxnorm_onlyhiddenlayers",
                        type=bool,
                        default=False)

    parser.add_argument(
        "--cand_ent_num_restriction",
        type=int,
        default=None,
        help="for reducing memory usage and"
        "avoiding OOM errors in big NN I can reduce the number of candidate ent for each span"
    )
    # --ed_datasets=  --el_datasets="aida_train.txt_z_aida_dev.txt"     which means i can leave something empty
    # and i can also put "" in the cla

    parser.add_argument("--no_p_e_m_usage",
                        type=bool,
                        default=False,
                        help="use similarity score instead of "
                        "final score for prediction")
    parser.add_argument("--pem_without_log", type=bool, default=False)
    parser.add_argument(
        "--pem_buckets_boundaries",
        default=None,
        help="example: 0.03_0.1_0.2_0.3_0.4_0.5_0.6_0.7_0.8_0.9_0.99")
    # the following two command line arguments
    parser.add_argument("--gpem_without_log", type=bool, default=False)
    parser.add_argument(
        "--gpem_buckets_boundaries",
        default=None,
        help="example: 0.03_0.1_0.2_0.3_0.4_0.5_0.6_0.7_0.8_0.9_0.99")
    parser.add_argument(
        "--stage2_nn_components",
        default="local_global",
        help="each option is one scalar, then these are fed to"
        "the final ffnn and we have the final score. choose any combination you want: e.g"
        "pem_local_global, pem_global, local_global, global, etc")
    parser.add_argument("--ablations", type=bool, default=False)
    args = parser.parse_args()

    if args.training_name is None:
        from datetime import datetime
        args.training_name = "{:%d_%m_%Y____%H_%M}".format(datetime.now())

    temp = "all_spans_" if args.all_spans_training else ""
    args.output_folder = config.base_folder+"data/tfrecords/" + \
                         args.experiment_name+"/{}training_folder/".format(temp)+\
                         args.training_name+"/"

    if args.continue_training:
        print("continue training...")
        train_args = load_train_args(args.output_folder, "train_continue")
        return train_args
    args.running_mode = "train"  # "evaluate"  "ensemble_eval"  "gerbil"

    if os.path.exists(args.output_folder) and not args.continue_training:
        print(
            "!!!!!!!!!!!!!!\n"
            "experiment: ", args.output_folder,
            "already exists and args.continue_training=False."
            "folder will be deleted in 20 seconds. Press CTRL+C to prevent it."
        )
        time.sleep(20)
        import shutil
        shutil.rmtree(args.output_folder)

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)

    args.checkpoints_folder = args.output_folder + "checkpoints/"
    if args.onleohnard:
        args.checkpoints_folder = "/cluster/home/nkolitsa/checkpoints_folder/"+\
            args.experiment_name + "/" + args.training_name + "/"

    args.summaries_folder = args.output_folder + "summaries/"
    if not os.path.exists(args.summaries_folder):
        os.makedirs(args.summaries_folder)

    args.ed_datasets = args.ed_datasets.split(
        '_z_') if args.ed_datasets != "" else None
    args.el_datasets = args.el_datasets.split(
        '_z_') if args.el_datasets != "" else None
    args.train_datasets = args.train_datasets.split(
        '_z_') if args.train_datasets != "" else None

    args.ed_val_datasets = [int(x) for x in args.ed_val_datasets.split('_')]
    args.el_val_datasets = [int(x) for x in args.el_val_datasets.split('_')]

    args.span_emb_ffnn = [int(x) for x in args.span_emb_ffnn.split('_')]
    args.final_score_ffnn = [int(x) for x in args.final_score_ffnn.split('_')]
    args.global_score_ffnn = [
        int(x) for x in args.global_score_ffnn.split('_')
    ]

    args.eval_cnt = 0
    args.zero = 1e-6

    if args.pem_buckets_boundaries:
        args.pem_buckets_boundaries = [
            float(x) for x in args.pem_buckets_boundaries.split('_')
        ]
    if args.gpem_buckets_boundaries:
        args.gpem_buckets_boundaries = [
            float(x) for x in args.gpem_buckets_boundaries.split('_')
        ]

    if args.fast_evaluation:
        if args.all_spans_training:  # destined for el so omit the evaluation on ed
            args.ed_datasets = None
        else:
            args.el_datasets = None
    return args