示例#1
0
    def train_parser(cls, options, data_train=None, data_dev=None, data_test=None):
        set_proc_name(options.title)
        ensure_dir(options.output)
        path = os.path.join(options.output, "{}_{}_train.log".format(options.title,
                                                                     int(time.time())))
        log_to_file(path)
        logger.name = options.title
        cls.options_hook(options)
        DataFormatClass = cls.get_data_formats()[options.data_format]

        if data_train is None:
            data_train = DataFormatClass.from_file(options.conll_train)

        if data_dev is None:
            data_dev = {i: DataFormatClass.from_file(i, False) for i in options.conll_dev}

        try:
            os.makedirs(options.output)
        except OSError:
            pass

        parser = cls(options, data_train)
        random_obj = random.Random(1)
        for epoch in range(options.epochs):
            logger.info('Starting epoch %d', epoch)
            random_obj.shuffle(data_train)
            options.is_train = True
            parser.train(data_train)

            # save model and delete old model
            for i in range(0, epoch - options.max_save):
                path = os.path.join(options.output, os.path.basename(options.model)) + str(i + 1)
                if os.path.exists(path):
                    os.remove(path)
            path = os.path.join(options.output, os.path.basename(options.model)) + str(epoch + 1)
            parser.save(path)

            def predict(sentences, gold_file, output_file):
                options.is_train = False
                with open(output_file, "w") as f_output:
                    if hasattr(DataFormatClass, "file_header"):
                        f_output.write(DataFormatClass.file_header + "\n")
                    for i in parser.predict(sentences):
                        f_output.write(i.to_string())
                # script_path = os.path.join(os.path.dirname(__file__), "main.py")
                # p = subprocess.Popen([sys.executable, script_path, "mst+empty", "predict", "--model", path,
                #                       "--test", gold_file,
                #                       "--output", output_file], stdout=sys.stdout)
                # p.wait()
                DataFormatClass.evaluate_with_external_program(gold_file, output_file)

            for file_name, file_content in data_dev.items():
                try:
                    prefix, suffix = os.path.basename(file_name).rsplit(".", 1)
                except ValueError:
                    prefix = os.path.basename(file_name)
                    suffix = ""

                dev_output = os.path.join(options.output, '{}_epoch_{}.{}'.format(prefix, epoch + 1, suffix))
                predict(file_content, file_name, dev_output)
示例#2
0
    def train_parser(cls, options, data_train=None, data_dev=None, data_test=None):
        if sys.platform.startswith("linux"):
            set_proc_name(options.title)
        ensure_dir(options.output)
        path = os.path.join(options.output, "{}_{}_train.log".format(options.title,
                                                                     int(time.time())))
        log_to_file(path)
        logger.name = options.title
        cls.options_hook(options)
        DataFormatClass = cls.get_data_formats()[options.data_format]

        if data_train is None:
            data_train = DataFormatClass.from_file(options.conll_train)

        if data_dev is None:
            data_dev = {i: DataFormatClass.from_file(i, False) for i in options.conll_dev}

        if data_test is None and options.conll_test is not None:
            data_test = DataFormatClass.from_file(options.conll_test, False)
        else:
            data_test = None

        try:
            os.makedirs(options.output)
        except OSError:
            pass

        return cls.repeat_train_and_validate(data_train, data_dev, data_test, options)
示例#3
0
    def train_parser(cls, options, data_train=None, data_dev=None, data_test=None):
        set_proc_name(options.title)
        ensure_dir(options.output)
        path = os.path.join(options.output, "{}_{}_train.log".format(options.title,
                                                                     int(time.time())))
        log_to_file(path)
        logger.name = options.title

        logger.info('Options:\n%s', pformat(options.__dict__))
        if data_train is None:
            data_train = cls.DataType.from_file(options.conll_train)

        if data_dev is None:
            data_dev = {i: cls.DataType.from_file(i, False) for i in options.conll_dev}

        try:
            os.makedirs(options.output)
        except OSError:
            pass

        parser = cls(options, data_train)
        random_obj = random.Random(1)

        def do_predict(epoch):
            for file_name, dev_sentences in data_dev.items():
                try:
                    prefix, suffix = os.path.basename(file_name).rsplit(".", 1)
                except ValueError:
                    prefix = file_name
                    suffix = ""

                dev_output = os.path.join(options.output, '{}_epoch_{}.{}'.format(prefix, epoch, suffix))
                cls.predict_and_output(parser, options, dev_sentences, dev_output)

        if options.epochs == 0:
            print("Predict directly.")
            do_predict(0)

        for epoch in range(options.epochs):
            logger.info('Starting epoch %d', epoch)
            random_obj.shuffle(data_train)
            parser.train(data_train)

            # save model and delete old model
            for i in range(0, epoch - options.max_save):
                path = os.path.join(options.output, os.path.basename(options.model)) + str(i + 1)
                if os.path.exists(path):
                    os.remove(path)
            path = os.path.join(options.output, os.path.basename(options.model)) + str(epoch + 1)
            parser.save(path)
            do_predict(epoch)
示例#4
0
def get_lib():
    hostname = platform.node()
    source_dir = os.path.join(os.path.dirname(__file__), "../libs", "xEisner")

    build_dir = os.path.join(source_dir, "build-{}".format(hostname))
    ensure_dir(build_dir)
    lib_path = os.path.join(build_dir, "libxEisner.so")
    print("Building xEisner...")
    p = subprocess.Popen("MAX_SENTENCE_SIZE=128 cmake ../ -DCMAKE_BUILD_TYPE=Release && make -j4",
                         shell=True, cwd=build_dir)
    p.communicate()
    assert p.returncode == 0
    assert os.path.exists(lib_path)
    return ctypes.cdll.LoadLibrary(lib_path)
    def __init__(
            self,
            model,
            hrg_statistics,  # type: HRGStatistics
            options):
        super(StructuredPeceptronHRGScorer, self).__init__(model)
        self.options = options
        self.activation = nn.activations[options.activation]

        self.edge_labels = list(
            word for word, count in hrg_statistics.nonterminals.most_common(300)) + \
                           list(hrg_statistics.structural_edges) + \
                           list(hrg_statistics.categories)

        self.possible_features = [("Edge", k) for k in self.edge_labels]
        logger.info("Consider {} features as graph embedding".format(
            len(self.possible_features)))
        self.possible_features.append("head_left")
        self.possible_features.append("head_right")
        # self.possible_features.append("head_left_1/2")
        # self.possible_features.append("head_left_2/2")
        # self.possible_features.append("head_right_1/2")
        # self.possible_features.append("head_right_2/2")
        self.feature_index = {
            i: idx
            for idx, i in enumerate(self.possible_features)
        }

        dense_dims = [options.lstm_dims * 2 * options.span_lstm_layers + len(self.possible_features) + 1] + \
                     options.hrg_mlp_dims + [1]
        # don't use bias in last transform
        use_bias = [True] * (len(dense_dims) - 2) + [False]

        self.dense_layer = nn.DenseLayers(self, dense_dims, self.activation,
                                          use_bias)
        self.count_scale = self.add_parameters((1, ))
        self.count_scale_2 = self.add_parameters((1, ))

        if self.options.conflict_output_dir:
            ensure_dir(self.options.conflict_output_dir)
示例#6
0
def k_fold_validation(train_file,
                      dev_file,
                      op,
                      FormatClass,
                      project_name,
                      outdir_prefix,
                      scheduler,
                      k=5,
                      prevent_redundant_preparation=True,
                      header=None):
    train_file_basename = os.path.basename(train_file)
    train_file_prefix, _, ext = train_file_basename.rpartition(".")
    train_sents = FormatClass.from_file(train_file)
    project_dir = os.path.join(outdir_prefix, project_name)
    ensure_dir(project_dir)
    train_file_i = os.path.join(project_dir, train_file_prefix + ".{}." + ext)
    train_file_except_i = os.path.join(project_dir,
                                       train_file_prefix + ".except-{}." + ext)
    data_preparation_done_file = os.path.join(
        project_dir, "." + train_file_basename + ".done")

    # do data preparation
    if not prevent_redundant_preparation or not os.path.exists(
            data_preparation_done_file):
        train_sents_splitted = []
        for i in range(k):
            start = int(i * len(train_sents) / k)
            end = int((i + 1) * len(train_sents) / k)
            train_sents_splitted.append(train_sents[start:end])

        f_train_list = [open(train_file_i.format(i), "w") for i in range(k)]
        f_train_except_list = [
            open(train_file_except_i.format(i), "w") for i in range(k)
        ]
        if header is not None:
            for f_i in f_train_list:
                f_i.write(header + "\n")
            for f_i in f_train_except_list:
                f_i.write(header + "\n")
        for i, train_sents_i in enumerate(train_sents_splitted):
            for sent in train_sents_i:
                for j in range(k):
                    if j == i:
                        f_train_list[j].write(sent.to_string())
                    else:
                        f_train_except_list[j].write(sent.to_string())
        for f_i in f_train_list:
            f_i.close()
        for f_i in f_train_except_list:
            f_i.close()
        with open(data_preparation_done_file, "w") as f:
            f.write("Done!")
        logger.info("{}-fold data preparation done!".format(k))
    else:
        logger.info("No need to prepare {}-fold data.".format(k))

    # create training tasks
    for i in range(k):
        op_i = dict(op)
        op_i["train"] = train_file_except_i.format(i)
        op_i["dev"] = dev_file
        scheduler.add_options("except-{}".format(i), op_i, project_dir)