示例#1
0
    def train(self,
              docs: Iterable[Document],
              unlabeled_docs: Iterable[Document] = None,
              early_stopping_callback: Callable[[NERClassifier, int],
                                                bool] = lambda c, e: False):

        feature_computer = SyntacticFeatureComputer(
            self.props.get('morph_feats_list', DEFAULT_FEATS_LIST))
        precomputed_docs = FuncIterable(
            lambda: map(feature_computer.create_features_for_doc, docs))

        char_padding_size = get_char_padding_size(self.props)
        feature_extractor, meta = generate_feature_extractor(
            precomputed_docs, self.props, char_padding_size)
        # we have only one graph
        graph, = build_graphs_with_shared_encoder(self.props, meta, [
            build_task_graph_meta(self.props,
                                  feature_extractor.get_labels_size())
        ])

        init = tf.global_variables_initializer()
        self._session.run(init)

        samples = feature_extractor.extract_features_from_docs(
            precomputed_docs)
        saver = tf.train.Saver(save_relative_paths=True)

        if self.props.get("unify_similar_entities_types", False):
            processor = unify_types_of_similar_entities
        else:
            processor = None

        classifier = _Classifier(graph, feature_extractor, feature_computer,
                                 self._session, saver, processor)

        batcher_factory = get_batcher_from_props(
            samples, self.props["batcher"],
            feature_extractor.get_padding_value_and_rank, True, True)

        train_meta = TaskTrainMeta(
            "NER", graph, batcher_factory, {
                "learning_rate":
                get_decayed_lr(self.props["learning_rate"],
                               self.props.get("lr_decay", 0)),
                "dropout_rate":
                get_const_controller(self.props.get("dropout", 1.0))
            }, classifier, early_stopping_callback)

        train_for_samples(self._session, self.props["epoch"], [train_meta])
示例#2
0
    def _init_metas(self, rel_ext_docs, unlabeled_docs):
        char_padding_size = get_char_padding_size(self.props)
        concatted_docs = FuncIterable(lambda: chain(rel_ext_docs, [] if unlabeled_docs is None else unlabeled_docs))

        shared_fe, shared_meta = generate_spans_common_feature_extractor(
            concatted_docs, self.props['shared'], char_padding_size)

        self._log_meta(shared_meta, "shared features")

        rel_ext_task_meta = self._init_rel_ext_task_meta(rel_ext_docs, shared_fe)
        parser_task_meta = self._init_parser_task_meta(unlabeled_docs, shared_fe)
        sdp_task_meta = self._init_sdp_task_meta(unlabeled_docs, shared_fe)

        auxiliary_metas = list(filter(lambda x: x is not None, [parser_task_meta, sdp_task_meta]))
        if not auxiliary_metas and unlabeled_docs is not None:
            warn("Unlabeled docs provided without auxiliary configs")

        return shared_meta, rel_ext_task_meta, auxiliary_metas
示例#3
0
    def train(self,
              docs: Iterable[Document],
              unlabeled_docs: Iterable[Document] = None,
              early_stopping_callback: Callable[[NETClassifier, int],
                                                bool] = lambda c, e: False):

        feature_computer = SyntacticFeatureComputer(
            self.props.get('morph_feats_list', DEFAULT_FEATS_LIST))

        if self.props.get("unify_similar_entities_types", False):
            grouper = chain_similar_entities
            get_bucket_for_sample = lambda s: int(s["chain_len"] == 1)
        else:
            grouper = chain_individual_entities
            get_bucket_for_sample = lambda s: s["seq_len"][0] // self.props[
                "batcher"]["bucket_length"]

        grouper_collapser = _GrouperCollapser(
            CoreferenceChainGrouper(grouper),
            EntitiesCollapser(self.props.get("types_to_collapse", set()),
                              collapse_with_ne=True))

        docs_groups = FuncIterable(lambda: map(
            itemgetter(0, 1),
            map(grouper_collapser.prepare_doc_with_collapsing, docs)))
        collapsed_docs = FuncIterable(lambda: map(itemgetter(0), docs_groups))
        precomputed_docs = FuncIterable(lambda: map(
            feature_computer.create_features_for_doc, collapsed_docs))
        groups = FuncIterable(lambda: map(itemgetter(1), docs_groups))

        char_padding_size = get_char_padding_size(self.props)
        feature_extractor, metas, token_meta = generate_feature_extractor(
            precomputed_docs, self.props, char_padding_size)
        feature_extractor = GroupingFeatureExtractor(
            feature_extractor, group_level_features=["labels_mask"])

        # reuse because this task is kinda unary rel-ext
        task_graph_meta = NETTaskGraphMeta("NET", self.props, metas,
                                           feature_extractor.get_labels_size(),
                                           True)
        # we have only one graph
        graph, = build_graphs_with_shared_encoder(
            self.props,
            token_meta, [build_task_graph_meta(task_graph_meta)],
            rank=3)

        init = tf.global_variables_initializer()
        self._session.run(init)

        samples = list(
            feature_extractor.extract_features_from_docs(
                precomputed_docs, groups))
        saver = tf.train.Saver(save_relative_paths=True)

        classifier = _Classifier(graph, feature_extractor, feature_computer,
                                 self._session, saver, grouper_collapser)

        batcher_factory = get_batcher_from_props(
            samples, self.props["batcher"],
            feature_extractor.get_padding_value_and_rank, True, True,
            get_bucket_for_sample)

        train_meta = TaskTrainMeta(
            "NET", graph, batcher_factory, {
                "learning_rate":
                get_decayed_lr(self.props["learning_rate"],
                               self.props.get("lr_decay", 0)),
                "dropout_rate":
                get_const_controller(self.props.get("dropout", 1.0))
            }, classifier, early_stopping_callback)

        train_for_samples(self._session, self.props["epoch"], [train_meta])
示例#4
0
    def _create_common_fe(self, docs):

        char_padding_size = get_char_padding_size(self.props)

        return generate_token_feature_extractor(docs, self.props, char_padding_size)