示例#1
0
    def train(self,
              docs: List[Document],
              unlabeled_docs: Iterable[Document] = None,
              early_stopping_callback: Callable[[RelExtClassifier, int],
                                                bool] = lambda c, e: False):

        collapsed_docs = FuncIterable(
            lambda: map(self._collapser.transform, docs))
        prec_docs = self._get_precomputed_docs(collapsed_docs,
                                               self._feature_computer)
        if unlabeled_docs is not None:
            unlabeled_docs = self._get_precomputed_docs(
                unlabeled_docs, self._syntactic_fc)

        shared_meta, rel_ext_meta, auxiliary_metas = self._init_metas(
            prec_docs, unlabeled_docs)

        print("Extracting features")
        rel_ext_samples = rel_ext_meta.feature_extractor.extract_features_from_docs(
            prec_docs)
        auxiliary_samples = \
            [list(task_meta.feature_extractor.extract_features_from_docs(unlabeled_docs))
             for task_meta in auxiliary_metas]

        self._build_and_train(shared_meta, rel_ext_meta, rel_ext_samples,
                              auxiliary_metas, auxiliary_samples,
                              early_stopping_callback)
def generate_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0):
    types_to_unquote = props.get("types_to_unquote", [])
    unquote_prob = props.get("prob_to_unquote", 0.0)

    if types_to_unquote and unquote_prob:
        # concat augmented docs with original ones to be sure all possible features are processed by FE factories
        augmentor = EntitiesUnquoteAugmentor(1.0, types_to_unquote)
        prev_docs = docs
        docs = FuncIterable(lambda: chain(prev_docs, map(augmentor.transform, prev_docs)))

    token_feature_extractor, token_features_meta = generate_token_feature_extractor(docs, props, char_padding_size)

    ne_feature_extractor, ne_meta = generate_ne_feature_extractor(docs, props)
    token_features_meta.basic_meta += ne_meta

    ent_types = collect_entities_types(docs)

    labelling_strategy = get_labelling_strategy(props.get("labelling_strategy", "BIO"))
    labels_converter = create_categorical_converter(
        labelling_strategy.get_possible_categories(ent_types),
        zero_padding=False
    )
    prob_augmentor = EntitiesUnquoteAugmentor(unquote_prob, types_to_unquote)
    feature_extractor = NERFeatureExtractor(
        token_feature_extractor, ne_feature_extractor, labelling_strategy, labels_converter, prob_augmentor)

    return feature_extractor, token_features_meta
示例#3
0
    def __enter__(self):
        stop_iter = False
        with open(self.path, 'wb') as f:
            while not stop_iter:
                try:
                    msgpack.dump(next(self.iterator), f)
                except StopIteration:
                    stop_iter = True

        return FuncIterable(lambda: _mp_iterate(self.path))
示例#4
0
    def extract_features_from_docs(self, docs) -> Iterable:
        def extract_samples(doc):
            ent_samples = self.extract_features_from_doc(doc,
                                                         include_labels=True)
            return [
                sample for ent, sample in ent_samples
                if isinstance(sample, dict)
            ]

        return FuncIterable(
            lambda: chain.from_iterable(map(extract_samples, docs)))
示例#5
0
    def extract_features_from_docs(self, docs, docs_groups) -> Iterable:
        def extract_samples(doc, groups):
            group_samples = self.extract_features_from_doc(doc,
                                                           groups,
                                                           include_labels=True)
            return [
                sample for _, sample in group_samples
                if isinstance(sample, dict)
            ]

        return FuncIterable(lambda: chain.from_iterable(
            starmap(extract_samples, zip(docs, docs_groups))))
示例#6
0
    def train(self,
              docs: Iterable[Document],
              unlabeled_docs: Iterable[Document] = None,
              early_stopping_callback: Callable[[NERClassifier, int],
                                                bool] = lambda c, e: False):

        feature_computer = SyntacticFeatureComputer(
            self.props.get('morph_feats_list', DEFAULT_FEATS_LIST))
        precomputed_docs = FuncIterable(
            lambda: map(feature_computer.create_features_for_doc, docs))

        char_padding_size = get_char_padding_size(self.props)
        feature_extractor, meta = generate_feature_extractor(
            precomputed_docs, self.props, char_padding_size)
        # we have only one graph
        graph, = build_graphs_with_shared_encoder(self.props, meta, [
            build_task_graph_meta(self.props,
                                  feature_extractor.get_labels_size())
        ])

        init = tf.global_variables_initializer()
        self._session.run(init)

        samples = feature_extractor.extract_features_from_docs(
            precomputed_docs)
        saver = tf.train.Saver(save_relative_paths=True)

        if self.props.get("unify_similar_entities_types", False):
            processor = unify_types_of_similar_entities
        else:
            processor = None

        classifier = _Classifier(graph, feature_extractor, feature_computer,
                                 self._session, saver, processor)

        batcher_factory = get_batcher_from_props(
            samples, self.props["batcher"],
            feature_extractor.get_padding_value_and_rank, True, True)

        train_meta = TaskTrainMeta(
            "NER", graph, batcher_factory, {
                "learning_rate":
                get_decayed_lr(self.props["learning_rate"],
                               self.props.get("lr_decay", 0)),
                "dropout_rate":
                get_const_controller(self.props.get("dropout", 1.0))
            }, classifier, early_stopping_callback)

        train_for_samples(self._session, self.props["epoch"], [train_meta])
示例#7
0
    def extract_features_from_docs_iterator(self,
                                            docs,
                                            use_filter=False,
                                            drop_negative=0):
        def apply():
            for doc in docs:
                doc_samples, _ = self.extract_features_from_doc(
                    doc, use_filter, include_labels=True)
                for sample in doc_samples:
                    if self.rel_reversed_converter[
                            sample['labels']] is not None or random.random(
                            ) >= drop_negative:
                        yield sample

        return FuncIterable(apply)
示例#8
0
def _get_bucketed_batches(
        samples: Iterable, batch_size: int,
        get_padding_value: Callable[[str], Any], get_bucket_for_sample: Callable, *,
        print_progress: bool = False, need_shuffling: bool = False, buffer_size: int = _DEFAULT_BATCHER_BUFFER_SIZE):

    buffers = FuncIterable(lambda: BlockIterator(iter(samples), buffer_size))

    for buffer in buffers:
        if print_progress:
            logger.info("{} samples added to buffer".format(len(buffer)))

        if need_shuffling:
            np.random.shuffle(buffer)

        buffer_buckets = defaultdict(list)
        for s in buffer:
            buffer_buckets[get_bucket_for_sample(s)].append(s)

        if print_progress:
            logger.info("{} buckets in buffer".format(len(buffer_buckets)))

        # sorting is applied to ensure reproducibility of results
        bucketed_samples = list(buffer_buckets[key] for key in sorted(buffer_buckets.keys()))
        buffer_batches = []

        for bucket in bucketed_samples:
            cur_batch_size = 0
            batch = defaultdict(list)

            for sample in bucket:
                _add_sample_to_batch(sample, batch)
                cur_batch_size += 1

                if cur_batch_size == batch_size:
                    buffer_batches.append(_pad_batch(batch, get_padding_value))
                    batch = defaultdict(list)
                    cur_batch_size = 0

            if cur_batch_size > 0:
                buffer_batches.append(_pad_batch(batch, get_padding_value))

        if need_shuffling:
            np.random.shuffle(buffer_batches)

        for batch in buffer_batches:
            yield batch
示例#9
0
    def _init_metas(self, rel_ext_docs, unlabeled_docs):
        char_padding_size = get_char_padding_size(self.props)
        concatted_docs = FuncIterable(lambda: chain(rel_ext_docs, [] if unlabeled_docs is None else unlabeled_docs))

        shared_fe, shared_meta = generate_spans_common_feature_extractor(
            concatted_docs, self.props['shared'], char_padding_size)

        self._log_meta(shared_meta, "shared features")

        rel_ext_task_meta = self._init_rel_ext_task_meta(rel_ext_docs, shared_fe)
        parser_task_meta = self._init_parser_task_meta(unlabeled_docs, shared_fe)
        sdp_task_meta = self._init_sdp_task_meta(unlabeled_docs, shared_fe)

        auxiliary_metas = list(filter(lambda x: x is not None, [parser_task_meta, sdp_task_meta]))
        if not auxiliary_metas and unlabeled_docs is not None:
            warn("Unlabeled docs provided without auxiliary configs")

        return shared_meta, rel_ext_task_meta, auxiliary_metas
示例#10
0
def load_docs(args):
    class HoldoutDataset:
        def __init__(self, train_docs, dev_docs):
            self.__train_docs = train_docs
            self.__dev_docs = dev_docs

        def get_splits(self):
            yield (self.__train_docs, self.__dev_docs)

        @property
        def splits_number(self):
            return 1

        def transformed_by(self, transformer):
            return HoldoutDataset(
                [transformer.transform(doc) for doc in self.__train_docs],
                [transformer.transform(doc) for doc in self.__dev_docs])

    class CVDataset:
        def __init__(self, docs):
            self.__docs = docs

        def get_splits(self):
            for i in range(args.folds):
                yield get_fold(self.__docs, args.folds, i)

        @property
        def splits_number(self):
            return args.folds

        def transformed_by(self, transformer):
            return CVDataset(
                [transformer.transform(doc) for doc in self.__docs])

    if args.strategy == 'holdout':
        dataset = HoldoutDataset(load(args.train_dir), load(args.dev_dir))
    elif args.strategy == 'cross_validation':
        dataset = CVDataset(load(args.traindev_dir))
    else:
        raise Exception(
            'Only holdout and cross_validation strategies are supported')

    return dataset, FuncIterable(lambda: read_conllu_file(args.unlabeled)
                                 ) if args.unlabeled is not None else None
示例#11
0
def _get_segment_batches(
        samples: Iterable, batch_key: str, size: int, get_padding_value: Callable[[str], Any], *,
        print_progress: bool = False, need_shuffling: bool = False, buffer_size: int = _DEFAULT_BATCHER_BUFFER_SIZE):
    try:
        first_sample = next(iter(samples))
    except StopIteration:
        return []

    common_features = set(key for key in first_sample if key != batch_key)
    buffers = FuncIterable(lambda: BlockIterator(iter(samples), buffer_size))

    for buffer in buffers:
        if print_progress:
            logger.info("{} samples added to buffer".format(len(buffer)))

        if need_shuffling:
            np.random.shuffle(buffer)

        batches = []

        for sample in buffer:
            if len(sample[batch_key]) == 0:
                continue

            # get 1 element batch for token features common for all examples in segment
            common_batch = next(_get_batches(
                [{key: value for key, value in sample.items() if key in common_features}], 1, get_padding_value))

            for batch in _get_batches(
                    sample[batch_key], size, get_padding_value, print_progress=False, need_shuffling=need_shuffling):

                batch.update(common_batch)
                batches.append(batch)

        if need_shuffling:
            np.random.shuffle(batches)

        for batch in batches:
            yield batch
示例#12
0
    def extract_features_from_docs(self, docs) -> Iterable:
        def is_positive(sample):
            return self.get_type(sample["labels"]) is not None

        samples = super().extract_features_from_docs(docs)
        negatives, positives = [], []

        for sample in samples:
            if is_positive(sample):
                positives.append(sample)
            else:
                negatives.append(sample)

        samples_to_pick = len(positives) * self._negative_ratio
        if samples_to_pick > len(negatives):
            samples_to_pick = len(negatives)
        else:
            samples_to_pick = int(samples_to_pick)

        logger.info(
            f"Extracted {len(positives)} positive samples, {samples_to_pick} negative samples"
        )
        return FuncIterable(lambda: chain(
            positives, random.sample(negatives, samples_to_pick)))
示例#13
0
 def _get_precomputed_docs(docs, feature_computer):
     return FuncIterable(lambda: map(feature_computer.create_features_for_doc, docs))
示例#14
0
 def extract_features_from_docs(self, docs) -> Iterable:
     return FuncIterable(
         lambda: chain.from_iterable(
             map(lambda d: self.extract_features_from_doc(self.augmentor.transform(d), True), docs)))
示例#15
0
 def extract_features_from_docs(self, docs) -> Iterable:
     return FuncIterable(lambda: chain.from_iterable(
         map(
             lambda doc: self.extract_features_from_doc(
                 doc, include_labels=True)[0], docs)))
示例#16
0
 def extract_features_from_docs(self, docs) -> Iterable:
     return FuncIterable(lambda: chain.from_iterable(
         map(self.extract_features_from_doc, docs)))
示例#17
0
    def train(self,
              docs: Iterable[Document],
              unlabeled_docs: Iterable[Document] = None,
              early_stopping_callback: Callable[[NETClassifier, int],
                                                bool] = lambda c, e: False):

        feature_computer = SyntacticFeatureComputer(
            self.props.get('morph_feats_list', DEFAULT_FEATS_LIST))

        if self.props.get("unify_similar_entities_types", False):
            grouper = chain_similar_entities
            get_bucket_for_sample = lambda s: int(s["chain_len"] == 1)
        else:
            grouper = chain_individual_entities
            get_bucket_for_sample = lambda s: s["seq_len"][0] // self.props[
                "batcher"]["bucket_length"]

        grouper_collapser = _GrouperCollapser(
            CoreferenceChainGrouper(grouper),
            EntitiesCollapser(self.props.get("types_to_collapse", set()),
                              collapse_with_ne=True))

        docs_groups = FuncIterable(lambda: map(
            itemgetter(0, 1),
            map(grouper_collapser.prepare_doc_with_collapsing, docs)))
        collapsed_docs = FuncIterable(lambda: map(itemgetter(0), docs_groups))
        precomputed_docs = FuncIterable(lambda: map(
            feature_computer.create_features_for_doc, collapsed_docs))
        groups = FuncIterable(lambda: map(itemgetter(1), docs_groups))

        char_padding_size = get_char_padding_size(self.props)
        feature_extractor, metas, token_meta = generate_feature_extractor(
            precomputed_docs, self.props, char_padding_size)
        feature_extractor = GroupingFeatureExtractor(
            feature_extractor, group_level_features=["labels_mask"])

        # reuse because this task is kinda unary rel-ext
        task_graph_meta = NETTaskGraphMeta("NET", self.props, metas,
                                           feature_extractor.get_labels_size(),
                                           True)
        # we have only one graph
        graph, = build_graphs_with_shared_encoder(
            self.props,
            token_meta, [build_task_graph_meta(task_graph_meta)],
            rank=3)

        init = tf.global_variables_initializer()
        self._session.run(init)

        samples = list(
            feature_extractor.extract_features_from_docs(
                precomputed_docs, groups))
        saver = tf.train.Saver(save_relative_paths=True)

        classifier = _Classifier(graph, feature_extractor, feature_computer,
                                 self._session, saver, grouper_collapser)

        batcher_factory = get_batcher_from_props(
            samples, self.props["batcher"],
            feature_extractor.get_padding_value_and_rank, True, True,
            get_bucket_for_sample)

        train_meta = TaskTrainMeta(
            "NET", graph, batcher_factory, {
                "learning_rate":
                get_decayed_lr(self.props["learning_rate"],
                               self.props.get("lr_decay", 0)),
                "dropout_rate":
                get_const_controller(self.props.get("dropout", 1.0))
            }, classifier, early_stopping_callback)

        train_for_samples(self._session, self.props["epoch"], [train_meta])