def train(self, docs: List[Document], unlabeled_docs: Iterable[Document] = None, early_stopping_callback: Callable[[RelExtClassifier, int], bool] = lambda c, e: False): collapsed_docs = FuncIterable( lambda: map(self._collapser.transform, docs)) prec_docs = self._get_precomputed_docs(collapsed_docs, self._feature_computer) if unlabeled_docs is not None: unlabeled_docs = self._get_precomputed_docs( unlabeled_docs, self._syntactic_fc) shared_meta, rel_ext_meta, auxiliary_metas = self._init_metas( prec_docs, unlabeled_docs) print("Extracting features") rel_ext_samples = rel_ext_meta.feature_extractor.extract_features_from_docs( prec_docs) auxiliary_samples = \ [list(task_meta.feature_extractor.extract_features_from_docs(unlabeled_docs)) for task_meta in auxiliary_metas] self._build_and_train(shared_meta, rel_ext_meta, rel_ext_samples, auxiliary_metas, auxiliary_samples, early_stopping_callback)
def generate_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0): types_to_unquote = props.get("types_to_unquote", []) unquote_prob = props.get("prob_to_unquote", 0.0) if types_to_unquote and unquote_prob: # concat augmented docs with original ones to be sure all possible features are processed by FE factories augmentor = EntitiesUnquoteAugmentor(1.0, types_to_unquote) prev_docs = docs docs = FuncIterable(lambda: chain(prev_docs, map(augmentor.transform, prev_docs))) token_feature_extractor, token_features_meta = generate_token_feature_extractor(docs, props, char_padding_size) ne_feature_extractor, ne_meta = generate_ne_feature_extractor(docs, props) token_features_meta.basic_meta += ne_meta ent_types = collect_entities_types(docs) labelling_strategy = get_labelling_strategy(props.get("labelling_strategy", "BIO")) labels_converter = create_categorical_converter( labelling_strategy.get_possible_categories(ent_types), zero_padding=False ) prob_augmentor = EntitiesUnquoteAugmentor(unquote_prob, types_to_unquote) feature_extractor = NERFeatureExtractor( token_feature_extractor, ne_feature_extractor, labelling_strategy, labels_converter, prob_augmentor) return feature_extractor, token_features_meta
def __enter__(self): stop_iter = False with open(self.path, 'wb') as f: while not stop_iter: try: msgpack.dump(next(self.iterator), f) except StopIteration: stop_iter = True return FuncIterable(lambda: _mp_iterate(self.path))
def extract_features_from_docs(self, docs) -> Iterable: def extract_samples(doc): ent_samples = self.extract_features_from_doc(doc, include_labels=True) return [ sample for ent, sample in ent_samples if isinstance(sample, dict) ] return FuncIterable( lambda: chain.from_iterable(map(extract_samples, docs)))
def extract_features_from_docs(self, docs, docs_groups) -> Iterable: def extract_samples(doc, groups): group_samples = self.extract_features_from_doc(doc, groups, include_labels=True) return [ sample for _, sample in group_samples if isinstance(sample, dict) ] return FuncIterable(lambda: chain.from_iterable( starmap(extract_samples, zip(docs, docs_groups))))
def train(self, docs: Iterable[Document], unlabeled_docs: Iterable[Document] = None, early_stopping_callback: Callable[[NERClassifier, int], bool] = lambda c, e: False): feature_computer = SyntacticFeatureComputer( self.props.get('morph_feats_list', DEFAULT_FEATS_LIST)) precomputed_docs = FuncIterable( lambda: map(feature_computer.create_features_for_doc, docs)) char_padding_size = get_char_padding_size(self.props) feature_extractor, meta = generate_feature_extractor( precomputed_docs, self.props, char_padding_size) # we have only one graph graph, = build_graphs_with_shared_encoder(self.props, meta, [ build_task_graph_meta(self.props, feature_extractor.get_labels_size()) ]) init = tf.global_variables_initializer() self._session.run(init) samples = feature_extractor.extract_features_from_docs( precomputed_docs) saver = tf.train.Saver(save_relative_paths=True) if self.props.get("unify_similar_entities_types", False): processor = unify_types_of_similar_entities else: processor = None classifier = _Classifier(graph, feature_extractor, feature_computer, self._session, saver, processor) batcher_factory = get_batcher_from_props( samples, self.props["batcher"], feature_extractor.get_padding_value_and_rank, True, True) train_meta = TaskTrainMeta( "NER", graph, batcher_factory, { "learning_rate": get_decayed_lr(self.props["learning_rate"], self.props.get("lr_decay", 0)), "dropout_rate": get_const_controller(self.props.get("dropout", 1.0)) }, classifier, early_stopping_callback) train_for_samples(self._session, self.props["epoch"], [train_meta])
def extract_features_from_docs_iterator(self, docs, use_filter=False, drop_negative=0): def apply(): for doc in docs: doc_samples, _ = self.extract_features_from_doc( doc, use_filter, include_labels=True) for sample in doc_samples: if self.rel_reversed_converter[ sample['labels']] is not None or random.random( ) >= drop_negative: yield sample return FuncIterable(apply)
def _get_bucketed_batches( samples: Iterable, batch_size: int, get_padding_value: Callable[[str], Any], get_bucket_for_sample: Callable, *, print_progress: bool = False, need_shuffling: bool = False, buffer_size: int = _DEFAULT_BATCHER_BUFFER_SIZE): buffers = FuncIterable(lambda: BlockIterator(iter(samples), buffer_size)) for buffer in buffers: if print_progress: logger.info("{} samples added to buffer".format(len(buffer))) if need_shuffling: np.random.shuffle(buffer) buffer_buckets = defaultdict(list) for s in buffer: buffer_buckets[get_bucket_for_sample(s)].append(s) if print_progress: logger.info("{} buckets in buffer".format(len(buffer_buckets))) # sorting is applied to ensure reproducibility of results bucketed_samples = list(buffer_buckets[key] for key in sorted(buffer_buckets.keys())) buffer_batches = [] for bucket in bucketed_samples: cur_batch_size = 0 batch = defaultdict(list) for sample in bucket: _add_sample_to_batch(sample, batch) cur_batch_size += 1 if cur_batch_size == batch_size: buffer_batches.append(_pad_batch(batch, get_padding_value)) batch = defaultdict(list) cur_batch_size = 0 if cur_batch_size > 0: buffer_batches.append(_pad_batch(batch, get_padding_value)) if need_shuffling: np.random.shuffle(buffer_batches) for batch in buffer_batches: yield batch
def _init_metas(self, rel_ext_docs, unlabeled_docs): char_padding_size = get_char_padding_size(self.props) concatted_docs = FuncIterable(lambda: chain(rel_ext_docs, [] if unlabeled_docs is None else unlabeled_docs)) shared_fe, shared_meta = generate_spans_common_feature_extractor( concatted_docs, self.props['shared'], char_padding_size) self._log_meta(shared_meta, "shared features") rel_ext_task_meta = self._init_rel_ext_task_meta(rel_ext_docs, shared_fe) parser_task_meta = self._init_parser_task_meta(unlabeled_docs, shared_fe) sdp_task_meta = self._init_sdp_task_meta(unlabeled_docs, shared_fe) auxiliary_metas = list(filter(lambda x: x is not None, [parser_task_meta, sdp_task_meta])) if not auxiliary_metas and unlabeled_docs is not None: warn("Unlabeled docs provided without auxiliary configs") return shared_meta, rel_ext_task_meta, auxiliary_metas
def load_docs(args): class HoldoutDataset: def __init__(self, train_docs, dev_docs): self.__train_docs = train_docs self.__dev_docs = dev_docs def get_splits(self): yield (self.__train_docs, self.__dev_docs) @property def splits_number(self): return 1 def transformed_by(self, transformer): return HoldoutDataset( [transformer.transform(doc) for doc in self.__train_docs], [transformer.transform(doc) for doc in self.__dev_docs]) class CVDataset: def __init__(self, docs): self.__docs = docs def get_splits(self): for i in range(args.folds): yield get_fold(self.__docs, args.folds, i) @property def splits_number(self): return args.folds def transformed_by(self, transformer): return CVDataset( [transformer.transform(doc) for doc in self.__docs]) if args.strategy == 'holdout': dataset = HoldoutDataset(load(args.train_dir), load(args.dev_dir)) elif args.strategy == 'cross_validation': dataset = CVDataset(load(args.traindev_dir)) else: raise Exception( 'Only holdout and cross_validation strategies are supported') return dataset, FuncIterable(lambda: read_conllu_file(args.unlabeled) ) if args.unlabeled is not None else None
def _get_segment_batches( samples: Iterable, batch_key: str, size: int, get_padding_value: Callable[[str], Any], *, print_progress: bool = False, need_shuffling: bool = False, buffer_size: int = _DEFAULT_BATCHER_BUFFER_SIZE): try: first_sample = next(iter(samples)) except StopIteration: return [] common_features = set(key for key in first_sample if key != batch_key) buffers = FuncIterable(lambda: BlockIterator(iter(samples), buffer_size)) for buffer in buffers: if print_progress: logger.info("{} samples added to buffer".format(len(buffer))) if need_shuffling: np.random.shuffle(buffer) batches = [] for sample in buffer: if len(sample[batch_key]) == 0: continue # get 1 element batch for token features common for all examples in segment common_batch = next(_get_batches( [{key: value for key, value in sample.items() if key in common_features}], 1, get_padding_value)) for batch in _get_batches( sample[batch_key], size, get_padding_value, print_progress=False, need_shuffling=need_shuffling): batch.update(common_batch) batches.append(batch) if need_shuffling: np.random.shuffle(batches) for batch in batches: yield batch
def extract_features_from_docs(self, docs) -> Iterable: def is_positive(sample): return self.get_type(sample["labels"]) is not None samples = super().extract_features_from_docs(docs) negatives, positives = [], [] for sample in samples: if is_positive(sample): positives.append(sample) else: negatives.append(sample) samples_to_pick = len(positives) * self._negative_ratio if samples_to_pick > len(negatives): samples_to_pick = len(negatives) else: samples_to_pick = int(samples_to_pick) logger.info( f"Extracted {len(positives)} positive samples, {samples_to_pick} negative samples" ) return FuncIterable(lambda: chain( positives, random.sample(negatives, samples_to_pick)))
def _get_precomputed_docs(docs, feature_computer): return FuncIterable(lambda: map(feature_computer.create_features_for_doc, docs))
def extract_features_from_docs(self, docs) -> Iterable: return FuncIterable( lambda: chain.from_iterable( map(lambda d: self.extract_features_from_doc(self.augmentor.transform(d), True), docs)))
def extract_features_from_docs(self, docs) -> Iterable: return FuncIterable(lambda: chain.from_iterable( map( lambda doc: self.extract_features_from_doc( doc, include_labels=True)[0], docs)))
def extract_features_from_docs(self, docs) -> Iterable: return FuncIterable(lambda: chain.from_iterable( map(self.extract_features_from_doc, docs)))
def train(self, docs: Iterable[Document], unlabeled_docs: Iterable[Document] = None, early_stopping_callback: Callable[[NETClassifier, int], bool] = lambda c, e: False): feature_computer = SyntacticFeatureComputer( self.props.get('morph_feats_list', DEFAULT_FEATS_LIST)) if self.props.get("unify_similar_entities_types", False): grouper = chain_similar_entities get_bucket_for_sample = lambda s: int(s["chain_len"] == 1) else: grouper = chain_individual_entities get_bucket_for_sample = lambda s: s["seq_len"][0] // self.props[ "batcher"]["bucket_length"] grouper_collapser = _GrouperCollapser( CoreferenceChainGrouper(grouper), EntitiesCollapser(self.props.get("types_to_collapse", set()), collapse_with_ne=True)) docs_groups = FuncIterable(lambda: map( itemgetter(0, 1), map(grouper_collapser.prepare_doc_with_collapsing, docs))) collapsed_docs = FuncIterable(lambda: map(itemgetter(0), docs_groups)) precomputed_docs = FuncIterable(lambda: map( feature_computer.create_features_for_doc, collapsed_docs)) groups = FuncIterable(lambda: map(itemgetter(1), docs_groups)) char_padding_size = get_char_padding_size(self.props) feature_extractor, metas, token_meta = generate_feature_extractor( precomputed_docs, self.props, char_padding_size) feature_extractor = GroupingFeatureExtractor( feature_extractor, group_level_features=["labels_mask"]) # reuse because this task is kinda unary rel-ext task_graph_meta = NETTaskGraphMeta("NET", self.props, metas, feature_extractor.get_labels_size(), True) # we have only one graph graph, = build_graphs_with_shared_encoder( self.props, token_meta, [build_task_graph_meta(task_graph_meta)], rank=3) init = tf.global_variables_initializer() self._session.run(init) samples = list( feature_extractor.extract_features_from_docs( precomputed_docs, groups)) saver = tf.train.Saver(save_relative_paths=True) classifier = _Classifier(graph, feature_extractor, feature_computer, self._session, saver, grouper_collapser) batcher_factory = get_batcher_from_props( samples, self.props["batcher"], feature_extractor.get_padding_value_and_rank, True, True, get_bucket_for_sample) train_meta = TaskTrainMeta( "NET", graph, batcher_factory, { "learning_rate": get_decayed_lr(self.props["learning_rate"], self.props.get("lr_decay", 0)), "dropout_rate": get_const_controller(self.props.get("dropout", 1.0)) }, classifier, early_stopping_callback) train_for_samples(self._session, self.props["epoch"], [train_meta])