def predict_segment(model: BiLSTM, input_mapper, text): if len(text) == 0 or text.isspace(): return [] with Processor.started_stopwatch('input_mapping'): tokens, char_ids, word_ids = input_mapper.transform_text(text) if len(char_ids) == 0: return [] all_ids = [] i = 0 while i < len(char_ids[0]): lim = min(len(char_ids[0]), i + _max_sequence_length) if lim - i > 0: all_ids.append((char_ids[0:1, i:lim], word_ids[0:1, i:lim])) i += _max_sequence_length predictions = [] for char_ids, word_ids in all_ids: with Processor.started_stopwatch('model_predict'): local_predictions = model.predict(char_ids, word_ids) predictions.extend(local_predictions[0]) start_index = None prev_end = None for (start, end), prediction in zip(tokens, predictions): if prediction == 1: if start_index is not None: end_punct = _punct.match(text, prev_end) if end_punct is not None: prev_end = end_punct.end() yield start_index, prev_end start_index = start prev_end = end if start_index is not None and prev_end is not None: yield start_index, prev_end
def file_to_event(self, f: Union[Path, str, io.IOBase], *, client: Optional[EventsClient] = None) -> Event: import pickle with Processor.started_stopwatch('io'): try: d = pickle.load(f) except TypeError: with Path(f).open('rb') as f: d = pickle.load(f) with Processor.started_stopwatch('transform'): return dict_to_event(d, client=client)
def event_to_file(self, event: Event, f: Union[Path, str, io.IOBase], *, include_label_text: bool = False): import pickle with Processor.started_stopwatch('transform'): d = event_to_dict(event, include_label_text=include_label_text) with Processor.started_stopwatch('io'): try: pickle.dump(d, f) except TypeError: with Path(f).open('wb') as f: pickle.dump(d, f)
def file_to_event(self, f: Union[Path, str, io.IOBase], client: Optional[EventsClient] = None) -> Event: import json with Processor.started_stopwatch('io'): try: d = json.load(f) except AttributeError: if isinstance(f, str): f = Path(f) with f.open('r') as f: d = json.load(f) with Processor.started_stopwatch('transform'): return dict_to_event(d, client=client)
def event_to_file(self, event: Event, f: Path, *, include_label_text: bool = False): import json with Processor.started_stopwatch('transform'): d = event_to_dict(event, include_label_text=include_label_text) with Processor.started_stopwatch('io'): try: json.dump(d, f) except AttributeError: f = Path(f) f.parent.mkdir(parents=True, exist_ok=True) with f.open('w') as f: json.dump(d, f)
def file_to_event(self, f: Union[Path, str, io.IOBase], *, client: Optional[EventsClient] = None) -> Event: import yaml try: from yaml import CLoader as Loader except ImportError: from yaml import Loader with Processor.started_stopwatch('io'): if isinstance(f, io.IOBase): d = yaml.load(f, Loader=Loader) else: with Path(f).open() as f: d = yaml.load(f, Loader=Loader) with Processor.started_stopwatch('transform'): return dict_to_event(d, client=client)
def event_to_file(self, event: Event, f: Union[Path, str, io.IOBase], *, include_label_text: bool = False): import yaml try: from yaml import CDumper as Dumper except ImportError: from yaml import Dumper with Processor.started_stopwatch('transform'): d = event_to_dict(event, include_label_text=include_label_text) with Processor.started_stopwatch('io'): if isinstance(f, io.IOBase): yaml.dump(d, f, Dumper=Dumper) else: f = Path(f) with f.open('w') as f: yaml.dump(d, f, Dumper=Dumper)
def predict_text(model: BiLSTM, input_mapper, text): prev = 0 with Processor.started_stopwatch('segment_splitting') as split_timer: for match in _split.finditer(text): split_timer.stop() start = match.start() local_text = text[prev:start] for ss, se in predict_segment(model, input_mapper, local_text): yield prev + ss, prev + se prev = match.end() split_timer.start()
def call_process(self, event_id, params): self.processed += 1 p = dict(self.params) if params is not None: p.update(params) with Processor.enter_context() as c, \ Event(event_id=event_id, client=self.client) as event: try: with Processor.started_stopwatch( 'process_method') as stopwatch: stopwatch.start() result = self.processor.process(event, p) return result, c.times, event.created_indices except Exception as e: self.failure_count += 1 logger.error( 'Processor "%s" failed while processing event with id: %s', self.component_id, event_id) logger.error(e) raise e
def call_process(self, event_id, params): self.processed += 1 p = dict(self.params or {}) if params is not None: p.update(params) with EventProcessor.enter_context() as context: try: request = processing_pb2.ProcessRequest( processor_id=self._processor_id, event_id=event_id) _structs.copy_dict_to_struct(p, request.params, [p]) with Processor.started_stopwatch('remote_call'): response = self._stub.Process(request) r = {} _structs.copy_struct_to_dict(response.result, r) timing_info = response.timing_info for k, v in timing_info.items(): context.add_time(k, v.ToTimedelta()) created_indices = {} for created_index in response.created_indices: try: doc_created_indices = created_indices[ created_index.document_name] except KeyError: doc_created_indices = [] created_indices[ created_index.document_name] = doc_created_indices doc_created_indices.append(created_index.index_name) return r, context.times, created_indices except Exception as e: self.failure_count += 1 logger.error( 'Processor "%s" failed while processing event with id: %s', self.component_id, event_id) logger.error(e) raise e
def test_stopwatch_no_fail_outside_context(): blah = False with Processor.started_stopwatch('foo'): blah = True assert blah