def train(self): nlp = self.nlp # gather unique entities entities = [] for _, train in self.storage.train.items.items(): for _, gold in train.items.items(): entities.append(gold.entity) # add custom entities based on https://spacy.io/usage/training#example-new-entity-type ner = nlp.get_pipe('ner') for entity in set(entities): ner.add_label(entity) # prepare data train_idx = list(self.storage.train.items.keys()) trains = odict() for idx, train in self.storage.train.items.items(): entities = odict() for gold_idx, gold in train.items.items(): # ensure span is valid positions if not gold.span: offset = train.text.find(gold.subtext) if offset != -1: gold.span = '%s,%s' % (offset, offset + len(gold.subtext)) span = gold.span.replace(' ', '').strip() if not entities.get(span): entities[span] = gold.entity trains[idx] = {'entities': []} for span, entity in entities.items(): spans = span.split(',') trains[idx]['entities'].append([int(spans[0]), int(spans[1]), entity]) # train now nlp.vocab.vectors.name = 'spacy_pretrained_vectors' optimizer = nlp.begin_training() for itn in range(self.storage.config.train_iteration): random.shuffle(train_idx) for idx in train_idx: text = self.storage.train.items[idx].text train = trains[idx] nlp.update([text], [train], drop=self.storage.config.train_drop, sgd=optimizer) # auto save if required and nlp_path is defined if self.storage.config.train_autosave and self.storage.nlp_path: if EXCELCY_MATCHER in nlp.pipe_names: nlp.remove_pipe(EXCELCY_MATCHER) nlp.to_disk(self.storage.nlp_path) return self
def train(self): nlp = self.nlp # gather unique entities entities = [] for _, train in self.storage.train.items.items(): for _, gold in train.items.items(): entities.append(gold.entity) # add custom entities based on https://spacy.io/usage/training#example-new-entity-type ner = nlp.get_pipe('ner') for entity in set(entities): ner.add_label(entity) # prepare data train_idx = list(self.storage.train.items.keys()) trains = odict() for idx, train in self.storage.train.items.items(): entities = odict() for gold_idx, gold in train.items.items(): # ensure offset is valid positions if not gold.offset: offset = train.text.find(gold.subtext) if offset != -1: gold.offset = '%s,%s' % (offset, offset + len(gold.subtext)) offset = gold.offset.replace(' ', '').strip() if not entities.get(offset): entities[offset] = gold.entity trains[idx] = {'entities': []} for offset, entity in entities.items(): offsets = offset.split(',') trains[idx]['entities'].append( [int(offsets[0]), int(offsets[1]), entity]) # train now nlp.vocab.vectors.name = 'spacy_pretrained_vectors' optimizer = nlp.begin_training() for itn in range(self.storage.config.train_iteration): random.shuffle(train_idx) for idx in train_idx: text = self.storage.train.items[idx].text train = trains[idx] nlp.update([text], [train], drop=self.storage.config.train_drop, sgd=optimizer) return self
def _save_xlsx(self, file_path: str): def convert(header: list, registry: Registry): return [getattr(registry, key, None) for key in header] data = self.items() sheets = odict() # build source sheet headers = ['idx', 'kind', 'value'] sheets['source'] = [headers] for _, source in self.source.items.items(): sheets['source'].append(convert(sheets['source'][0], source)) # build prepare sheet headers = ['idx', 'kind', 'value', 'entity'] sheets['prepare'] = [headers] for _, prepare in self.prepare.items.items(): sheets['prepare'].append(convert(sheets['prepare'][0], prepare)) # build train sheet headers = ['idx', 'text', 'subtext', 'entity'] sheets['train'] = [headers] for _, train in self.train.items.items(): sheets['train'].append(convert(sheets['train'][0], train)) for _, gold in train.items.items(): sheets['train'].append(convert(sheets['train'][0], gold)) # build config sheet headers = ['name', 'value'] sheets['config'] = [headers] for config_name, config_value in self.config.items().items(): sheets['config'].append([config_name, config_value]) # save utils.excel_save(sheets=sheets, file_path=file_path)
def _save_xlsx(self, file_path: str, kind: list): def convert(header: list, registry: Registry) -> list: return [getattr(registry, key, None) for key in header] sheets = odict() # build phase sheet if 'phase' in kind: headers = ['idx', 'enabled', 'fn', 'args', 'notes'] sheets['phase'] = [headers] for _, phase in self.phase.items.items(): val = convert(sheets['phase'][0], phase) val[headers.index('args')] = ', '.join([ '%s=%s' % (k, v) for k, v in val[headers.index('args')].items() ]) sheets['phase'].append(val) # build source sheet if 'source' in kind: headers = ['idx', 'enabled', 'kind', 'value', 'notes'] sheets['source'] = [headers] for _, source in self.source.items.items(): sheets['source'].append(convert(sheets['source'][0], source)) # build prepare sheet if 'prepare' in kind: headers = ['idx', 'enabled', 'kind', 'value', 'entity', 'notes'] sheets['prepare'] = [headers] for _, prepare in self.prepare.items.items(): sheets['prepare'].append(convert(sheets['prepare'][0], prepare)) # build train sheet if 'train' in kind: headers = ['idx', 'enabled', 'text', 'subtext', 'entity', 'notes'] sheets['train'] = [headers] for _, train in self.train.items.items(): sheets['train'].append(convert(sheets['train'][0], train)) for _, gold in train.items.items(): sheets['train'].append(convert(sheets['train'][0], gold)) # build config sheet if 'config' in kind: headers = ['name', 'value'] sheets['config'] = [headers] for config_name, config_value in self.config.as_dict().items(): sheets['config'].append([config_name, config_value]) # save utils.excel_save(sheets=sheets, file_path=file_path)
def parse(self, data: odict): """ Overwrite current state of storage with given data :param data: Data in ordereddict """ # copy the data data = copy.deepcopy(data) # parse phase self.phase = Phases() for idx, item in data.get('phase', {}).get('items', {}).items(): args = item.get('args', odict()) for key, val in args.items(): args[key] = self.resolve_value(value=val) phase = Phase.make(items=item) self.phase.add_item(item=phase) # parse source self.source = Sources() for idx, item in data.get('source', {}).get('items', {}).items(): source = Source.make(items=item) self.source.add_item(item=source) # parse prepare self.prepare = Prepares() for idx, item in data.get('prepare', {}).get('items', {}).items(): prepare = Prepare.make(items=item) self.prepare.add_item(item=prepare) # parse train self.train = Trains() for idx, train_item in data.get('train', {}).get('items', {}).items(): train = Train.make(items=train_item) self.train.add_item(item=train) for idx2, gold_item in train_item.get('items', {}).items(): gold = Gold.make(items=gold_item) train.add_item(item=gold) # parse config self.config = Config.make(items=data.get('config', {}))
def add(self, fn: str, args: dict = None, idx: str = None): item = Phase(fn=fn, args=args or odict(), idx=str(idx)) self.add_item(item=item) return item
def _load_xlsx(self, file_path: str): """ Data loader for XLSX, this needs to be converted back to YML structure format :param file_path: XLSX file path """ wb = utils.excel_load(file_path=file_path) data = odict() # TODO: add validator, if wrong data input # TODO: refactor to less hardcoded? # parse phase data['phase'] = odict() data['phase']['items'] = odict() for phase in wb.get('phase', []): idx = phase.get('idx', len(data['phase']['items'])) args = odict() raws = phase.get('args', '').split(',') for raw in raws: kv = raw.split('=') if len(kv) == 2: key, value = kv args[key.strip()] = value.strip() phase['args'] = args data['phase']['items'][str(idx)] = phase # parse source data['source'] = odict() data['source']['items'] = odict() for source in wb.get('source', []): idx = source.get('idx', len(data['source']['items'])) data['source']['items'][str(idx)] = source # parse prepare data['prepare'] = odict() data['prepare']['items'] = odict() for prepare in wb.get('prepare', []): idx = prepare.get('idx', len(data['prepare']['items'])) data['prepare']['items'][str(idx)] = prepare # parse train data['train'] = odict() data['train']['items'] = odict() # lets ensure there is idx train_idx, gold_idx = 0, 0 for train in wb.get('train', []): if train.get('text') is not None: if gold_idx > 0: train_idx = train_idx + 1 gold_idx = 0 if train.get('idx') is None: train['idx'] = str(train_idx) else: if train.get('idx') is None: train['idx'] = '%s.%s' % (train_idx, gold_idx) gold_idx = gold_idx + 1 for train in wb.get('train', []): idx = str(train.get('idx')) train_idx, gold_idx = idx, None if '.' in idx: train_idx, gold_idx = idx.split('.') # add train list if train.get('text') is not None: t = odict() t['items'] = odict() for k in ['idx', 'text']: t[k] = train.get(k) data['train']['items'][train_idx] = t else: t = data['train']['items'][train_idx] g = odict() for k in ['idx', 'subtext', 'offset', 'entity']: g[k] = train.get(k) t['items'][idx] = g # parse config data['config'] = odict() for config in wb.get('config', odict()): name, value = config.get('name'), config.get('value') data['config'][name] = value self.parse(data=data)
def retest(self): for idx, train in self.storage.train.items.items(): # clear before retest the entities train.items = odict() # it is the same concept as prepare self._prepare_parse(train=train)
def __attrs_post_init__(self): super(Train, self).__attrs_post_init__() self.items = odict()
def __attrs_post_init__(self): super(Prepares, self).__attrs_post_init__() self.items = odict()
def __attrs_post_init__(self): super(Sources, self).__attrs_post_init__() self.items = odict()