示例#1
0
    def train(self):
        nlp = self.nlp

        # gather unique entities
        entities = []
        for _, train in self.storage.train.items.items():
            for _, gold in train.items.items():
                entities.append(gold.entity)

        # add custom entities based on https://spacy.io/usage/training#example-new-entity-type
        ner = nlp.get_pipe('ner')
        for entity in set(entities):
            ner.add_label(entity)

        # prepare data
        train_idx = list(self.storage.train.items.keys())
        trains = odict()
        for idx, train in self.storage.train.items.items():
            entities = odict()
            for gold_idx, gold in train.items.items():
                # ensure span is valid positions
                if not gold.span:
                    offset = train.text.find(gold.subtext)
                    if offset != -1:
                        gold.span = '%s,%s' % (offset, offset + len(gold.subtext))
                span = gold.span.replace(' ', '').strip()
                if not entities.get(span):
                    entities[span] = gold.entity
            trains[idx] = {'entities': []}
            for span, entity in entities.items():
                spans = span.split(',')
                trains[idx]['entities'].append([int(spans[0]), int(spans[1]), entity])

        # train now
        nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
        optimizer = nlp.begin_training()
        for itn in range(self.storage.config.train_iteration):
            random.shuffle(train_idx)
            for idx in train_idx:
                text = self.storage.train.items[idx].text
                train = trains[idx]
                nlp.update([text], [train], drop=self.storage.config.train_drop, sgd=optimizer)

        # auto save if required and nlp_path is defined
        if self.storage.config.train_autosave and self.storage.nlp_path:
            if EXCELCY_MATCHER in nlp.pipe_names:
                nlp.remove_pipe(EXCELCY_MATCHER)
            nlp.to_disk(self.storage.nlp_path)

        return self
示例#2
0
    def train(self):
        nlp = self.nlp

        # gather unique entities
        entities = []
        for _, train in self.storage.train.items.items():
            for _, gold in train.items.items():
                entities.append(gold.entity)

        # add custom entities based on https://spacy.io/usage/training#example-new-entity-type
        ner = nlp.get_pipe('ner')
        for entity in set(entities):
            ner.add_label(entity)

        # prepare data
        train_idx = list(self.storage.train.items.keys())
        trains = odict()
        for idx, train in self.storage.train.items.items():
            entities = odict()
            for gold_idx, gold in train.items.items():
                # ensure offset is valid positions
                if not gold.offset:
                    offset = train.text.find(gold.subtext)
                    if offset != -1:
                        gold.offset = '%s,%s' % (offset,
                                                 offset + len(gold.subtext))
                offset = gold.offset.replace(' ', '').strip()
                if not entities.get(offset):
                    entities[offset] = gold.entity
            trains[idx] = {'entities': []}
            for offset, entity in entities.items():
                offsets = offset.split(',')
                trains[idx]['entities'].append(
                    [int(offsets[0]), int(offsets[1]), entity])

        # train now
        nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
        optimizer = nlp.begin_training()
        for itn in range(self.storage.config.train_iteration):
            random.shuffle(train_idx)
            for idx in train_idx:
                text = self.storage.train.items[idx].text
                train = trains[idx]
                nlp.update([text], [train],
                           drop=self.storage.config.train_drop,
                           sgd=optimizer)

        return self
示例#3
0
    def _save_xlsx(self, file_path: str):
        def convert(header: list, registry: Registry):
            return [getattr(registry, key, None) for key in header]

        data = self.items()
        sheets = odict()

        # build source sheet
        headers = ['idx', 'kind', 'value']
        sheets['source'] = [headers]
        for _, source in self.source.items.items():
            sheets['source'].append(convert(sheets['source'][0], source))

        # build prepare sheet
        headers = ['idx', 'kind', 'value', 'entity']
        sheets['prepare'] = [headers]
        for _, prepare in self.prepare.items.items():
            sheets['prepare'].append(convert(sheets['prepare'][0], prepare))

        # build train sheet
        headers = ['idx', 'text', 'subtext', 'entity']
        sheets['train'] = [headers]
        for _, train in self.train.items.items():
            sheets['train'].append(convert(sheets['train'][0], train))
            for _, gold in train.items.items():
                sheets['train'].append(convert(sheets['train'][0], gold))

        # build config sheet
        headers = ['name', 'value']
        sheets['config'] = [headers]
        for config_name, config_value in self.config.items().items():
            sheets['config'].append([config_name, config_value])

        # save
        utils.excel_save(sheets=sheets, file_path=file_path)
示例#4
0
文件: storage.py 项目: todun/excelcy
    def _save_xlsx(self, file_path: str, kind: list):
        def convert(header: list, registry: Registry) -> list:
            return [getattr(registry, key, None) for key in header]

        sheets = odict()

        # build phase sheet
        if 'phase' in kind:
            headers = ['idx', 'enabled', 'fn', 'args', 'notes']
            sheets['phase'] = [headers]
            for _, phase in self.phase.items.items():
                val = convert(sheets['phase'][0], phase)
                val[headers.index('args')] = ', '.join([
                    '%s=%s' % (k, v)
                    for k, v in val[headers.index('args')].items()
                ])
                sheets['phase'].append(val)

        # build source sheet
        if 'source' in kind:
            headers = ['idx', 'enabled', 'kind', 'value', 'notes']
            sheets['source'] = [headers]
            for _, source in self.source.items.items():
                sheets['source'].append(convert(sheets['source'][0], source))

        # build prepare sheet
        if 'prepare' in kind:
            headers = ['idx', 'enabled', 'kind', 'value', 'entity', 'notes']
            sheets['prepare'] = [headers]
            for _, prepare in self.prepare.items.items():
                sheets['prepare'].append(convert(sheets['prepare'][0],
                                                 prepare))

        # build train sheet
        if 'train' in kind:
            headers = ['idx', 'enabled', 'text', 'subtext', 'entity', 'notes']
            sheets['train'] = [headers]
            for _, train in self.train.items.items():
                sheets['train'].append(convert(sheets['train'][0], train))
                for _, gold in train.items.items():
                    sheets['train'].append(convert(sheets['train'][0], gold))

        # build config sheet
        if 'config' in kind:
            headers = ['name', 'value']
            sheets['config'] = [headers]
            for config_name, config_value in self.config.as_dict().items():
                sheets['config'].append([config_name, config_value])

        # save
        utils.excel_save(sheets=sheets, file_path=file_path)
示例#5
0
文件: storage.py 项目: todun/excelcy
    def parse(self, data: odict):
        """
        Overwrite current state of storage with given data
        :param data: Data in ordereddict
        """

        # copy the data
        data = copy.deepcopy(data)

        # parse phase
        self.phase = Phases()
        for idx, item in data.get('phase', {}).get('items', {}).items():
            args = item.get('args', odict())
            for key, val in args.items():
                args[key] = self.resolve_value(value=val)
            phase = Phase.make(items=item)
            self.phase.add_item(item=phase)

        # parse source
        self.source = Sources()
        for idx, item in data.get('source', {}).get('items', {}).items():
            source = Source.make(items=item)
            self.source.add_item(item=source)

        # parse prepare
        self.prepare = Prepares()
        for idx, item in data.get('prepare', {}).get('items', {}).items():
            prepare = Prepare.make(items=item)
            self.prepare.add_item(item=prepare)

        # parse train
        self.train = Trains()
        for idx, train_item in data.get('train', {}).get('items', {}).items():
            train = Train.make(items=train_item)
            self.train.add_item(item=train)
            for idx2, gold_item in train_item.get('items', {}).items():
                gold = Gold.make(items=gold_item)
                train.add_item(item=gold)

        # parse config
        self.config = Config.make(items=data.get('config', {}))
示例#6
0
文件: storage.py 项目: todun/excelcy
 def add(self, fn: str, args: dict = None, idx: str = None):
     item = Phase(fn=fn, args=args or odict(), idx=str(idx))
     self.add_item(item=item)
     return item
示例#7
0
文件: storage.py 项目: todun/excelcy
    def _load_xlsx(self, file_path: str):
        """
        Data loader for XLSX, this needs to be converted back to YML structure format
        :param file_path: XLSX file path
        """
        wb = utils.excel_load(file_path=file_path)
        data = odict()

        # TODO: add validator, if wrong data input
        # TODO: refactor to less hardcoded?

        # parse phase
        data['phase'] = odict()
        data['phase']['items'] = odict()
        for phase in wb.get('phase', []):
            idx = phase.get('idx', len(data['phase']['items']))
            args = odict()
            raws = phase.get('args', '').split(',')
            for raw in raws:
                kv = raw.split('=')
                if len(kv) == 2:
                    key, value = kv
                    args[key.strip()] = value.strip()
            phase['args'] = args
            data['phase']['items'][str(idx)] = phase

        # parse source
        data['source'] = odict()
        data['source']['items'] = odict()
        for source in wb.get('source', []):
            idx = source.get('idx', len(data['source']['items']))
            data['source']['items'][str(idx)] = source

        # parse prepare
        data['prepare'] = odict()
        data['prepare']['items'] = odict()
        for prepare in wb.get('prepare', []):
            idx = prepare.get('idx', len(data['prepare']['items']))
            data['prepare']['items'][str(idx)] = prepare

        # parse train
        data['train'] = odict()
        data['train']['items'] = odict()
        # lets ensure there is idx
        train_idx, gold_idx = 0, 0
        for train in wb.get('train', []):
            if train.get('text') is not None:
                if gold_idx > 0:
                    train_idx = train_idx + 1
                    gold_idx = 0
                if train.get('idx') is None:
                    train['idx'] = str(train_idx)
            else:
                if train.get('idx') is None:
                    train['idx'] = '%s.%s' % (train_idx, gold_idx)
                gold_idx = gold_idx + 1
        for train in wb.get('train', []):
            idx = str(train.get('idx'))
            train_idx, gold_idx = idx, None
            if '.' in idx:
                train_idx, gold_idx = idx.split('.')
            # add train list
            if train.get('text') is not None:
                t = odict()
                t['items'] = odict()
                for k in ['idx', 'text']:
                    t[k] = train.get(k)
                data['train']['items'][train_idx] = t
            else:
                t = data['train']['items'][train_idx]
                g = odict()
                for k in ['idx', 'subtext', 'offset', 'entity']:
                    g[k] = train.get(k)
                t['items'][idx] = g

        # parse config
        data['config'] = odict()
        for config in wb.get('config', odict()):
            name, value = config.get('name'), config.get('value')
            data['config'][name] = value

        self.parse(data=data)
示例#8
0
 def retest(self):
     for idx, train in self.storage.train.items.items():
         # clear before retest the entities
         train.items = odict()
         # it is the same concept as prepare
         self._prepare_parse(train=train)
示例#9
0
 def __attrs_post_init__(self):
     super(Train, self).__attrs_post_init__()
     self.items = odict()
示例#10
0
 def __attrs_post_init__(self):
     super(Prepares, self).__attrs_post_init__()
     self.items = odict()
示例#11
0
 def __attrs_post_init__(self):
     super(Sources, self).__attrs_post_init__()
     self.items = odict()