예제 #1
0
파일: conllu.py 프로젝트: zmjm4/ltp
    def _generate_examples(self, files):
        for filename in files:
            logging.info("⏳ Generating examples from = %s", filename)
            for line_num, block in iter_blocks(filename=filename):
                # last example
                id, words, lemma, upos, xpos, feats, head, deprel, deps, misc = [
                    list(value) for value in zip(*block)
                ]
                if self.config.deps:
                    deps = [[
                        label.split(':', maxsplit=1)
                        for label in dep.split('|')
                    ] for dep in deps]
                    deps = [[{
                        'id': depid,
                        'head': int(label[0]),
                        'rel': label[-1]
                    } for label in dep] for depid, dep in enumerate(deps)]
                    deps = list(itertools.chain(*deps))
                    if any([dep['head'] >= len(words) for dep in deps]):
                        continue

                yield line_num, {
                    "id": id,
                    "form": words,
                    "lemma": lemma,
                    "upos": upos,
                    "xpos": xpos,
                    "feats": feats,
                    "head": head,
                    "deprel": deprel,
                    "deps": deps,
                    "misc": misc,
                }
예제 #2
0
    def _generate_examples(self, filepath):
        logging.info("⏳ Generating examples from = %s", filepath)
        for line_num, block in iter_blocks(filename=filepath):
            # last example
            words, bio = [list(value) for value in zip(*block)]

            yield line_num, {"words": words, "bio": bio}
예제 #3
0
파일: corpus.py 프로젝트: zhangwj618/ltp
    def iter(self,
             filename: str,
             fields,
             multi_field: str = None,
             split=None,
             strip=None,
             proxy_property: dict = None):
        fields, fields_slices = self.build_slice(fields, multi_field,
                                                 proxy_property)
        if proxy_property is not None:
            field_map = {field[0]: idx for idx, field in enumerate(fields)}
            for proxy, source in proxy_property.items():
                if proxy in field_map:
                    fields_slices[field_map[proxy]] = fields_slices[
                        field_map[source]]

        for line_num, block in tqdm(list(iter_blocks(filename, split, strip))):
            values = [list(value) for value in zip(*block)]
            values = [values[field_slice] for field_slice in fields_slices]
            processed, more = self.post_fn(values)

            if more:
                for values in processed:
                    try:
                        yield Example.fromlist(values, fields)
                    except Exception as e:
                        print(line_num, e)
            else:
                try:
                    yield Example.fromlist(processed, fields)
                except Exception as e:
                    print(line_num, e)
예제 #4
0
파일: conllu.py 프로젝트: ztzdxqj/ltp
def build_vocabs(data_dir,
                 train_file,
                 dev_file=None,
                 test_file=None,
                 min_freq=5):
    counters = {
        'word': (1, Counter()),
        'lemma': (2, Counter()),
        'upos': (3, Counter()),
        'xpos': (4, Counter()),
        'feats': (5, Counter()),
        'deprel': (7, Counter()),
        # FOR CHAR FEATS
        'word_char': (1, Counter()),
        # DEPS
        'deps': (8, Counter())
    }

    if any([
            os.path.exists(os.path.join(data_dir, 'vocabs', f'{key}.txt'))
            for key in counters
    ]):
        return

    if not os.path.exists(os.path.join(data_dir, 'vocabs')):
        os.makedirs(os.path.join(data_dir, 'vocabs'))

    for file_name in [train_file, dev_file, test_file]:
        for line_num, block in iter_blocks(
                filename=os.path.join(data_dir, file_name)):
            values = [list(value) for value in zip(*block)]

            for name, (row, counter) in counters.items():
                if 'char' in name:
                    counter.update(itertools.chain(*values[row]))
                elif 'deps' == name:
                    try:
                        deps = [[
                            label.split(':', maxsplit=1)[1]
                            for label in dep.split('|')
                        ] for dep in values[row]]
                        counter.update(itertools.chain(*deps))
                    except:
                        counter.update('_')
                else:
                    counter.update(values[row])

    for feat, (row, counter) in counters.items():
        if 'word' in feat:
            counter = Counter({
                word: count
                for word, count in counter.items() if count > min_freq
            })

        with open(os.path.join(data_dir, 'vocabs', f'{feat}.txt'),
                  mode='w') as f:
            f.write('\n'.join(sorted(counter.keys())))
예제 #5
0
    def _generate_examples(self, files):
        for filename in files:
            logging.info("⏳ Generating examples from = %s", filename)
            for line_num, block in iter_blocks(filename=filename):
                # last example
                words, predicate, *roles = [list(value) for value in zip(*block)]

                yield line_num, {
                    "form": words, "predicate": predicate, "arguments": roles
                }
예제 #6
0
def build_vocabs(data_dir, *files):
    counters = {'predicate': (1, Counter()), 'arguments': (slice(2, None), Counter())}

    if any([os.path.exists(os.path.join(data_dir, 'vocabs', f'{key}.txt')) for key in counters]):
        return

    if not os.path.exists(os.path.join(data_dir, 'vocabs')):
        os.makedirs(os.path.join(data_dir, 'vocabs'))

    for filename in files:
        for line_num, block in iter_blocks(filename=filename):
            values = [list(value) for value in zip(*block)]

            for name, (row, counter) in counters.items():
                current = values[row]
                if not len(current):
                    continue
                item = current[0]
                if isinstance(item, list):
                    for item in current:
                        counter.update(item)
                else:
                    counter.update(current)

    for feat, (row, counter) in counters.items():
        with open(os.path.join(data_dir, 'vocabs', f'{feat}.txt'), mode='w') as f:
            # some process
            if feat == 'predicate':
                tags = sorted(counter.keys())
                tags.remove('_')
                tags = ['_'] + tags
            elif feat == 'arguments':
                tags = sorted(counter.keys())
                tags.remove('O')
                if 'B-V' in tags:
                    tags.remove('B-V')
                    tags_backup = ['O', 'B-V']
                else:
                    tags_backup = ['O']
                tags = sorted(set([tag[2:] for tag in tags]))
                tags = [f'B-{tag}' for tag in tags] + [f'I-{tag}' for tag in tags]

                tags = tags_backup + tags
            else:
                tags = ['_']
            f.write('\n'.join(tags))
예제 #7
0
def build_vocabs(data_dir, *files):
    counter = Counter()

    if os.path.exists(os.path.join(data_dir, 'vocabs', 'bio.txt')):
        return

    if not os.path.exists(os.path.join(data_dir, 'vocabs')):
        os.makedirs(os.path.join(data_dir, 'vocabs'))

    for filename in files:
        for line_num, block in iter_blocks(filename=filename):
            values = [list(value) for value in zip(*block)]
            counter.update(values[1])

    with open(os.path.join(data_dir, 'vocabs', 'bio.txt'), mode='w') as f:
        tags = sorted(counter.keys())
        tags.remove('O')
        f.write('\n'.join(['O'] + tags))