예제 #1
0
def sgm2plain(sgm_path, plain_path=None, nop_if_exists=True):
    """
    Convert .sgm file to a plain text file, contents are sentence in the seg tag

    :param sgm_path: sgm file path
    :type sgm_path: str

    :param plain_path: target plain text file, if None, auto set to be the same as
    sgm_path but with '.sgm' replaced with '.plain' (optional)
    :type plain_path: str or None

    :param nop_if_exists: if the target plain file already exists, it does no
    operation, else the function overwrites it
    :type nop_if_exists: bool
    """
    sgm_suffix = '.sgm'
    if not sgm_path.endswith(sgm_suffix):
        raise ValueError('smg_path must be ended with {}'.format(sgm_suffix))

    if not file_exists(sgm_path):
        raise ValueError('sgm_path {} not exists'.format(sgm_path))

    if plain_path is None:
        pos = sgm_path.rfind(sgm_suffix)
        plain_path = sgm_path[:pos] + '.plain'

    if file_exists(plain_path) and nop_if_exists:
        return

    lines = [x.strip(' \r\n') for x in load_file_contents(sgm_path)]
    sentences = [
        re.sub(r'</?.*?>', '', x)
        for x in filter(lambda x: x.startswith('<seg'), lines)
    ]
    write_iterable_contents(plain_path, sentences)
예제 #2
0
def main(args):
    source, target, output = args.source, args.target, args.output_path
    assert all([file_exists(x) for x in [source, target]])

    src_contents = load_file_contents(source)
    tgt_contents = load_file_contents(target)
    src_cnt, tgt_cnt = len(src_contents), len(tgt_contents)
    assert src_cnt <= tgt_cnt
    it_print('source {} lines, target {} lines'.format(src_cnt, tgt_cnt))

    if output is None:
        output = 'mapping.json'

    src_mapping = OrderedDict({k + 1: src_contents[k] for k in range(src_cnt)})
    tgt_mapping = {k + 1: tgt_contents[k] for k in range(tgt_cnt)}

    mapping = OrderedDict()
    with tqdm(total=src_cnt, disable=not args.verbose) as pb:
        src_keys = list(sorted(src_mapping.keys()))
        for key in src_keys:
            sub, value = None, src_mapping[key]
            for sub in tgt_mapping:
                if value == tgt_mapping[sub]:
                    mapping[key] = sub
                    break
            if sub is not None:
                it_print('{} -> {}'.format(key, sub))
                src_mapping.pop(key)
                tgt_mapping.pop(sub)
            pb.update(1)

    write_file_contents(output, to_json(mapping, indent=2))
    write_file_contents('source.left.json', to_json(src_mapping, indent=2))
    write_file_contents('target.left.json', to_json(tgt_mapping, indent=2))
예제 #3
0
def check_file_paths(file_paths):
    line_cnt = file_lines(file_paths[0])
    for file_path in file_paths:
        if not file_exists(file_path):
            it_print('file path [{}] does not exists.'.format(file_path))
            exit(0)
        cnt = file_lines(file_path)
        if line_cnt != cnt:
            it_print('file lines mismatch: {} => {}.'.format(line_cnt, cnt))
            exit(0)
    return line_cnt
예제 #4
0
def main(args):
    ext = '.dup'
    if args.corpus:
        source, target = args.source, args.target
        src_dup_path, tgt_dup_path = source + ext, target + ext
    else:
        src_dup_path, tgt_dup_path = args.source, args.target
        assert src_dup_path.endswith(ext) and tgt_dup_path.endswith(ext)
        source = src_dup_path.replace(ext, '')
        target = tgt_dup_path.replace(ext, '')
    if not all([file_exists(x) for x in [src_dup_path, tgt_dup_path]]):
        raise ValueError('.dup files not exists, try to use tok_stat.py with '
                         '-o option to generate')

    co_duplicated = OrderedDict()
    src_duplicated = parse_duplicated(src_dup_path, 'source')
    tgt_duplicated = parse_duplicated(tgt_dup_path, 'target')
    analyze(co_duplicated, src_duplicated, tgt_duplicated)

    if args.remove:
        execute(source, target, co_duplicated, verbose=args.verbose)

    contents = to_json(co_duplicated, indent=2)
    write_file_contents(args.output, contents)
예제 #5
0
data_path = './data'
create_if_not_exists(data_path)

cache_path = './cache'
create_if_not_exists(cache_path)

index_path = concat_path(data_path, 'index.json')
paper_cache_path = concat_path(cache_path, 'papers.json')
query_cache_path = concat_path(cache_path, 'queries.json')
cache_size = 80

dblp_data_path = concat_path(data_path, 'dblp')
create_if_not_exists(dblp_data_path)

# initialize and load index
if not file_exists(index_path):
    data = {'version': 0.1}
    write_json_contents(index_path, data)
index = parse_json(load_file_contents(index_path, pieces=False))
index = OrderedDict(index)

# initialize and load paper_cache
if not file_exists(paper_cache_path):
    it_print('building papers cache ...')
    data = {'version': 0.1, 'build_time': current_datetime(), 'values': {}}

    # build cache
    for key, value in index.items():
        if not isinstance(value, string_types):
            continue
        kwargs = {'source': None}