def sgm2plain(sgm_path, plain_path=None, nop_if_exists=True): """ Convert .sgm file to a plain text file, contents are sentence in the seg tag :param sgm_path: sgm file path :type sgm_path: str :param plain_path: target plain text file, if None, auto set to be the same as sgm_path but with '.sgm' replaced with '.plain' (optional) :type plain_path: str or None :param nop_if_exists: if the target plain file already exists, it does no operation, else the function overwrites it :type nop_if_exists: bool """ sgm_suffix = '.sgm' if not sgm_path.endswith(sgm_suffix): raise ValueError('smg_path must be ended with {}'.format(sgm_suffix)) if not file_exists(sgm_path): raise ValueError('sgm_path {} not exists'.format(sgm_path)) if plain_path is None: pos = sgm_path.rfind(sgm_suffix) plain_path = sgm_path[:pos] + '.plain' if file_exists(plain_path) and nop_if_exists: return lines = [x.strip(' \r\n') for x in load_file_contents(sgm_path)] sentences = [ re.sub(r'</?.*?>', '', x) for x in filter(lambda x: x.startswith('<seg'), lines) ] write_iterable_contents(plain_path, sentences)
def main(args): source, target, output = args.source, args.target, args.output_path assert all([file_exists(x) for x in [source, target]]) src_contents = load_file_contents(source) tgt_contents = load_file_contents(target) src_cnt, tgt_cnt = len(src_contents), len(tgt_contents) assert src_cnt <= tgt_cnt it_print('source {} lines, target {} lines'.format(src_cnt, tgt_cnt)) if output is None: output = 'mapping.json' src_mapping = OrderedDict({k + 1: src_contents[k] for k in range(src_cnt)}) tgt_mapping = {k + 1: tgt_contents[k] for k in range(tgt_cnt)} mapping = OrderedDict() with tqdm(total=src_cnt, disable=not args.verbose) as pb: src_keys = list(sorted(src_mapping.keys())) for key in src_keys: sub, value = None, src_mapping[key] for sub in tgt_mapping: if value == tgt_mapping[sub]: mapping[key] = sub break if sub is not None: it_print('{} -> {}'.format(key, sub)) src_mapping.pop(key) tgt_mapping.pop(sub) pb.update(1) write_file_contents(output, to_json(mapping, indent=2)) write_file_contents('source.left.json', to_json(src_mapping, indent=2)) write_file_contents('target.left.json', to_json(tgt_mapping, indent=2))
def check_file_paths(file_paths): line_cnt = file_lines(file_paths[0]) for file_path in file_paths: if not file_exists(file_path): it_print('file path [{}] does not exists.'.format(file_path)) exit(0) cnt = file_lines(file_path) if line_cnt != cnt: it_print('file lines mismatch: {} => {}.'.format(line_cnt, cnt)) exit(0) return line_cnt
def main(args): ext = '.dup' if args.corpus: source, target = args.source, args.target src_dup_path, tgt_dup_path = source + ext, target + ext else: src_dup_path, tgt_dup_path = args.source, args.target assert src_dup_path.endswith(ext) and tgt_dup_path.endswith(ext) source = src_dup_path.replace(ext, '') target = tgt_dup_path.replace(ext, '') if not all([file_exists(x) for x in [src_dup_path, tgt_dup_path]]): raise ValueError('.dup files not exists, try to use tok_stat.py with ' '-o option to generate') co_duplicated = OrderedDict() src_duplicated = parse_duplicated(src_dup_path, 'source') tgt_duplicated = parse_duplicated(tgt_dup_path, 'target') analyze(co_duplicated, src_duplicated, tgt_duplicated) if args.remove: execute(source, target, co_duplicated, verbose=args.verbose) contents = to_json(co_duplicated, indent=2) write_file_contents(args.output, contents)
data_path = './data' create_if_not_exists(data_path) cache_path = './cache' create_if_not_exists(cache_path) index_path = concat_path(data_path, 'index.json') paper_cache_path = concat_path(cache_path, 'papers.json') query_cache_path = concat_path(cache_path, 'queries.json') cache_size = 80 dblp_data_path = concat_path(data_path, 'dblp') create_if_not_exists(dblp_data_path) # initialize and load index if not file_exists(index_path): data = {'version': 0.1} write_json_contents(index_path, data) index = parse_json(load_file_contents(index_path, pieces=False)) index = OrderedDict(index) # initialize and load paper_cache if not file_exists(paper_cache_path): it_print('building papers cache ...') data = {'version': 0.1, 'build_time': current_datetime(), 'values': {}} # build cache for key, value in index.items(): if not isinstance(value, string_types): continue kwargs = {'source': None}