def execute(source, target, co_duplicated, ext='.dedup', verbose=False): duplicated = co_duplicated.pop('linenos') src_contents = load_file_contents(source, strip=False) tgt_contents = load_file_contents(target, strip=False) total = len(src_contents) assert total == len(tgt_contents) src_lines, tgt_lines = [], [] iterator = zip(src_contents, tgt_contents) for lineno, (src, tgt) in enumerate(iterator, start=1): if verbose and lineno % 10000 == 0: it_print('processed {}'.format(lineno)) if lineno in duplicated: duplicated.remove(lineno) continue # write to file src_lines.append(src) tgt_lines.append(tgt) count = len(src_lines) assert count == len(tgt_lines) it_print('total {} lines, after filter, {} left'.format(total, count)) write_file_contents(source + ext, ''.join(src_lines)) write_file_contents(target + ext, ''.join(tgt_lines))
def main(args): source, target, output = args.source, args.target, args.output_path assert all([file_exists(x) for x in [source, target]]) src_contents = load_file_contents(source) tgt_contents = load_file_contents(target) src_cnt, tgt_cnt = len(src_contents), len(tgt_contents) assert src_cnt <= tgt_cnt it_print('source {} lines, target {} lines'.format(src_cnt, tgt_cnt)) if output is None: output = 'mapping.json' src_mapping = OrderedDict({k + 1: src_contents[k] for k in range(src_cnt)}) tgt_mapping = {k + 1: tgt_contents[k] for k in range(tgt_cnt)} mapping = OrderedDict() with tqdm(total=src_cnt, disable=not args.verbose) as pb: src_keys = list(sorted(src_mapping.keys())) for key in src_keys: sub, value = None, src_mapping[key] for sub in tgt_mapping: if value == tgt_mapping[sub]: mapping[key] = sub break if sub is not None: it_print('{} -> {}'.format(key, sub)) src_mapping.pop(key) tgt_mapping.pop(sub) pb.update(1) write_file_contents(output, to_json(mapping, indent=2)) write_file_contents('source.left.json', to_json(src_mapping, indent=2)) write_file_contents('target.left.json', to_json(tgt_mapping, indent=2))
def sgm2plain(sgm_path, plain_path=None, nop_if_exists=True): """ Convert .sgm file to a plain text file, contents are sentence in the seg tag :param sgm_path: sgm file path :type sgm_path: str :param plain_path: target plain text file, if None, auto set to be the same as sgm_path but with '.sgm' replaced with '.plain' (optional) :type plain_path: str or None :param nop_if_exists: if the target plain file already exists, it does no operation, else the function overwrites it :type nop_if_exists: bool """ sgm_suffix = '.sgm' if not sgm_path.endswith(sgm_suffix): raise ValueError('smg_path must be ended with {}'.format(sgm_suffix)) if not file_exists(sgm_path): raise ValueError('sgm_path {} not exists'.format(sgm_path)) if plain_path is None: pos = sgm_path.rfind(sgm_suffix) plain_path = sgm_path[:pos] + '.plain' if file_exists(plain_path) and nop_if_exists: return lines = [x.strip(' \r\n') for x in load_file_contents(sgm_path)] sentences = [ re.sub(r'</?.*?>', '', x) for x in filter(lambda x: x.startswith('<seg'), lines) ] write_iterable_contents(plain_path, sentences)
def retrieve_paper_titles(data_path, **kwargs): source = kwargs.pop('source', None) if source is None or source != 'dblp': return contents = load_file_contents(data_path, pieces=False) try: contents = contents.decode('utf-8') except (UnicodeDecodeError, AttributeError): pass return dblp_paper_regex.findall(contents)
def parse_duplicated(dup_path, prefix=''): duplicated = OrderedDict() dup_lines = load_file_contents(dup_path) for line in dup_lines: if python2: lineno, dno, subject = line.split(' ', 2) else: lineno, dno, subject = line.split(maxsplit=2) lineno = int(lineno) if dno in duplicated: duplicated[dno]['linenos'].append(lineno) else: duplicated[dno] = { 'subject': subject, 'linenos': [lineno], } it_print(message.format(prefix, len(duplicated))) return duplicated
def count_file_tokens(file_path): contents = load_file_contents(file_path, strip=True) count, vocab = 0, set() total, seqs = len(contents), set() seen, repeated = dict(), list() for lineno, line in enumerate(contents, start=1): if line == '': continue if line in seen: repeated.append((lineno, seen[line], line)) else: seen[line] = lineno seqs.add(line) tokens = line.split() count += len(tokens) for token in tokens: vocab.add(token) return (count, vocab), (total, seqs), repeated
cache_path = './cache' create_if_not_exists(cache_path) index_path = concat_path(data_path, 'index.json') paper_cache_path = concat_path(cache_path, 'papers.json') query_cache_path = concat_path(cache_path, 'queries.json') cache_size = 80 dblp_data_path = concat_path(data_path, 'dblp') create_if_not_exists(dblp_data_path) # initialize and load index if not file_exists(index_path): data = {'version': 0.1} write_json_contents(index_path, data) index = parse_json(load_file_contents(index_path, pieces=False)) index = OrderedDict(index) # initialize and load paper_cache if not file_exists(paper_cache_path): it_print('building papers cache ...') data = {'version': 0.1, 'build_time': current_datetime(), 'values': {}} # build cache for key, value in index.items(): if not isinstance(value, string_types): continue kwargs = {'source': None} if value.startswith(dblp_data_path): kwargs['source'] = 'dblp' data['values'][key] = retrieve_paper_titles(value, **kwargs)