示例#1
0
def execute(source, target, co_duplicated, ext='.dedup', verbose=False):
    duplicated = co_duplicated.pop('linenos')
    src_contents = load_file_contents(source, strip=False)
    tgt_contents = load_file_contents(target, strip=False)
    total = len(src_contents)
    assert total == len(tgt_contents)

    src_lines, tgt_lines = [], []
    iterator = zip(src_contents, tgt_contents)
    for lineno, (src, tgt) in enumerate(iterator, start=1):
        if verbose and lineno % 10000 == 0:
            it_print('processed {}'.format(lineno))
        if lineno in duplicated:
            duplicated.remove(lineno)
            continue
        # write to file
        src_lines.append(src)
        tgt_lines.append(tgt)

    count = len(src_lines)
    assert count == len(tgt_lines)

    it_print('total {} lines, after filter, {} left'.format(total, count))
    write_file_contents(source + ext, ''.join(src_lines))
    write_file_contents(target + ext, ''.join(tgt_lines))
示例#2
0
def main(args):
    source, target, output = args.source, args.target, args.output_path
    assert all([file_exists(x) for x in [source, target]])

    src_contents = load_file_contents(source)
    tgt_contents = load_file_contents(target)
    src_cnt, tgt_cnt = len(src_contents), len(tgt_contents)
    assert src_cnt <= tgt_cnt
    it_print('source {} lines, target {} lines'.format(src_cnt, tgt_cnt))

    if output is None:
        output = 'mapping.json'

    src_mapping = OrderedDict({k + 1: src_contents[k] for k in range(src_cnt)})
    tgt_mapping = {k + 1: tgt_contents[k] for k in range(tgt_cnt)}

    mapping = OrderedDict()
    with tqdm(total=src_cnt, disable=not args.verbose) as pb:
        src_keys = list(sorted(src_mapping.keys()))
        for key in src_keys:
            sub, value = None, src_mapping[key]
            for sub in tgt_mapping:
                if value == tgt_mapping[sub]:
                    mapping[key] = sub
                    break
            if sub is not None:
                it_print('{} -> {}'.format(key, sub))
                src_mapping.pop(key)
                tgt_mapping.pop(sub)
            pb.update(1)

    write_file_contents(output, to_json(mapping, indent=2))
    write_file_contents('source.left.json', to_json(src_mapping, indent=2))
    write_file_contents('target.left.json', to_json(tgt_mapping, indent=2))
示例#3
0
def sgm2plain(sgm_path, plain_path=None, nop_if_exists=True):
    """
    Convert .sgm file to a plain text file, contents are sentence in the seg tag

    :param sgm_path: sgm file path
    :type sgm_path: str

    :param plain_path: target plain text file, if None, auto set to be the same as
    sgm_path but with '.sgm' replaced with '.plain' (optional)
    :type plain_path: str or None

    :param nop_if_exists: if the target plain file already exists, it does no
    operation, else the function overwrites it
    :type nop_if_exists: bool
    """
    sgm_suffix = '.sgm'
    if not sgm_path.endswith(sgm_suffix):
        raise ValueError('smg_path must be ended with {}'.format(sgm_suffix))

    if not file_exists(sgm_path):
        raise ValueError('sgm_path {} not exists'.format(sgm_path))

    if plain_path is None:
        pos = sgm_path.rfind(sgm_suffix)
        plain_path = sgm_path[:pos] + '.plain'

    if file_exists(plain_path) and nop_if_exists:
        return

    lines = [x.strip(' \r\n') for x in load_file_contents(sgm_path)]
    sentences = [
        re.sub(r'</?.*?>', '', x)
        for x in filter(lambda x: x.startswith('<seg'), lines)
    ]
    write_iterable_contents(plain_path, sentences)
示例#4
0
def retrieve_paper_titles(data_path, **kwargs):
    source = kwargs.pop('source', None)
    if source is None or source != 'dblp':
        return

    contents = load_file_contents(data_path, pieces=False)
    try:
        contents = contents.decode('utf-8')
    except (UnicodeDecodeError, AttributeError):
        pass

    return dblp_paper_regex.findall(contents)
示例#5
0
def parse_duplicated(dup_path, prefix=''):
    duplicated = OrderedDict()
    dup_lines = load_file_contents(dup_path)
    for line in dup_lines:
        if python2:
            lineno, dno, subject = line.split(' ', 2)
        else:
            lineno, dno, subject = line.split(maxsplit=2)
        lineno = int(lineno)
        if dno in duplicated:
            duplicated[dno]['linenos'].append(lineno)
        else:
            duplicated[dno] = {
                'subject': subject,
                'linenos': [lineno],
            }

    it_print(message.format(prefix, len(duplicated)))
    return duplicated
示例#6
0
def count_file_tokens(file_path):
    contents = load_file_contents(file_path, strip=True)
    count, vocab = 0, set()
    total, seqs = len(contents), set()
    seen, repeated = dict(), list()
    for lineno, line in enumerate(contents, start=1):
        if line == '':
            continue

        if line in seen:
            repeated.append((lineno, seen[line], line))
        else:
            seen[line] = lineno
        seqs.add(line)
        tokens = line.split()
        count += len(tokens)
        for token in tokens:
            vocab.add(token)
    return (count, vocab), (total, seqs), repeated
示例#7
0
cache_path = './cache'
create_if_not_exists(cache_path)

index_path = concat_path(data_path, 'index.json')
paper_cache_path = concat_path(cache_path, 'papers.json')
query_cache_path = concat_path(cache_path, 'queries.json')
cache_size = 80

dblp_data_path = concat_path(data_path, 'dblp')
create_if_not_exists(dblp_data_path)

# initialize and load index
if not file_exists(index_path):
    data = {'version': 0.1}
    write_json_contents(index_path, data)
index = parse_json(load_file_contents(index_path, pieces=False))
index = OrderedDict(index)

# initialize and load paper_cache
if not file_exists(paper_cache_path):
    it_print('building papers cache ...')
    data = {'version': 0.1, 'build_time': current_datetime(), 'values': {}}

    # build cache
    for key, value in index.items():
        if not isinstance(value, string_types):
            continue
        kwargs = {'source': None}
        if value.startswith(dblp_data_path):
            kwargs['source'] = 'dblp'
        data['values'][key] = retrieve_paper_titles(value, **kwargs)