Exemplo n.º 1
0
def other_main(args):
    """The "real" main function of the "other" mode."""
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    batch_prefixes = find_all_batches(args.input_dir)
    logging.info('Found a total of {} batches in {}.'.format(
        len(batch_prefixes), args.input_dir))

    batches_to_subtract = find_all_batches(args.cross_dir)
    logging.info(
        'Found a total of {} batches in {}to deduplicate against.'.format(
            len(batches_to_subtract), args.cross_dir))

    with ProcessPoolExecutor(max_workers=args.processes) as executor:
        f = partial(deduplicate_other,
                    batches_to_subtract=batches_to_subtract,
                    output_dir=args.output_dir,
                    threshold=args.threshold,
                    permutations=args.permutations)
        original_doc_num, final_doc_num = 0, 0
        for new_num, old_num in executor.map(f, batch_prefixes):
            original_doc_num += old_num
            final_doc_num += new_num

    logging.info('Cross deduplication done; in all, kept '
                 '{} documents out of {}.'.format(final_doc_num,
                                                  original_doc_num))
Exemplo n.º 2
0
def self_main(args):
    """The "real" main function of the "self" mode."""
    working_dir = op.join(args.output_dir, 'self')
    if not os.path.isdir(working_dir):
        os.makedirs(working_dir)

    batch_prefixes = find_all_batches(args.input_dir)
    logging.info('Found a total of {} batches in {}.'.format(
        len(batch_prefixes), args.input_dir))

    # First, deduplicate documents _within_ the same batch
    original_doc_num, self_doc_num, final_doc_num = 0, 0, 0
    with Pool(args.processes) as pool:
        f = partial(deduplicate_self,
                    output_dir=working_dir,
                    threshold=args.threshold,
                    permutations=args.permutations)
        for new_num, old_num in pool.map(f, batch_prefixes):
            original_doc_num += old_num
            self_doc_num += new_num
    pool.close()
    pool.join()

    logging.info('Self deduplication done; in all, kept '
                 '{} documents out of {}.'.format(self_doc_num,
                                                  original_doc_num))

    # Now, we need to do the deduplication between batches. The idea here is
    # to load one batch into memory, and delete all documents from it that are
    # also present in any of the other batches (more precisely, we only need to
    # do the upper triangle matrix: batch b_i is deduplicated with batches b_j,
    # where j > i).
    # At this point, we do all work in output_dir.
    # Yes, there is no need to send the last batch through this round, except
    # for counting final_doc_num.
    batch_prefixes = find_all_batches(working_dir)
    batches_to_subtract = [
        find_all_batches(working_dir, int(op.basename(file_prefix)))
        for file_prefix in batch_prefixes
    ]

    with ProcessPoolExecutor(max_workers=args.processes) as executor:
        f = partial(deduplicate_other,
                    output_dir=args.output_dir,
                    threshold=args.threshold,
                    permutations=args.permutations)
        final_doc_num = sum(
            num
            for num, _ in executor.map(f, batch_prefixes, batches_to_subtract))

    logging.info('Full deduplication done; in all, kept '
                 '{} documents out of {}.'.format(final_doc_num,
                                                  original_doc_num))

    # Let's delete the intermediate directory.
    shutil.rmtree(working_dir)
Exemplo n.º 3
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    os.nice(20)

    if not op.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    input_batches = [
        batch_prefix for input_dir in args.input_dirs
        for batch_prefix in find_all_batches(input_dir)
    ]

    logging.info('Found a total of {} input batches.'.format(
        len(input_batches)))
    logging.info('Writing files to {}...'.format(args.output_dir))

    batch_size = args.batch_size if not args.keep_sizes else sys.maxsize
    with closing(BatchWriter(batch_size, args.output_dir, args.zeroes)) as bw:
        for input_batch in input_batches:
            if not args.keep_sizes:
                logging.info('Reading batch {}...'.format(input_batch))
                for input_file, results in read_batch(input_batch):
                    bw.write_results(input_file, results)
            else:
                logging.info('Copying batch {}...'.format(input_batch))
                bw.copy_file(input_batch)

    logging.info('Done; renumbered {} documents.'.format(bw.total_written))
Exemplo n.º 4
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')

    os.nice(20)
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    batch_prefixes = find_all_batches(args.minhash_dir)
    logging.info('Found a total of {} batches.'.format(len(batch_prefixes)))

    with Pool(args.processes) as pool:
        f = partial(deduplicate_batch_documents,
                    output_dir=args.output_dir,
                    input_dir=args.input_dir,
                    ignore_missing_files=args.ignore_missing_files)
        kept, total = 0, 0
        for batch_kept, batch_total in pool.imap(f, batch_prefixes):
            kept += batch_kept
            total += batch_total
        pool.close()
        pool.join()
    logging.info('Done.')

    logging.info('Kept {} documents out of {} in total'.format(kept, total))
Exemplo n.º 5
0
def deduplicate_other_old(file_prefix, input_dir, output_dir, threshold,
                          permutations):
    """
    Removes all documents from a set of minhashed documents (3 files with the
    same minhash prefix) that occur in other batches in input_dir. Only
    batches whose number is higher than the batch in question are considered
    (i.e. upper triangular matrix).

    Warning: only works for full documents at this point!
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=permutations)
    file_base = op.basename(file_prefix)
    logging.info('Processing batch {}...'.format(file_base))

    # First, load the (already deduplicated) batch...
    for input_file, results in read_batch(file_prefix):
        for doc_id, minhash in zip(results['id'], results['minhash']):
            lsh.insert('\t'.join(doc_id), minhash)

    initial_len = len(lsh.keys)
    to_match_with = find_all_batches(input_dir,
                                     int(file_prefix.rpartition(os.sep)[-1]))

    # Now, remove all documents in it that are contained in other batches
    # to the "right" of it (with greater batch numbers)
    for batch in to_match_with:
        initial_batch_len = len(lsh.keys)
        for _, results in read_batch(batch):
            for i, minhash in enumerate(results['minhash']):
                for duplicate in lsh.query(minhash):
                    lsh.remove(duplicate)
        logging.info(
            'Cross-deduplicated batch {} with batch {}: {} -> {} documents.'.
            format(file_base, op.basename(batch), initial_batch_len,
                   len(lsh.keys)))

    # Finally, we print the documents left. Unfortunately, in order to
    # keep the format, we have to read the original batch again.
    with closing(
            BatchWriter(sys.maxsize, output_dir, len(file_base),
                        int(file_base))) as bw:
        # OK, we need to re-read the batch unfortunately
        for input_file, results in read_batch(file_prefix):
            doc_ids, minhashes = [], []
            for doc_id, minhash in zip(results['id'], results['minhash']):
                if '\t'.join(doc_id) in lsh:
                    doc_ids.append(doc_id)
                    minhashes.append(minhash)
            bw.write_results(input_file, {'id': doc_ids, 'minhash': minhashes})
    logging.info('Processed batch {}; kept {} out of {} documents.'.format(
        file_base, len(lsh.keys), initial_len))
    return len(lsh.keys), initial_len