示例#1
0
def process_file(filename, input_dir, output_dir):
    input_file = os.path.join(input_dir, filename)
    output_file = os.path.join(output_dir, filename)
    logging.info('Processing file {}...'.format(filename))

    stats = Counter()

    with notempty(openall(output_file, 'wt')) as outf:
        header, it = headtail(parse_file(input_file, True))
        num_fields = len(header)
        do_wsafter = 'wsafter' not in header
        if do_wsafter:
            header.insert(1, 'wsafter')
            logging.debug('Adding the wsafter field...')
        print('\t'.join(header), file=outf)
        for document in it:
            stats['documents'] += 1
            try:
                stats['token_errors'] += fix_invalid_lines(document, num_fields)
                if do_wsafter:
                    add_wsafter(document)
            except ValueError:
                logging.exception(f'Error in file {input_file}')
                raise
            print(document, file=outf)
    return stats
示例#2
0
def consumer(output_files: List[str], queue: Queue, header: str,
             documents: int, num_readers: Value, lock: Lock) -> int:
    """
    Reads :class:`Document`s from the shared queue and writes them to one of
    the output files at random.

    :param output_files: list of output file names.
    :param queue: the queue shared with all processes.
    :param header: the header of the tsv files. Written to all output files.
    :param documents: the number of documents to write to an output file.
    :param num_readers: a shared variable holding the number of readers that
                        are still active. This function exits if two conditions
                        are met: the queue is empty and *num_readers* is 0.
    :param lock: a lock that regulates access to *num_readers*.
    :returns: the number of documents written.
    """
    logging.info(f'Consumer started with {len(output_files)} files.')
    output_names = [os.path.basename(f) for f in output_files]
    outfs = [notempty(openall(f, 'wt')) for f in output_files]
    written = [0 for _ in outfs]
    docs_written = 0

    # Write the header
    for outf in outfs:
        print(header, file=outf)

    while outfs:
        i = random.randint(0, len(outfs) - 1)
        try:
            print(queue.get(timeout=5), file=outfs[i])
            written[i] += 1
            docs_written += 1
            if docs_written % 1000 == 0:
                logging.debug(
                    f'Consumer has written {docs_written} documents.')
            if written[i] == documents:
                logging.info(f'Written {documents} documents to '
                             f'{output_names[i]}; closing...')
                outfs[i].close()
                del outfs[i]
                del written[i]
                del output_names[i]
        except Empty:
            with lock:
                if num_readers.value == 0:
                    logging.info('Timeout waiting for queue; cleaning up...')
                    break
        except:
            logging.exception(f'Exception writing {output_names[i]}!')
            sys.exit(3)

    # Close any dangling output files
    for i in range(len(outfs)):
        logging.info(f'Written {written[i]} documents to '
                     f'{output_names[i]}; closing...')
        outfs[i].close()

    logging.info(f'Consumer finished; written {docs_written} documents.')
    return docs_written
示例#3
0
def process_file(filename,
                 input_dir,
                 output_dir,
                 languages,
                 language_unit,
                 min_len_str,
                 keep_urls=None,
                 drop_urls=None):
    input_file = os.path.join(input_dir, filename)
    output_file = os.path.join(output_dir, filename)
    logging.info('Processing file {}...'.format(filename))

    stats = Counter()
    it = parse_file(input_file, True, True, True)
    it = each_doc(it, stats)
    if languages:
        if language_unit == 'doc':
            it = filter_languages_doc(it, languages, stats)
        else:
            it = filter_languages_p(it, languages, stats)
    if min_len_str:
        it = filter_length(it, min_len_str, stats)
    if drop_urls:
        # Get the right list: from the Manager or the local one
        url_list = drop_urls if drop_urls.__class__.__name__ == 'DictProxy' \
                             else urls_to_drop  # noqa
        it = filter_urls(it, url_list, stats)
    if keep_urls:
        # Get the right list: from the Manager or the local one
        url_list = keep_urls if keep_urls.__class__.__name__ == 'DictProxy' \
                             else urls_to_keep  # noqa
        it = retain_urls(it, url_list, stats)
    try:
        with notempty(openall(output_file, 'wt')) as outf:
            for doc in it:
                print(doc, file=outf)
    except:
        logging.exception('Got an error.')
    logging.info('Finished processing file {}...'.format(filename))
    return stats
示例#4
0
def deduplicate_batch_documents(batch_prefix,
                                output_dir,
                                input_dir=None,
                                ignore_missing_files=False):
    """
    Filters documents not present in the batch and writes the filtered corpus
    files to output_dir. As above, input_dir can be specified if the location
    information in the batch files is outdated.

    Empty files will not be written.
    """
    batch_base = op.basename(batch_prefix)
    logging.info('Filtering batch {}...'.format(batch_base))

    kept, total = 0, 0
    num_files = 0
    for input_file, results in read_batch(batch_prefix):
        file_base = op.basename(input_file)
        url_set = set('_'.join(doc_id) for doc_id in results['id'])
        input_file = op.join(input_dir, file_base) if input_dir else input_file
        if os.path.isfile(input_file):
            with notempty(openall(op.join(output_dir, file_base),
                                  'wt')) as outf:
                for doc_no, doc in enumerate(parse_file(input_file), start=1):
                    if doc.attrs['url'] in url_set:
                        print(doc, file=outf)
                        kept += 1
                total += doc_no
            num_files += 1
        elif ignore_missing_files:
            logging.debug(
                'Input file {} was not found; ignoring...'.format(input_file))
        else:
            raise FileNotFoundError(
                'Input file {} not found.'.format(input_file))

    logging.info('Filtered batch {} of {} files; '
                 'kept {} documents out of {}.'.format(batch_base, num_files,
                                                       kept, total))
    return kept, total
def filter_file(input_file, output_file, uniqs, url_fn: UrlFn) -> FilterStats:
    """
    Filters an index file; i.e. drops all duplicate URLs.
    :param input_file: the input index file
    :param output_file: the output index file
    :param uniqs: the shared dictionary of unique URLs
    :param url_fn: the URL transformation function to apply to each URL. In the
                   scope of this program, this is either hashing or nothing.
    """
    logging.info('Filtering file {}...'.format(input_file))
    stats = FilterStats(old_files=1)
    with openall(input_file, 'rt') as inf, notempty(openall(output_file,
                                                            'wt')) as outf:
        line_no = lines_printed = 0
        for line_no, line in enumerate(map(str.strip, inf), start=1):
            try:
                url, warc, offset, length = line.split()[:7][-6:-2]
                record = IndexRecord(warc, offset, length)
                if record == uniqs.get(url_fn(url)):
                    print(line, file=outf)
                    lines_printed += 1
            except:
                logging.exception('Exception in file {}:{}'.format(
                    input_file, line_no))

    if line_no:
        logging.info('Kept {} URLs out of {} in {}.'.format(
            lines_printed, line_no, input_file))
        stats.old_urls = line_no
    else:
        logging.info('File {} was empty.'.format(input_file))

    if lines_printed:
        stats.new_files = 1
        stats.new_urls = lines_printed
    return stats