예제 #1
0
def main(args):

    files = [path.strip() for path in sys.stdin if not path.startswith('#')]
    ldc_names = [get_ldc_name(path) for path in files]

    # sanity checks
    for ldc_name in ldc_names:
        if not os.path.exists('{0}/raw/{1}.gz'.format(args.workspace,
                                                      ldc_name)):
            raise Exception('File not found: %s',
                            '{0}/raw/{1}.gz'.format(args.workspace, ldc_name))

    pool = Pool(args.jobs)
    logging.info('Distributing %d jobs to %d workers', len(ldc_names),
                 args.jobs)
    t0 = time.time()
    result = pool.map(partial(parse_and_save, args=args), ldc_names)
    dt = time.time() - t0
    logging.info('Total time: %f seconds', dt)

    data = zip(ldc_names, result)

    try:
        # prints a Markdown table if possible
        from tabulate import tabulate
        print tabulate(data, headers=['Corpus', 'Time (s)'], tablefmt='pipe')
    except:
        # plain table otherwise
        print '\n'.join('{0} {1}'.format(corpus, time)
                        for corpus, time in data)
예제 #2
0
def main(args):

    files = [path.strip() for path in sys.stdin if not path.startswith('#')]
    ldc_names = [get_ldc_name(path) for path in files]

    # sanity checks
    for ldc_name in ldc_names:
        if not os.path.exists('{0}/bsgml_trees/{1}'.format(args.workspace, ldc_name)):
            raise Exception('File not found: %s', '{0}/bsgml_trees/{1}'.format(args.workspace, ldc_name))

    # distribute jobs
    pool = Pool(args.jobs)
    logging.info('Distributing %d jobs to %d workers', len(ldc_names), args.jobs)

    t0 = time()
    # results = pool.map(partial(fix_bad_sgml, args=args), ldc_names)
    results = pool.map(partial(badsgml2text, args=args), ldc_names)
    dt = time() - t0
    logging.info('Total time: %f seconds', dt)
    
    try:
        # prints a Markdown table if possible
        from tabulate import tabulate
        print tabulate(zip(ldc_names, 
                            (len(result) for result in results),
                            (sum(result) for result in results), 
                            (np.mean(result) for result in results)),
                        headers=['Corpus', 'Documents', 'Total Sentences', 'Average Document Length'],
                        tablefmt='pipe')
    except ImportError:
        logging.info('Consider installing tabulate to get nice summaries.')
예제 #3
0
def extract_and_save_txt(sgml_gz, args):
    """Extracts documents from a gzipped sgml file -> file ids"""
    try:
        ids = []
        n = 0
        logging.info('Processing %s', sgml_gz)
        stem = get_ldc_name(sgml_gz)
        with gzip.open(sgml_gz, 'rb') as fi:
            with gzip.open('{0}/raw/{1}.gz'.format(args.workspace, stem), 'wb') as fo:
                parser = TextFromSGML(fi.read(), text_under='text', root='sgml')
                for doc in parser.iterdocs():
                    if doc['text']:
                        ids.append(doc['id'])
                        writedoctext(fo, doc['text'].split('\n'), id=doc['id'])
                logging.info('%s contains %d documents', stem, len(ids))
        return ids
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
예제 #4
0
def extract_and_save_sgml(sgml_gz, args):
    """Extracts documents from a gzipped sgml file -> file ids"""
    try:
        ids = []
        n = 0
        logging.info('Processing %s', sgml_gz)
        stem = get_ldc_name(sgml_gz)
        with gzip.open(sgml_gz, 'rb') as fi:
            sgmler = MakeSGMLDocs(file=stem)
            parser = TextFromSGML(fi.read(), text_under='text', root='sgml')
            for doc in parser.iterdocs():
                if doc['text']:
                    ids.append(doc['id'])
                    sgmler.add(doc['text'], id=doc['id'])
            sgmler.writegz('{0}/raw/{1}'.format(args.workspace, stem))
            logging.info('%s contains %d documents', stem, len(ids))
        return ids
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
예제 #5
0
def main(args):
    files = [path.strip() for path in sys.stdin if not path.startswith('#')]
    ldc_names = [get_ldc_name(path) for path in files]

    pool = Pool(args.jobs)
    if args.sgml:
        results = pool.map(partial(extract_and_save_sgml, args=args), files)
    else:
        results = pool.map(partial(extract_and_save_txt, args=args), files)
    logging.info('Documents: %d', len(results))

    data = zip(ldc_names, (len(ids) for ids in results))

    try:
        # prints a Markdown table if possible
        from tabulate import tabulate
        print tabulate(data, headers=['Corpus', 'Documents'], tablefmt='pipe')
    except:
        # plain table otherwise
        print '\n'.join('{0} {1}'.format(c, n) for c, n in data)
예제 #6
0
def main(args):
    files = [path.strip() for path in sys.stdin if not path.startswith('#')]
    ldc_names = [get_ldc_name(path) for path in files]

    pool = Pool(args.jobs)
    if args.sgml:
        results = pool.map(partial(extract_and_save_sgml, args=args), files)
    else:
        results = pool.map(partial(extract_and_save_txt, args=args), files)
    logging.info('Documents: %d', len(results))

    data = zip(ldc_names, (len(ids) for ids in results))

    try:
        # prints a Markdown table if possible
        from tabulate import tabulate
        print tabulate(data,
                headers=['Corpus', 'Documents'],
                tablefmt='pipe')
    except:
        # plain table otherwise
        print '\n'.join('{0} {1}'.format(c, n) for c, n in data)
예제 #7
0
def main(args):

    files = [path.strip() for path in sys.stdin if not path.startswith('#')]
    ldc_names = [get_ldc_name(path) for path in files]

    # sanity checks
    for ldc_name in ldc_names:
        if not os.path.exists('{0}/bsgml_trees/{1}'.format(
                args.workspace, ldc_name)):
            raise Exception(
                'File not found: %s',
                '{0}/bsgml_trees/{1}'.format(args.workspace, ldc_name))

    # distribute jobs
    pool = Pool(args.jobs)
    logging.info('Distributing %d jobs to %d workers', len(ldc_names),
                 args.jobs)

    t0 = time()
    # results = pool.map(partial(fix_bad_sgml, args=args), ldc_names)
    results = pool.map(partial(badsgml2text, args=args), ldc_names)
    dt = time() - t0
    logging.info('Total time: %f seconds', dt)

    try:
        # prints a Markdown table if possible
        from tabulate import tabulate
        print tabulate(zip(ldc_names, (len(result) for result in results),
                           (sum(result) for result in results),
                           (np.mean(result) for result in results)),
                       headers=[
                           'Corpus', 'Documents', 'Total Sentences',
                           'Average Document Length'
                       ],
                       tablefmt='pipe')
    except ImportError:
        logging.info('Consider installing tabulate to get nice summaries.')