Пример #1
0
def test_extract_url():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    package = index.create()
    src = extract.from_url(package, CSV_URL)
    assert src is not None, src

    assert 'barnet-2009.csv' in src.path, src
Пример #2
0
def test_parse_with_dates():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    package = index.create()
    src = extract.from_file(package, GPC_FIXTURE)
    artifact = transform.to_table(src, 'table')

    assert artifact.name == 'table'
    recs = list(artifact.records())
    assert len(recs) == 23, len(recs)
    assert isinstance(recs[0]['transaction_date'], date)
def store_contract_text(barndir, txtdir):
    coll = barn.open_collection('knowncontracts', 'file', path=barndir)
    for pkg in coll:
        extension = pkg.source.name.rsplit('.')[-1].lower()
        # fail fast if we hit an unknown filetype -- better than producing junk
        extractor = extractors.get(extension, donothing)
        text = extractor(pkg.source.data())
        outfile = '%s/%s.txt' % (txtdir, pkg.id)
        with codecs.open(outfile, 'w', 'utf-8', errors='ignore') as outf:
            outf.write(text)
    return coll
def store_contract_text(barndir, txtdir):
    coll = barn.open_collection('knowncontracts', 'file', path=barndir)
    for pkg in coll:
        extension = pkg.source.name.rsplit('.')[-1].lower()
        # fail fast if we hit an unknown filetype -- better than producing junk
        extractor = extractors.get(extension, donothing)
        text = extractor(pkg.source.data())
        outfile = '%s/%s.txt' % (txtdir, pkg.id)
        with codecs.open(outfile, 'w', 'utf-8', errors='ignore') as outf:
            outf.write(text)
    return coll
Пример #5
0
def test_basic_api():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    assert not len(list(index)), len(list(index))

    package = index.create(manifest={'test': 'value'})
    assert len(list(index)) == 1, len(list(index))
    assert package.id is not None, package.id

    assert package.manifest['test'] == 'value'

    assert index.get(package.id) == package, index.get(package.id)
Пример #6
0
def test_extract_file():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    package = index.create()
    src = extract.from_file(package, CSV_FIXTURE)
    assert src is not None, src

    sources = list(package.all(Source))
    assert len(sources) == 1, sources

    artifacts = list(package.all(Artifact))
    assert len(artifacts) == 0, artifacts

    assert 'barnet-2009.csv' in src.path, src
def store_known_contracts(args):
    coll = barn.open_collection('knowncontracts', 'file', path=args.barndir)

    # XXX warning: special-handling
    for pattern in args.include_text_directory:
        for url in glob.glob(pattern + '/*'):
            print('ingesting %s' % url)
            res = coll.ingest(url)

    for url in known_contracts(args.sheetid):
        url = preprocess_url(url)
        if url is None:
            print('skipping %s' % url)
            continue
        print('ingesting %s' % url)
        res = coll.ingest(url)
    print('done')
def store_known_contracts(args):
    coll = barn.open_collection('knowncontracts', 'file', path=args.barndir)
    
    # XXX warning: special-handling
    for pattern in args.include_text_directory:
        for url in glob.glob(pattern + '/*'):
            print('ingesting %s' % url)
            res = coll.ingest(url)

    for url in known_contracts(args.sheetid):
        url = preprocess_url(url)
        if url is None:
            print('skipping %s' % url)
            continue
        print('ingesting %s' % url)
        res = coll.ingest(url)
    print('done')
Пример #9
0
def test_open_collection():
    from barn import open_collection
    coll = open_collection('test', 's3', bucket_name='foo')
    assert isinstance(coll.store, S3Store), coll.store
    assert coll.store.bucket.name == 'foo', coll.store.bucket