def main_stats(num_files, file_size, download_iterations, download_threads, output_dir, only_upload, only_download, **kwargs): download_report_filename, upload_report_filename = None, None for filename in glob(os.path.join(output_dir, '*.csv')): if '/download-report-' in filename: assert not download_report_filename download_report_filename = filename elif '/upload-report-' in filename: assert not upload_report_filename upload_report_filename = filename assert download_report_filename and upload_report_filename print('upload_report_filename', upload_report_filename) print('download_report_filename', download_report_filename) print("Generating upload stats...") upload_stats = defaultdict(int) df.Flow(df.load(upload_report_filename), stats_process_upload_rows(upload_stats)).process() print("Generating download stats...") download_stats = defaultdict(int) df.Flow(df.load(download_report_filename), stats_process_download_rows(download_stats, file_size)).process() print("Upload Stats") pprint(dict(upload_stats)) print("Download Stats") pprint(dict(download_stats))
def flow(*args): is_dpp = len(args) > 3 return Flow( load('data/unique_records_full/datapackage.json', resources=['unique_records']), load('data/app_records_full/datapackage.json', resources=['search_app_records']), add_field('__revision', 'integer', REVISION), *(add_field(f['name'], f['type']) for f in STATUS_FIELDS), manage_revisions, *(dump_to_sql( { DB_TABLE: { 'resource-name': resource_name, 'mode': 'update', 'update_keys': KEY_FIELDS } }, DATAFLOWS_DB_ENGINE) for resource_name in ['unique_records', 'search_app_records']), *(add_field(f'rev_{name}', 'date') for name in ['last_updated_at', 'last_modified_at', 'created_at']), set_revisions, filter_rows(equals=[{ '__next_update_days': FILTER_NEXT_UPDATE_DAYS }]) if FILTER_NEXT_UPDATE_DAYS else None, add_date_range(), dump_to_path('data/publications_for_es'), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['doc_id']), update_resource(None, **{'dpp:streaming': True}))
def flow(*_): DF.Flow( DF.load(filename, name='welfare'), DF.add_field('activity_name', 'string', lambda r: r['שם השירות (ציבורי)']), DF.filter_rows(lambda r: r['activity_name']), DF.add_field( 'activity_description', 'array', lambda r: [r['תיאור השירות (תיאור קצר)'] + '\n' + r['השירות (מטרת השירות)'] ]), DF.add_field( 'history', 'array', lambda r: [ dict( year=2019, unit=r['יחידה ארגונית נותנת השירות'].split('/')[0].strip(), subunit=r['יחידה ארגונית נותנת השירות'].split('/')[1]. strip(), subsubunit=r['יחידה ארגונית נותנת השירות'].split('/')[ 1].strip(), ) ]), DF.add_field('target_audience', 'array', splitter('אוכלוסייה')), DF.add_field('subject', 'array', splitter('תחום ההתערבות')), DF.add_field('intervention', 'array', splitter('אופן התערבות')), DF.select_fields(FIELDS), DF.add_field('publisher_name', 'string', 'משרד הרווחה'), DF.add_field('min_year', 'integer', 2019), DF.add_field('max_year', 'integer', 2019), DF.add_field('kind', 'string', 'gov_social_service'), DF.add_field('kind_he', 'string', 'שירות חברתי'), DF.printer(), DF.validate(), DF.dump_to_path('tmp/activities-welfare')).process() return DF.Flow( DF.load('tmp/activities-welfare/datapackage.json'), DF.update_resource(-1, **{'dpp:streaming': True}), )
def join_unique_records(*args): is_dpp = len(args) > 3 return Flow( load('data/search_import_from_gdrive/datapackage.json', resources=['search_import']), load('data/search_results/unique_records.csv', resources=['unique_records']), set_type('migdar_id', type='string', resources=['unique_records', 'search_import']), join(source_name='search_import', source_key=['migdar_id'], target_name='unique_records', target_key=['migdar_id'], fields={ f'gd_{field}': { 'name': field } for field in SEARCH_IMPORT_FIELD_NAMES }, full=False), printer(tablefmt='plain' if is_dpp else 'html', num_rows=1, fields=['migdar_id']), dump_to_path('data/unique_records_full'), update_resource(None, **{'dpp:streaming': True}))
def test_example_8(): from dataflows import Flow, load, dump_to_path def find_double_winners(package): # Remove the emmies resource - we're going to consume it now package.pkg.remove_resource('emmies') # Must yield the modified datapackage yield package.pkg # Now iterate on all resources resources = iter(package) # Emmies is the first - read all its data and create a set of winner names emmy = next(resources) emmy_winners = set( map(lambda x: x['nominee'], filter(lambda x: x['winner'], emmy))) # Oscars are next - filter rows based on the emmy winner set academy = next(resources) yield filter(lambda row: row['Winner'] and row['Name'] in emmy_winners, academy) f = Flow( # Emmy award nominees and winners load('data/emmy.csv', name='emmies'), # Academy award nominees and winners load('data/academy.csv', encoding='utf8', name='oscars'), find_double_winners, dump_to_path('out/double_winners')) _ = f.process()
def flow(): load_steps = (load('data/committees/kns_committee/datapackage.json', resources=['kns_committee']), load('data/members/mk_individual/datapackage.json', resources=['mk_individual_positions']), load('data/people/committees/meeting-attendees/datapackage.json', resources=['kns_committeesession'])) load_steps = (cache(load_steps, cache_path='.cache/web_ui/meetings_load_steps')) return Flow(*load_steps + (update_meetings,))
def decp_processing(): flow = Flow( # Chargement du CSV suite à la conversion depuis JSON load("decp.csv"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), # Tri par rootId et seq pour préparer à la création de donneesActuelles sort_rows('{rootId}:{seq}', resources=0, reverse=True), donnees_actuelles, # rootId et seq peuvent maintenant être supprimés delete_fields(["rootId", "seq"], resources=0, regex=False), sort_rows('{datePublicationDonnees}', resources=0, reverse=True), # Nouvelle table dédiée aux marchés, sans données sur les titulaires print("Création de la table dédiée aux marchés..."), duplicate(source="decp", target_name="decp-sans-titulaires", target_path="decp-sans-titulaires.csv", duplicate_to_end=True), delete_fields([ "titulaire.id", "titulaire.denominationSociale", "titulaire.typeIdentifiant" ], resources="decp-sans-titulaires", regex=False), set_primary_key(["uid"], resources="decp-sans-titulaires"), deduplicate(), # Chargement des précédentes DECP au format CSV, pour extraction des nouvelles données # print("Téléchargement des données tabulaires précédentes..."), # load("https://decp.info/db/decp.csv?_size=max&_dl=1", name="previous-decp"), # set_type("acheteur.id", type="string"), # set_type("titulaire.id", type="string"), # set_type("codeCPV", type="string"), # set_type("lieuExecution.code", type="string"), # delete_fields(["rowid"], resources="previous-decp", regex=False), # #print("Fusion des données tabulaires précédentes et des données d'aujourd'hui..."), # concatenate({},target={"name": "decp-titulaires","path": "decp-titulaires.csv"},resources=["decp","previous-decp"]), # Chargement des précédentes données dédiées aux titulaires print("Chargement des données titulaires..."), load("decp-titulaires.csv", name="decp-titulaires"), set_type("acheteur.id", type="string"), set_type("titulaire.id", type="string"), set_type("codeCPV", type="string"), set_type("lieuExecution.code", type="string"), set_type("departement", type="string"), set_type("codeAPE", type="string"), print("Enregistrement des données sur le disque..."), dump_to_path("decp")) flow.process()
def conference_csv(): flow = Flow( # Load inputs load( od19_base + od19_feedback, name='feedback', format='csv', ), load( od19_base + od19_analysis, name='analysis', format='csv', ), # Process them set_type("Anzahl.*", type='integer', resources='analysis'), delete_fields([ "Anzahl Auflistung", ".*\\(Formel\\)", ".*Duplikate", ], resources='analysis'), not_empty_groupcol, # Save the results add_metadata( name='opendatach19', title='''Opendata.ch/2019 Forum''', licenses=[{ "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0" }], maintainers=[{ "name": "Oleg Lavrovsky", "web": "https://datalets.ch/" }], views=[{ "name": "Groups", "resources": ["analysis"], "spec": { "group": "Alle " "Bedürfnisse" "", "series": ["Anzahl Auflistung (Zahl)"], "type": "bar" }, "specType": "simple", "title": "Topic counts" }]), printer(), validate(), dump_to_path('data/opendatach19'), ) flow.process()
def test_expected_contact_with_patient(): print("test_expected_contact_with_patient") back_from_abroad_db = [169603, 169632, 169813] contact_with_patient_db = [10722, 10715, 10697] Flow( load_from_db.flow({ "where": "id in (%s)" % ", ".join(map(str, back_from_abroad_db + contact_with_patient_db)) }), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform( 34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), ).process() contact_with_patient_key = values_to_convert['insulation_status'][ 'contact-with-patient'] back_from_abroad_key = values_to_convert['insulation_status'][ 'back-from-abroad'] contact_with_patient_array = [] back_from_abroad_array = [] counts = {"contact_with_patient": 0, "back_from_abroad": 0} def _test(row): if int(row["isolation"]) == contact_with_patient_key: counts["contact_with_patient"] += 1 contact_with_patient_array.append(int(row["id"])) if int(row["isolation"]) == back_from_abroad_key: assert int(row["id"]) in back_from_abroad_db counts["back_from_abroad"] += 1 back_from_abroad_array.append(int(row["id"])) Flow( load( 'data/corona_data_collector/destination_output/corona_bot_answers_25_3_2020_with_coords.csv' ), load( 'data/corona_data_collector/destination_output/corona_bot_answers_22_3_2020_with_coords.csv' ), _test, ).process() assert 3 == counts["contact_with_patient"], str(counts) assert 3 == counts["back_from_abroad"], str(counts) assert set(back_from_abroad_array) == set(back_from_abroad_db) assert set(contact_with_patient_array) == set(contact_with_patient_db) print("OK")
def update_dataset(): flow = Flow( # Load inputs load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), checkpoint('load_data'), # Process them (if necessary) # Save the results add_metadata(name='csse_covid_19_time_series', title='''csse_covid_19_time_series'''), printer(), dump_to_path(), ) flow.process()
def test_exception_information_multiple_processors_last_errored(): from dataflows import Flow, load, exceptions flow = Flow( load('data/academy.csv'), load('data/bad-path2.csv'), ) with pytest.raises(exceptions.ProcessorError) as excinfo: flow.results() assert str(excinfo.value.cause).startswith( "Failed to load source 'data/bad-path2.csv' and options") assert str(excinfo.value.cause).endswith( ": [Errno 2] No such file or directory: 'data/bad-path2.csv'") assert excinfo.value.processor_name == 'load' assert excinfo.value.processor_object.load_source == 'data/bad-path2.csv' assert excinfo.value.processor_position == 2
def test_load_name_path(): from dataflows import load dp, *_ = Flow(load('data/beatles_age.json', name='foo'), load('data/beatles_age.csv')).process() print(dp.descriptor['resources']) res0 = dp.resources[0] res1 = dp.resources[1] assert res0.name == 'foo' assert res0.descriptor['path'] == 'foo.json' assert res1.name == 'beatles_age' assert res1.descriptor['path'] == 'beatles_age.csv'
def test_change_acl_on_s3_no_path_provided(s3_client, bucket): # Prepare paths paths = [ 'my/private/datasets/file_1.csv' 'my/private/datasets/file_2.csv' ] # Fill the S3 bucket for path in paths: s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read') # Set private ACL using the processor flow = Flow( load('data/data.csv'), change_acl_on_s3( bucket=bucket, acl='private', endpoint_url=os.environ['S3_ENDPOINT_URL'], ), ) flow.process() # Assert everything is private now for path in paths: url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path) assert requests.get(url).status_code == 403
def get_updated_sources(): import requests from pyquery import PyQuery as pq URL = 'https://mr.gov.il/ilgstorefront/he/news/details/230920201036' sources = [] page = pq(requests.get(URL).text) anchors = page.find('a') for anchor in anchors: anchor = pq(anchor) href = anchor.attr('href') if '.zip' in href: sources.append(href + '#.xlsx') sources = [ DF.load(source, format='excel-xml', encoding='utf8', bytes_sample_size=0) for source in sources ] if len(sources) != 2: return DF.Flow( data_gov_il_resource.flow(tenders), data_gov_il_resource.flow(exemptions), ) else: return DF.Flow(*sources)
def test_change_acl_on_s3_handles_more_than_1000_files(s3_client, bucket): # Prepare paths paths = [] for index in range(1, 1101): path = 'my/private/datasets/file_%s.csv' % index paths.append(path) # Fill the S3 bucket for path in paths: s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read') # Set private ACL using the processor flow = Flow( load('data/data.csv'), change_acl_on_s3( bucket=bucket, acl='private', path='my/private/datasets', endpoint_url=os.environ['S3_ENDPOINT_URL'], ), ) flow.process() # Assert everything is private now for path in paths: url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path) assert requests.get(url).status_code == 403
def flow(parameters): _from = parameters.pop('from') num_resources = 0 def count_resources(): def func(package): global num_resources num_resources = len(package.pkg.resources) yield package.pkg yield from package return func def mark_streaming(_from): def func(package): for i in range(num_resources, len(package.pkg.resources)): package.pkg.descriptor['resources'][i].setdefault(PROP_STREAMING, True) package.pkg.descriptor['resources'][i].setdefault(PROP_STREAMED_FROM, _from) yield package.pkg yield from package return func return Flow( count_resources(), load(_from, **parameters), mark_streaming(_from), )
def test_example_7(): from dataflows import Flow, load, dump_to_path def add_is_guitarist_column(package): # Add a new field to the first resource package.pkg.descriptor['resources'][0]['schema']['fields'].append( dict(name='is_guitarist', type='boolean')) # Must yield the modified datapackage yield package.pkg # Now iterate on all resources resources = iter(package) beatles = next(resources) def f(row): row['is_guitarist'] = row['instrument'] == 'guitar' return row yield map(f, beatles) f = Flow( # Same one as above load('data/beatles.csv'), add_is_guitarist_column, dump_to_path('out/beatles_guitarists')) _ = f.process()
def prepare_locations(): prepare_addresses() return DF.Flow( DF.load('_cache_addresses/datapackage.json'), DF.add_field( 'address', 'string', lambda r: '{} {}{}'.format( r['street_name'], r['house_number'], r['letter'] or '')), DF.add_field( 'item', 'object', lambda r: dict(value=dict(lat=float(r['lat']), lon=float(r['lon']), arnona_zones=r['arnona_zones'], שם=r['address']), display=r['address'])), DF.sort_rows('{house_number}'), DF.delete_fields([ 'house_number', 'letter', 'lat', 'lon', 'arnona_zones', 'address' ]), DF.join_with_self( 'concat', ['street_name'], dict(display=dict(name='street_name'), items=dict(name='item', aggregate='array'))), DF.add_field('sort_street_address', 'string', lambda r: sort_street_address(r['display'])), DF.sort_rows('{sort_street_address}'), DF.delete_fields(['sort_street_address']), DF.printer(), DF.dump_to_path('_cache_locations'), DF.checkpoint('_cache_locations')).results()[0][0]
def broken_links_flow(): return DF.Flow( *[ DF.Flow( DF.load(URL_TEMPLATE.format(**c), name=c['name']), DF.add_field('__name', 'string', c['name'], resources=c['name']), DF.add_field('__title', 'string', get_title(c['title']), resources=c['name']), ) for c in configuration ], DF.add_field('urls', 'array', lambda r: RE.findall(str(r))), DF.add_field('link', 'string', lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)), DF.concatenate( dict( name=['__name'], title=['__title'], link=[], urls=[], )), DF.add_field('url', 'string'), DF.add_field('error', 'string'), unwind(), DF.delete_fields(['urls']), DF.parallelize(check_broken(), 4), DF.filter_rows(lambda r: r['error'] is not None), )
def datarecords(kind): return map( lambda r: r['value'], DF.Flow( DF.load(f'https://data-input.obudget.org/api/datarecords/{kind}', format='json', property='result')).results()[0][0])
def flow(*args): is_dpp = len(args) > 3 return Flow( load( 'https://migdar-internal-search.odata.org.il/__data/search_import/index.csv', encoding='utf-8', http_session=get_migdar_session()), update_resource('index', name='search_import_index', path='search_import_index.csv'), load_from_gdrive_files, update_resource('search_import_index', name='search_import', path='search_import.csv', schema={ 'fields': [{ 'name': n, 'type': 'string' } for n in SEARCH_IMPORT_FIELD_NAMES] }, **{'dpp:streaming': True}), printer(num_rows=20, tablefmt='plain' if is_dpp else 'html', fields=['migdar_id', 'pubyear', 'title']), dump_to_path('data/search_import_from_gdrive'))
def operator(name, params): connection_string = params['db_url'] source_table = params['db_table'] target_instance_name = params['target_instance_name'] target_package_id = params['target_package_id'] target_organization_id = params['target_organization_id'] print('starting db_fetcher operator') print( 'source_table={} target_instance_name={} target_package_id={} target_organization_id={}' .format(source_table, target_instance_name, target_package_id, target_organization_id)) with tempfile.TemporaryDirectory() as tempdir: csv_filename = target_package_id + '.csv' DF.Flow( DF.load(connection_string, table=source_table, name=target_package_id, infer_strategy=DF.load.INFER_PYTHON_TYPES), DF.update_resource(-1, path=csv_filename), DF.delete_fields(['_source']), DF.dump_to_path(tempdir)).process() csv_filename = os.path.join(tempdir, csv_filename) print('{}, {:,} bytes'.format(csv_filename, os.stat(csv_filename).st_size)) update_package(target_instance_name, target_organization_id, target_package_id, target_package_id, [('CSV', csv_filename)])
def test_change_acl_on_s3(s3_client, bucket): # Prepare paths paths = [ 'my/private/datasets/README.md', 'my/private/datasets/datapackage.json', 'my/private/datasets/data/mydata.csv', 'my/public/datasets/data/mydata.csv', ] # Fill the S3 bucket for path in paths: s3_client.put_object(Body='body', Bucket=bucket, Key=path, ACL='public-read') # Assert all contents are public by default for path in paths: url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path) assert requests.get(url).status_code == 200 # Set private ACL using the processor flow = Flow( load('data/data.csv'), change_acl_on_s3( bucket=bucket, acl='private', path='my/private/datasets', endpoint_url=os.environ['S3_ENDPOINT_URL'], ), ) flow.process() # Assert only public contents are public for path in paths: url = '{}/{}/{}'.format(os.environ['S3_ENDPOINT_URL'], bucket, path) assert requests.get(url).status_code == (200 if 'public' in path else 403)
def _get_last_runs(): runs_history_last_rows = {} for id, path in parameters["check_covid19_israel_id_paths"].items(): def _process_runs_history(rows): for row in rows: yield row runs_history_last_rows[id] = row Flow(load("%s/runs_history/datapackage.json" % path), _process_runs_history).process() for id, row in runs_history_last_rows.items(): start_time = row["start_time"] end_time = datetime.datetime.strptime(row["end_time"], '%Y-%m-%dT%H:%M:%S') yield { "id": id, "github_sha1": row["github_sha1"], "error": row["error"], "start_time": start_time, "end_time": end_time, "duration_minutes": (end_time - start_time).total_seconds() / 60, "log_file": "https://avidcovider-pipelines-data.odata.org.il/data/%s/log_files/%s.log" % (id, start_time.strftime("%Y%m%dT%H%M%S")), }
def test_dump_to_s3_non_existent_bucket(s3_client, bucket): # Delete bucket s3_client.delete_bucket(Bucket=bucket) # Dump to S3 using the processor flow = Flow( load('data/data.csv'), dump_to_s3( bucket=bucket, acl='private', path='my/datapackage', endpoint_url=os.environ['S3_ENDPOINT_URL'], ), ) flow.process() # Check datapackage.json content response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/datapackage.json') descriptor = json.loads(response['Body'].read().decode('utf-8')) assert descriptor['resources'][0]['schema']['fields'][0]['name'] == 'id' assert descriptor['resources'][0]['schema']['fields'][1]['name'] == 'name' # Check data.csv content response = s3_client.get_object(Bucket=bucket, Key='my/datapackage/data.csv') contents = response['Body'].read().decode('utf-8') assert contents == 'id,name\r\n1,english\r\n2,中国人\r\n'
def test_load_strategy_infer_strings_from_native_types(): from dataflows import load flow = Flow(load( 'data/beatles_age.json', infer_strategy='strings', ), ) data, package, stats = flow.results() assert data == [[ { 'age': '18', 'name': 'john' }, { 'age': '16', 'name': 'paul' }, { 'age': '17', 'name': 'george' }, { 'age': '22', 'name': 'ringo' }, ]]
def test_load_from_package_resources(): from dataflows import load datapackage = { 'resources': [{ 'name': 'my-resource-{}'.format(i), 'path': 'my-resource-{}.csv'.format(i), 'schema': { 'fields': [{ 'name': 'foo', 'type': 'string' }] } } for i in range(2)] } resources = ((row for row in [{ 'foo': 'bar{}'.format(i) }, { 'foo': 'baz{}'.format(i) }]) for i in range(2)) data, dp, *_ = Flow( load((datapackage, resources), resources=['my-resource-1']), ).results() assert len(dp.resources) == 1 assert dp.get_resource( 'my-resource-1').descriptor['path'] == 'my-resource-1.csv' assert data[0][1] == {'foo': 'baz1'}
def test_sort_rows_decimal(): from decimal import Decimal from dataflows import sort_rows, load f = Flow( load('data/numbers.csv', cast_strategy=load.CAST_WITH_SCHEMA), sort_rows(key='{a}'), ) results, dp, _ = f.results() assert list(results[0]) == [{ 'a': Decimal('-1000') }, { 'a': Decimal('-0.5') }, { 'a': Decimal('-0.4') }, { 'a': Decimal('0') }, { 'a': Decimal('1.1') }, { 'a': Decimal('2') }, { 'a': Decimal('10') }, { 'a': Decimal('1000') }]
def test_load_duplicate_headers_with_deduplicate_headers_flag(): from dataflows import load flow = Flow(load('data/duplicate_headers.csv', deduplicate_headers=True), ) data, package, stats = flow.results() assert package.descriptor['resources'][0]['schema']['fields'] == [ { 'name': 'header1', 'type': 'string', 'format': 'default' }, { 'name': 'header2 (1)', 'type': 'string', 'format': 'default' }, { 'name': 'header2 (2)', 'type': 'string', 'format': 'default' }, ] assert data == [[ { 'header1': 'value1', 'header2 (1)': 'value2', 'header2 (2)': 'value3' }, ]]
def generate_package(): package_flow = Flow( add_metadata( name="unicode-emojis", title="UTS #51 Unicode Emoji", descriptor=( "List of emojis available from the Unicode Consortium. " "More information can be found in the Unicode® Technical Standard #51." ), sources=[ { "name": "unicode-emoji", "path": "https://unicode.org/Public/emoji/latest/emoji-test.txt", "title": "UTS #51 Unicode Emoji", }, ], licenses=[ { "name": "ODC-PDDL-1.0", "path": "http://opendatacommons.org/licenses/pddl/", "title": "Open Data Commons Public Domain Dedication and License v1.0", } ], keywords=["unicode", "emojis", "emoji", "51", "standard", "uts"], ), load(load_source="data/emojis.csv", format="csv",), validate(), dump_to_path(), ) package_flow.process()
} ], views=[ { "name": "graph", "title": "Average yield from British Government Securities, 10 year Nominal Par Yield", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["Rate"]} } ], readme=readme() ), load( load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUQAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963', skip_rows=[1], headers=['Date', 'Rate'], format='csv', name='quarterly' ), load( load_source='http://www.bankofengland.co.uk/boeapps/iadb/fromshowcolumns.asp?csv.x=yes&SeriesCodes=IUAAMNPY&UsingCodes=Y&CSVF=TN&Datefrom=01/Jan/1963', skip_rows=[1], headers=['Year', 'Rate'], format='csv', name='annual' ), set_type('Date', resources='quarterly', type='date', format='any'), set_type('Rate', resources='quarterly', type='number', description='Quarterly average yield from British Government Securities, 10 year Nominal Par Yield'), set_type('Year', resources='annual', type='date', format='any'), set_type('Rate', resources='annual', type='number', description='Annual average yield from British Government Securities, 10 year Nominal Par Yield'), update_resource('quarterly', **{'path':'data/quarterly.csv', 'dpp:streaming': True}),
} ], version="0.2.0", views=[ { "name": "graph", "title": "VIX - CBOE Volatility Index", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["VIX Close"]} } ], readme=readme() ), load( load_source='http://www.cboe.com/publish/ScheduledTask/MktData/datahouse/vixcurrent.csv', headers=2, name='vix-daily' ), set_type('Date', type='date', format='any'), update_resource('vix-daily', **{'title': 'VIX Daily', 'path':'data/vix-daily.csv', 'dpp:streaming': True}), validate() ) def flow(parameters, datapackage, resources, stats): return finance_vix if __name__ == '__main__': finance_vix.process()
"publisher": "core", "formats": ["CSV", "JSON"] }, { "title": "Natural gas", "path": "/core/natural-gas", "publisher": "core", "formats": ["CSV", "JSON"] } ], version="0.2.0" ), load( load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its', skip_rows=[1, 2, 3, 4, 5, -1], headers=['Date', 'Price', 'Empty column'], format='csv', name='annual' ), extract_december_rows, load( load_source='http://www.bundesbank.de/cae/servlet/StatisticDownload?tsId=BBEX3.M.XAU.USD.EA.AC.C06&its_csvFormat=en&its_fileFormat=csv&mode=its', skip_rows=[1, 2, 3, 4, 5, -1], headers=['Date', 'Price', 'Empty column'], format='csv', name='monthly' ), update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}), update_resource('annual', **{'path':'data/annual.csv', 'dpp:streaming': True}), set_type('Date', resources='annual', type='yearmonth'), set_type('Price', resources='annual', type='number'),
} ], views=[ { "name": "graph", "title": "10 year US Government Bond Yields (Monthly granuarlity)", "specType": "simple", "spec": {"type": "line","group": "Date","series": ["Rate"]} } ], readme=readme() ), load( load_source='http://www.federalreserve.gov/datadownload/Output.aspx?rel=H15&series=0809abf197c17f1ff0b2180fe7015cc3&lastObs=&from=&to=&filetype=csv&label=include&layout=seriescolumn', skip_rows=[i+1 for i in range(6)], headers=['Date', 'Rate'], format='csv', name='monthly' ), set_type('Date', type='date', format='any', descriptor='Date in ISO 8601'), set_type('Rate', type='number', description='Percent per year'), update_resource('monthly', **{'path':'data/monthly.csv', 'dpp:streaming': True}), validate(), dump_to_path() ) def flow(parameters, datapackage, resources, stats): return bond_us