def _construct_graph(self, single_graph=False, services=None): if single_graph: graph = bonobo.Graph() g1 = graph g2 = graph else: g1 = bonobo.Graph() g2 = bonobo.Graph() _ = self._add_geog_graph(g1) _ = self._add_abstracts_graph(g2) _ = self._add_journals_graph(g2) _ = self._add_series_graph(g2) _ = self._add_people_graph(g2) _ = self._add_corp_graph(g2) # _ = self._add_tal_graph(g2) # _ = self._add_subject_graph(g2) if single_graph: self.graph_0 = graph return [graph] else: self.graphs = [g1, g2] return self.graphs
def test_postgres(postgres): #bonobo.settings.QUIET.set(True) db_name = 'my_db' port = postgres['NetworkSettings']['Ports']['5432/tcp'][0]['HostPort'] wait_for_postgres(port) root_engine = create_root_engine(port) _execute_sql(root_engine, "CREATE ROLE my_user WITH LOGIN PASSWORD '';") _execute_sql( root_engine, 'CREATE DATABASE {name} WITH OWNER=my_user TEMPLATE=template0 ENCODING="utf-8"' .format(name=db_name)) engine = create_engine('my_user', db_name, port) metadata.create_all(engine) services = {'sqlalchemy.engine': engine} graph = bonobo.Graph() graph.add_chain(extract, bonobo_sqlalchemy.InsertOrUpdate(TABLE_1)) assert bonobo.run(graph, services=services) buf = Bufferize() graph = bonobo.Graph() graph.add_chain( bonobo_sqlalchemy.Select('SELECT * FROM ' + TABLE_1), buf, ) assert bonobo.run(graph, services=services) assert buf.buffer == [((0, 'value for 0'), {}), ((1, 'value for 1'), {}), ((2, 'value for 2'), {}), ((3, 'value for 3'), {}), ((4, 'value for 4'), {}), ((5, 'value for 5'), {}), ((6, 'value for 6'), {}), ((7, 'value for 7'), {}), ((8, 'value for 8'), {}), ((9, 'value for 9'), {})] graph = bonobo.Graph( bonobo_sqlalchemy.Select('SELECT * FROM ' + TABLE_1), bonobo_sqlalchemy.InsertOrUpdate(TABLE_2), ) assert bonobo.run(graph, services=services) buf = Bufferize() graph = bonobo.Graph() graph.add_chain( bonobo_sqlalchemy.Select('SELECT * FROM ' + TABLE_2), buf, ) assert bonobo.run(graph, services=services) assert buf.buffer == [((0, 'value for 0'), {}), ((1, 'value for 1'), {}), ((2, 'value for 2'), {}), ((3, 'value for 3'), {}), ((4, 'value for 4'), {}), ((5, 'value for 5'), {}), ((6, 'value for 6'), {}), ((7, 'value for 7'), {}), ((8, 'value for 8'), {}), ((9, 'value for 9'), {})]
def get_graph_old_file(): graph = bonobo.Graph() old_formatter = LineFormatter(old_file_list_file) graph.add_chain(extract_old_file, old_formatter.reformat, load_old) return graph
def get_inventory_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( bonobo.CsvReader('Deckbox-inventory.csv'), bonobo.Filter(lambda *args: args[-1] != 'English'), inventory, bonobo.Rename(Card_Number='Card Number', Tradelist_Count='Tradelist Count'), # bonobo_sqlalchemy.InsertOrUpdate( # 'cards', # discriminant=( # 'Name', # 'Edition', # 'Card_Number', # 'Foil', # ), # engine='cards'), _name='main', ) return graph
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() split_dbs = bonobo.noop graph.add_chain( bonobo.CsvReader('/etl/metrics-insights/workday-users.csv', fs='brickftp'), employee_active, find_badge_id, bonobo.UnpackItems(0), split_dbs) for engine in list(set(options['engine'])): graph.add_chain(bonobo_sqlalchemy.InsertOrUpdate( table_name=options['table_name'] + options['table_suffix'], discriminant=('badgeid', ), buffer_size=10, engine=engine), _input=split_dbs) return graph
def get_cache_graph(**options): """ This graphs builds a cache of badges from ccure :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( bonobo.CsvReader( '/etl/ccure/uploads/BadgeID/ccure_BadgeID_AllButVendor.txt', fields=('badge_id', 'empty1', 'last_name', 'empty2', 'first_name', 'empty3', 'issued_on', 'empty4', 'disabled', 'empty5', 'valid_until', 'empty6', 'flag2', 'empty7', 'flag3', 'empty8', 'flag4'), delimiter='|', fs='brickftp'), badge_active, cache, ) return graph
def get_bu_graph(**options): graph = bonobo.Graph() graph.add_chain( get_business_units, join_cost_centers, centerstone_BU_SupOrg_Merge_remap, centerstone_BussUnit_remap, ) graph.add_chain( #bonobo.Limit(3), #bonobo.PrettyPrinter(), productLineLevel1_remap, unique_product_line, bonobo.UnpackItems(0), bonobo.PrettyPrinter(), bonobo.CsvWriter('/etl/centerstone/downloads/ProductLineLevel1.txt' + options['suffix'], lineterminator="\n", delimiter="\t", fs="brickftp"), _input=centerstone_BussUnit_remap) graph.add_chain( teamLevel3_remap, bonobo.UnpackItems(0), bonobo.CsvWriter('/etl/centerstone/downloads/TeamLevel3.txt' + options['suffix'], lineterminator="\n", delimiter="\t", fs="brickftp"), _input=centerstone_BussUnit_remap) return graph
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( load_config, load_existing_files, extract_raw_folders, extract_raw_files_from_folder, ) graph.add_chain(data_copy, _input=None, _name="copy") graph.add_chain( # bonobo.Limit(30), process_patient_data, _input=extract_raw_files_from_folder, _output="copy", ) graph.add_chain( # bonobo.Limit(30), process_image, _input=process_patient_data, _output="copy", ) graph.add_chain(process_dicom_data, upload_text_data, _input=process_image) return graph
def _generate_graph(self) -> None: """[Generate the Bonobo.Graph for the current Report] The report generation follows the rules: - Tasks which do not depend on any are executed in parallel - Tasks which depend on a given task await the previous' completion to be executed Returns: bonobo.Graph -- [The generated bonobo.Graph] """ graph: bonobo.Graph = bonobo.Graph() #Generate a dependencies 1-d tree, (child_id, parent_id) so we can isolate parents # Parent: graph.add_chain(parent, _input= None) # Child : graph.add_chain(child1, _output = parent) tasks: Dict[int, Task] = self._tasks # Assess the nodes' relationships and connect them as input or output (or both) for node in tasks.values(): if node.children is not None: for child_id in node.children: if child_id is not None: graph >> node >> tasks[child_id] elif node.parents is None: graph >> node self._graph = graph if len(graph.nodes) > 0 else None
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain(DataExtractor(), _input=None, _name="extractor") graph.add_chain( list_clinical_files, load_clinical_files, _output="extractor", ) graph.add_chain( list_image_metadata_files, load_image_metadata_files, _output="extractor", ) graph.add_chain( get_storage_stats, _output="extractor", ) return graph
def build_graph(self, **options): # Building the Graph: self.graph = bonobo.Graph() self.graph.add_chain(self.extract, self.transform, self.load) return self.graph
def get_graph_locations_with_mapping_other(): graph = bonobo.Graph() mapping_formatter = LineFormatter(file_locations_with_mapping_other) graph.add_chain(extract_locations_with_mapping_other, mapping_formatter.reformatMapping, load_mapping_other) return graph
def run(self, services=None, **options): '''Run the People bonobo pipeline.''' print(f'- Limiting to {self.limit} records per file', file=sys.stderr) if not services: services = self.get_services(**options) print('Running graph component...', file=sys.stderr) graph = self.get_graph(**options, services=services) self.run_graph(graph, services=services) print('Serializing static instances...', file=sys.stderr) for model, instances in self.static_instances.used_instances().items(): g = bonobo.Graph() nodes = self.serializer_nodes_for_model(model=self.models[model], use_memory_writer=False) values = instances.values() source = g.add_chain(GraphListSource(values)) self.add_serialization_chain(g, source.output, model=self.models[model], use_memory_writer=False) self.run_graph(g, services={}) print('Writing people-groups mapping data to disk') pg_file = pathlib.Path( settings.pipeline_tmp_path).joinpath('people_groups.json') with pg_file.open('w') as fh: json.dump(services['people_groups'], fh)
def get_costcenter_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( get_cost_centers, cache_cost_centers, centerstone_CostCenter_remap, #bonobo.PrettyPrinter(), bonobo.UnpackItems(0), # Can't skip the header, but must bonobo.CsvWriter( '/etl/centerstone/downloads/CostCenterLevel2.txt' + options['suffix'], lineterminator="\n", delimiter="\t", fs="brickftp"), bonobo.CsvWriter( 'CostCenterLevel2.txt' + options['suffix'], lineterminator="\n", delimiter="\t", fs="centerstone"), bonobo.count, _name="main") return graph
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() split_dbs = bonobo.noop graph.add_chain(extract_accounts, transform, valid_aws_account, bonobo.UnpackItems(0), split_dbs, _name="main") for engine in list(set(options['engine'])): graph.add_chain(bonobo_sqlalchemy.InsertOrUpdate( table_name=options['table_name'] + options['table_suffix'], discriminant=('linked_account_number', ), engine=engine), _input=split_dbs) return graph
def main(): graph = bonobo.Graph( extract_data_from_csv, transform_data, load_data_to_dw ) bonobo.run(graph)
def get_graph(**options): graph = bonobo.Graph() graph.add_chain(extract, transform) graph.add_chain(plot, _input=transform) graph.add_chain(analytics, _input=transform) return graph
def get_graph(graph=None, *, _limit=(), _print=()): """ Extracts a list of cafes with on euro in Paris, renames the name, address and zipcode fields, reorders the fields and formats to json and csv files. """ graph = graph or bonobo.Graph() producer = ( graph.get_cursor() >> ODSReader(dataset="liste-des-cafes-a-un-euro", netloc="opendata.paris.fr") >> PartialGraph(*_limit) >> bonobo.UnpackItems(0) >> bonobo.Rename( name="nom_du_cafe", address="adresse", zipcode="arrondissement") >> bonobo.Format(city="Paris", country="France") >> bonobo.OrderFields([ "name", "address", "zipcode", "city", "country", "geometry", "geoloc" ]) >> PartialGraph(*_print)) # Comma separated values. graph.get_cursor(producer.output) >> bonobo.CsvWriter( "coffeeshops.csv", fields=["name", "address", "zipcode", "city"], delimiter=",") # Standard JSON graph.get_cursor( producer.output) >> bonobo.JsonWriter(path="coffeeshops.json") # Line-delimited JSON graph.get_cursor( producer.output) >> bonobo.LdjsonWriter(path="coffeeshops.ldjson") return graph
def get_raw_threads_graph(day): graph = bonobo.Graph() graph.add_chain(ChannelsSource(day), MessagesFetcher(day, day + datetime.timedelta(days=1)), remove_invalid_messages, process_channel_message, add_thread_to_message, db.JsonRawThreadsWriter(day)) return graph
def build_graph(self): graph = bonobo.Graph( self.generate_data, self.uppercase, self.output, ) return graph
def __init__(self): """ initialize command """ self.graph = bonobo.Graph( self.read_recs, self.write_recs )
def get_graph(**options): return bonobo.Graph( bonobo_sqlalchemy.Select('SELECT * FROM table', engine='sqlalchemy.pgengine'), bonobo_sqlalchemy.InsertOrUpdate(table_name='table_1', engine='sqlalchemy.pgengine'), )
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain(extract_accounts, transform, bonobo.JsonWriter('aws_accounts_ex.json'), valid_aws_account, _name="main") graph.add_chain( bonobo.JsonWriter('aws_accounts.json'), _input="main", ) graph.add_chain( bonobo.UnpackItems(0), bonobo.CsvWriter('aws_accounts.csv'), _input=valid_aws_account, ) graph.add_chain( bonobo.UnpackItems(0), bonobo_sqlalchemy.InsertOrUpdate(table_name='aws_accounts' + options['table_suffix'], discriminant=('account_id', ), engine='db'), _input=valid_aws_account, ) return graph
def test_execution(): graph = bonobo.Graph() result_args = [] result_nt = [] result_bt = [] graph.add_chain(extract_nt, transform_using_args, StoreInList(result_args)) graph.add_chain(transform_nt, StoreInList(result_nt), _input=extract_nt) graph.add_chain(extract_bt, transform_using_args, StoreInList(result_bt)) with GraphExecutionContext(graph) as context: context.run_until_complete() assert result_args == [ (2, "Guido", "guido.py"), (4, "Larry", "larry.pl"), (6, "Dennis", "dennis.c"), (8, "Yukihiro", "yukihiro.rb"), ] assert result_nt == [(1, "GUIDO", ".py"), (2, "LARRY", ".pl"), (3, "DENNIS", ".c"), (4, "YUKIHIRO", ".rb")] assert result_bt == [ (2, "Guido", "guido.py"), (4, "Larry", "larry.pl"), (6, "Dennis", "dennis.c"), (8, "Yukihiro", "yukihiro.rb"), ]
def get_graph(**options): graph = bonobo.Graph() graph.add_chain( extract_fablabs, bonobo.Limit(10), bonobo.PrettyPrinter(), ) return graph
def get_graph_stg(): graph = bonobo.Graph() graph.add_chain( source.SourceQualifier(**sources['staging_1']), t_transform, target.LoadTarget(**targets['target']), ) return graph
def get_graph(**options): graph = bonobo.Graph() graph.add_chain( extract_fablabs, bonobo.Limit(10), write_repr_to_file, ) return graph
def main(): graph = bonobo.Graph( extract_data_from_xlxs, transform_data, load_into_new_xlsx_file ) bonobo.run(graph)
def test_run_graph_noop(): graph = bonobo.Graph(bonobo.noop) assert len(graph) == 1 with patch('bonobo._api._is_interactive_console', side_effect=lambda: False): result = bonobo.run(graph) assert isinstance(result, GraphExecutionContext)
def get_graph(): graph = bonobo.Graph() graph.add_chain( extract, print, ) return graph