def get_graph(**options): graph = bonobo.Graph() graph.add_chain(normalize, write_repr_to_file, _input=None) graph.add_chain(extract_fablabs, bonobo.Limit(5), _output=normalize) graph.add_chain(json_loader('data.json'), bonobo.Limit(10), _output=normalize, _name="loadjson") return graph
def get_graph_options(options): _limit = options.pop("limit", None) _print = options.pop("print", False) return { "_limit": (bonobo.Limit(_limit), ) if _limit else (), "_print": (bonobo.PrettyPrinter(), ) if _print else () }
def get_graph(**options): graph = bonobo.Graph() graph.add_chain( getotherdata, bonobo.Limit(10), bonobo.PrettyPrinter(), ) return graph
def get_graph(**options): graph = bonobo.Graph() graph.add_chain( extract_fablabs, bonobo.Limit(10), write_repr_to_file, ) return graph
def test_limit_default(): context, results = MagicMock(), [] with ContextCurrifier(bonobo.Limit()).as_contextmanager(context) as stack: for i in range(20): results += list(stack()) assert results == [NOT_MODIFIED] * 10
def get_graph_options(options): _limit = options.pop('limit', None) _print = options.pop('print', False) return { '_limit': (bonobo.Limit(_limit), ) if _limit else (), '_print': (bonobo.PrettyPrinter(), ) if _print else (), }
def get_graph(**options): graph = bonobo.Graph() graph.add_chain( extract_fablabs, bonobo.Limit(10), bonobo.PrettyPrinter(), ) return graph
def get_graph(**options): graph = bonobo.Graph() # # Import authors # graph.add_chain( # bonobo.CsvReader('data/ff-faculty.csv', skip=1), # bonobo.Limit(limit), # create_author_document, # FilterDuplicate(collection="jhu-authors", field='hopkins_id', target='hopkins_id', database=database), # MongoWriter(collection='jhu-authors', database=database), # ) # # # Retreive authors from scopus # graph.add_chain( # extract_author_scopus_ids, # bonobo.Limit(limit), # FilterDuplicate(collection='scopus-authors', database=database), # get_author, # MongoWriter(collection='scopus-authors', database=database), # _input=create_author_document, # ) # Retrieve documents from scopus graph.add_chain( bonobo.CsvReader('data/ff-article-ids-17.csv'), bonobo.Limit(limit), FilterDuplicate(collection='scopus-documents', database=database), get_document, # Keep errata data. Leave it to downstream analysis. Otherwise it'll be repeatedly downloaded and discarded. # remove_errata, MongoWriter(collection='scopus-documents', database=database), ) # Extract serials data from Scopus and load into MongoDB graph.add_chain(lambda args: args['coredata'].get('source-id', None), bonobo.Limit(limit), FilterDuplicate(collection='scopus-serials', database=database), get_serial, MongoWriter(collection='scopus-serials', database=database), _input=get_document) return graph
def get_graph_options(options): logger.debug("Unpacking command line options %s.", options) _limit = options.pop("limit", None) _print = options.pop("print", False) graph_options = { "_limit": (bonobo.Limit(_limit), ) if _limit else (), "_print": (bonobo.PrettyPrinter(), ) if _print else () } logger.debug("Created graph options %s.", graph_options) return graph_options
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain(extract_fablabs,bonobo.Limit(10),bonobo.PrettyPrinter()) return graph
def get_graph(**options): graph = bonobo.Graph() graph.add_chain( extract, bonobo.Limit(10), #bonobo.PrettyPrinter(), transform, load, ) return graph
def get_graph(*, _limit=None, _print=False): graph = bonobo.Graph() trunk = graph.add_chain(bonobo.JsonReader('datasets/theaters.json'), *((bonobo.Limit(_limit),) if _limit else ())) if _print: graph.add_chain(bonobo.PrettyPrinter(), _input=trunk.output) graph.add_chain(bonobo.JsonWriter('theaters.json', fs='fs.output'), _input=trunk.output) graph.add_chain(bonobo.LdjsonWriter('theaters.ldjson', fs='fs.output'), _input=trunk.output) return graph
def get_graph(**options): # Elimino el archivo de reporte para empezar de cero if os.path.isfile(reporte) is True: os.remove(reporte) graph = bonobo.Graph() graph.add_chain( extract, # Limito la cantidad de informacion que fluye al siguiente eslavon bonobo.Limit(20), transform, load) return graph
def get_graph(*, _limit=None, _print=False): graph = bonobo.Graph() trunk = graph.add_chain(bonobo.JsonReader("theaters.json", fs="fs.static"), *((bonobo.Limit(_limit), ) if _limit else ())) if _print: graph.add_chain(bonobo.PrettyPrinter(), _input=trunk.output) graph.add_chain(bonobo.JsonWriter("theaters.output.json", fs="fs.output"), _input=trunk.output) graph.add_chain(bonobo.LdjsonWriter("theaters.output.ldjson", fs="fs.output"), _input=trunk.output) return graph
def handle( self, input_filename, output_filename, reader=None, reader_option=None, writer=None, writer_option=None, option=None, limit=None, transformation=None, ): reader_factory = default_registry.get_reader_factory_for( input_filename, format=reader) reader_kwargs = _resolve_options((option or []) + (reader_option or [])) if output_filename == '-': writer_factory = bonobo.PrettyPrinter writer_args = () else: writer_factory = default_registry.get_writer_factory_for( output_filename, format=writer) writer_args = (output_filename, ) writer_kwargs = _resolve_options((option or []) + (writer_option or [])) transformations = () if limit: transformations += (bonobo.Limit(limit), ) transformations += _resolve_transformations(transformation) graph = bonobo.Graph() graph.add_chain( reader_factory(input_filename, **reader_kwargs), *transformations, writer_factory(*writer_args, **writer_kwargs), ) return bonobo.run(graph, services={ 'fs': bonobo.open_fs(), })
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( get_docs_by_year(2018, False), extract_id, bonobo.Limit(2), FilterDuplicate(collection='document', field='_id', database='test'), get_document, MongoWriter(collection='document', database='test')) # Author graph.add_chain( get_authors_from_doc, FilterDuplicate(collection='author', field='@auid', database='test'), lambda args: args['@auid'], get_author, MongoWriter(collection='author', database='test'), # bonobo.JsonWriter('results/authors.json'), _input=get_document) # Author Affiliation # graph.add_chain( # get_author_affl, # bonobo.UnpackItems(0), # bonobo.CsvWriter('results/author-affl.csv'), # _input=get_author # ) # # Affiliations - Skip. Instead, use the affiliation API to retrieve JHU affiliations # graph.add_chain( # get_author_affl, # FilterDuplicate(collection='affiliation', field='affiliation'), # lambda args: args['affiliation'], # get_affiliation, # MongoWriter(collection='affiliation'), # _input=get_author # ) # Serial By ID graph.add_chain(lambda args: args['coredata'].get('source-id', None), FilterDuplicate(collection='serial', database='test'), get_serial, MongoWriter(collection='serial', database='test'), _input=get_document) return graph
def get_graph(*, _limit=None, _print=False): return bonobo.Graph(bonobo.CsvReader("coffeeshops.csv"), *((bonobo.Limit(_limit), ) if _limit else ()), *((bonobo.PrettyPrinter(), ) if _print else ()), bonobo.CsvWriter("coffeeshops.csv", fs="fs.output"))
print(str(args)) def with_opened_file(self, context): with open('output_csv.txt', 'w+') as f: yield f # decorator is used here: every time we open the file, and append row to the existing rows, instead of overwriting it # Or directly use load (not writing to file) @use_context_processor(with_opened_file) def write_repr_to_file(f, *row): f.write(repr(row) + "\n") # if we don't use decorator, only one record will be written (will over-write the old records) def write_to_file_onetime(*row): with open('output_csv_trial.txt', 'w+') as f: f.write(repr(row) + "\n") if __name__ == '__main__': graph = bonobo.Graph() graph.add_chain( get_price, transform, bonobo.Limit(20), write_repr_to_file, ) bonobo.run(graph)
data = pd.read_csv('train.csv', encoding='ISO-8859-1') # construct category dictionary (map the category to index) category_dict = defaultdict() category_list = data.columns.tolist() for i in range(len(category_list)): category_dict[category_list[i]] = i print(category_dict) # construct the terms dictionary (get the dict from data descriptions file) term_dictionary = getdict() print(term_dictionary) # write the terms dictionary into two-column csv write_dict_to_csv() # divide the table into 3 sub-tables # TODO: The index needs to be changed according to the requirements lotinfo_idx = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # build Bonobo pipeline graph = bonobo.Graph() graph.add_chain( extract, # the transform step will replace the abbr. with its full description transform, bonobo.Limit(100), write_repr_to_file, ) bonobo.run(graph)
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() graph.add_chain( bonobo.CsvWriter('billing.csv'), bonobo.JsonWriter('billing.json'), invalid_entries, fix_numbers, parse_dates, #bonobo.PrettyPrinter(), filter_summary, #bonobo.PrettyPrinter(), lookup_account_sk, lookup_date_sk, summarize_costs, bonobo.UnpackItems(0), bonobo_sqlalchemy.InsertOrUpdate( table_name='fact_itsm_aws_historical_cost' + options['table_suffix'], discriminant=( 'productname', 'date_sk', 'account_name_sk', ), engine='database'), _name="main", _input=None, ) now = options['now'] # Go to beginning of month now += relativedelta(day=1, hour=0, minute=0, second=0, microsecond=0) when = now for log in range(0, options['months']): when = when + relativedelta(months=-1) tstamp = when.strftime("%Y-%m") print("# %d Processing %s" % (log, tstamp)) if options['limit']: _limit = (bonobo.Limit(options['limit']), ) else: _limit = () graph.add_chain( AwsBillingReader('%s-aws-cost-allocation-%s.csv' % (options['aws_account_id'], tstamp), fs='s3', skip=1), *_limit, _output="main", ) graph.add_chain( bonobo_sqlalchemy.InsertOrUpdate( table_name=options['table'] + options['table_suffix'], discriminant=('invoiceid', 'linkedaccountid', 'payeraccountid', 'recordid'), engine='database'), _input=parse_dates, ) return graph
def get_graph(**options): """ This function builds the graph that needs to be executed. :return: bonobo.Graph """ graph = bonobo.Graph() # Read data from the CSV file and load into MongoDB graph.add_chain( bonobo.CsvReader('data/biophysics-author-names.csv'), bonobo.Limit(limit), get_author_by_name, create_author_document, MongoWriter(collection='jhu-authors', database=database), ) # Extract authors from Scopus and load into MongoDB graph.add_chain( extract_authors, bonobo.Limit(limit), extract_id, FilterDuplicate(collection='scopus-authors', database=database), get_author, MongoWriter(collection='scopus-authors', database=database), _input=get_author_by_name ) # Extract documents from Scopus and load into MongoDB graph.add_chain( extract_id, get_docs_by_author, bonobo.Limit(limit), extract_id, FilterDuplicate(collection='scopus-documents', field='_id', database=database), get_document, remove_errata, MongoWriter(collection='scopus-documents', database=database), _input=extract_authors ) # Extract serials data from Scopus and load into MongoDB graph.add_chain( lambda args: args['coredata'].get('source-id', None), bonobo.Limit(limit), FilterDuplicate(collection='scopus-serials', database=database), get_serial, MongoWriter(collection='scopus-serials', database=database), _input=remove_errata ) # Extract co-authors data from Scopus and load into MongoDB graph.add_chain( get_authors_from_doc, bonobo.Limit(limit), FilterDuplicate(collection='scopus-authors', field='@auid', database=database), lambda args: args['@auid'], get_author, MongoWriter(collection='scopus-authors', database=database), # bonobo.JsonWriter('results/authors.json'), _input=remove_errata ) return graph
def get_graph(*, _limit=None, _print=False): return bonobo.Graph(bonobo.CsvReader('datasets/coffeeshops.txt'), *((bonobo.Limit(_limit), ) if _limit else ()), *((bonobo.PrettyPrinter(), ) if _print else ()), bonobo.CsvWriter('coffeeshops.csv', fs='fs.output'))
def test_limit(): limit = bonobo.Limit(2) results = [] for i in range(42): results += list(limit()) assert results == [NOT_MODIFIED] * 2
def test_limit_not_there(): limit = bonobo.Limit(42) results = [] for i in range(10): results += list(limit()) assert results == [NOT_MODIFIED] * 10
def get_graph(): graph = bonobo.Graph() graph.add_chain(extract, transform, load_csv, load_postgres, bonobo.Limit(1000), bonobo.PrettyPrinter()) return graph