def main(args): print(common.jsonpretty(args)) cmd_options = {} cmd_options['preview_mode'] = True if args.get('--preview') else False cmd_options['group_target'] = True if args.get('--group') else False cmd_options['config'] = True if args.get('--config') else False cmd_options['list'] = True if args.get('ls') or args.get( '<command_target>') == 'ls' else False warp_home_dir = os.getcwd() # default is current directory warp_initfile = 'warp.ini' # TODO: use a default constant instead of magic string warp_config = None with open(warp_initfile) as f: warp_config = yaml.load(f) if not warp_config: raise Exception('missing warp.ini config file.') warp_home_dir = load_config_var(warp_config['globals']['warp_home']) warpfiles_dir = os.path.join(warp_home_dir, 'warpfiles') extensions_dir = os.path.join(warp_home_dir, 'extensions') loader = CommandLoader(warpfiles_dir) extension_mgr = ExtensionManager(warp_config) extension_mgr.bind_methods_to_class(WarpCLI) cli = WarpCLI(loader, extension_mgr) cli.cmdloop()
def write(self, records, **kwargs): for raw_record in records: record = json.loads(raw_record) customer_id = record['customerid'] order_month = record['month'] order_year = record['year'] order_amount = record['order_amount'] self.active_customers.add(customer_id) # update monthly customer orders key = (customer_id, order_month, order_year) if customer_id == 'cus_1683': print( 'recording order for %s in month %s and year %s. Amount: %s' % (customer_id, order_month, order_year, order_amount)) if not self.customer_orders.get(key): self.customer_orders[key] = [order_amount] else: self.customer_orders[key].append(order_amount) # calculate total revenue if not self.total_monthly_revenue_table.get(order_month): self.total_monthly_revenue_table[order_month] = order_amount else: self.total_monthly_revenue_table[order_month] += order_amount print(common.jsonpretty(self.total_monthly_revenue_table)) print( self.customer_orders_before_month('cus_1683', 4, 2020, self.customer_orders))
def _send(self, msg_header, kafka_message, **kwargs): log.debug("writing kafka log message to db...") log.debug('### kafka_message keys: %s' % '\n'.join(kafka_message.keys())) outbound_record = {} fact_data = self._schema_mapping_context.get_fact_values( kafka_message.get('body'), persistence_manager=self._pmgr) print('### OLAP fact data:') print(common.jsonpretty(fact_data)) insert_query_template = ''' INSERT INTO {fact_table} ({field_names}) VALUES ({data_placeholders}); ''' data_placeholder_segment = ', '.join( [':%s' % name for name in fact_data.keys()]) print('### initial rendering of insert statement: ') iqtemplate_render = insert_query_template.format( fact_table=self._schema_mapping_context.fact.table_name, field_names=','.join(fact_data.keys()), data_placeholders=data_placeholder_segment) print(iqtemplate_render) insert_statement = text(iqtemplate_render) insert_statement = insert_statement.bindparams(**fact_data) #dbconnection = self._pmgr.database.engine.connect() result = self._dbconnection.execute(insert_statement)
def write(self, records, **kwargs): for rec in records: print('### ready to write record to Kinesis:') print('------------------------\n') print(common.jsonpretty(rec)) status = self.kinesis_svc.write(rec, 'apollo_test_stream') print('Done with status: %s' % status)
def msg_handler(message, receipt_handle, service_registry): s3_svc = service_registry.lookup('s3') print('### Inside SQS message handler function.') print("### message follows:") print(common.jsonpretty(message)) # unpack SQS message to get notification about S3 file upload message_body_raw = message['Body'] message_body = json.loads(message_body_raw) for record in message_body['Records']: s3_data = record.get('s3') if not record: continue bucket_name = s3_data['bucket']['name'] object_key = s3_data['object']['key'] # TODO: set a limit on file size? print( '#--- received object upload notification [ bucket: %s, key: %s ]' % (bucket_name, object_key)) s3key = S3Key(bucket_name, object_key) jsondata = None try: jsondata = s3_svc.download_json(bucket_name, object_key) print('### JSON payload data:') print(common.jsonpretty(jsondata)) # we use the name of the top-level S3 "folder" to select the action to perform, # by keying into the dispatch table channel_id = object_key.split('/')[0] handler = S3_EVENT_DISPATCH_TABLE.get(channel_id) if not handler: raise Exception( 'no handler registered for S3 upload events to bucket %s with key %s' % (bucket_name, object_key)) handler(service_registry, **jsondata) except Exception as err: print('Error handling JSON job data from URI %s.' % s3key.uri) print(err) traceback.print_exc(file=sys.stdout) return
def helloathena_func(input_data, service_registry, **kwargs): athenasvc = service_registry.lookup('athena') db_service = service_registry.lookup('postgres') encoded_input_query = input_data['input_query'] input_query = base64.b64decode(encoded_input_query).decode('utf-8') s3_output_filename = athenasvc.athena_to_s3(input_query, 8) if not s3_output_filename: return 'No result from query.' s3_svc = service_registry.lookup('s3') querydata = s3_svc.download_data(s3_output_filename) # This is CSV data, so the first line will be the header if querydata.find('\n') > -1: query_output_header = querydata.split('\n')[0] else: query_output_header = querydata query_response_fields = [ token.strip('"') for token in query_output_header.split(',') ] # query the knowledgebase to get the fields in the test definition definition_fields = {} obs_def_id = input_data['observation_def_id'] with db_service.txn_scope() as session: print(f'######## OBSERVATION DEF ID: {obs_def_id}') ObservationVerification = db_service.Base.classes.issue_mgmt_observationverification verification_query = session.query(ObservationVerification).filter( ObservationVerification.observation_definition_id == obs_def_id) results = verification_query.all() for record in results: definition_fields[record.key_name] = record.data_type print(common.jsonpretty(definition_fields)) errors = [] for fieldname in query_response_fields: if fieldname not in definition_fields: errors.append({ 'error_type': 'undefined_field', 'error_key': 'field_name', 'error_value': fieldname }) response = {'ok': True, 'input_query': input_query} if len(errors): response['ok'] = False response['errors'] = errors return response
def do_update(self, cmd_args): '''Usage: update (map | project) update map <map_name> update project (globals | datasources) ''' print(common.jsonpretty(cmd_args))
def award_job(self, bid_window_id, bidder_array, **kwargs): payload = {'window_id': bid_window_id, 'bids': bidder_array} print('PAYLOAD for calling /award endpoint:') print(common.jsonpretty(payload)) response = self._call_endpoint(self.award, payload, **kwargs) return response
def main(args): src_file = args.get('<datafile>') null_mode = args.get('--null') readable_dict_mode = args.get('--readable_dict') readable_line_mode = args.get('--readable_line') with open(src_file) as f: first_line = f.readline() fields = first_line.split('|') nb_reporter = dmap.NullByteFilter(delimiter='|', field_names=fields) if null_mode: null_pairs = nb_reporter.filter_with_null_output(src_file) for null_pair in null_pairs: print( common.jsonpretty({ 'line_number': null_pair[0], 'field': null_pair[1] })) elif readable_dict_mode: readable_lines = nb_reporter.filter_with_readable_output(src_file) for line in readable_lines: if line == first_line: continue record_dict = {} value_array = line.split('|') for r_index, field in enumerate(fields): record_dict[field] = value_array[r_index] print(common.jsonpretty(record_dict)) elif readable_line_mode: proc = Dictionary2CSVProcessor(fields, "|", dmap.WhitespaceCleanupProcessor()) readable_lines = nb_reporter.filter_with_readable_output(src_file) for line in readable_lines: if line == first_line: continue record_dict = {} value_array = line.split('|') for r_index, field in enumerate(fields): record_dict[field] = value_array[r_index] proc.process(record_dict) else: print("Choose an option flag for record info output")
def trigger_arbitration(service_registry, **kwargs): current_time = datetime.datetime.now() # scan ALL open bidding windows api_service = service_registry.lookup('job_mgr_api') response = api_service.get_open_bid_windows() bid_windows = response.json()['data']['bidding_windows'] print('###----- Retrieved open bid windows from API endpoint:') print(bid_windows) # for each open window, see who has bid; for bwindow in bid_windows: job_tag = bwindow['job_tag'] window_id = bwindow['bidding_window_id'] if bwindow['policy']['limit_type'] == 'num_bids': print('++ Policy limit is %d bids.' % int(bwindow['policy']['limit'])) json_bidder_data = api_service.get_active_job_bids(job_tag) bidding_users = json_bidder_data.json()['data']['bidders'] num_bids = len(bidding_users) policy_limit_bids = int(bwindow['policy']['limit']) if num_bids >= policy_limit_bids: winners = arbitrate(bidding_users, service_registry) if len(winners): print('!!!!!!!!!!! WE HAVE A WINNER !!!!!!!!!!!!!!!!!!') print(common.jsonpretty(winners)) api_service.award_job(window_id, winners) else: print('### No winner determined in the arbitration round ending %s.' % current_time.isoformat()) elif bwindow['policy']['limit_type'] == 'time_seconds': # see how long the window has been open; window_opened_at = dateutil.parser.parse(bwindow['open_ts']) window_open_duration = (current_time - window_opened_at).seconds policy_limit_seconds = int(bwindow['policy']['limit']) if window_open_duration >= policy_limit_seconds: json_bidder_data = api_service.get_active_job_bids(job_tag) bid_data = json_bidder_data.json()['data']['bidders'] if len(bid_data): winners = arbitrate(bid_data, service_registry) if len(winners): print('!!!!!!!!!!! WE HAVE A WINNER !!!!!!!!!!!!!!!!!!') api_service.award_job(window_id, winners) else: print('### No winner determined in the arbitration round ending %s.' % current_time.isoformat()) else: print('### No more bidders in this round.') else: # raise hell; we don't support that raise Exception('Unrecognized bidding window policy limit_type: %s' % bwindow['policy']['limit_type'])
def read_msg(self, input_data, **kwargs): self.log.info(common.jsonpretty(input_data)) if input_data.get('Records'): self.log.info('### S3 bucket name: %s' % input_data['Records'][0]['s3']['bucket']['name']) self.log.info('### new S3 object: %s' % input_data['Records'][0]['s3']['object']['key']) return dict(input_data['Records'][0]['s3']) return {}
def arbitrate(bidder_list, service_registry): # Decide which bidder gets assigned a job, using a simple random selector. # This is only for the proof of concept; we will upgrade to smarter (and user-pluggable) # arbitration methods once we shake the system out. print('#####------- Arbitrating bid data:') print(common.jsonpretty(bidder_list)) random.seed(time.time()) index = random.randrange(0, len(bidder_list)) return [bidder_list[index]]
def profile(self, record_generator, service_registry, **kwargs): time_log = dmap.TimeLog() result_tuple = None operation_name = kwargs.get( 'op_name') or 'profile dataset "%s"' % self.table_name with jrnl.stopwatch(operation_name, time_log): result_tuple = self._profile(record_generator, service_registry, **kwargs) print(common.jsonpretty(time_log.readout)) return result_tuple
def scrape_olr_condo_listings(soup_parse_tree, html_data): data = { 'street_address': '', 'neighborhood': '', 'type': '', 'tags': [], 'price': '', 'monthly_chg': '', 'beds': None, 'bathrooms': None, 'square_footage': None, 'size_description': '' } #print(html_data) detail_div = soup_parse_tree.find('div', {'class': 'apt_details_left'}) # this one contains the address addr_span = detail_div.find_all('span', {'class': 'txt_gray'})[0] address = addr_span.find_all('a')[0].get_text().strip() data['street_address'] = unicodedata.normalize('NFKD', address) # these spans contain the other feataures of the listing feature_spans = detail_div.find_all('span', {'class': 'txt_black_normal'}) raw_neighborhood_string = unicodedata.normalize('NFKD', feature_spans[0].get_text().strip()) print('### Raw neighborhood string: %s' % raw_neighborhood_string) tokens = [t.strip() for t in raw_neighborhood_string.split('\n')] print(tokens) data['neighborhood'] = tokens[0] data['tags'].extend([t.strip() for t in tokens[1].split('|')]) raw_pricing_string = unicodedata.normalize('NFKD', feature_spans[1].get_text().strip()) print('### Raw pricing string: %s' % raw_pricing_string) price_size_fields = [token.strip() for token in raw_pricing_string.split('\n')] print(price_size_fields) data.update(decode_olr_condo_coop_price_size_fields(price_size_fields)) ''' price_fields = [token.strip() for token in price_size_fields[0].split('|')] data['price'] = price_fields[0].split('$')[0] # skip over the price drop field for now size_fields = [token.strip() for token in price_size_fields[2].split('|')] data['size_description'] = size_fields[1] data['square_footage'] = size_fields[2] ''' print(common.jsonpretty(data)) '''
def main(args): if args['--list-codes']: print('Supported site codes:') print(common.jsonpretty(RE_SITE_CODES)) print('\n') print('Supported neighborhood codes:') print(common.jsonpretty(BK_NEIGHBORHOOD_CODES)) return site_code = args['<site-code>'] if not RE_SITE_CODES.get(site_code): print('unrecognized site code "%s".' % site_code) print('valid codes:') print(common.jsonpretty(RE_SITE_CODES)) return neighborhood_code = args['<neighborhood_code>'] if not BK_NEIGHBORHOOD_CODES.get(neighborhood_code): print('unrecognized neighborhood code "%s".' % neighborhood_code) print('valid codes:') print(common.jsonpretty(BK_NEIGHBORHOOD_CODES)) return url = RE_SITE_CODES[site_code] # create a new Firefox session #options = webdriver.FirefoxOptions() #options.add_argument('-headless') driver = webdriver.Firefox() driver.implicitly_wait(30) driver.get(url) # print('### Issuing search against %s for neighborhood %s...' % (url, BK_NEIGHBORHOOD_CODES[neighborhood_code])) if site_code == 'trulia': search_trulia(neighborhood_code, driver) elif site_code == 'olr': scrape_olr(neighborhood_code, driver)
def __init__(self, warp_yaml_cfg, **kwargs): self.registry = {} print(common.jsonpretty(warp_yaml_cfg)) warp_home_dir = load_config_var(warp_yaml_cfg['globals']['warp_home']) extensions_dir = os.path.join(warp_home_dir, 'extensions') sys.path.append(extensions_dir) module_names = [] should_load_all = False # load context from the specified modules #module_names = warp_yaml_cfg.get('extensions') or [] module_names = [] # otherwise load all modules ''' else: should_load_all = True module_names = [f[0:-3] for f in os.listdir(extensions_dir) if f.endswith('.py')] ''' for module_name in module_names: extensions = {} dirmod = __import__('extensions.%s' % module_name) extmod = getattr(dirmod, module_name) context_loader_function = getattr(extmod, '__load__') extension_args = {} for param in warp_yaml_cfg['extensions'][module_name][ 'init_params']: extension_args[param['name']] = param['value'] print('### Extension params: %s' % extension_args) extension_context = context_loader_function( warp_home_dir, logger, **extension_args) function_names = [ f[0] for f in getmembers(extmod) if isfunction(f[1]) and is_valid_extension_name(f[0]) ] for raw_function_name in function_names: function_name = raw_function_name.lstrip('_') bound_method_name = '_'.join( [CMD_METHOD_PREFIX, module_name, function_name]) function_obj = getattr(extmod, raw_function_name) mx = MethodExtension(function_name, bound_method_name, function_obj) extensions[function_name] = mx self.registry[module_name] = extensions
def main(args): queue_url = args['<queue_url>'] sendargs = { 'QueueUrl': queue_url, 'DelaySeconds': int(args.get('<delay>', 0)), 'MessageAttributes': parse_attributes(args['--attrs'][0]), 'MessageBody': args['<body>'] } print(common.jsonpretty(sendargs)) client = boto3.client('sqs') response = client.send_message(**sendargs) print(response)
def explore(parse_tree): print(dir(parse_tree)) expression_buffer = [] symbol_table = {} for node in parse_tree.iter_subtrees_topdown(): if node.data == 'form': print('### start-of-form') if node.data == 'variable': symbol_table.update(resolve_var(node)) print(f'### Symbol table updated: {resolve_var(node)}') #print(node.scan_values()) if node.data == 'expression': print( f'### found an expression with {len(node.children)} child nodes.' ) for child in node.children: if child.data == 'name': expression_buffer.append(child.children[0]) if child.data == 'operator': expression_buffer.append(child.children[0]) if child.data == 'number': expression_buffer.append(child.children[0]) if child.data == 'expression': continue print(f'### symbols:') print(common.jsonpretty(symbol_table)) pylines = [] locals = [] for key, value in symbol_table.items(): locals.append(f'{key} = {value}') #pylines.append(f'{" ".join(expression_buffer)})') pystmt = ' '.join(expression_buffer) print(pystmt) print('### can I get a witness?') print(eval(pystmt, {}, symbol_table))
def main(args): print(args) local_env = common.LocalEnvironment('PGSQL_USER', 'PGSQL_PASSWORD') local_env.init() pgsql_user = local_env.get_variable('PGSQL_USER') pgsql_password = local_env.get_variable('PGSQL_PASSWORD') yaml_config = common.read_config_file(args['<initfile>']) print(common.jsonpretty(yaml_config)) db_host = yaml_config['globals']['database_host'] db_name = yaml_config['globals']['database_name'] pubsub = pgpubsub.connect(host=db_host, user=pgsql_user, password=pgsql_password, database=db_name) channel_id = args['<channel>'] if not yaml_config['channels'].get(channel_id): raise NoSuchEventChannel(channel_id) handler_module_name = yaml_config['globals']['handler_module'] project_dir = common.load_config_var(yaml_config['globals']['project_dir']) sys.path.append(project_dir) handlers = __import__(handler_module_name) handler_function_name = yaml_config['channels'][channel_id][ 'handler_function'] if not hasattr(handlers, handler_function_name): raise NoSuchEventHandler(handler_function_name, handler_module_name) handler_function = getattr(handlers, handler_function_name) service_objects = common.ServiceObjectRegistry( snap.initialize_services(yaml_config, logger)) pubsub.listen(channel_id) print('listening on channel "%s"...' % channel_id) for event in pubsub.events(): print(event.payload)
def sns_receive_func(input_data, service_objects, **kwargs): log.info(input_data) sns_message_raw = input_data['Message'] log.info(sns_message_raw) sns_message = json.loads(sns_message_raw) print(common.jsonpretty(sns_message)) s3_segment = sns_message['Records'][0]['s3'] keyname = s3_segment['object']['key'] s3_svc = service_objects.lookup('s3') file_loc = s3_svc.download_object('datalab.mercury', keyname) log.info('### Downloaded S3 object to %s.' % file_loc) return core.TransformStatus(json.dumps({}))
def write(self, records, **kwargs): db_svc = self.service_object_registry.lookup('redshift_svc') Listing = db_svc.Base.classes.grailed_listings for record in records: print('>>> placeholder Redshift data write operation:') db_record = json.loads(record) with db_svc.txn_scope() as session: listing = Listing() for key, value in db_record.items(): if value == 'True': setattr(listing, key, True) elif value == 'False': setattr(listing, key, False) else: setattr(listing, key, value) session.add(listing) session.commit() print(common.jsonpretty(json.loads(record)))
def _process(self, record): print(common.jsonpretty(record)) return record
def default_event_handler(event, svc_registry): print(common.jsonpretty(json.loads(event.payload)))
def handle_instructors_insert(json_obj, svc_object_registry): print(common.jsonpretty(json_obj))
def handle_default_error(self, exception, source_record): print('Error of type "%s" transforming record: %s' % (exception.__class__.__name__, exception), file=sys.stderr) print('Offending record:', file=sys.stderr) print(common.jsonpretty(source_record), file=sys.stderr)
def main(args): print(common.jsonpretty(args))
def main(args): #print(common.jsonpretty(args)) config_filename = args['<configfile>'] yaml_config = common.read_config_file(config_filename) service_object_registry = common.ServiceObjectRegistry( snap.initialize_services(yaml_config)) datastore_registry = DatastoreRegistry( initialize_datastores(yaml_config, service_object_registry)) preview_mode = False if args['--preview']: preview_mode = True limit = -1 if args.get('--limit') is not None: limit = int(args['--limit']) list_mode = False stream_input_mode = False file_input_mode = False available_ingest_targets = load_ingest_targets(yaml_config, datastore_registry) if args['--target'] == True and args['<datafile>'] is None: stream_input_mode = True ingest_target_name = args['<ingest_target>'] ingest_target = lookup_ingest_target_by_name(ingest_target_name, available_ingest_targets) buffer = initialize_record_buffer(ingest_target, datastore_registry) record_count = 0 with checkpoint(buffer, interval=ingest_target.checkpoint_interval): while True: if record_count == limit: break raw_line = sys.stdin.readline() line = raw_line.lstrip().rstrip() if not len(line): break if not preview_mode: buffer.write(line) else: print(line) record_count += 1 elif args['<datafile>']: file_input_mode = True input_file = args['<datafile>'] ingest_target_name = args['<ingest_target>'] ingest_target = lookup_ingest_target_by_name(ingest_target_name, available_ingest_targets) buffer = initialize_record_buffer(ingest_target, datastore_registry) record_count = 0 with checkpoint(buffer, interval=ingest_target.checkpoint_interval): with open(input_file) as f: for line in f: if record_count == limit: break if not preview_mode: buffer.write(line) else: print(line) record_count += 1 elif args['--list'] == True: if args['targets']: for target in yaml_config['ingest_targets']: print('::: Ingest target "%s": ' % target) print(common.jsonpretty(yaml_config['ingest_targets'][target])) if args['datastores']: for dstore in yaml_config['datastores']: print('::: Datastore alias "%s": ' % dstore) print(common.jsonpretty(yaml_config['datastores'][dstore])) if args['globals']: print('::: Global settings:') print(common.jsonpretty(yaml_config['globals']))