def main(): args = argparser.parse_args() LOGGER.info('Process Start', { 'affiliate': args.affiliate, 'month': args.month, 'type': args.type }) # 0. calculate time frame (year, month) = map(int, args.month.split("-")) start_date = datetime.date(year, month, 1) end_date = datetime.date(year, month, calendar.monthrange(year, month)[1]) # 1. download if args.affiliate.upper() == "ALL": affiliates = os.listdir(os.path.abspath("affiliates")) else: affiliates = [args.affiliate + ".py"] data = downloader.get_data(affiliates, start_date, end_date, args.type) if len(data) > 0: # fixing log id for coupons with tracking id (5 chars) instead of log id (9 chars) data['ipg:source'] = data.apply( lambda x: parse.fix_broken_log_id(x['ipg:source']), axis=1) # filter out only old system transactions data['old_system'] = data.apply( lambda x: ('ipg:sourceAffiliate' in data.columns and x[ 'ipg:sourceAffiliate'] == 'ipricegroup' ) or not parse.is_new_system_source(x['ipg:source']), axis=1) data = data[data['old_system'] == True] data = downloader.process_data(data) if type == 'transactions': LOGGER.info("Calculating basket size data") data = calculation.calc_basket_size(data) downloader.export_csv(data) LOGGER.info('Process Ended: ', { 'affiliate': args.affiliate, 'month': args.month, 'type': args.type })
def main(dataset_folder, temporal_folder, experiment): """Run the script. Args: dataset_folder : specifies output folder experiment : tuple (<experiment name>, <sequence number>), that specifies which experiment to download. We sequences 1,2,3,4 of indoor_flying experiment. temporal_folder : specifies folder were the original dataset will be placed. """ if not _is_experiment_correctly_defined(experiment): raise ValueError('"experiments" are not correctly defined.') dataset_folder = os.path.abspath(dataset_folder) temporal_folder = os.path.abspath(temporal_folder) _make_if_does_not_exist(temporal_folder) _make_if_does_not_exist(dataset_folder) downloader.TMP_FOLDER = temporal_folder experiment_name, experiment_number = experiment paths = dataset_constants.experiment_paths(experiment_name, experiment_number, dataset_folder) dataset_constants.create_folders(paths) calibration_data = calibration.Calibration(experiment_name) data_path = downloader.get_data(experiment_name, experiment_number)[0] data_bag = bag_indexer.get_bag_indexer(data_path) gt_bag_path = downloader.get_ground_truth(experiment_name, experiment_number)[0] gt_bag = bag_indexer.get_bag_indexer(gt_bag_path) depth_topic_reader = gt_bag.get_topic_reader(TOPICS['depth']) focal_length_x_baseline = calibration_data.intrinsic_extrinsic['cam1'][ 'projection_matrix'][0][3] synchronization_timestamps = [] for index, depth_message in enumerate(depth_topic_reader): depth_image, timestamp = _load_image_message(depth_message) disparity_image = _depth2disparity(depth_image, focal_length_x_baseline) disparity_path = paths['disparity_file'] % index cv2.imwrite(disparity_path, disparity_image) synchronization_timestamps.append(timestamp) np.savetxt(paths['timestamps_file'], np.array(synchronization_timestamps), fmt='%f', header="timestamp") distorted_to_rectified = { 'cam0': calibration_data.left_map, 'cam1': calibration_data.right_map } for camera in ['cam0', 'cam1']: rectified_to_distorted_x, rectified_to_distorted_y = \ _rectification_map( calibration_data.intrinsic_extrinsic[camera]) image_size = calibration_data.intrinsic_extrinsic[camera]['resolution'] events_topic_reader = data_bag.get_topic_reader( TOPICS[camera]['events']) images_topic_reader = data_bag.get_topic_reader( TOPICS[camera]['image']) event_bags_timestamps = _get_bags_timestamps(events_topic_reader) image_bags_timestamps = _get_bags_timestamps(images_topic_reader) for synchronization_index, synchronization_timestamp in enumerate( synchronization_timestamps): synchronized_events = _get_synchronized_events( synchronization_timestamp, event_bags_timestamps, events_topic_reader) rectified_synchronized_events = _rectify_events( synchronized_events, distorted_to_rectified[camera], image_size) events_path = paths[camera]['event_file'] % synchronization_index np.save(events_path, np.array(rectified_synchronized_events)) synchronized_image = _get_synchronized_image( synchronization_timestamp, image_bags_timestamps, images_topic_reader) rectified_synchronized_image = cv2.remap(synchronized_image, rectified_to_distorted_x, rectified_to_distorted_y, cv2.INTER_LINEAR) image_path = paths[camera]['image_file'] % synchronization_index cv2.imwrite(image_path, rectified_synchronized_image)
######polynomial trial #degree of polynomial of 3 was tested and #provide results with only power of 1 from downloader import get_data df_dict = get_data() listings = df_dict['listings'] calendar = df_dict['calendar'] reviews = df_dict['reviews'] from clean_data import clean_preprocess_listings data = clean_preprocess_listings(listings) ##data downloaded , building model import pandas as pd from functions import subtract_elem from model_functions import Pipeline, sklearn_model, ml_model_setup pipe1 = Pipeline([ ('poly', PolynomialFeatures(degree=2)), ('scaler', StandardScaler()), ('regressor', AdaBoostRegressor(n_estimators=200, loss='exponential')), ]) regressor = sklearn_model(model=pipe1) model = ml_model_setup(data=None, model=regressor) tmp = pd.get_dummies( data, columns=['neighbourhood_cleansed', 'bathrooms_type', 'room_type'], prefix=['neighbourhoods', 'bathroom', 'room_type'], prefix_sep='-') outcomes = ['annual_earnings'] variables = subtract_elem(tmp.columns, outcomes) model.set_inputs(data=tmp, outcomes=outcomes, variables=variables) model.train()
#! /usr/bin/env ipython '''main file to execute the project''' print('Downloading the data') #downloading data into memory from datetime import datetime from downloader import get_data city = "Boston" results_path="results/" df_dict=get_data(city="Boston", latest_by=datetime.today().strftime("%d-%B-%Y")) listings=df_dict['listings'] calendar=df_dict['calendar'] reviews=df_dict['reviews'] print('cleaning and preprocessing the data') #cleaning & preprocessing the data from clean_data import clean_preprocess_listings data=clean_preprocess_listings(listings) #let's plot the results from the exploratory analysis print('Plotting the results') #results1/plot1 : Broad understanding of earning & avg.price from plot_functions import plot_graph from functions import range_without_outliers inputs= { 'title':f'Annual earning across Airbnb listing in {city} ', 'kind' : 'hist', 'bins' : 600, 'x_label' : f'Earning by a single listing',
def main(): parser = argparse.ArgumentParser( description='Ingest Blue Planet data using REST API.\ Lab Username, Password and Id can also be set as an environment variables.') parser.add_argument('--lab-host', '-lh', help='Cienna Emulation Cloud, Lab Host') parser.add_argument('--lab-username', '-lu', help='Cienna Emulation Cloud, Lab Username') parser.add_argument('--lab-password', '-lp', help='Cienna Emulation Cloud, Lab Password') parser.add_argument('--lab-id', '-lid', help='Cienna Emulation Cloud, Lab Id') parser.add_argument( '--data-type', '-dt', help= 'Type of data (alarms, user, device, connections) to pull from MCP', required=True, choices=PATH_MAP.keys()) parser.add_argument('--recreate-table', '-r', help='To recreate table in database.', required=False, default=True) parser.add_argument('--dest-db-type', help='Destination database type (MySQL or Postgres', default='Postgres') parser.add_argument('--mysql-host', '-mh', help='MySQL host to store data') parser.add_argument('--mysql-username', '-mu', help='MySQL username') parser.add_argument('--mysql-password', '-mp', help='MySQL Password') parser.add_argument('--mysql-db', '-db', help='MySQL DB') parser.add_argument('--mysql-port', '-p', help='MySQL Port', default=3306) parser.add_argument('--postgres-host', '-ph', help='Postgres host to store data') parser.add_argument('--postgres-username', '-pu', help='Postgres username') parser.add_argument('--postgres-password', help='Postgres Password') parser.add_argument('--postgres-db', '-pdb', help='Postgres DB') parser.add_argument('--postgres-port', help='Postgres Port', default=3306) parser.add_argument('--params', default='{}', type=valid_dict, help='URL parameters for MCP APIs.\ Accepts valid dictionary as a string.\ e.g. \'{"params":"1", "type":"mcp"}\'') args = parser.parse_args() host = get_environment_data("HOST", args.lab_host, DEFAULT_HOST) username = get_environment_data("USERNAME", args.lab_username, default="dev") password = get_environment_data("PASSWORD", args.lab_password) lab_id = get_environment_data("LABID", args.lab_id, default=254) host = "%s/%s" % (host, lab_id) # End-to-End data pipelines # 1. Authentication: token = authorize_with_MCP(host, username, password) # 2. Get Data: data = get_data(host, get_path(args.data_type, params=args.params), token, key=get_data_key(args.data_type)) if args.dest_db_type.lower() == 'mysql': import mysqlhelper # 3. convert json to flat csv data_file = mysqlhelper.write_data_to_file(data, args.data_type) mysql_host = get_environment_data("MYSQL_HOST", args.mysql_host, default='mcp_db') mysql_username = get_environment_data("MYSQL_USERNAME", args.mysql_username, default="limpid") mysql_password = get_environment_data("MYSQL_PASSWORD", args.mysql_password) mysql_db = get_environment_data("MYSQL_DB", args.mysql_db, default=MYSQL_DB) mysql_port = get_environment_data("MYSQL_PORT", args.mysql_port, default=3306) # 4. Recreate table in database if args.recreate_table: mysqlhelper.recreate_table(args.data_type, mysql_host, mysql_username, mysql_password, mysql_db, mysql_port) mysqlhelper.load_data_in_db(args.data_type, data_file, mysql_host, mysql_username, mysql_password, mysql_db, mysql_port) elif args.dest_db_type.lower() == 'postgres': import postgreshelper # 3. convert json to jsonl data_file = postgreshelper.write_data_to_file(data, args.data_type) postgres_host = get_environment_data("POSTGRES_HOST", args.postgres_host, default='mcp_db_pg') postgres_username = get_environment_data("POSTGRES_USERNAME", args.postgres_username, default="limpid") postgres_password = get_environment_data("POSTGRES_PASSWORD", args.postgres_password) postgres_db = get_environment_data("POSTGRES_DB", args.postgres_db, default=POSTGRES_DB) postgres_port = get_environment_data("POSTGRES_PORT", args.postgres_port, default=5432) # 4. Recreate table in database if args.recreate_table: postgreshelper.recreate_table(args.data_type, postgres_host, postgres_username, postgres_password, postgres_db, postgres_port) postgreshelper.load_data_in_db(args.data_type, data_file, postgres_host, postgres_username, postgres_password, postgres_db, postgres_port) else: print( "%s is not supported currently. Supported databases are MySQL and Postgres" % args.dest_db_type) raise
def main(argv): args = argparser.parse_args() LOGGER.info('Sync Process Starts', { 'month': args.month, 'affiliate': args.affiliate }) # 0. calculate time frame (year, month) = map(int, args.month.split("-")) start_date = datetime.date(year, month, 1) end_date = datetime.date(year, month, calendar.monthrange(year, month)[1]) if args.affiliate.upper() == "ALL": affiliates = os.listdir(os.path.abspath("affiliates")) affiliates.remove('ipricegroup.py') else: affiliates = [args.affiliate + ".py"] dry_run = True if args.dry_run == 'True' else False testing = False if args.testing == 'False' else True data = downloader.get_data(affiliates, start_date, end_date, 'transactions') if len(data) > 0: # fixing log id for coupons with tracking id (5 chars) instead of log id (9 chars) data['ipg:source'] = data.apply( lambda x: parse.fix_broken_log_id(x['ipg:source']), axis=1) # filter out only new system transactions data['new_system'] = data.apply( lambda x: parse.is_new_system_source(x['ipg:source']), axis=1) data = data[data['new_system'] == True] if len(data) > 0: # enrich information from log database data['ipg:logId'] = data.apply( lambda x: parse.detect_log_id(x['ipg:source']), axis=1) LOGGER.info("Downloading Sync Log data") data = log.get_sync_log_info(data) ipricegroup = IpriceGroup() existing_conversions = ipricegroup.get_existing_conversions( start_date, end_date) count = {'created': 0, 'updated': 0, 'processed': 0} for index, row in data.iterrows(): count['processed'] += 1 conversion_unique_id = 'id:' + row[ 'ipg:affiliateNetwork'] + '-' + str( row['ipg:originalConversionId']) if testing: conversion_unique_id += '-testoffer' conversion_datetime = parse.get_datetime_from_timestamp( row['ipg:timestamp']) existing_conversion = existing_conversions.get( conversion_unique_id) # todo handle the case is sometime calling create/update api continuously doesn't work. HO is not real-time affiliate_id = parse.detect_aff_id(row['ipg:source']) try: if existing_conversion is None: fields = { 'ad_id': parse.detect_transaction_id(row['ipg:source']), 'advertiser_info': row['ipg:advertiserInfo'], 'affiliate_id': affiliate_id, 'affiliate_info1': 'testoffer' if testing else row['ipg:cc'], 'affiliate_info2': '', 'affiliate_info3': row['ipg:affCustom'], 'affiliate_info4': conversion_unique_id, 'affiliate_info5': 'createdByIprice', 'country_code': row['ipg:cc'], 'currency': row['ipg:currency'], 'datetime': conversion_datetime, 'ip': row['ipg:ip'], 'is_adjustment': '0', 'offer_id': parse.detect_offer_id(row['ipg:source']), 'payout': row['ipg:commission'], 'refer': row['ipg:exitUrl'], 'revenue': row['ipg:commission'], 'sale_amount': row['ipg:orderValue'], 'session_datetime': conversion_datetime, 'session_ip': row['ipg:ip'], 'source': row['ipg:source'], 'status': row['ipg:status'], 'user_agent': row['ipg:userAgent'] } LOGGER.info( "Create Conversion", { 'affiliate_id': affiliate_id, 'conversion_id': conversion_unique_id }) if dry_run: LOGGER.info("Create dry run fields", fields) else: ipricegroup.upsert_conversion(fields) count['created'] += 1 data.loc[index, 'ipg:sync'] = 'created' else: fields = { 'ip': row['ipg:ip'], 'payout': row['ipg:commission'], 'refer': row['ipg:exitUrl'], 'revenue': row['ipg:commission'], 'sale_amount': row['ipg:orderValue'], 'session_datetime': conversion_datetime, 'session_ip': row['ipg:ip'], 'status': row['ipg:status'], 'user_agent': row['ipg:userAgent'] } if ipricegroup.is_updated_conversion( existing_conversion, fields): LOGGER.info( "Update Conversion", { 'affiliate_id': affiliate_id, 'existing_conversion_id': existing_conversion['id'], 'conversion_id': conversion_unique_id }) if dry_run: LOGGER.info("Update dry run fields", fields) else: # if we have these fields changed we cannot update the conversion # but we can log it to sync_failed 2 times # 1- the old conversion with negative payout, revenue & sale_Amount # 2- the new conversion with the new values non_updatable_fields = { 'currency': 'ipg:currency', 'source': 'ipg:source', } for key, field in non_updatable_fields.iteritems(): if str(existing_conversion[key]) != str( row[field]): raise Exception( 'change of ' + key + ' to ' + str(row[field]) + ' are not allowed for an existing conversion: ' + json.dumps(existing_conversion)) ipricegroup.upsert_conversion( fields, existing_conversion['id']) count['updated'] += 1 data.loc[index, 'ipg:sync'] = 'updated' except Exception as e: data.loc[index, 'ipg:sync'] = 'failed' LOGGER.error(str(e), {'affiliate_id': affiliate_id}) LOGGER.info("Processed Conversion", {'count': count['processed']}) LOGGER.info("Created Conversion", {'count': count['created']}) LOGGER.info("Updated Conversion", {'count': count['updated']}) data = downloader.process_data(data) downloader.export_csv(data) LOGGER.info('Sync Process Ended', { 'month': args.month, 'affiliate': args.affiliate })
def main(): mode = input('Mode selection (nodata, update, db_init): ') flag_keyboard_interrupt = False engine, session = db.connect() # db_tickers = db.load_available_tickers(session) # TODO db_tickers = [] # testing new_tickers_df = fio.us_equities('resources/generic_backup101518.json') # TODO remove from list according to error csv # TODO remove from list according to db if mode == 'nodata': start_date = pd.to_datetime(params.START_DATE) for ticker_row in new_tickers_df.iterrows( ): # looping over all tickers ticker_row = ticker_row[1] # get rid of the indexing metadata = ticker_row.to_dict() print('Debugging - printing metadata') # debugging print(metadata) # debugging if flag_keyboard_interrupt: # interruption logic print('Keyboard interrupted - exiting...') break if ticker_row[ 'ticker'] not in db_tickers: # skip over existing tickers try: data = dl.get_data(ticker_row['ticker'], start_date=start_date) print(f"Inserting {ticker_row['ticker']} into db" ) # debugging db.insert_data(session, data, metadata=metadata) session.commit() except KeyboardInterrupt as error: flag_keyboard_interrupt = True # except Exception: # can't debug if errors don't show # fio.append_error_csv(ticker_row) # TODO elif mode == 'update': for ticker in db_tickers: if flag_keyboard_interrupt: # ctrl-c break break last_date = db.last_data_date(ticker) # TODO if last_date < pd.Timestamp.today() - pd.TimeDelta(1, 'D'): try: data = dl.get_data(ticker, last_date=last_date) db.insert_table(session, data) # TODO except KeyboardInterrupt as error: flag_keyboard_interrupt = True elif mode == 'db_init': verify = input('You are about to delete everything ' 'in the finance_data database (y/n): ') if verify == 'y': base.metadata.create_all(engine) # create all tables fio.setup_error_csv(verify=True) else: print('Not a valid mode')