def post(self): data = request.json logger.info(f'Register Post data {data} [{type(data)}]') if data is None: return {'status': 'fail', 'message': 'No data passed'}, 400 try: user = User.query.filter( or_(User.email == data.get('email'), User.username == data.get('username'))).first() if user: logger.info(f"Resister found pre-existing User: {user}") return { 'status': 'fail', 'message': 'Username/Email Already exist!' }, 401 user = User( email=data.get('email'), username=data.get('username'), password=data.get('password'), ) db.session.add(user) db.session.commit() auth_token = user.encode_auth_token(user.id) return { 'status': 'success', 'message': 'Successfully registered', 'auth_token': auth_token.decode() }, 201 except Exception as e: logger.error(e) return { 'status': 'fail', 'message': 'An error has occurred', }, 401
def parse(self, response): logger.info(f'parsed url {response.url}') # SELENIUM DRIVER # self.driver.get(response.url) # self.driver.implicitly_wait(3) #Append data self.store = self.store.append({'c': 0, 'd': 2}, ignore_index=True)
def init_db(conn: Connection) -> None: """ Configures the target schema in which the tweets data will be stored, creates the schema and the table if not existing yet :param conn: SQLAlchemy connection object """ logger.info(f"{Fore.YELLOW}Initializing database ...{Style.RESET_ALL}") # Create specified schema if not exists if not conn.dialect.has_schema(conn, schema_name): logger.info( f"{Fore.YELLOW}Schema {schema_name} does not exist, creating it ...{Style.RESET_ALL}" ) conn.execute(schema.CreateSchema(schema_name)) logger.info( f"{Fore.GREEN}Schema {schema_name} successfully created !{Style.RESET_ALL}" ) else: logger.info( f"{Fore.GREEN}Schema {schema_name} was found, continuing database initialization " f"...{Style.RESET_ALL}") # Create tables Base.metadata.create_all(bind=conn) logger.info( f"{Fore.GREEN}Schema {schema_name} successfully configured !{Style.RESET_ALL}" )
def __init__(self, USER, HOST, DATABASE, PASSWORD): self.USER = USER self.HOST = HOST self.DATABASE = DATABASE self.PASSWORD = PASSWORD try: conn = mysql.connector.connect( user=USER, host=HOST, database=DATABASE, password=PASSWORD, ) self.conn = conn logger.info('connection database success') print('Connection success') except mysql.connector.Error as err: if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: logger.error('umm something wrong with user name or password') print('umm something wrong with user name or password') elif err.errno == errorcode.ER_BAD_DB_ERROR: logger.error('database does not exist') print('database does not exist') else: logger.error(err) print(err)
def csv_load_to_db(filename, destination_folder, connection): """ parse csv file and execute query to load into database. Arg: 1. filename = name of csv file 2. destination_folder= downloaded files directory 3. connection = connection(HOST,HOST,DATABASE,PASSWORD) """ csv_file = open(destination_folder + filename) count_header, count_row = 0, 0 for row in islice(csv_file, 50001): # for row in islice(fl,1000): if count_header < 1: columns = row.rstrip().split(',') count_header += 1 print(columns) else: val = row.rstrip().split(',') dt1 = datetime.strptime(val[5], '%m/%d/%Y').date() dt2 = datetime.strptime(val[7], '%m/%d/%Y').date() val[5] = dt1 val[7] = dt2 count_row += 1 # print(val) ### this the part where we use all the parsed csv and insert it to target database ### we are not going to make new connection code to execute query, but we call methods from module connection to do the job ### since we'll call this class in main.py no need to import module connection here ### params = val insert_sql = '''INSERT INTO sales ({},{}, `{}`, `{}`, `{}`, `{}`, `{}`, `{}`,`{}`, `{}`, `{}`, `{}`, `{}`, `{}`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''.format( *columns) # Iterable Unpacking #* to unpacked list,tuple,** unpacked dict os.system('clear') print('inserting data to database') connection.execute_query(insert_sql, params) connection.commit() logger.info("{}'s rows loaded into database".format(count_row))
def __init__(self, **kwargs): # instantiating chrome selenium driver, start urls feature an 'infinite scroll' # behavior that needs to be triggered to view all listed shoes self.store = kwargs.get('store', pd.DataFrame()) self.file_path = kwargs.get( 'file_path', path.join( project_path, f"results/{self.__class__.__name__}-{datetime.now().strftime('%y-%m-%d-%H-%M')}" )) if kwargs.get('use_selenium'): self.driver = get_driver() logger.info( f'{TemplateSelSpider.name} init, store: {self.store}, file_path: {self.file_path}' ) super().__init__()
def post(self): data = request.json logger.info(f"Like post data[{type(data)}]: {data}") like = dbLike.query.filter( dbLike.user_id == data['user_id'], dbLike.post_id == data['post_id'], ).first() if not like: like = dbLike(user_id=data['user_id'], post_id=data['post_id'], value=data['value']) else: like.value = data['value'] db.session.add(like) db.session.commit() return { 'status': 'success', 'message': 'post retrieved', }, 200
def get(self): data = request.args logger.info(f"Post get data[{type(data)}]: {data}") try: if 'id' in data: logger.info('REQUEST BY ID') posts = [dbPost.query.get(data['id']),] elif 'feed' in data: feed_query = json.loads(data['feed']) posts = [post for _, post in dbUserFollow.query.filter( dbUserFollow.follower_id == feed_query['userId'] ).join( dbPost, dbUserFollow.followed_id == dbPost.author_id ).add_entity( dbPost ).all()] else: posts = dbPost.query.join( dbUser, dbUser.id == dbPost.author_id ).filter_by( # **{'id': data.get('id')} ).add_entity( dbUser ) if data: posts = posts.filter_by( **{'id': data.get('author_id')} ) posts = [post for post, user in posts.all()] posts = [post.resp_dict() for post in posts] return { 'status': 'success', 'message': 'post retrieved', 'posts': posts }, 200 except Exception as e: logger.error(e) return { 'status': 'fail', 'message': 'An error has occurred', }, 401
def post(self): data = request.json logger.info(f'request data [{type(data)}]: {data}') email_body = f"FREELANCE SITE INQUERY:\n" \ f"name: {data['name']}\n" \ f"email: {data['email']}\n" + \ f"_"*12 + \ f"\n{data['message']}" msg = Message( subject="Freelance Site Inquery", body=email_body, sender=data['email'], recipients=["*****@*****.**"] ) mail.send(msg) response = { 'status': 200, 'body': 'Success', } logger.info(f'response: {response}') return response
def train(experiment, parameters, method, joint, run_on_test=False): experiment_summary = get_experiment_summary(parameters.embedding_size, parameters.neg_ratio, parameters.batch_size, parameters.learning_rate, parameters.loss_ratio, parameters.type_ratios) module = create_module(experiment, method, joint, parameters) logger.info(" \n------------------ \n") logger.info("Running model with:\n") logger.info(experiment_summary) trainer = Trainer(module) valid_result = trainer.fit(experiment, parameters) print(experiment_summary) print_results('validation set', valid_result) if run_on_test: test_results = trainer.test(experiment, parameters.checkpoint_file) print_results('test set', test_results) else: test_results = None return valid_result, test_results
def check_dataset(conn): ''' if dataset is not exist or need updating, it will execute extraction function and return ``str`` 'ready' if not, return ``str`` 'up to date' ''' client = bq.Client() rows = client.query('select count(*) from project_four.sales') conn = conn mysql_total_rows = check_database(conn) try: if mysql_total_rows is None: print('something wrong with database') pass else: for item in rows: dataset_total_rows = item[0] if dataset_total_rows == mysql_total_rows: print('bigquery_is_up_to_date') logger.info( 'bigquery is up to date, extraction process is skipped' ) return 'up to date' elif dataset_total_rows < mysql_total_rows: print('bq dataset is not up to date to mysql') logger.info( 'bq dataset is not up to date to mysql, begin extraction' ) mysql_to_pq(conn) return 'ready' except GoogleAPIError: rows.error_result['reason'] == 'notFound' print(rows.error_result) logger.info('{}, new dataset created and begin extraction'.format( rows.error_result)) mysql_to_pq(conn) return 'ready'
def post(self): auth_header = request.headers.get('Authorization') if auth_header: auth_token = auth_header.split(" ")[1] resp = User.decode_auth_token(auth_token) if isinstance(resp, str): return {'status': 'fail', 'message': resp}, 401 else: data = request.json logger.info(f"Comment post data[{type(data)}]: {data}") if data is None: return {'status': 'fail', 'message': 'No data passed'}, 400 else: try: post_id = data['postId'] author_id = data['authorId'] comment = data['comment'] post = dbPost.query.filter( dbPost.id == post_id).first() new_comment = dbComment(post_id=post_id, author_id=author_id, body=comment) post.comments.append(new_comment) db.session.add(new_comment) db.session.add(post) db.session.commit() return { 'status': 'success', 'message': 'comment submitted', }, 200 except Exception as e: logger.error(e) return { 'status': 'fail', 'message': 'An error has occurred', }, 401 else: return {'status': 'fail', 'message': 'Invalid auth provided'}, 401
def post(self): data = request.form logger.info(f"/user post data[{type(data)}]: {data}") user = dbUser.query.filter( dbUser.id == data['userId'] ).first() if 'aviFile' in request.files.keys(): avi_file = request.files['aviFile'] avi_s3_name = f"user-{data['userId']}-avi" s3.upload_file( avi_file, s3.bucket_name, object_name=avi_s3_name ) user.avi_s3_name = avi_s3_name if 'bio' in data.keys(): user.bio = data["bio"] db.session.add(user) db.session.commit() return { 'status': 'success', 'message': 'profile updated', 'user': user.resp_dict() }, 200
def post(self): data = request.form logger.info(f"Post post data[{type(data)}]: {data}") user_id = data['user_id'] s3_name = f"{user_id}-{data['title'].replace(' ', '-')}-{str(uuid.uuid4().hex)}.png" image_file = request.files['image'] # TODO: remove and change image upload to frontend? # Will have to deal with cloudfront at some point anyways... s3.upload_file( image_file, s3.bucket_name, object_name=s3_name ) # s3_url = f"https://{s3.bucket_name}.s3.amazonaws.com/{s3_name}" post = dbPost( author_id=user_id, title=data['title'], desc=data.get('description'), s3_name=s3_name, # s3_url=s3_url ) db.session.add(post) tags = request.form['tags'].split(',') for tag in tags: db_tag = dbTag.query.filter( dbTag.name == tag ).first() if not db_tag: db_tag = dbTag(name=tag) post.tags.append(db_tag) db.session.add(db_tag) db.session.commit() return { 'status': 'success', 'message': 'post uploaded', # TODO: implement to json function in post database class 'post': post.resp_dict() }, 200
def post(self): data = request.json logger.info(f'Login Post data {data} [{type(data)}]') if data is None: return {'status': 'fail', 'message': 'No data passed'}, 400 try: # user = User.query.filter_by( # email=data.get('email'), # username=data.get('username') # ).first() user = User.query.filter( (User.username == data.get('username')) | (User.email == data.get('email'))).first() logger.info(f'Login Post user query result: {user}') if user and bcrypt.check_password_hash(user.password, data.get('password')): auth_token = user.encode_auth_token(user.id) if auth_token: return { 'status': 'success', 'message': 'Success', 'user': user.resp_dict(include_private=True), # 'user': user.resp_dict(), 'auth_token': auth_token.decode() }, 200 else: return { 'status': 'fail', 'message': 'Username or Password are invalid!' }, 401 except Exception as e: logger.error(e) return { 'status': 'fail', 'message': 'An error has occurred', }, 401
def get(self): authenticated = check_auth(request) try: data = dict(request.args) logger.info(f"User get data[{type(data)}]: {data}") user = dbUser.query.filter_by(**data).first() logger.warning(f'authenticated: {authenticated}') if isinstance(authenticated, int): same_user = authenticated == user.id return { 'status': 'success', 'user': user.resp_dict(include_private=same_user), }, 200 else: return { 'status': 'success', 'user': user.resp_dict() }, 200 except Exception as e: logger.error(e) return { 'status': 'fail', 'message': 'An error has occurred', }, 401
def main(): ''' Starts harvest script ''' logger.info('launching main') # tz = pytz.timezone('America/Los_Angeles') tz = pytz.timezone('EST') start = datetime.datetime.now(tz=tz) if config['use_proxy']: update_proxies() # launching crawlers store = Store() process = CrawlerProcess(custom_settings) for spider in spiders: logger.info(f'starting {spider.name}') process.crawl(spider, store=store) process.start() process.join() end = datetime.datetime.now(tz=tz) logger.info(f"runtime: {end - start}")
return 'ready' if __name__ == '__main__': with open('config.json', 'r') as json_file: config = json.load(json_file) conn = mysql.connector.connect( user=config['mysql']['USER'], # establish target database host=config['mysql']['HOST'], database=config['mysql']['DATABASE'], password=config['mysql']['PASSWORD']) job = check_dataset(conn) if job is 'ready': load.load_blob(bucket_name=func_param['bucket_name'], destination_blob=func_param['dest_blob_transform'], source_file_path=func_param['source_transform']) logger.info('parquet file succesfully load into cloudstorage') load.pq_gcs_to_bigquery(uri='gs://' + func_param['bucket_name'] + '/' + func_param['dest_blob_transform'], dataset='project_four', table_id='sales', write_disposition='WRITE_APPEND') logger.info('dataset updated') else: logger.info('dataset is up to date') pass
def mysql_to_pq(conn, source_transform=func_param['source_transform'], name_of_dataset='project_four', by_row_batch=5): ''' extract mysql database and save into local pq ``tmp/sales-date.pq``. this function take the last rows of bq dataset and compared againts current mysql database to avoid duplication, only extract load new data from mysql to bq. if dataset not exist it will create dataset using name given Args: 1. source_transform = 'path/local/file.pq' 2. by_row_batch = number of row you want to extract ``int`` return: ``str`` of local pq file path ''' client = bq.Client() row_id = client.query( 'select id from project_four.sales order by id desc limit 1') try: for i in row_id: last_row_id = i[0] print('last row in dataset is, ' + i[0]) except GoogleAPIError: row_id.error_result['reason'] == 'notFound' last_row_id = 0 print('no dataset.table') client.create_dataset(name_of_dataset, exists_ok=True) print('new dataset, {} created'.format(name_of_dataset)) cur = conn.cursor() #mysql conn cur.execute('use sales_records') cur.execute('select * from sales where id>={} and id<={}'.format( last_row_id + 1, last_row_id + by_row_batch)) list_row = cur.fetchall() rows_of_extracted_mysql = [] for i in list_row: rows_of_extracted_mysql.append(list(i)) print('extracting from mysql') df = pd.DataFrame(rows_of_extracted_mysql, columns=[ 'id', 'region', 'country', 'item_type', 'sales_channel', 'Order Priority', 'order_date', 'order_id', 'ship_date', 'units_sold', 'unit_price', 'unit_cost', 'total_revenue', 'total_cost', 'total_profit' ]) table = pa.Table.from_pandas(df) # df.to_csv('test.csv') pq.write_table(table, source_transform) # the pd.to_parquet does not working for some reason, segmentation fault # as for mean time use pyarrow lib to to create parquet file # df.to_parquet(source_transform, engine='fastparquet') logger.info('id {} to {} being extracted'.format( last_row_id + 1, last_row_id + by_row_batch)) print( 'data extracted from id {} to {}, {} file ready to upload to cloudstorage' .format(last_row_id + 1, last_row_id + by_row_batch, source_transform)) #,source_transform)