def _crawl_dig_point(next_dig_point): results = {} offset_dig_point = next_dig_point current_timestamp = util.get_timestamp() the_datetime = util.timestamp_to_datetime(current_timestamp) current_year = the_datetime.year cfg.logger.debug('current_year: %s', current_year) for idx in range(0, N_ITER_CRAWL_DIG_POINT): (error_code, next_dig_point, offset_dig_point, iter_results) = _iter_crawl_dig_point(next_dig_point, offset_dig_point) results.update(iter_results) offset_dig_point_year = offset_dig_point // 100000 + 1911 offset_dig_point_mod_100000 = offset_dig_point % 100000 cfg.logger.debug( 'offset_dig_point_year: %s offset_dig_point_mod_100000: %s', offset_dig_point_year, offset_dig_point_mod_100000) if offset_dig_point_year != current_year and offset_dig_point_mod_100000 >= 30000: break sleep_time = cfg.config.get('time_sleep', 30) cfg.logger.debug('to sleep %s', sleep_time) time.sleep(sleep_time) results_list = results.values() return (next_dig_point, results_list)
def _save_img(data, postfix, content_type): the_timestamp = util.get_timestamp() the_datetime = util.timestamp_to_datetime(the_timestamp) the_id = str(the_timestamp) + "_" + util.uuid() filename = the_id + '.' + postfix the_dir = '/data/img/bee/' + the_datetime.strftime('%Y-%m-%d') util.makedirs(the_dir) with open(the_dir + '/' + filename, 'w') as f: f.write(data) (the_thumbnail, thumbnail_postfix) = _make_thumbnail(data, postfix) the_dir = '/data/thumbnail/bee/' + the_datetime.strftime('%Y-%m-%d') util.makedirs(the_dir) thumbnail_filename = the_id + '.' + thumbnail_postfix with open(the_dir + '/' + thumbnail_filename, 'w') as f: f.write(the_thumbnail) db_data = {"filename": the_datetime.strftime('%Y-%m-%d/') + filename, "thumbnail_filename": the_datetime.strftime("%Y-%m-%d/") + thumbnail_filename, "the_id": the_id, 'content_type': content_type, 'save_time': the_timestamp} util.db_insert('bee_img', [db_data]) if '_id' in db_data: del db_data['_id'] return db_data
def _crawl_dig_point(next_dig_point): results = {} offset_dig_point = next_dig_point current_timestamp = util.get_timestamp() the_datetime = util.timestamp_to_datetime(current_timestamp) current_year = the_datetime.year cfg.logger.debug('current_year: %s', current_year) for idx in range(0, N_ITER_CRAWL_DIG_POINT): (error_code, next_dig_point, offset_dig_point, iter_results) = _iter_crawl_dig_point(next_dig_point, offset_dig_point) results.update(iter_results) offset_dig_point_year = offset_dig_point // 100000 + 1911 offset_dig_point_mod_100000 = offset_dig_point % 100000 cfg.logger.debug('offset_dig_point_year: %s offset_dig_point_mod_100000: %s', offset_dig_point_year, offset_dig_point_mod_100000) if offset_dig_point_year != current_year and offset_dig_point_mod_100000 >= 30000: break sleep_time = cfg.config.get('time_sleep', 30) cfg.logger.debug('to sleep %s', sleep_time) time.sleep(sleep_time) results_list = results.values() return (next_dig_point, results_list)
def _crawl_dig(): the_url = 'http://61.60.124.185/tpctempdig/InfoAllList.asp' start_timestamp = 946684800 end_timestamp = util.get_timestamp() + 86400 * 366 start_datetime = util.timestamp_to_datetime(start_timestamp) end_datetime = util.timestamp_to_datetime(end_timestamp) params = { 'sortflag': '', 'sorttype': '', 'TargetLB': '', 'qry2': 1, 'startyear': start_datetime.year, 'startmonth': start_datetime.month, 'startday': start_datetime.day, 'endyear': end_datetime.year, 'endmonth': end_datetime.month, 'endday': end_datetime.day, } http_data = util.http_multipost({the_url: params}) #cfg.logger.debug('http_data: %s', http_data) (latest_timestamp, dig_data) = _parse_dig(http_data[the_url]) [_put_to_db(each_data) for each_data in dig_data] util.save_cache('cron_new_taipei_city_latest_dig', {'latest_timestamp': latest_timestamp})
def _is_to_refresh_google_token(user_info): if user_info.get('user_type', '') != 'google': return False if user_info.get('token_refresh_timestamp', 0) < util.get_timestamp(): return True return False
def _create_session_key(user_id=None, offset_timestamp=0): the_timestamp = util.get_timestamp() the_timestamp += offset_timestamp session_key = _serialize_session_key(the_timestamp, util.gen_random_string()) if user_id: util.db_update('session_user_map', {"session_key": session_key}, {"user_id": user_id, "the_timestamp": the_timestamp}) return session_key
def p_json_handler(data): for each_data in data: the_timestamp = util.get_timestamp() the_id = str(the_timestamp) + "_" + util.uuid() each_data['the_id'] = the_id each_data['save_time'] = the_timestamp each_data['user_name'] = each_data.get('user_name', '') each_data['address'] = each_data.get('address', '') each_data['count'] = util._int(each_data['count']) util.db_insert('bee', data) return {"success": True}
def _create_session_key(user_id=None, offset_timestamp=0): the_timestamp = util.get_timestamp() the_timestamp += offset_timestamp session_key = _serialize_session_key(the_timestamp, util.gen_random_string()) if user_id: util.db_update('session_user_map', {"session_key": session_key}, { "user_id": user_id, "the_timestamp": the_timestamp }) return session_key
def _check_refresh_session(session, session_key, session_key2, user_info): the_timestamp = util.get_timestamp() user_id = user_info.get('user_id', '') (session_timestamp, session_id) = _deserialize_session_key(session_key) if the_timestamp - session_timestamp > EXPIRE_TIMESTAMP_SESSION_BLOCK: if not session_key2: session_key2 = _create_session_key(user_id=user_id) session_key3 = _create_session_key(user_id=user_id, offset_timestamp=OFFSET_TIMESTAMP_SESSION_BLOCK) session['value'] = session_key2 session['value2'] = session_key3 session.save() session_struct = _extract_session_struct_from_session_key(session_key) remove_session(session_struct)
def _check_refresh_session(session, session_key, session_key2, user_info): the_timestamp = util.get_timestamp() user_id = user_info.get('user_id', '') (session_timestamp, session_id) = _deserialize_session_key(session_key) if the_timestamp - session_timestamp > EXPIRE_TIMESTAMP_SESSION_BLOCK: if not session_key2: session_key2 = _create_session_key(user_id=user_id) session_key3 = _create_session_key( user_id=user_id, offset_timestamp=OFFSET_TIMESTAMP_SESSION_BLOCK) session['value'] = session_key2 session['value2'] = session_key3 session.save() session_struct = _extract_session_struct_from_session_key(session_key) remove_session(session_struct)
def p_json_handler(data): ''' data: [{deliver_time, deliver_date, ad_versions, geo, count, user_name, address, county, town, deliver_status, memo}] deliver_date: time in iso-8601 format (with millisecond precision) deliver_time: deliver_date as timestamp (secs after Unix epoch) in int. ad_versions: list of ad_versions. the name of ad is based on "name" in /get/adData geo: geojson format. accepting LineString and Point count: in int number user_name: string address: string county: string, based on app/scripts/services/TWCounties in frontend town: string, based on app/scripts/services/TWTown in frontend deliver_status: string memo: string ex: {"town":"東區","count":10,"deliver_time":1398724259,"deliver_date":"2014-04-28T22:30:59.383Z","geo":[{"type":"LineString","coordinates":[[120.99337719999994,24.7905385],[120.99452376365662,24.79139038370729],[120.99501729011536,24.79084493848351]]}],"ad_versions":["鳥籠監督條例"],"county":"新竹市","deliver_status":"test","address":"nthu","user_name":"test_user_name","memo":"test"} ex2: {"town":"內湖區","count":3000,"deliver_time":1398164891,"deliver_date":"2014-04-22T11:08:11.835Z","geo":[{"type":"Point","coordinates":[121.61277294158936,25.06670789727661]}],"ad_versions":["20140421_二類電信RE"],"county":"台北市","address":"康寧路三段","user_name":"test_user_name"} ''' for each_data in data: for key in _MUST_HAVE_KEYS: if key not in each_data: return {"success": False, "errorMsg": "no key: key: %s each_data: %s" % (key, util.json_dumps(each_data))} the_timestamp = util.get_timestamp() the_id = str(the_timestamp) + "_" + util.uuid() each_data['the_id'] = the_id if 'deliver_time' not in each_data: (error_code, deliver_time) = _parse_deliver_time(each_data) if error_code != S_OK: return {"success": False, "error_msg": "deliver_date not fit format: deliver_date: %s each_data: %s" % (each_data.get('deliver_date', ''), util.json_dumps(each_data))} each_data['deliver_time'] = deliver_time each_data['save_time'] = the_timestamp each_data['user_name'] = each_data.get('user_name', '') each_data['address'] = each_data.get('address', '') each_data['count'] = util._int(each_data['count']) util.db_insert('bee', data) return {"success": True}
def p_json_handler(data): error_code = S_OK error_msg = '' save_timestamp = util.get_timestamp() for each_data in data: _infer_columns(each_data, save_timestamp) the_id = each_data['the_id'] db_result = util.db_find_one('roadDB', {'the_id': the_id}) if db_result: if not _is_same(db_result, each_data): error_code = S_ERR cfg.logger.error('data different: the_id: %s db_result: %s each_data: %s', the_id, db_result, each_data) error_msg += 'data different: the_id: %s db_result: %s each_data: %s\n' % (the_id, db_result, each_data) continue util.db_insert_if_not_exist('roadDB', {'the_id': the_id}, each_data) return {"success": True if error_code == S_OK else False, "error_msg": error_msg}
def _login(client_id, scope, register_uri, authorization_base_url, request, params): (session_struct, session_struct2) = util_user.process_session(request) cfg.logger.debug('session_struct: %s session_struct2: %s', session_struct, session_struct2) the_path = params.get('url', '') the_timestamp = util.get_timestamp() cfg.logger.debug('params: %s the_path: %s', params, the_path) the_auth = OAuth2Session(client_id, scope=scope, redirect_uri=register_uri) authorization_url, state = the_auth.authorization_url( authorization_base_url, approval_prompt="auto") util.db_insert( 'login_info', { "state": state, "the_timestamp": the_timestamp, "params": params, "url": the_path }) is_cron_remove_expire = cfg.config.get('is_cron_remove_expire', True) if not is_cron_remove_expire: expire_timestamp_session = cfg.config.get( 'expire_unix_timestamp_session', EXPIRE_UNIX_TIMESTAMP_SESSION) * 1000 util.db_remove('login_info', { "the_timestamp": { "$lt": the_timestamp - expire_timestamp_session } }) cfg.logger.debug( 'after authorization_url: authorization_url: %s state: %s', authorization_url, state) redirect(authorization_url)
def process_session(request): session = request.environ['beaker.session'] session_struct = {} session_struct2 = {} the_timestamp = util.get_timestamp() if not session.has_key('value'): session_struct = _construct_session_struct(the_timestamp) session['value'] = session_struct.get('key', '') session_struct2 = _construct_session_struct(the_timestamp + 300) session['value2'] = session_struct2.get('key', '') session_key = _create_session_key() session_key2 = _create_session_key(offset_timestamp=300) session['value'] = session_key session['value2'] = session_key2 session.save() else: session_key = session['value'] session_key2 = session['value2'] session_struct = _extract_session_struct_from_session_key(session_key) session_struct2 = _extract_session_struct_from_session_key( session_key2) session_timestamp = session_struct.get('the_timestamp', 0) session_timestamp2 = session_struct2.get('the_timestamp', 0) if the_timestamp - util._int(session_timestamp) >= 300: new_timestamp = max(the_timestamp, util._int(session_timestamp2) + 300) session_struct3 = _construct_session_struct(new_timestamp) session_struct = session_struct2 session_struct2 = session_struct3 session['value'] = session_struct.get('key', '') session['value2'] = session_struct2.get('key', '') session.save() return (session_struct, session_struct2)
def _get_params(params): if not params: server = cfg.config.get('web_server', 'http://106.187.101.193:5346') the_url = server + '/get/taipei_city_dig_point_next_dig_point' http_result = util.http_multiget([the_url]) next_dig_point = util._int(util.json_loads(http_result.get(the_url, ''), ''), START_TAIPEI_CITY_DIG_POINT) cfg.logger.debug('after http_multiget: http_result: %s next_dig_point: %s', http_result, next_dig_point) return (S_OK, {'next_dig_point': next_dig_point}) next_dig_point = params.get('next_dig_point', START_TAIPEI_CITY_DIG_POINT) year = next_dig_point // MAX_TAIPEI_CITY_DIG_POINTS_BY_YEAR the_timestamp = util.get_timestamp() tw_year = util.timestamp_to_tw_year(the_timestamp) if tw_year <= year: return (S_ERR, None) next_dig_point = (year + 1) * MAX_TAIPEI_CITY_DIG_POINTS_BY_YEAR return (S_OK, {'next_dig_point': next_dig_point})
def _login(client_id, scope, register_uri, authorization_base_url, request, params): (session_struct, session_struct2) = util_user.process_session(request) cfg.logger.debug('session_struct: %s session_struct2: %s', session_struct, session_struct2) the_path = params.get('url', '') the_timestamp = util.get_timestamp() cfg.logger.debug('params: %s the_path: %s', params, the_path) the_auth = OAuth2Session(client_id, scope=scope, redirect_uri=register_uri) authorization_url, state = the_auth.authorization_url(authorization_base_url, approval_prompt="auto") util.db_insert('login_info', {"state": state, "the_timestamp": the_timestamp, "params": params, "url": the_path}) is_cron_remove_expire = cfg.config.get('is_cron_remove_expire', True) if not is_cron_remove_expire: expire_timestamp_session = cfg.config.get('expire_unix_timestamp_session', EXPIRE_UNIX_TIMESTAMP_SESSION) * 1000 util.db_remove('login_info', {"the_timestamp": {"$lt": the_timestamp - expire_timestamp_session}}) cfg.logger.debug('after authorization_url: authorization_url: %s state: %s', authorization_url, state) redirect(authorization_url)
def process_session(request): session = request.environ['beaker.session'] session_struct = {} session_struct2 = {} the_timestamp = util.get_timestamp() if not session.has_key('value'): session_struct = _construct_session_struct(the_timestamp) session['value'] = session_struct.get('key', '') session_struct2 = _construct_session_struct(the_timestamp + 300) session['value2'] = session_struct2.get('key', '') session_key = _create_session_key() session_key2 = _create_session_key(offset_timestamp=300) session['value'] = session_key session['value2'] = session_key2 session.save() else: session_key = session['value'] session_key2 = session['value2'] session_struct = _extract_session_struct_from_session_key(session_key) session_struct2 = _extract_session_struct_from_session_key(session_key2) session_timestamp = session_struct.get('the_timestamp', 0) session_timestamp2 = session_struct2.get('the_timestamp', 0) if the_timestamp - util._int(session_timestamp) >= 300: new_timestamp = max(the_timestamp, util._int(session_timestamp2) + 300) session_struct3 = _construct_session_struct(new_timestamp) session_struct = session_struct2 session_struct2 = session_struct3 session['value'] = session_struct.get('key', '') session['value2'] = session_struct2.get('key', '') session.save() return (session_struct, session_struct2)