def _parse_save_time(x, funnel_dict): x = _unicode_dict(x) the_str = x.get(u'時間戳記', '') #cfg.logger.debug('the_date_time: %s', the_str) the_list = the_str.split(' ') the_date = the_list[0] the_time = the_list[2] am_pm = the_list[1] the_time_list = the_time.split(':') the_hr = util._int(the_time_list[0]) if the_hr == 12 and am_pm == u'上午': the_time_list[0] = '0' the_time = ':'.join(the_time_list) if am_pm == u'下午' and the_hr != 12: the_time_list[0] = str(util._int(the_time[0]) + 12) the_time = ':'.join(the_time_list) cfg.logger.debug('the_date: %s the_time: %s', the_date, the_time) the_datetime_str = the_date + ' ' + the_time the_datetime = datetime.strptime(the_datetime_str, '%Y/%m/%d %H:%M:%S') the_timestamp = util.datetime_to_timestamp(the_datetime) return the_timestamp
def get_db_results_by_the_timestamp(start_timestamp, end_timestamp): start_timestamp = util._int(start_timestamp) end_timestamp = util._int(end_timestamp) result_all = util.db_find('roadDB', {'end_timestamp': {'$gte': start_timestamp}, 'start_timestamp': {'$lte': end_timestamp}}) results = [result for result in result_all if _is_valid(result, start_timestamp, end_timestamp)] return results
def _parse_date5(the_str): # all-number the_str = re.sub('-.*', '', the_str) if len(the_str) != 8: return (0, 0, 0) year = util._int(the_str[0:4]) month = util._int(the_str[4:6]) day = util._int(the_str[6:8]) return (year, month, day)
def _infer_columns(data, save_timestamp): data['the_id'] = data.get('the_category', '') + '_' + data.get('the_idx', '') data['json_id'] = data.get('county_name', '') + '_' + str(data.get('start_timestamp', 0)) + '_' + str(data.get('end_timestamp', MAX_TIMESTAMP)) + '_' + data.get('the_id', '') start_timestamp = util._int(data.get('start_timestamp', 0)) data['beginDate'] = '' if not start_timestamp else util.timestamp_to_date_str(start_timestamp, 'Asia/Taipei') end_timestamp = util._int(data.get('end_timestamp', 0)) data['endDate'] = '' if not end_timestamp or end_timestamp >= MAX_TIMESTAMP else util.timestamp_to_date_str(end_timestamp, 'Asia/Taipei')
def _parse_date7(the_str): # all-number the_str = re.sub('-.*', '', the_str) #cfg.logger.debug('the_str: %s len(the_str): %s', the_str, len(the_str)) if len(the_str) != 5: return (0, 0, 0) month = util._int(the_str[0:1]) day = util._int(the_str[1:3]) year = util._int(the_str[3:5]) + 2000 return (year, month, day)
def _parse_date8(the_str): # all-number the_str = re.sub('-.*', '', the_str) #cfg.logger.debug('the_str: %s len(the_str): %s', the_str, len(the_str)) if len(the_str) != 6: return (0, 0, 0) year = util._int(the_str[0:2]) + 2000 month = util._int(the_str[2:4]) day = util._int(the_str[4:6]) return (year, month, day)
def _parse_date2(the_str): the_str_match = re.search(ur'(\d+)/(\d+)', the_str, flags=re.UNICODE) # MM-dd if the_str_match: num1 = the_str_match.group(1) num2 = the_str_match.group(2) year = 2014 month = util._int(num1) day = util._int(num2) if day == 22014: day = 20 return (year, month, day) return (0, 0, 0)
def _parse_date(the_date): the_date_list = the_date.split('/') if len(the_date_list) != 3: return 0 the_year = util._int(the_date_list[0]) the_month = util._int(the_date_list[1]) the_day = util._int(the_date_list[2]) cfg.logger.debug('the_date: %s the_year: %s the_month: %s the_day: %s', the_date, the_year, the_month, the_day) the_datetime = datetime(the_year, the_month, the_day) the_timestamp = util.datetime_to_timestamp(the_datetime) return the_timestamp
def _deserialize_session_key(session_key): the_list = session_key.split('@') if not the_list: return (0, '') session_timestamp = util._int(the_list[0]) if len(the_list) >= 1 else 0 session_id = the_list[1] if len(the_list) >= 2 else '' return (session_timestamp, session_id)
def get_db_results_by_the_timestamp(start_timestamp, end_timestamp): start_timestamp = util._int(start_timestamp) end_timestamp = util._int(end_timestamp) result_all = util.db_find( 'roadDB', { 'end_timestamp': { '$gte': start_timestamp }, 'start_timestamp': { '$lte': end_timestamp } }) results = [ result for result in result_all if _is_valid(result, start_timestamp, end_timestamp) ] return results
def _parse_date9(the_str): # Apr. the_str_match = re.search(ur'Apr\.(\d+)', the_str, flags=re.UNICODE) # MM-dd if the_str_match: num1 = the_str_match.group(1) year = 2014 month = 4 day = util._int(num1) return (year, month, day) return (0, 0, 0)
def _get_params(): server = cfg.config.get('web_server', 'http://106.187.101.193:5346') the_url = server + '/get/taipei_city_road_case_next_road_case' http_result = util.http_multiget([the_url]) next_road_case = util._int(util.json_loads(http_result.get(the_url, ''), ''), START_TAIPEI_CITY_ROAD_CASE) cfg.logger.debug('after http_multiget: http_result: %s next_road_case: %s', http_result, next_road_case) return {'next_road_case': next_road_case}
def get_google_address_handler(params): n_db_result = util._int(params.get('n', 1)) db_result = util.db_find_it('bee_csv', {'is_processed_address': {'$ne': True}}, {'_id': False, 'csv_key': True, 'google_address': True, 'address': True, 'county_and_town': True}) db_result_total = db_result.count() db_result = db_result.limit(n_db_result) db_result = list(db_result) return {"status": "OK", "total": db_result_total, "result": db_result}
def g_taipei_city_road_case_next_road_case_handler(): db_results = util.db_find_it('roadDB', {'the_category': 'taipei_city_road_case', "the_idx": {"$gte": MIN_NEXT_TAIPEI_CITY_ROAD_CASE, "$lte": MAX_NEXT_TAIPEI_CITY_ROAD_CASE}}, {'_id': False, 'the_idx': True}) if not db_results: return START_TAIPEI_CITY_ROAD_CASE db_result = db_results.sort('the_idx', pymongo.DESCENDING).limit(1) if not db_result: return START_TAIPEI_CITY_ROAD_CASE result_list = list(db_result) if not result_list: return START_TAIPEI_CITY_ROAD_CASE result = result_list[0] the_idx = result.get('the_idx', START_TAIPEI_CITY_ROAD_CASE) #1000.0 the_idx = util._int(util._float(the_idx) // 1) return util._int(the_idx)
def p_json_handler(data): for each_data in data: the_timestamp = util.get_timestamp() the_id = str(the_timestamp) + "_" + util.uuid() each_data['the_id'] = the_id each_data['save_time'] = the_timestamp each_data['user_name'] = each_data.get('user_name', '') each_data['address'] = each_data.get('address', '') each_data['count'] = util._int(each_data['count']) util.db_insert('bee', data) return {"success": True}
def _get_params(): server = cfg.config.get('web_server', 'http://106.187.101.193:5346') the_url = server + '/get/taipei_city_road_case_next_road_case' http_result = util.http_multiget([the_url]) next_road_case = util._int( util.json_loads(http_result.get(the_url, ''), ''), START_TAIPEI_CITY_ROAD_CASE) cfg.logger.debug('after http_multiget: http_result: %s next_road_case: %s', http_result, next_road_case) return {'next_road_case': next_road_case}
def process_session(request): session = request.environ['beaker.session'] session_struct = {} session_struct2 = {} the_timestamp = util.get_timestamp() if not session.has_key('value'): session_struct = _construct_session_struct(the_timestamp) session['value'] = session_struct.get('key', '') session_struct2 = _construct_session_struct(the_timestamp + 300) session['value2'] = session_struct2.get('key', '') session_key = _create_session_key() session_key2 = _create_session_key(offset_timestamp=300) session['value'] = session_key session['value2'] = session_key2 session.save() else: session_key = session['value'] session_key2 = session['value2'] session_struct = _extract_session_struct_from_session_key(session_key) session_struct2 = _extract_session_struct_from_session_key( session_key2) session_timestamp = session_struct.get('the_timestamp', 0) session_timestamp2 = session_struct2.get('the_timestamp', 0) if the_timestamp - util._int(session_timestamp) >= 300: new_timestamp = max(the_timestamp, util._int(session_timestamp2) + 300) session_struct3 = _construct_session_struct(new_timestamp) session_struct = session_struct2 session_struct2 = session_struct3 session['value'] = session_struct.get('key', '') session['value2'] = session_struct2.get('key', '') session.save() return (session_struct, session_struct2)
def _parse_count(x, funnel_dict): x = _unicode_dict(x) the_str = x[u'數量'] the_str_match = re.search(ur'^(\d+)', the_str, flags=re.UNICODE) if not the_str_match: return '' the_str_purify = the_str_match.group(1) #cfg.logger.debug('the_str: (%s, %s) the_str_purify: (%s: %s)', the_str, the_str.__class__.__name__, the_str_purify, the_str_purify.__class__.__name__) return util._int(the_str_purify)
def process_session(request): session = request.environ['beaker.session'] session_struct = {} session_struct2 = {} the_timestamp = util.get_timestamp() if not session.has_key('value'): session_struct = _construct_session_struct(the_timestamp) session['value'] = session_struct.get('key', '') session_struct2 = _construct_session_struct(the_timestamp + 300) session['value2'] = session_struct2.get('key', '') session_key = _create_session_key() session_key2 = _create_session_key(offset_timestamp=300) session['value'] = session_key session['value2'] = session_key2 session.save() else: session_key = session['value'] session_key2 = session['value2'] session_struct = _extract_session_struct_from_session_key(session_key) session_struct2 = _extract_session_struct_from_session_key(session_key2) session_timestamp = session_struct.get('the_timestamp', 0) session_timestamp2 = session_struct2.get('the_timestamp', 0) if the_timestamp - util._int(session_timestamp) >= 300: new_timestamp = max(the_timestamp, util._int(session_timestamp2) + 300) session_struct3 = _construct_session_struct(new_timestamp) session_struct = session_struct2 session_struct2 = session_struct3 session['value'] = session_struct.get('key', '') session['value2'] = session_struct2.get('key', '') session.save() return (session_struct, session_struct2)
def _determine_date1(num1_str, num2_str, num3_str): num1 = util._int(num1_str) num2 = util._int(num2_str) num3 = util._int(num3_str) year = 2014 month = 0 day = 0 if num1 > 100: # yyyy-MM-dd month = num2 day = num3 elif num1 == 4 or num1 == 5: # MM-dd-yyyy month = num1 day = num2 else: # dd-MM-yyyy month = num2 day = num1 #cfg.logger.debug('num1_str: %s num2_str: %s num3_str: %s num1: %s num2: %s num3: %s year: %s month: %s day: %s', num1_str, num2_str, num3_str, num1, num2, num3, year, month, day) if day == 78: day = 8 return (year, month, day)
def g_json_all_handler(params): next_id = params.get('next_id', '') sort_order = params.get('order', 'DESC') is_desc = True if sort_order in ['desc', 'DESC'] else False num_query = util._int(params.get('num_query', DEFAULT_NUM_QUERY)) the_query = {} if next_id: query_key = '$lte' if is_desc else '$gte' the_query['json_id'] = {query_key: next_id} db_results = util.db_find_it('roadDB', the_query, {'_id': False, 'extension': False}) sort_flag = pymongo.DESCENDING if is_desc else pymongo.ASCENDING db_results.sort([('json_id', sort_flag)]).limit(num_query) results = list(db_results) return results
def get_google_address_handler(params): n_db_result = util._int(params.get('n', 1)) db_result = util.db_find_it('bee_csv', {'is_processed_address': { '$ne': True }}, { '_id': False, 'csv_key': True, 'google_address': True, 'address': True, 'county_and_town': True }) db_result_total = db_result.count() db_result = db_result.limit(n_db_result) db_result = list(db_result) return {"status": "OK", "total": db_result_total, "result": db_result}
def p_json_handler(data): ''' data: [{deliver_time, deliver_date, ad_versions, geo, count, user_name, address, county, town, deliver_status, memo}] deliver_date: time in iso-8601 format (with millisecond precision) deliver_time: deliver_date as timestamp (secs after Unix epoch) in int. ad_versions: list of ad_versions. the name of ad is based on "name" in /get/adData geo: geojson format. accepting LineString and Point count: in int number user_name: string address: string county: string, based on app/scripts/services/TWCounties in frontend town: string, based on app/scripts/services/TWTown in frontend deliver_status: string memo: string ex: {"town":"東區","count":10,"deliver_time":1398724259,"deliver_date":"2014-04-28T22:30:59.383Z","geo":[{"type":"LineString","coordinates":[[120.99337719999994,24.7905385],[120.99452376365662,24.79139038370729],[120.99501729011536,24.79084493848351]]}],"ad_versions":["鳥籠監督條例"],"county":"新竹市","deliver_status":"test","address":"nthu","user_name":"test_user_name","memo":"test"} ex2: {"town":"內湖區","count":3000,"deliver_time":1398164891,"deliver_date":"2014-04-22T11:08:11.835Z","geo":[{"type":"Point","coordinates":[121.61277294158936,25.06670789727661]}],"ad_versions":["20140421_二類電信RE"],"county":"台北市","address":"康寧路三段","user_name":"test_user_name"} ''' for each_data in data: for key in _MUST_HAVE_KEYS: if key not in each_data: return {"success": False, "errorMsg": "no key: key: %s each_data: %s" % (key, util.json_dumps(each_data))} the_timestamp = util.get_timestamp() the_id = str(the_timestamp) + "_" + util.uuid() each_data['the_id'] = the_id if 'deliver_time' not in each_data: (error_code, deliver_time) = _parse_deliver_time(each_data) if error_code != S_OK: return {"success": False, "error_msg": "deliver_date not fit format: deliver_date: %s each_data: %s" % (each_data.get('deliver_date', ''), util.json_dumps(each_data))} each_data['deliver_time'] = deliver_time each_data['save_time'] = the_timestamp each_data['user_name'] = each_data.get('user_name', '') each_data['address'] = each_data.get('address', '') each_data['count'] = util._int(each_data['count']) util.db_insert('bee', data) return {"success": True}
def get_json_today_by_start_date_handler(start_date, params): start_timestamp = util.date_to_timestamp(start_date) next_id = params.get('next_id', '') sort_order = params.get('order', 'DESC') num_query = util._int(params.get('num_query', DEFAULT_NUM_QUERY)) is_desc = sort_order in ['DESC', 'desc'] today = util.date_today() today_timestamp = util.date_to_timestamp(today) cfg.logger.debug('start_timestamp: %s today_timestamp: %s', start_timestamp, today_timestamp) the_query = { 'start_timestamp': { '$lte': start_timestamp }, 'end_timestamp': { '$gte': today_timestamp } } if next_id: query_key = '$lte' if is_desc else '$gte' the_query['json_id'] = {query_key: next_id} db_results = util.db_find_it('roadDB', the_query, { '_id': False, 'extension': False }) sort_flag = pymongo.DESCENDING if is_desc else pymongo.ASCENDING db_results.sort([('json_id', sort_flag)]).limit(num_query) results = list(db_results) cfg.logger.debug('start_date: %s next_id: %s num_query: %s', start_date, next_id, num_query) for (idx, result) in enumerate(results): cfg.logger.debug('idx: %s result: %s', idx, result) return results
def get_versions_handler(params): n_db_result = util._int(params.get("n", 1)) db_result = util.db_find_it( "bee_csv", {"is_processed_address": True, "is_processed_ad_version": {"$ne": True}}, { "_id": False, "csv_key": True, "address": True, "google_address": True, "geo": True, "version_text": True, "versions": True, "deliver_time": True, }, ) db_result = db_result.limit(n_db_result) db_result = list(db_result) return {"status": "OK", "result": db_result}
def g_json_all_handler(params): next_id = params.get('next_id', '') sort_order = params.get('order', 'DESC') is_desc = True if sort_order in ['desc', 'DESC'] else False num_query = util._int(params.get('num_query', DEFAULT_NUM_QUERY)) the_query = {} if next_id: query_key = '$lte' if is_desc else '$gte' the_query['json_id'] = {query_key: next_id} db_results = util.db_find_it('roadDB', the_query, { '_id': False, 'extension': False }) sort_flag = pymongo.DESCENDING if is_desc else pymongo.ASCENDING db_results.sort([('json_id', sort_flag)]).limit(num_query) results = list(db_results) return results
def get_versions_handler(params): n_db_result = util._int(params.get('n', 1)) db_result = util.db_find_it('bee_csv', { 'is_processed_address': True, 'is_processed_ad_version': { '$ne': True } }, { '_id': False, 'csv_key': True, 'address': True, 'google_address': True, 'geo': True, 'version_text': True, 'versions': True, "deliver_time": True }) db_result = db_result.limit(n_db_result) db_result = list(db_result) return {"status": "OK", "result": db_result}
def _process_each_data(user_id, each_data, server_timestamp): offset_timestamp = util._int(each_data.get('offset_timestamp', 0)) lat = each_data.get('lat', 0) lon = each_data.get('lon', 0) yaw = each_data.get('yaw', 0) pitch = each_data.get('pitch', 0) roll = each_data.get('roll', 0) x = each_data.get('x', 0) y = each_data.get('y', 0) z = each_data.get('z', 0) the_timestamp = server_timestamp + offset_timestamp key = {'user_id': user_id, 'the_timestamp': the_timestamp} val = {'lat': lat, 'lon': lon, 'yaw': yaw, 'pitch': pitch, 'roll': roll, 'x': x, 'y': y, 'z': z} cfg.logger.debug('to db_update reportDB: server_timestamp: %s offset_timestamp: %s the_timestamp: %s key: %s val: %s', server_timestamp, offset_timestamp, the_timestamp, key, val) util.db_update('reportDB', key, val) return {'success': True}
def _get_params(params): if not params: server = cfg.config.get('web_server', 'http://106.187.101.193:5346') the_url = server + '/get/taipei_city_dig_point_next_dig_point' http_result = util.http_multiget([the_url]) next_dig_point = util._int(util.json_loads(http_result.get(the_url, ''), ''), START_TAIPEI_CITY_DIG_POINT) cfg.logger.debug('after http_multiget: http_result: %s next_dig_point: %s', http_result, next_dig_point) return (S_OK, {'next_dig_point': next_dig_point}) next_dig_point = params.get('next_dig_point', START_TAIPEI_CITY_DIG_POINT) year = next_dig_point // MAX_TAIPEI_CITY_DIG_POINTS_BY_YEAR the_timestamp = util.get_timestamp() tw_year = util.timestamp_to_tw_year(the_timestamp) if tw_year <= year: return (S_ERR, None) next_dig_point = (year + 1) * MAX_TAIPEI_CITY_DIG_POINTS_BY_YEAR return (S_OK, {'next_dig_point': next_dig_point})
def _parse_date6(the_str): # all-number the_str = re.sub('-.*', '', the_str) #cfg.logger.debug('the_str: %s len(the_str): %s', the_str, len(the_str)) if len(the_str) != 7: return (0, 0, 0) if re.search('^10', the_str): year = util._int(the_str[0:3]) + 1911 month = util._int(the_str[3:5]) day = util._int(the_str[5:7]) return (year, month, day) month = util._int(the_str[0:1]) day = util._int(the_str[1:3]) year = util._int(the_str[3:7]) return (year, month, day)
def _get_params(params): if not params: server = cfg.config.get('web_server', 'http://106.187.101.193:5346') the_url = server + '/get/new_taipei_city_dig_point_next_year' http_result = util.http_multiget([the_url]) next_year = util._int(util.json_loads(http_result.get(the_url, ''), ''), START_NEW_TAIPEI_CITY_DIG_POINT_YEAR) this_year = _get_this_year() next_year = min(next_year, this_year) cfg.logger.debug('after http_multiget: http_result: %s next_year: %s', http_result, next_year) return (S_OK, {'next_year': next_year}) next_year = params.get('next_year', START_NEW_TAIPEI_CITY_DIG_POINT_YEAR) stop_year = _get_stop_year() if next_year == stop_year: return (S_ERR, {'next_year': next_year}) next_year += 1 return (S_OK, {'next_year': next_year})
def do_crawler_new_taipei_city_dig_point(next_year): next_year = util._int(next_year) results = crawler_new_taipei_city_dig_point({'next_year': next_year}) util.to_json(results['data'], 'log.new_taipei_city_dig_point.json')
def _extract_session_struct_from_session_key(session_key): (session_timestamp, session_id) = _deserialize_session_key(session_key) return {"key": session_key, "the_timestamp": util._int(session_timestamp)}
def p_img_handler(data, content_type, idx): idx = util._int(idx) postfix = _parse_postfix(content_type) result = _save_img(data, postfix, content_type) result['the_idx'] = idx return result
def _sleep(): time_sleep = util._int(cfg.config.get('time_sleep', 86400)) cfg.logger.debug('to sleep: time_sleep: %s', time_sleep) time.sleep(time_sleep)