def get_info_history(session): geo_num = 63 starttime = convert_time('2015', '1', '1', '0') starttime = str(starttime) starttime = starttime[:-2] page_count = 50 endtime = starttime sort = 0 offset = 0 inter_lat = INTER_LAT inter_lon = INTER_LON for p_id in range(0, geo_num): geo_range = DISTANCE p = 1 index = [0]*50 view_list = fourtree(session, QUERY_COORDINATE_LIST[p_id], starttime, geo_range, inter_lat, inter_lon) for view in view_list: p = 1 while p < 20: coordinate = view['coordinate'] geo_range = view['geo_range'] temp_time = random.randint(10, 15) wait_time(temp_time) info_list = get_weibo_by_coordinate(session, coordinate, starttime, endtime, geo_range, 0, page_count, p, 0) if info_list: pd = save_data_by_db(info_list) if not pd: break p += 1 else: break sleep_time = random.randint(10, 20) wait_time(sleep_time)
def search_info(session, keyword="", start_time="", end_time="", num=1, location=0): content_text = str() id_text = str() haslink = str() if location != 0: haslink = "&haslink=1" for i in range(START_PAGE, START_PAGE + TOTAL_PAGE): url = 'http://s.weibo.com/weibo/' + keyword + '&scope=ori' + haslink + '×cope=custom:' + start_time + ':' + end_time + '&page=' + str( i) + '&rd=newTips' sleep_time = random.randint(10, 30) wait_time(sleep_time) get_text = session.get(url).text get_text = u'' + get_text get_text = get_text.encode('utf-8') content_text = save_catch_page(get_text) pd = out_page(content_text) if not pd: return num num = get_page_info(content_text, session, location, num) return num
def fourtree(session, coordinate, starttime, geo_range, inter_lat, inter_lon): temp_time = random.randint(2, 5) wait_time(temp_time) info_list = get_weibo_by_coordinate(session, coordinate, starttime, 0, geo_range, 0, 50, 20, 0) if info_list: print(geo_range) save_data_by_db(info_list) else: print(0) if info_list: inter_lat = round(inter_lat / 2, 6) inter_lon = round(inter_lon / 2, 6) coordinate1 = dict() coordinate2 = dict() coordinate3 = dict() coordinate4 = dict() geo_range = float(geo_range) geo_range = int(round(geo_range / 2 * 1.3)) if geo_range < 100: geo_range = 100 coordinate1['latitude'] = str( round(float(coordinate['latitude']) + inter_lat, 6)) coordinate1['longitude'] = str( round(float(coordinate['longitude']) + inter_lat, 6)) coordinate2['latitude'] = str( round(float(coordinate['latitude']) - inter_lon, 6)) coordinate2['longitude'] = str( round(float(coordinate['longitude']) + inter_lon, 6)) coordinate3['latitude'] = str( round(float(coordinate['latitude']) + inter_lat, 6)) coordinate3['longitude'] = str( round(float(coordinate['longitude']) - inter_lat, 6)) coordinate4['latitude'] = str( round(float(coordinate['latitude']) - inter_lon, 6)) coordinate4['longitude'] = str( round(float(coordinate['longitude']) - inter_lon, 6)) l1 = fourtree(session, coordinate1, starttime, geo_range, inter_lat, inter_lon) l2 = fourtree(session, coordinate2, starttime, geo_range, inter_lat, inter_lon) l3 = fourtree(session, coordinate3, starttime, geo_range, inter_lat, inter_lon) l4 = fourtree(session, coordinate4, starttime, geo_range, inter_lat, inter_lon) l5 = [{'coordinate': coordinate, 'geo_range': geo_range}] return l1 + l2 + l3 + l4 + l5 else: return [{'coordinate': coordinate, 'geo_range': geo_range}]
def wblogin(username, password): resp = session.get( 'http://login.sina.com.cn/sso/prelogin.php?' 'entry=sso&callback=sinaSSOController.preloginCallBack&' 'su=%s&rsakt=mod&client=%s' % (base64.b64encode(username.encode('utf-8')), WBCLIENT) ) wait_time(3) pre_login_str = re.match(r'[^{]+({.+?})', resp.text).group(1) pre_login = json.loads(pre_login_str) pre_login = json.loads(pre_login_str) data = { 'entry': 'weibo', 'gateway': 1, 'from': '', 'savestate': 7, 'userticket': 1, 'ssosimplelogin': 1, 'su': base64.b64encode(requests.utils.quote(username).encode('utf-8')), 'service': 'miniblog', 'servertime': pre_login['servertime'], 'nonce': pre_login['nonce'], 'vsnf': 1, 'vsnval': '', 'pwencode': 'rsa2', 'sp': encrypt_passwd( password, pre_login['pubkey'], pre_login['servertime'], pre_login['nonce'] ), 'rsakv': pre_login['rsakv'], 'encoding':'UTF-8', 'prelt': '115', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.si' 'naSSOController.feedBackUrlCallBack', 'returntype': 'META', } resp = session.post( 'http://login.sina.com.cn/sso/login.php?client=%s' % WBCLIENT, data=data ) login_url = re.search(r'replace\([\"\']([^\'\"]+)[\"\']', resp.text).group(1) resp = session.get(login_url) login_str = re.match(r'[^{]+({.+?}})', resp.text).group(1) return json.loads(login_str)
def search_info(session, keyword="", start_time="", end_time="", num=1, location=0): content_text = str() id_text = str() haslink = str() if location != 0: haslink = "&haslink=1" for i in range(START_PAGE, START_PAGE+TOTAL_PAGE): url = 'http://s.weibo.com/weibo/'+keyword+'&scope=ori'+haslink+'×cope=custom:'+start_time+':'+end_time+'&page='+str(i)+'&rd=newTips' sleep_time = random.randint(10, 30) wait_time(sleep_time) get_text = session.get(url).text get_text = u'' + get_text get_text = get_text.encode('utf-8') content_text = save_catch_page(get_text) pd = out_page(content_text) if not pd: return num num = get_page_info(content_text, session, location, num) return num
def get_info_history(session): geo_num = 63 starttime = convert_time('2015', '1', '1', '0') starttime = str(starttime) starttime = starttime[:-2] page_count = 50 endtime = starttime sort = 0 offset = 0 inter_lat = INTER_LAT inter_lon = INTER_LON for p_id in range(0, geo_num): geo_range = DISTANCE p = 1 index = [0] * 50 view_list = fourtree(session, QUERY_COORDINATE_LIST[p_id], starttime, geo_range, inter_lat, inter_lon) for view in view_list: p = 1 while p < 20: coordinate = view['coordinate'] geo_range = view['geo_range'] temp_time = random.randint(10, 15) wait_time(temp_time) info_list = get_weibo_by_coordinate(session, coordinate, starttime, endtime, geo_range, 0, page_count, p, 0) if info_list: pd = save_data_by_db(info_list) if not pd: break p += 1 else: break sleep_time = random.randint(10, 20) wait_time(sleep_time)
def fourtree(session, coordinate, starttime, geo_range, inter_lat, inter_lon): temp_time = random.randint(2, 5) wait_time(temp_time) info_list = get_weibo_by_coordinate(session, coordinate, starttime, 0, geo_range, 0, 50, 20, 0) if info_list: print (geo_range) save_data_by_db(info_list) else: print (0) if info_list: inter_lat = round(inter_lat/2, 6) inter_lon = round(inter_lon/2, 6) coordinate1 = dict() coordinate2 = dict() coordinate3 = dict() coordinate4 = dict() geo_range = float(geo_range) geo_range = int(round(geo_range/2*1.3)) if geo_range < 100: geo_range = 100 coordinate1['latitude'] = str(round(float(coordinate['latitude']) + inter_lat, 6)) coordinate1['longitude'] = str(round(float(coordinate['longitude']) + inter_lat, 6)) coordinate2['latitude'] = str(round(float(coordinate['latitude']) - inter_lon, 6)) coordinate2['longitude'] = str(round(float(coordinate['longitude']) + inter_lon, 6)) coordinate3['latitude'] = str(round(float(coordinate['latitude']) + inter_lat, 6)) coordinate3['longitude'] = str(round(float(coordinate['longitude']) - inter_lat, 6)) coordinate4['latitude'] = str(round(float(coordinate['latitude']) - inter_lon, 6)) coordinate4['longitude'] = str(round(float(coordinate['longitude']) - inter_lon, 6)) l1 = fourtree(session, coordinate1, starttime, geo_range, inter_lat, inter_lon) l2 = fourtree(session, coordinate2, starttime, geo_range, inter_lat, inter_lon) l3 = fourtree(session, coordinate3, starttime, geo_range, inter_lat, inter_lon) l4 = fourtree(session, coordinate4, starttime, geo_range, inter_lat, inter_lon) l5 = [{'coordinate': coordinate, 'geo_range': geo_range}] return l1+l2+l3+l4 +l5 else: return [{'coordinate': coordinate, 'geo_range': geo_range}]
def get_weibo_by_coordinate(session, coordinate, starttime, endtime, range=2000, sort=0, count=20, page=1, offset=0): if log_date.log_date.year != datetime.datetime.now(): log_date.change_log_date() init_log() num = 0 pd_403 = [0] * len(APP_SOURCE_LIST) end_403 = [1] * len(APP_SOURCE_LIST) while True: try: app_id = random.randint(0, len(APP_SOURCE_LIST)-1) url = "http://api.weibo.com/2/place/nearby_timeline.json?" url += "source="+APP_SOURCE_LIST[app_id] url += "&lat="+coordinate['latitude']+"&long="+coordinate['longitude'] url += "&starttime="+str(starttime)+"&range="+str(range)+"&sort="+str(sort) url += "&count="+str(count)+"&page="+str(page)+"&offset="+str(offset) text = session.get(url) if text.status_code == 403: pd_403[app_id] = 1 if pd_403 == end_403: sleep_time = 15600 else: sleep_time = random.randint(12, 30) wait_time(sleep_time) continue break except ConnectionError: num += 1 lg_warning(ConnectionError) lg_debug('connect fail'+str(num)) sleep_time = random.randint(6, 10) wait_time(str(sleep_time)) continue except Exception: num += 1 print('Connection reset by peer error') lg_warning(Exception) lg_debug('Connection reset by peer'+str(num)) sleep_time = random.randint(10, 20) wait_time(str(sleep_time)) continue text_dict = None text_list_dict = None try: text_dict = text.json() if text_dict.has_key('statuses'): text_list_dict = text_dict['statuses'] lg_debug('success catch the info_list') else: lg_debug("get_weibo_by_coordinate: No Json") except Exception: lg_warning(Exception.message) lg_debug("get_weibo_by_coordinate: No Json") return text_list_dict
def wblogin(username, password): resp = session.get('http://login.sina.com.cn/sso/prelogin.php?' 'entry=sso&callback=sinaSSOController.preloginCallBack&' 'su=%s&rsakt=mod&client=%s' % (base64.b64encode(username.encode('utf-8')), WBCLIENT)) wait_time(3) pre_login_str = re.match(r'[^{]+({.+?})', resp.text).group(1) pre_login = json.loads(pre_login_str) pre_login = json.loads(pre_login_str) data = { 'entry': 'weibo', 'gateway': 1, 'from': '', 'savestate': 7, 'userticket': 1, 'ssosimplelogin': 1, 'su': base64.b64encode(requests.utils.quote(username).encode('utf-8')), 'service': 'miniblog', 'servertime': pre_login['servertime'], 'nonce': pre_login['nonce'], 'vsnf': 1, 'vsnval': '', 'pwencode': 'rsa2', 'sp': encrypt_passwd(password, pre_login['pubkey'], pre_login['servertime'], pre_login['nonce']), 'rsakv': pre_login['rsakv'], 'encoding': 'UTF-8', 'prelt': '115', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.si' 'naSSOController.feedBackUrlCallBack', 'returntype': 'META', } resp = session.post('http://login.sina.com.cn/sso/login.php?client=%s' % WBCLIENT, data=data) login_url = re.search(r'replace\([\"\']([^\'\"]+)[\"\']', resp.text).group(1) resp = session.get(login_url) login_str = re.match(r'[^{]+({.+?}})', resp.text).group(1) return json.loads(login_str)
def get_weibo_by_coordinate(session, coordinate, starttime, endtime, range=2000, sort=0, count=20, page=1, offset=0): if log_date.log_date.year != datetime.datetime.now(): log_date.change_log_date() init_log() num = 0 pd_403 = [0] * len(APP_SOURCE_LIST) end_403 = [1] * len(APP_SOURCE_LIST) while True: try: app_id = random.randint(0, len(APP_SOURCE_LIST) - 1) url = "http://api.weibo.com/2/place/nearby_timeline.json?" url += "source=" + APP_SOURCE_LIST[app_id] url += "&lat=" + coordinate['latitude'] + "&long=" + coordinate[ 'longitude'] url += "&starttime=" + str(starttime) + "&range=" + str( range) + "&sort=" + str(sort) url += "&count=" + str(count) + "&page=" + str( page) + "&offset=" + str(offset) text = session.get(url) if text.status_code == 403: pd_403[app_id] = 1 if pd_403 == end_403: sleep_time = 15600 else: sleep_time = random.randint(12, 30) wait_time(sleep_time) continue break except ConnectionError: num += 1 lg_warning(ConnectionError) lg_debug('connect fail' + str(num)) sleep_time = random.randint(6, 10) wait_time(str(sleep_time)) continue except Exception: num += 1 print('Connection reset by peer error') lg_warning(Exception) lg_debug('Connection reset by peer' + str(num)) sleep_time = random.randint(10, 20) wait_time(str(sleep_time)) continue text_dict = None text_list_dict = None try: text_dict = text.json() if text_dict.has_key('statuses'): text_list_dict = text_dict['statuses'] lg_debug('success catch the info_list') else: lg_debug("get_weibo_by_coordinate: No Json") except Exception: lg_warning(Exception.message) lg_debug("get_weibo_by_coordinate: No Json") return text_list_dict
starttime = convert_time('2015', '7', '1', '0') endtime = starttime starttime = str(starttime) starttime = starttime[:-2] info_list = get_weibo_by_coordinate(session, QUERY_COORDINATE_LIST[p_id], starttime, endtime, geo_range, 0, page_count, p, 0) save_data_by_db(info_list) if not info_list: continue else: pass length = len(info_list) cmpstr1 = info_list[length - 1]['mid'] cmpstr2 = index_num[p_id] index_num[p_id] = info_list[0]['mid'] while arbitrary_precision_compare(cmpstr1, cmpstr2) == 1: p += 1 info_list = get_weibo_by_coordinate(session, QUERY_COORDINATE_LIST[p_id], starttime, endtime, geo_range, 0, page_count, p, 0) save_data_by_db(info_list) if not info_list: break else: pass length = len(info_list) cmpstr1 = info_list[length - 1]['mid'] if p >= 20: break sleep_time = random.randint(10, 20) wait_time(sleep_time)