def getNewsList(date): lasttime = CurrentTime appLogger.info("Crawling Xinhuashe Video") rd = RequestData() rd.setSouceType('app') rd.setMetaID('') rd.setTrackSourceID(TRACESOURCEID) try: url = 'http://pub.zhongguowangshi.com/getRecord?date=' + date print url newsList = getXinhuaList(url) except: appLogger.info("fail to crawl page") return for newsurl in newsList: rd, isSuccess = setRdInfo(newsurl, rd) if not isSuccess: continue appLogger.info(rd._clip_title + "is successfully crawled , sending to MQ...") if len(rd._source_url) != 0 or len(rd._content) != 0: sendToMQ(rd) lasttime = rd._publish_time
def getNewsList(channel): lasttime = CurrentTime print("Crawling Channels ... channelid is " +channel['name']) pagenum =0 rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') while (cmp(lasttime,TenDaysAgoTime) == 1): time.sleep(1) print channel['name'] ,pagenum try : newsList = id2Url(channel,pagenum) if newsList==None or len(newsList)<1: break except: appLogger.error(traceback.format_exc()) print("fail to crawl page") break for newsitem in newsList: time.sleep(0.5) #print(newsitem["title"]) rd,isSuccess= setRdInfo(newsitem,rd) if not isSuccess: continue print(rd._clip_title + "is successfully crawled , sending to MQ...") if len(rd._source_url)!=0 or len(rd._content)!=0: sendToMQ(rd) lasttime = rd._publish_time break
def getChannelNewsList(channelid): print("Crawling Channels ... channelid is " + channelid) lasttime = CurrentTime rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') Listurl = id2Url(ChannelIds[channelid]) newsLists = json.loads(getList(Listurl)) domaindb = sqlite3.connect("news.db") cursor = domaindb.cursor() cursor.execute( "create table if not exists tencentnews (id integer primary key,pid text)" ) #print channelid if len(newsLists["idlist"][0]["ids"]) == 0: return for newsitem in newsLists["idlist"][0]["ids"]: if cmp(lasttime, TenDaysAgoTime) == -1: break if not int(newsitem["exist"]) == 1: continue cursor.execute("select * from tencentnews where pid='" + str(newsitem["id"]) + "'") if len(cursor.fetchall()) > 0: #print("Newsitem has been crawled before, pass...") continue try: rdi, rdv, isSuccess = setRdInfo(channelid, newsitem, rd) except: isSuccess = False if not isSuccess: continue print("Newsitem is successfully crawled , sending to MQ...") if len(rdi._source_url) != 0: #print rdi.hostparseToStr() sendToMQ(rdi) if rdv._outer_clipurl != "": #print rdv.linkparseToStr() sendTolinkMQ(rdv) domaindb.execute("insert into tencentnews(pid) values('" + str(newsitem["id"]) + "')") domaindb.commit() lasttime = rd._publish_time #print rd._publish_time +"::::::"+ rd._clip_title time.sleep(0.1) domaindb.close()
def getNewsList(channelid): pagenum = 1 lasttime = CurrentTime print("Crawling Channels ... channelid is " + channelid) rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') #get requestdata while (cmp(lasttime, TenDaysAgoTime) == 1): #print pagenum listurl = id2Url(ChannelIds[channelid], pagenum) print(listurl) time.sleep(1) newsLists = getList(listurl) if newsLists == None: continue newsLists = json.loads(newsLists) if not newsLists.has_key("articles") or len( newsLists["articles"]) == 0: break for newsitem in newsLists["articles"]: if not newsitem.has_key("link") or not newsitem.has_key( "time" ) or newsitem["link"] == "" or newsitem["link"][0:4] == "chan": continue try: rd, isSuccess = setRdInfo(newsitem, rd) except: isSuccess = False if not isSuccess: continue if len(rd._source_url) == 0: continue print(rd._clip_title + "is successfully crawled , sending to MQ...") sendToMQ(rd) print("successfully sent to MQ !") lasttime = rd._publish_time pagenum += 1
def getChannelNewsList(channelid): print("Crawling Channels ... channelid is " + channelid) pagenum = 1 lasttime = CurrentTime rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') #print channelid while cmp(lasttime, TenDaysAgoTime) == 1: #print pagenum listurl = id2Url(ChannelIds[channelid], pagenum) print listurl if ChannelIds[channelid] == 'T1457068979049': newsLists = getList(listurl)[u'\u89c6\u9891'] else: newsLists = getList(listurl)[ChannelIds[channelid]] if len(newsLists) == 0: break for newsitem in newsLists: #print json.dumps(newsitem) try: if newsitem.has_key('mp4_url') or newsitem.has_key('m3u8_url'): rd = setVinfo(newsitem, rd) else: rd = setRdInfo(newsitem, rd) lasttime = rd._publish_time if not rd: continue #print rd._publish_time +":::::"+ rd._clip_title print("Newsitem is successfully crawled , sending to MQ...") #print rd.hostparseToStr() rd_json = rd.hostparseToJson() rd_base64 = base64.encodestring(json.dumps(rd_json)) setTask(rd_base64) except: appLogger.error(traceback.format_exc()) print("Newsitem crawling failed") continue pagenum += 1
def getNewsList(channel): #print channel pagenum = 0 lasttime = CurrentTime print("Crawling Channels ... channelid is %s" % channel['name']) rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') while (cmp(lasttime, TenDaysAgoTime) == 1): time.sleep(1) print channel['name'], pagenum clipurl = id2Url(channel, pagenum) print clipurl try: newsLists = json.loads(getList(clipurl)) newsList = newsLists['list'] if not newsList: break except: print("fail to crawl page") pagenum += 1 continue for newsitem in newsList: if not newsitem.has_key("id") or newsitem['id'] == '': continue print(newsitem["title"]) rd, rdv, isSuccess = setRdInfo(newsitem, rd) if not isSuccess: continue print(rd._clip_title + "is successfully crawled , sending to MQ...") if rd._source_url or rd._content: sendToMQ(rd) if rdv._source_url or rdv._content: sendToMQ(rdv) lasttime = rd._publish_time pagenum += 1
def getNewsList(channel): print channel pagenum = 1 lasttime = CurrentTime print("Crawling Channels ... channelid is " +channel['name']) rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') while (cmp(lasttime,TenDaysAgoTime) == 1): time.sleep(1) print channel['id'],pagenum clipurl = id2Url(channel,pagenum) try : newsLists = json.loads(getList(clipurl)) newsList = [] for i in newsLists['dataList']: for j in i['list']: newsList.append(j) if newsList==None or len(newsList)<1: break except: appLogger.error(traceback.format_exc()) break for newsitem in newsList: rd,isSuccess= setRdInfo(newsitem,rd) if not isSuccess: continue print(rd._clip_title + "is successfully crawled , sending to MQ...") if len(rd._source_url)!=0 or len(rd._content)!=0: sendToMQ(rd) lasttime = rd._publish_time pagenum += 1
def crawler(config): ''' return RequestData obj ''' rd = RequestData() return rd
def getVideoList(): pagenum = 1 lasttime = CurrentTime print("Crawling Channels ... channelid is VideoChannel") rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') print("Crawling VideoChannel ....") domaindb = sqlite3.connect("news.db") cursor = domaindb.cursor() cursor.execute( "create table if not exists sohunews (id integer primary key,pid text)" ) #get requestdata while (cmp(lasttime, TenDaysAgoTime) == 1): #print pagenum listurl = id2Url(36, pagenum) time.sleep(1) newsLists = json.loads(getList(listurl)) if not newsLists.has_key("articles") or len( newsLists["articles"]) == 0: break for newsitem in newsLists["articles"]: if not newsitem.has_key("link") or not newsitem.has_key( "time") or newsitem["link"] == "": continue cursor.execute("select * from sohunews where pid='" + newsitem["link"] + "'") if len(cursor.fetchall()) > 0: #print("Newsitem has been crawled before, pass...") pass mid = getVideoMid(newsitem["link"]) clipurl = Videodetail(mid) #print clipurl newsdetail = getList(clipurl) if newsdetail == None: continue rd.setClipUrl(clipurl) rd.setClipTitle(newsitem["title"]) rd.setCategory('video') rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000)) rd.setViewCount(int(newsitem["commentNum"])) if newsitem.has_key("media"): rd.setClipSource(newsitem["media"]) newsdetail = json.loads(newsdetail) srcurl = [] for item in newsdetail["message"]["playurl"]: if newsdetail["message"]["playurl"][item]!="" and \ newsdetail["message"]["playurl"][item]!=0 and \ newsdetail["message"]["playurl"][item]!=[] : srcurl.append(newsdetail["message"]["playurl"][item]) rd.setSourceUrl(srcurl) if len(srcurl) == 0: continue print(rd._clip_title + "is successfully crawled , sending to MQ...") sendToMQ(rd) print("successfully sent to MQ !") domaindb.execute("insert into sohunews(pid) values('" + newsitem["link"] + "')") domaindb.commit() lasttime = rd._publish_time #print type(rd._view_count) pagenum += 1 domaindb.close()
def getAlbumList(channelid): pagenum = 1 lasttime = CurrentTime print("Crawling Channels ... channelid is " + channelid) rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') domaindb = sqlite3.connect("news.db") cursor = domaindb.cursor() cursor.execute( "create table if not exists sohunews (id integer primary key,pid text)" ) #get requestdata while (cmp(lasttime, TenDaysAgoTime) == 1): #print pagenum listurl = AlbumUrl(ChannelIds[channelid], pagenum) time.sleep(1) newsLists = json.loads(getList(listurl)) if not newsLists.has_key("news") or len(newsLists["news"]) == 0: break for newsitem in newsLists["news"]: if not newsitem.has_key("gid"): continue cursor.execute("select * from sohunews where pid='" + str(newsitem["gid"]) + "'") if len(cursor.fetchall()) > 0: #print("Newsitem has been crawled before, pass...") continue clipurl = Albumdetail(str(newsitem["gid"])) newsdetail = getList(clipurl) if newsdetail == None: continue rd.setClipUrl(clipurl) rd.setClipTitle(newsitem["title"]) rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000)) rd.setViewCount(int(newsitem["commentNum"])) rd.setCategory('image') newsdetail = xmltodict.parse(newsdetail) srcurl = [] new_srcurl = [] if newsdetail["root"].has_key("gallery"): if type(newsdetail["root"]["gallery"] ["photo"]) is types.ListType: for img in newsdetail["root"]["gallery"]["photo"]: srcurl.append(img["pic"]) else: srcurl.append( newsdetail["root"]["gallery"]["photo"]["pic"]) if len(srcurl) == 0: continue #FIX https://seals.vobile.cn/trac/ProjectManagement/ticket/743 for url in srcurl: if url.find(',http') > 0: new_srcurl.append(url[:url.find(',http')]) else: new_srcurl.append(url) rd.setSourceUrl(new_srcurl) print(rd._clip_title + "is successfully crawled , sending to MQ...") sendToMQ(rd) print("successfully sent to MQ !") domaindb.execute("insert into sohunews(pid) values('" + str(newsitem["gid"]) + "')") domaindb.commit() lasttime = rd._publish_time pagenum += 1 domaindb.close()
def load_wechat_page(url): try: driver = get_wechat_phantomjs_driver() driver.get(url) page_source = driver.page_source soup = BeautifulSoup(page_source, "lxml") logo_url = soup.find('a', attrs={'uigs': "account_name_0"}) if not logo_url: raise Exception("WeChat User Not Found") else: i = logo_url['href'] print i driver1 = get_wechat_phantomjs_driver() driver1.get(i) time.sleep(5) page_source = driver1.page_source soup1 = BeautifulSoup(page_source, "lxml") jsonstr = '' for i in soup1.find_all('script', attrs={'type': 'text/javascript'}): if i.string and 'document.domain' in i.string: pattern = re.compile(r"var msgList = ([^']+)]};") for j in pattern.findall(i.string): jsonstr = j + ']}' break news = [] while jsonstr == '': #print( soup1 print("need input auth code!") driver1.get_screenshot_as_file('./vcode1.jpg') cut_image() elem_code = driver1.find_element_by_id('input') elem_code.send_keys(get_vcode()) driver1.find_element_by_id('bt').click() for _ in xrange(40): time.sleep(0.5) driver1.get_screenshot_as_file('./vcode2.jpg') for i in soup1.find_all('script', attrs={'type': 'text/javascript'}): if i.string and 'document.domain' in i.string: pattern = re.compile(r"var msgList = ([^']+)]};") for j in pattern.findall(i.string): jsonstr = j + ']}' break #print( driver1.page_source open('./wechatjson', 'w+').write(jsonstr) for i in json.loads(jsonstr)['list']: news.append(i["app_msg_ext_info"]) for j in i["app_msg_ext_info"]["multi_app_msg_item_list"]: news.append(j) success = True img_news_list = [] video_news_list = [] for news_item in news: rd1 = RequestData() rd1.setSouceType('wechat') rd1.setMetaID('') rd = rd1.copyrd() rd, rdv, isSuccess = gen_wechat_news(news_item, rd) if not isSuccess: continue print json.dumps(rd) print json.dumps(rdv) img_news_list.append(rd) video_news_list.append(rdv) with open('wechat.out', 'w+') as f: json.dump( { "status": 0, "data": { "img_news": img_news_list, "video_news": video_news_list } }, f) except: time.sleep(20) with open('wechat.out', 'w+') as f: json.dump({ "status": -1, "data": { "error": traceback.format_exc() } }, f) finally: try: driver.service.process.send_signal(signal.SIGTERM) driver.quit() driver1.service.process.send_signal(signal.SIGTERM) driver1.quit() except: pass return success, news