Exemplo n.º 1
0
def getNewsList(channel):
    lasttime = CurrentTime
    print("Crawling Channels ... channelid is " +channel['name'])
    pagenum =0
    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    while (cmp(lasttime,TenDaysAgoTime) == 1):
        time.sleep(1)
        print channel['name'] ,pagenum 
        try :
            newsList = id2Url(channel,pagenum)
            if newsList==None or len(newsList)<1:
                break
        except:
            appLogger.error(traceback.format_exc())
            print("fail to crawl page")
            break

        for newsitem in newsList:
            time.sleep(0.5)
            #print(newsitem["title"])
            rd,isSuccess= setRdInfo(newsitem,rd)
            if not isSuccess:
                continue

            print(rd._clip_title + "is successfully crawled , sending to MQ...")
            if len(rd._source_url)!=0 or len(rd._content)!=0:
                sendToMQ(rd)

            lasttime = rd._publish_time
        break
Exemplo n.º 2
0
def getNewsList(date):
    lasttime = CurrentTime

    appLogger.info("Crawling Xinhuashe Video")

    rd = RequestData()
    rd.setSouceType('app')
    rd.setMetaID('')
    rd.setTrackSourceID(TRACESOURCEID)
    try:
        url = 'http://pub.zhongguowangshi.com/getRecord?date=' + date
        print url
        newsList = getXinhuaList(url)

    except:
        appLogger.info("fail to crawl page")
        return
    for newsurl in newsList:
        rd, isSuccess = setRdInfo(newsurl, rd)
        if not isSuccess:
            continue

        appLogger.info(rd._clip_title +
                       "is successfully crawled , sending to MQ...")
        if len(rd._source_url) != 0 or len(rd._content) != 0:
            sendToMQ(rd)
        lasttime = rd._publish_time
Exemplo n.º 3
0
def getChannelNewsList(channelid):
    print("Crawling Channels ... channelid is " + channelid)

    lasttime = CurrentTime

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')

    Listurl = id2Url(ChannelIds[channelid])
    newsLists = json.loads(getList(Listurl))

    domaindb = sqlite3.connect("news.db")
    cursor = domaindb.cursor()
    cursor.execute(
        "create table if not exists tencentnews (id integer primary key,pid text)"
    )
    #print channelid
    if len(newsLists["idlist"][0]["ids"]) == 0:
        return
    for newsitem in newsLists["idlist"][0]["ids"]:
        if cmp(lasttime, TenDaysAgoTime) == -1:
            break
        if not int(newsitem["exist"]) == 1:
            continue
        cursor.execute("select * from tencentnews where pid='" +
                       str(newsitem["id"]) + "'")
        if len(cursor.fetchall()) > 0:
            #print("Newsitem has been crawled before, pass...")
            continue
        try:
            rdi, rdv, isSuccess = setRdInfo(channelid, newsitem, rd)
        except:
            isSuccess = False
        if not isSuccess:
            continue
        print("Newsitem is successfully crawled , sending to MQ...")
        if len(rdi._source_url) != 0:
            #print rdi.hostparseToStr()
            sendToMQ(rdi)
        if rdv._outer_clipurl != "":
            #print rdv.linkparseToStr()
            sendTolinkMQ(rdv)
        domaindb.execute("insert into tencentnews(pid) values('" +
                         str(newsitem["id"]) + "')")
        domaindb.commit()
        lasttime = rd._publish_time
        #print rd._publish_time +"::::::"+ rd._clip_title
        time.sleep(0.1)
    domaindb.close()
Exemplo n.º 4
0
def getNewsList(channelid):
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is " + channelid)

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    #get requestdata
    while (cmp(lasttime, TenDaysAgoTime) == 1):

        #print pagenum
        listurl = id2Url(ChannelIds[channelid], pagenum)
        print(listurl)
        time.sleep(1)

        newsLists = getList(listurl)

        if newsLists == None:
            continue
        newsLists = json.loads(newsLists)

        if not newsLists.has_key("articles") or len(
                newsLists["articles"]) == 0:
            break
        for newsitem in newsLists["articles"]:
            if not newsitem.has_key("link") or not newsitem.has_key(
                    "time"
            ) or newsitem["link"] == "" or newsitem["link"][0:4] == "chan":
                continue
            try:
                rd, isSuccess = setRdInfo(newsitem, rd)
            except:
                isSuccess = False
            if not isSuccess:
                continue
            if len(rd._source_url) == 0:
                continue

            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            sendToMQ(rd)
            print("successfully sent to MQ !")

            lasttime = rd._publish_time

        pagenum += 1
Exemplo n.º 5
0
def getNewsList(channel):
    #print channel
    pagenum = 0
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is %s" % channel['name'])

    rd = RequestData()

    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    while (cmp(lasttime, TenDaysAgoTime) == 1):
        time.sleep(1)
        print channel['name'], pagenum
        clipurl = id2Url(channel, pagenum)
        print clipurl
        try:
            newsLists = json.loads(getList(clipurl))
            newsList = newsLists['list']
            if not newsList:
                break
        except:
            print("fail to crawl page")
            pagenum += 1
            continue

        for newsitem in newsList:
            if not newsitem.has_key("id") or newsitem['id'] == '':
                continue

            print(newsitem["title"])

            rd, rdv, isSuccess = setRdInfo(newsitem, rd)
            if not isSuccess:
                continue

            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            if rd._source_url or rd._content:
                sendToMQ(rd)
            if rdv._source_url or rdv._content:
                sendToMQ(rdv)

            lasttime = rd._publish_time
        pagenum += 1
Exemplo n.º 6
0
def getChannelNewsList(channelid):
    print("Crawling Channels ... channelid is " + channelid)

    pagenum = 1
    lasttime = CurrentTime

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    #print channelid
    while cmp(lasttime, TenDaysAgoTime) == 1:
        #print pagenum
        listurl = id2Url(ChannelIds[channelid], pagenum)
        print listurl
        if ChannelIds[channelid] == 'T1457068979049':
            newsLists = getList(listurl)[u'\u89c6\u9891']
        else:
            newsLists = getList(listurl)[ChannelIds[channelid]]
        if len(newsLists) == 0:
            break
        for newsitem in newsLists:
            #print json.dumps(newsitem)
            try:
                if newsitem.has_key('mp4_url') or newsitem.has_key('m3u8_url'):
                    rd = setVinfo(newsitem, rd)
                else:
                    rd = setRdInfo(newsitem, rd)
                lasttime = rd._publish_time

                if not rd:
                    continue
                #print rd._publish_time +":::::"+ rd._clip_title
                print("Newsitem is successfully crawled , sending to MQ...")
                #print rd.hostparseToStr()
                rd_json = rd.hostparseToJson()
                rd_base64 = base64.encodestring(json.dumps(rd_json))
                setTask(rd_base64)
            except:
                appLogger.error(traceback.format_exc())
                print("Newsitem crawling failed")
                continue
        pagenum += 1
Exemplo n.º 7
0
def getNewsList(channel):
    print channel
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is " +channel['name'])
    
    rd = RequestData()

    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')
    while (cmp(lasttime,TenDaysAgoTime) == 1):
        time.sleep(1)
        print channel['id'],pagenum
        clipurl = id2Url(channel,pagenum)
        try :
            newsLists = json.loads(getList(clipurl))
            newsList = []
            for i in newsLists['dataList']:
                for j in i['list']:
                    newsList.append(j)
            if newsList==None or len(newsList)<1:
                break
        except:
            appLogger.error(traceback.format_exc())
            break

        for newsitem in newsList:
            rd,isSuccess= setRdInfo(newsitem,rd)        
            if not isSuccess:
                continue

            print(rd._clip_title + "is successfully crawled , sending to MQ...")
            if len(rd._source_url)!=0 or len(rd._content)!=0:

                sendToMQ(rd)

            lasttime = rd._publish_time
        pagenum += 1
Exemplo n.º 8
0
def getVideoList():
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is VideoChannel")

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')

    print("Crawling VideoChannel ....")

    domaindb = sqlite3.connect("news.db")
    cursor = domaindb.cursor()
    cursor.execute(
        "create table if not exists sohunews (id integer primary key,pid text)"
    )

    #get requestdata
    while (cmp(lasttime, TenDaysAgoTime) == 1):

        #print pagenum
        listurl = id2Url(36, pagenum)

        time.sleep(1)
        newsLists = json.loads(getList(listurl))

        if not newsLists.has_key("articles") or len(
                newsLists["articles"]) == 0:
            break
        for newsitem in newsLists["articles"]:
            if not newsitem.has_key("link") or not newsitem.has_key(
                    "time") or newsitem["link"] == "":
                continue
            cursor.execute("select * from sohunews where pid='" +
                           newsitem["link"] + "'")
            if len(cursor.fetchall()) > 0:
                #print("Newsitem has been crawled before, pass...")
                pass
            mid = getVideoMid(newsitem["link"])
            clipurl = Videodetail(mid)
            #print clipurl
            newsdetail = getList(clipurl)
            if newsdetail == None:
                continue
            rd.setClipUrl(clipurl)
            rd.setClipTitle(newsitem["title"])
            rd.setCategory('video')
            rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000))
            rd.setViewCount(int(newsitem["commentNum"]))
            if newsitem.has_key("media"):
                rd.setClipSource(newsitem["media"])
            newsdetail = json.loads(newsdetail)

            srcurl = []
            for item in newsdetail["message"]["playurl"]:
                if newsdetail["message"]["playurl"][item]!="" and \
                   newsdetail["message"]["playurl"][item]!=0  and \
                   newsdetail["message"]["playurl"][item]!=[]  :
                    srcurl.append(newsdetail["message"]["playurl"][item])
            rd.setSourceUrl(srcurl)
            if len(srcurl) == 0:
                continue
            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            sendToMQ(rd)
            print("successfully sent to MQ !")
            domaindb.execute("insert into sohunews(pid) values('" +
                             newsitem["link"] + "')")
            domaindb.commit()
            lasttime = rd._publish_time
            #print type(rd._view_count)
        pagenum += 1
    domaindb.close()
Exemplo n.º 9
0
def getAlbumList(channelid):
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is " + channelid)

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')

    domaindb = sqlite3.connect("news.db")
    cursor = domaindb.cursor()
    cursor.execute(
        "create table if not exists sohunews (id integer primary key,pid text)"
    )
    #get requestdata
    while (cmp(lasttime, TenDaysAgoTime) == 1):

        #print pagenum
        listurl = AlbumUrl(ChannelIds[channelid], pagenum)

        time.sleep(1)
        newsLists = json.loads(getList(listurl))

        if not newsLists.has_key("news") or len(newsLists["news"]) == 0:
            break
        for newsitem in newsLists["news"]:
            if not newsitem.has_key("gid"):
                continue
            cursor.execute("select * from sohunews where pid='" +
                           str(newsitem["gid"]) + "'")
            if len(cursor.fetchall()) > 0:
                #print("Newsitem has been crawled before, pass...")
                continue
            clipurl = Albumdetail(str(newsitem["gid"]))
            newsdetail = getList(clipurl)
            if newsdetail == None:
                continue
            rd.setClipUrl(clipurl)
            rd.setClipTitle(newsitem["title"])
            rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000))
            rd.setViewCount(int(newsitem["commentNum"]))
            rd.setCategory('image')

            newsdetail = xmltodict.parse(newsdetail)

            srcurl = []
            new_srcurl = []
            if newsdetail["root"].has_key("gallery"):
                if type(newsdetail["root"]["gallery"]
                        ["photo"]) is types.ListType:
                    for img in newsdetail["root"]["gallery"]["photo"]:

                        srcurl.append(img["pic"])
                else:
                    srcurl.append(
                        newsdetail["root"]["gallery"]["photo"]["pic"])

            if len(srcurl) == 0:
                continue

            #FIX https://seals.vobile.cn/trac/ProjectManagement/ticket/743
            for url in srcurl:
                if url.find(',http') > 0:
                    new_srcurl.append(url[:url.find(',http')])
                else:
                    new_srcurl.append(url)

            rd.setSourceUrl(new_srcurl)
            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            sendToMQ(rd)
            print("successfully sent to MQ !")
            domaindb.execute("insert into sohunews(pid) values('" +
                             str(newsitem["gid"]) + "')")
            domaindb.commit()
            lasttime = rd._publish_time

        pagenum += 1
    domaindb.close()