Exemplo n.º 1
0
def getVideoList():
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is VideoChannel")

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')

    print("Crawling VideoChannel ....")

    domaindb = sqlite3.connect("news.db")
    cursor = domaindb.cursor()
    cursor.execute(
        "create table if not exists sohunews (id integer primary key,pid text)"
    )

    #get requestdata
    while (cmp(lasttime, TenDaysAgoTime) == 1):

        #print pagenum
        listurl = id2Url(36, pagenum)

        time.sleep(1)
        newsLists = json.loads(getList(listurl))

        if not newsLists.has_key("articles") or len(
                newsLists["articles"]) == 0:
            break
        for newsitem in newsLists["articles"]:
            if not newsitem.has_key("link") or not newsitem.has_key(
                    "time") or newsitem["link"] == "":
                continue
            cursor.execute("select * from sohunews where pid='" +
                           newsitem["link"] + "'")
            if len(cursor.fetchall()) > 0:
                #print("Newsitem has been crawled before, pass...")
                pass
            mid = getVideoMid(newsitem["link"])
            clipurl = Videodetail(mid)
            #print clipurl
            newsdetail = getList(clipurl)
            if newsdetail == None:
                continue
            rd.setClipUrl(clipurl)
            rd.setClipTitle(newsitem["title"])
            rd.setCategory('video')
            rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000))
            rd.setViewCount(int(newsitem["commentNum"]))
            if newsitem.has_key("media"):
                rd.setClipSource(newsitem["media"])
            newsdetail = json.loads(newsdetail)

            srcurl = []
            for item in newsdetail["message"]["playurl"]:
                if newsdetail["message"]["playurl"][item]!="" and \
                   newsdetail["message"]["playurl"][item]!=0  and \
                   newsdetail["message"]["playurl"][item]!=[]  :
                    srcurl.append(newsdetail["message"]["playurl"][item])
            rd.setSourceUrl(srcurl)
            if len(srcurl) == 0:
                continue
            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            sendToMQ(rd)
            print("successfully sent to MQ !")
            domaindb.execute("insert into sohunews(pid) values('" +
                             newsitem["link"] + "')")
            domaindb.commit()
            lasttime = rd._publish_time
            #print type(rd._view_count)
        pagenum += 1
    domaindb.close()
Exemplo n.º 2
0
def getAlbumList(channelid):
    pagenum = 1
    lasttime = CurrentTime

    print("Crawling Channels ... channelid is " + channelid)

    rd = RequestData()
    rd.setTrackSourceID(TRACESOURCEID)
    rd.setSouceType('app')
    rd.setMetaID('')

    domaindb = sqlite3.connect("news.db")
    cursor = domaindb.cursor()
    cursor.execute(
        "create table if not exists sohunews (id integer primary key,pid text)"
    )
    #get requestdata
    while (cmp(lasttime, TenDaysAgoTime) == 1):

        #print pagenum
        listurl = AlbumUrl(ChannelIds[channelid], pagenum)

        time.sleep(1)
        newsLists = json.loads(getList(listurl))

        if not newsLists.has_key("news") or len(newsLists["news"]) == 0:
            break
        for newsitem in newsLists["news"]:
            if not newsitem.has_key("gid"):
                continue
            cursor.execute("select * from sohunews where pid='" +
                           str(newsitem["gid"]) + "'")
            if len(cursor.fetchall()) > 0:
                #print("Newsitem has been crawled before, pass...")
                continue
            clipurl = Albumdetail(str(newsitem["gid"]))
            newsdetail = getList(clipurl)
            if newsdetail == None:
                continue
            rd.setClipUrl(clipurl)
            rd.setClipTitle(newsitem["title"])
            rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000))
            rd.setViewCount(int(newsitem["commentNum"]))
            rd.setCategory('image')

            newsdetail = xmltodict.parse(newsdetail)

            srcurl = []
            new_srcurl = []
            if newsdetail["root"].has_key("gallery"):
                if type(newsdetail["root"]["gallery"]
                        ["photo"]) is types.ListType:
                    for img in newsdetail["root"]["gallery"]["photo"]:

                        srcurl.append(img["pic"])
                else:
                    srcurl.append(
                        newsdetail["root"]["gallery"]["photo"]["pic"])

            if len(srcurl) == 0:
                continue

            #FIX https://seals.vobile.cn/trac/ProjectManagement/ticket/743
            for url in srcurl:
                if url.find(',http') > 0:
                    new_srcurl.append(url[:url.find(',http')])
                else:
                    new_srcurl.append(url)

            rd.setSourceUrl(new_srcurl)
            print(rd._clip_title +
                  "is successfully crawled , sending to MQ...")
            sendToMQ(rd)
            print("successfully sent to MQ !")
            domaindb.execute("insert into sohunews(pid) values('" +
                             str(newsitem["gid"]) + "')")
            domaindb.commit()
            lasttime = rd._publish_time

        pagenum += 1
    domaindb.close()