示例#1
0
    def crawl(self):
        cid = self.key
        channel = CHANNELS[int(cid)]
        page = 1
        pagesize = 30

        while 1:
            try:
                data = api_shows(cid, page, pagesize)
                if data is not None:
                    page += 1
                else:
                    return
            except:
                self.logger.warning(get_exception_info())
                continue

            if not data.get('results'):
                break
            for item in data['results']:
                try:
                    show_id = item['tid']
                    reset = (item['completed'] == 0)
                    data = {
                        'channel': channel,
                        'image': item.get('show_vthumburl_hd') if item.get('show_vthumburl_hd') else item.get('show_thumburl_hd'),
                        'image2': item.get('show_thumburl_hd')
                    }
                    Scheduler.schedule(
                        AlbumCrawler.type, key=show_id, data=data, reset=reset)
                except:
                    self.logger.warning(get_exception_info())
示例#2
0
    def crawl(self):
        cid = self.key
        page = 1
        pagesize = 30

        while True:
            list_data = api_list(cid, page, pagesize)
            # 数据不为空,取出数据,page+=1
            if list_data.get('results'):
                for item in list_data.get('results'):
                    # m = re.match("^\d+$", item['tid'])
                    # if m:
                    source_id = item['tid']
                    reset = int(item['completed']) == 0

                    Scheduler.schedule(
                        type=AlbumCrawler.type,
                        key=str(source_id),
                        reset=reset
                    )
                page += 1
            # 数据为空跳出循环
            if not list_data.get('results'):
                if page > 100:
                    return
                else:
                    page += 1
示例#3
0
    def crawl(self):
        cid = self.key
        page = 1
        pagesize = 30

        while True:
            list_data = api_list(cid, page, pagesize)
            # 数据不为空,取出数据,page+=1
            if list_data.get('results'):
                for item in list_data.get('results'):
                    # m = re.match("^\d+$", item['tid'])
                    # if m:
                    source_id = item['tid']
                    reset = int(item['completed']) == 0

                    Scheduler.schedule(type=AlbumCrawler.type,
                                       key=str(source_id),
                                       reset=reset)
                page += 1
            # 数据为空跳出循环
            if not list_data.get('results'):
                if page > 100:
                    return
                else:
                    page += 1
示例#4
0
def crawler_job(request):
    if request.method == "GET":
        url =  request.GET.get('url', None)
        raw_job = Scheduler.get_job_from_url(url)
        job = {}
        if raw_job:
            job = {
                    'type': raw_job['type'],
                    'key':  raw_job['key'],
                    'priority': raw_job['priority'],
                    'interval': raw_job['interval'],
                    'lastrun': raw_job['lastrun'],
                    'status': raw_job['status'],
                    'to_album_id': raw_job['data'].get('to_album_id'),
                    }
        return job
    else:
        url =  request.GET.get('url', None)
        interval =  request.GET.get('interval', 3600)
        channel =  request.GET.get('channel', None)
        image =  request.GET.get('image', None)
        data = {
                'channel':channel,
                'image': image,
                }
        success = Scheduler.schedule_url(url, data=data, interval=int(interval), reset=True)
        return {'status': int(success)}
示例#5
0
    def crawl(self):
        cid = self.key
        channel = self.data.get("channel_name")
        itemid = 0
        date = 0
        areaid = 0
        sort = 2  # 1:最新 2:最热
        start = 0
        num = 30

        while True:
            list_data = api_list(cid, itemid, date, areaid, sort, start, num,
                                 pcode, version)
            list_data = list_data.get("body")

            if list_data.get("data"):
                for item in list_data['data']:
                    source_id = item.get("id")
                    image = item.get("icon")
                    reset = int(item['isend']) == 0

                    Scheduler.schedule(type=AlbumCrawler.type,
                                       key=source_id,
                                       data={
                                           "channel": channel,
                                           "image": image
                                       },
                                       reset=reset)

                start += 1

            if not list_data.get("data"):
                return
示例#6
0
    def crawl(self):
        cid = self.key
        channel = self.data.get("channel_name")
        itemid = 0
        date = 0
        areaid = 0
        sort = 2  # 1:最新 2:最热
        start = 0
        num = 30

        while True:
            list_data = api_list(
                cid, itemid, date, areaid, sort, start, num, pcode, version)
            list_data = list_data.get("body")

            if list_data.get("data"):
                for item in list_data['data']:
                    source_id = item.get("id")
                    image = item.get("icon")
                    reset = int(item['isend']) == 0

                    Scheduler.schedule(
                        type=AlbumCrawler.type,
                        key=source_id,
                        data={
                            "channel": channel,
                            "image": image
                        },
                        reset=reset
                    )

                start += 1

            if not list_data.get("data"):
                return
示例#7
0
 def init(conf=None):
     if not conf:
         conf = {}
     for cid in CHANNELS.keys():
         Scheduler.schedule(CategoryCrawler.type,
                            key=str(cid),
                            priority=conf.get('priority', Priority.High),
                            interval=conf.get('interval', 3600))
示例#8
0
    def crawl(self):
        min_time = self.data.get('updated',
                                 datetime.min) if self.data else datetime.min
        max_time = None
        time = None

        page = 1
        while True:
            url = "http://www.265zy.com/list/?0-%s.html" % page
            hxs = load_html(url)

            for s in hxs.select("//body/.//tr[@class='row']"):
                try:
                    href = s.select("td[1]/a/@href").extract()[0]
                    source_id = re.findall("(\d+)\.html", href)[0]
                    title = clean_title(
                        s.select("td[1]/.//text()").extract()[0])
                    region = s.select("td[2]/.//text()").extract()[0].replace(
                        u"地区", u"")
                    category = s.select("td[3]/.//text()").extract()[0]
                    time = s.select("td[4]/.//text()").extract()[0]
                    time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
                except:
                    continue

                if not max_time:
                    max_time = time
                if time < min_time:
                    break

                data = {
                    "title": title,
                    "time": time,
                    'category': category,
                    'region': region,
                }

                lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                lasttime = lastdata.get(
                    "time", datetime.min) if lastdata else datetime.min
                Scheduler.schedule(type=AlbumCrawler.type,
                                   key=source_id,
                                   data=data,
                                   reset=data['time'] > lasttime)

            if time and time < min_time:
                break

            text = hxs.select("//div[@class='pages']/span/text()").extract()[0]
            page_count = int(re.findall(u"\d+/(\d+)页", text)[0])
            if page >= 5:
                break
            page += 1

        if max_time:
            if not self.data:
                self.data = {}
            self.data['updated'] = max_time
示例#9
0
 def init(conf=None):
     if not conf:
         conf = {}
     for catecode in CHANNELS.keys():
         Scheduler.schedule(CategoryCrawler.type,
                            key=str(catecode),
                            data={"catecode": catecode},
                            priority=conf.get('priority', Priority.Normal),
                            interval=conf.get('interval', 3600))
示例#10
0
 def init(conf=None):
     if not conf:
         conf = {}
     for channel in CHANNELS.iterkeys():
         Scheduler.schedule(HistoryCrawler.type,
                            key=channel,
                            data={"year": 1900},
                            priority=conf.get('priority', Priority.Normal),
                            interval=conf.get('interval', 86400))
示例#11
0
 def init(conf=None):
     if not conf:
         conf = {}
     for id in _CHANNEL_DCT.iterkeys():
         Scheduler.schedule(CategoryCrawler.type,
                            key=str(id),
                            data={"cid": id},
                            priority=conf.get('priority', Priority.High),
                            interval=conf.get('interval', 3600))
示例#12
0
 def init(conf=None):
     if not conf:
         conf = {}
     Scheduler.schedule(
         ListCrawler.type,  #爬虫类型
         key="",  #该类型只有一个实例,key设""
         priority=conf.get('priority', Priority.High),  #优先级为高
         data={'updated': datetime.min},  #附加数据为上次爬取到的最新视频的更新时间
         interval=conf.get('interval', 3600)  #循环抓取间隔为1小时
     )
示例#13
0
 def init(conf = None):
     if not conf:
         conf = {}
     Scheduler.schedule(
                        ListCrawler.type, #爬虫类型
                        key = "", #该类型只有一个实例,key设""
                        priority = conf.get('priority', Priority.High), #优先级为高
                        data = {'updated' : datetime.min}, #附加数据为上次爬取到的最新视频的更新时间
                        interval = conf.get('interval', 3600) #循环抓取间隔为1小时
                        )
示例#14
0
 def init(conf=None):
     if not conf:
         conf = {}
     for id, channel in CHANNELS.iteritems():
         data = {"channel": channel}
         Scheduler.schedule(ListCrawler.type,
                            key=id,
                            data=data,
                            priority=conf.get('priority', Priority.High),
                            interval=conf.get('interval', 3600))
示例#15
0
 def init(conf = None):
     if not conf:
         conf = {}
     Scheduler.schedule(
                        ListCrawler.type, #鐖櫕绫诲瀷
                        key = "", #璇ョ被鍨嬪彧鏈変竴涓疄渚�,key璁�""
                        priority = conf.get('priority', Priority.High), #浼樺厛绾т负楂�
                        data = {'updated' : datetime.min}, #闄勫姞鏁版嵁涓轰笂娆$埇鍙栧埌鐨勬渶鏂拌棰戠殑鏇存柊鏃堕棿
                        interval = conf.get('interval', 3600) #寰幆鎶撳彇闂撮殧涓�忔椂
                        )
示例#16
0
 def init(conf=None):
     if not conf:
         conf = {}
     for key, data in CHANNELS.iteritems():
         Scheduler.schedule(
             ListCrawler.type,
             key=key,
             data=data,
             priority=conf.get('priority', Priority.High),
             interval=conf.get('interval', 3600)
         )
示例#17
0
    def crawl(self):
        min_time = self.data.get('updated', datetime.min) if self.data else datetime.min
        max_time = None
        time = None

        page = 1
        while True:
            url = "http://zyqvod.com/?page=%s" % page
            hxs = load_html(url)

            for s in hxs.select("//table[@id='listTable']/tbody/tr"):
                try:
                    source_id = re.findall("id=(\d+)", s.select("td[1]/a/@href").extract()[0])[0]
                    title = clean_title(s.select("td[1]/.//text()").extract()[0])
                    category = s.select("td[2]/.//text()").extract()[0]
                    region = s.select("td[3]/.//text()").extract()[0]
                    completed = s.select("td[4]/.//text()").extract()[0] == u"完结"
                    time = s.select("td[5]/.//text()").extract()[0]
                    time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
                except:
                    continue

                if not max_time:
                    max_time = time
                if time < min_time:
                    break

                data = {
                            "title" : title,
                            "time" : time,
                            'category' : category,
                            'region' : region,
                            'completed' : completed,
                        }

                lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min
                Scheduler.schedule(type = AlbumCrawler.type, key = source_id, data = data, reset = data['time'] > lasttime)

            if time and time < min_time:
                break

            text = hxs.select("//div[@class='page_num']/text()").extract()[0]
            page_count = int(re.findall(u"\d+/(\d+)页", text)[0])
            if page >= page_count:
                break
            page += 1

        if max_time:
            if not self.data:
                self.data = {}
            self.data['updated'] = max_time
示例#18
0
 def init(conf=None):
     if not conf:
         conf = {}
     data = api_channel(pcode, version)
     for channel in data['body']['channel']:
         cid = channel.get("cid")
         channel_name = channel.get("name")
         crawl_data = {
             "channel_id": cid,
             "channel_name": channel_name,
         }
         Scheduler.schedule(ListCrawler.type, key=cid, data=crawl_data, priority=conf.get(
             'priority', Priority.High), interval=conf.get('interval', 3600))
示例#19
0
    def crawl(self):
        min_time = self.data.get('updated', datetime.min) if self.data else datetime.min
        max_time = None
        time = None

        page = 1
        while True:
            url = "http://www.265zy.com/list/?0-%s.html" % page
            hxs = load_html(url)

            for s in hxs.select("//body/.//tr[@class='row']"):
                try:
                    href = s.select("td[1]/a/@href").extract()[0]
                    source_id = re.findall("(\d+)\.html", href)[0]
                    title = clean_title(s.select("td[1]/.//text()").extract()[0])
                    region = s.select("td[2]/.//text()").extract()[0].replace(u"地区", u"")
                    category = s.select("td[3]/.//text()").extract()[0]
                    time = s.select("td[4]/.//text()").extract()[0]
                    time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
                except:
                    continue

                if not max_time:
                    max_time = time
                if time < min_time:
                    break

                data = {
                            "title" : title,
                            "time" : time,
                            'category' : category,
                            'region' : region,
                        }

                lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min
                Scheduler.schedule(type = AlbumCrawler.type, key = source_id, data = data, reset = data['time'] > lasttime)

            if time and time < min_time:
                break

            text = hxs.select("//div[@class='pages']/span/text()").extract()[0]
            page_count = int(re.findall(u"\d+/(\d+)页", text)[0])
            if page >= 5:
                break
            page += 1

        if max_time:
            if not self.data:
                self.data = {}
            self.data['updated'] = max_time
示例#20
0
    def crawl(self):
        type = 3
        channel_id = self.key
        channel = self.data['channel']
        start = 0
        num = 16
        params = {
            "order": "times",
            "time": "today"
        }

        while 1:
            list_data = api_list(type, channel_id, start, num, params)
            if start == list_data['num']:
                return
            for item in list_data['data']:
                if channel in LONG_VIDEO_CHANNELS.values():
                    source_id = item['mid']
                    tags = []
                    time = item['public_time']
                    time = datetime.strptime(time, "%Y%m%d")

                    lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                    lasttime = lastdata.get(
                        "time", datetime.min) if lastdata else datetime.min
                    reset = time > lasttime
                else:
                    source_id = item['flvid']
                    tags = item.get("tags").split(",")
                    time = datetime.utcnow()
                    reset = False

                data = {
                    "url": item.get("web_url"),
                    "title": item.get("title"),
                    "image": item.get("bpic"),
                    "image2": item.get("mpic"),
                    "description": item.get("introduce"),
                    "duration": item.get("duration"),
                    "tags": tags,
                    "time": time,
                    "channel": channel
                }

                Scheduler.schedule(
                    AlbumCrawler.type,
                    source_id,
                    data,
                    reset=reset
                )
            start += 1
示例#21
0
 def init(conf=None):
     if not conf:
         conf = {}
     for id, channel in CHANNELS.iteritems():
         data = {
             "channel": channel
         }
         Scheduler.schedule(
             ListCrawler.type,
             key=id,
             data=data,
             priority=conf.get('priority', Priority.High),
             interval=conf.get('interval', 3600)
         )
示例#22
0
    def process_album(self, item):
        sites = {}
        fangying_id = re.findall("f_(.+)\.html", item['link'])[0]

        for play in item['plays']:
            site = play['site']
            if site not in SITES:
                continue

            if play["url"].find("fangying.com") != -1:
                stream = []
            else:
                format = "thunder" if site == "thunder" else ""
                stream = [{"url" : play["url"], "format" : format}]

            video = VideoItemModel({
                                    "title" : play["title"],
                                    "url" : play["url"],
                                    "stream" : stream,
                                    })

            if not sites.has_key(site):
                sites[site] = []
            sites[site].append(dict(video))

        model = None
        for site, videos in sites.iteritems():
            model = VideoSourceModel({
                        "source" : self.data['source'],
                        "source_id" : fangying_id,
                        "videos" : videos,
                        "title" : item['title'],
                        "directors" : item['directors'].split("/"),
                        "actors" : item['performers'].split("/"),
                        "description" : item['description'],
                        'categories' : item['genres'].split("/"),
                        'region' : item['countries'].split("/")[0],
                        'duration' : parse_duration(item['duration']),
                        'image' : item['avatar_middle'],
                        'score' : float(item['douban_rating']) if item.get('douban_rating') else None,
                        'url' : item['link'],
                        'price' : 0.0,
                        'pubtime' : parse_pubtime(item['release_time']),
                        'channel' : CHANNELS.get(self.key)
                 })
            export(model)

        if model:
            Scheduler.schedule(RelationCrawler.type, key = fangying_id, data = {'title' : model['title'], 'url' : model['url']})
示例#23
0
 def crawl(self):
     page = 1
     while (True):
         url = HOT_LIST % page
         video_list = loadurl(url)
         if video_list == None:
             break
         else:
             for videoinfo in video_list:
                 video = videoinfo['video'][0]
                 video['source'] = SOURCE
                 Scheduler.schedule(AlbumCrawler.type,
                                    video.get('id'),
                                    data=video)
             page += 1
示例#24
0
 def init(conf=None):
     if not conf:
         conf = {}
     data = api_channel(pcode, version)
     for channel in data['body']['channel']:
         cid = channel.get("cid")
         channel_name = channel.get("name")
         crawl_data = {
             "channel_id": cid,
             "channel_name": channel_name,
         }
         Scheduler.schedule(ListCrawler.type,
                            key=cid,
                            data=crawl_data,
                            priority=conf.get('priority', Priority.High),
                            interval=conf.get('interval', 3600))
示例#25
0
    def crawl(self):
        type = 3
        channel_id = self.key
        channel = self.data['channel']
        start = 0
        num = 16
        params = {"order": "times", "time": "today"}

        while 1:
            list_data = api_list(type, channel_id, start, num, params)
            if start == list_data['num']:
                return
            for item in list_data['data']:
                if channel in LONG_VIDEO_CHANNELS.values():
                    source_id = item['mid']
                    tags = []
                    time = item['public_time']
                    time = datetime.strptime(time, "%Y%m%d")

                    lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                    lasttime = lastdata.get(
                        "time", datetime.min) if lastdata else datetime.min
                    reset = time > lasttime
                else:
                    source_id = item['flvid']
                    tags = item.get("tags").split(",")
                    time = datetime.utcnow()
                    reset = False

                data = {
                    "url": item.get("web_url"),
                    "title": item.get("title"),
                    "image": item.get("bpic"),
                    "image2": item.get("mpic"),
                    "description": item.get("introduce"),
                    "duration": item.get("duration"),
                    "tags": tags,
                    "time": time,
                    "channel": channel
                }

                Scheduler.schedule(AlbumCrawler.type,
                                   source_id,
                                   data,
                                   reset=reset)
            start += 1
示例#26
0
 def crawl(self):
     page = 1
     while(True):
         url = HOT_LIST % page
         video_list = loadurl(url)
         if video_list == None:
             break
         else:
             for videoinfo in video_list:
                 video = videoinfo['video'][0]
                 video['source'] = SOURCE
                 Scheduler.schedule(
                                    AlbumCrawler.type,
                                    video.get('id'),
                                    data = video
                                    )   
             page+=1               
示例#27
0
 def crawl(self):
     for channel in CHANNELS:            
         list_url=LIST % (DICT.get(channel), 1)#杩涘叆姒滃崟绗竴椤�
         pagenum=int(loadurl(list_url).get('pagenum'))
         for page in range(pagenum):
             page+=1#褰撳墠椤典粠1璁℃暟
             current_url=LIST % (DICT.get(channel), page)
             lists=loadurl(current_url).get('lists')
             for episode in lists:
                 data={
                         'title':episode.get('name'),
                         'image':episode.get('pic'),
                         'category':episode.get('cate'),
                         'channel':channel,
                         'source':SOURCE
                     }
                 Scheduler.schedule(
                                    AlbumCrawler.type,
                                    episode.get('mid'),
                                    data,
                                    reset=True
                                    )
示例#28
0
    def crawl(self):
        cid = self.key
        channel = CHANNELS[int(cid)]
        page = 1
        pagesize = 30

        while 1:
            try:
                data = api_shows(cid, page, pagesize)
                if data is not None:
                    page += 1
                else:
                    return
            except:
                self.logger.warning(get_exception_info())
                continue

            if not data.get('results'):
                break
            for item in data['results']:
                try:
                    show_id = item['tid']
                    reset = (item['completed'] == 0)
                    data = {
                        'channel':
                        channel,
                        'image':
                        item.get('show_vthumburl_hd')
                        if item.get('show_vthumburl_hd') else
                        item.get('show_thumburl_hd'),
                        'image2':
                        item.get('show_thumburl_hd')
                    }
                    Scheduler.schedule(AlbumCrawler.type,
                                       key=show_id,
                                       data=data,
                                       reset=reset)
                except:
                    self.logger.warning(get_exception_info())
示例#29
0
    def crawl(self):
        cid = self.data['cid']
        current_time = int(time.time())

        for album_data in self.get_albums(cid):
            try:
                album = extract_album(album_data, self.data['source'])
                if not album:
                    continue
                checkup_time = time.mktime(album['time'].timetuple())

                # can't get video for paid item
                if (not album["price"]) and album.get('source_id'):
                    Scheduler.schedule(
                        type=AlbumCrawler.type,
                        key=album['source_id'],
                        data={"time": album["time"]},
                        reset=(current_time - checkup_time) < 86400)
            except:
                self.logger.warning(get_exception_info())

        self.data['updated'] = current_time
示例#30
0
    def crawl(self):
        cid = self.data['cid']
        current_time = int(time.time())

        for album_data in self.get_albums(cid):
            try:
                album = extract_album(album_data, self.data['source'])
                if not album:
                    continue
                checkup_time = time.mktime(album['time'].timetuple())

                # can't get video for paid item
                if (not album["price"]) and album.get('source_id'):
                    Scheduler.schedule(
                        type=AlbumCrawler.type,
                        key=album['source_id'],
                        data={"time": album["time"]},
                        reset=(current_time - checkup_time) < 86400
                    )
            except:
                self.logger.warning(get_exception_info())

        self.data['updated'] = current_time
示例#31
0
def schedule(request):
    response = {}
    type = request.GET.get("type")
    nextrun = request.GET.get("nextrun")
    if type.endswith("album"):
        response['error_info'] = "Type Error."
        response['status'] = False
        return response
    try:
        nextrun = datetime.strptime(nextrun, "%Y-%m-%d-%H-%M-%S")
    except:
        response['error_info'] = "Datetime Error."
        response['status'] = False
        return response
    
    m = Scheduler.monitor_schedule(type, nextrun)
    if m is not None:
        response['error_info'] = ""
        response['status'] = True
    else:
        response['error_info'] = "Type Error."
        response['status'] = False
    return response
示例#32
0
分两个爬虫抓取,一个抓取更新榜单,另一个抓取单个视频详情
抓取榜单的爬虫会创建相应的详情爬虫,并传给相应的参数和数据

更新榜单地址: http://bdzy.cc/list/?0-1.html
视频详情地址: http://bdzy.cc/detail/?20808.html
'''         
import requests, re, HTMLParser 
from datetime import datetime
from scrapy.selector import HtmlXPathSelector
from contentservice.crawler import Crawler, Scheduler, Priority, export
from contentservice.models.video import VideoSourceModel, VideoItemModel
from contentservice.utils.datetimeutil import parse_date
from contentservice.utils.text import split
from contentservice.utils import get_exception_info

'''
重要方法说明
Scheduler.schedule(type, key, priority, data, reset, interval, timeout)
    type - 爬虫任务的类型
    key - 爬虫任务在该类型中的唯一标识(type和key组合起来唯一标识所爬取的内容,key通常为源站id)
    priority - 任务优先级 High, Normal, Low
    data - 附加数据(用来持久化跟该爬虫实例相关的数据),附加数据每次运行完成会自动持久化
    reset - 是否强制重新抓取。默认不会重新抓取已经完成的任务
    interval - 任务循环运行的间隔时间,0为只运行一次,默认值为0
    timeout - 超时时间,超时会自动杀死任务

Crawler
爬虫基类,每类爬虫都需要继承这个类
方法:
    init(conf=None)  初始化(每次程序启动调用一次),用于创建起始爬虫任务
    crawl() 爬取代码的主函数
示例#33
0
    def process_album(self, item):
        sites = {}
        fangying_id = re.findall("f_(.+)\.html", item['link'])[0]

        for play in item['plays']:
            site = play['site']
            if site not in SITES:
                continue

            if play["url"].find("fangying.com") != -1:
                stream = []
            else:
                format = "thunder" if site == "thunder" else ""
                stream = [{"url": play["url"], "format": format}]

            video = VideoItemModel({
                "title": play["title"],
                "url": play["url"],
                "stream": stream,
            })

            if not sites.has_key(site):
                sites[site] = []
            sites[site].append(dict(video))

        model = None
        for site, videos in sites.iteritems():
            model = VideoSourceModel({
                "source":
                self.data['source'],
                "source_id":
                fangying_id,
                "videos":
                videos,
                "title":
                item['title'],
                "directors":
                item['directors'].split("/"),
                "actors":
                item['performers'].split("/"),
                "description":
                item['description'],
                'categories':
                item['genres'].split("/"),
                'region':
                item['countries'].split("/")[0],
                'duration':
                parse_duration(item['duration']),
                'image':
                item['avatar_middle'],
                'score':
                float(item['douban_rating'])
                if item.get('douban_rating') else None,
                'url':
                item['link'],
                'price':
                0.0,
                'pubtime':
                parse_pubtime(item['release_time']),
                'channel':
                CHANNELS.get(self.key)
            })
            export(model)

        if model:
            Scheduler.schedule(RelationCrawler.type,
                               key=fangying_id,
                               data={
                                   'title': model['title'],
                                   'url': model['url']
                               })
示例#34
0
    def crawl(self):
        min_time = self.data['updated'] #上次爬取到最新视频的更新时间, 为本次爬取的时间下界
        max_time = None #本次抓取的最新视频的时间

        page = 1
        while True:
            url = "http://bdzy.cc/list/?0-%s.html" % page
            hxs = load_html(url) #读取网页html, 返回一个HtmlXPathSelector

            time = None
            for s in hxs.select("//body/.//tr[@class='row']"): #用xpath解析html
                try:
                    href = s.select("td[1]/a/@href").extract()[0]
                    source_id = re.findall("(\d+)\.html", href)[0] #源站ID
                    title = clean_title(s.select("td[1]/.//text()").extract()[0])
                    region = s.select("td[2]/.//text()").extract()[0].replace(u"地区", u"")
                    category = s.select("td[3]/.//text()").extract()[0]
                    time = s.select("td[4]/.//text()").extract()[0]
                    time = datetime.strptime(time, "%Y-%m-%d")

                    if not max_time: #第一条是最新更新的
                        max_time = time
                    if time < min_time: #已经爬取到上次最新的数据
                        break

                    data = { #详情页爬虫任务的附加数据
                        "title" : title,
                        "time" : time,
                        "category" : category,
                        "region" : region,
                        }

                    #获取对应详情页爬虫的附加数据,用time字段判断该内容是否已经更新,需要重新抓取. 如果第一次创建,则数据为空
                    lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                    lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min

                    #创建相应的专辑爬虫,爬取相应的详情页. key为源站id
                    Scheduler.schedule(
                                       AlbumCrawler.type,
                                       source_id,
                                       data,
                                       reset = data["time"] > lasttime #是否需要强制重新抓取
                                       )
                except:
                    self.logger.warning(get_exception_info()) #纪录错误信息并继续
                    continue

            if time and time < min_time: #已经爬取到上次最新的数据
                break

            #获取总页数
            text = hxs.select("//div[@class='pages']/span/text()").extract()[0]
            page_count = int(re.findall(u"\d+/(\d+)页", text)[0])

            #超过总页数
            if page >= page_count:
                break
            page += 1

        if max_time:
            self.data = {'updated' : max_time} #保存上次爬取到的最新的时间
示例#35
0
    def crawl(self):
        channel_id = self.key
        channel = self.data['c_name']
        list_params = self.data['c_list_param']
        page = list_params['page']
        pagesize = 24
        now = int(time.time())

        params = dict(list_params)  # list_params & PARAMS_INFO merge
        params.update(PARAMS_INFO)

        while True:
            list_data = api_list(
                auto_id=channel_id, page=page, pagesize=pagesize, params=params)
            if list_data['returncode'] != 404:
                if list_data.get('cover'):
                    for item in list_data["cover"]:
                        source_id = item.get("c_cover_id")
                        pubtime = item.get("c_year")
                        checkup_time = datetime.strptime(
                            item['c_checkup_time'], "%Y-%m-%d %H:%M:%S")
                        checkup_time = time.mktime(checkup_time.timetuple())

                        data = {
                            "source_id": source_id,
                            "title": item.get("c_title"),
                            "image": item.get("c_pic"),
                            "actors": item.get("c_actor"),
                            "directors": item.get("c_director"),
                            "categories": item.get("c_subtype"),
                            "channel": channel,
                            "region": item.get("c_area"),
                            "pubtime": pubtime,
                        }

                        Scheduler.schedule(
                            type=AlbumCrawler.type,
                            key=source_id,
                            data=data,
                            # checkup time whthin three hours set reset==True
                            reset=(now - checkup_time) < 10800
                        )
                    page += 1

                if list_data.get('video'):
                    for item in list_data["video"]:
                        source_id = item.get("c_vid")
                        pubtime = item.get("c_ctime")

                        data = {
                            "source_id": source_id,
                            "title": item.get("c_title"),
                            "image": item.get("c_pic"),
                            "channel": channel,
                            "pubtime": pubtime,
                        }

                        Scheduler.schedule(
                            type=AlbumCrawler.type,
                            key=source_id,
                            data=data,
                        )
                    page += 1
            else:
                return
示例#36
0
 def init(conf = None):
     if not conf:
         conf = {}
     Scheduler.schedule(TopCrawler.type, priority = conf.get('priority', Priority.High), interval = conf.get('interval', 86400))
示例#37
0
 def init(conf=None):
     if not conf:
         conf = {}
     for catecode in CHANNELS.keys():
         Scheduler.schedule(CategoryCrawler.type, key = str(catecode), data = {"catecode" : catecode}, priority = conf.get('priority', Priority.Normal), interval = conf.get('interval', 3600))
示例#38
0
 def init(conf=None):
     if not conf:
         conf = {}
     for id in _CHANNEL_DCT.iterkeys():
         Scheduler.schedule(CategoryCrawler.type, key=str(id), data={"cid": id}, priority=conf.get(
             'priority', Priority.High), interval=conf.get('interval', 3600))
示例#39
0
 def init(conf=None):
     if not conf:
         conf = {}
     for channel in CHANNELS.iterkeys():
         Scheduler.schedule(HistoryCrawler.type, key = channel, data = {"year" : 1900}, priority = conf.get('priority', Priority.Normal), interval = conf.get('interval',86400))
示例#40
0
 def init(conf=None):
     if not conf:
         conf = {}
     for cid in CHANNELS.keys():
         Scheduler.schedule(CategoryCrawler.type, key=str(cid), priority=conf.get(
             'priority', Priority.High), interval=conf.get('interval', 3600))