예제 #1
0
    def crawl(self):
        min_time = self.data.get('updated',
                                 datetime.min) if self.data else datetime.min
        max_time = None
        time = None

        page = 1
        while True:
            url = "http://www.265zy.com/list/?0-%s.html" % page
            hxs = load_html(url)

            for s in hxs.select("//body/.//tr[@class='row']"):
                try:
                    href = s.select("td[1]/a/@href").extract()[0]
                    source_id = re.findall("(\d+)\.html", href)[0]
                    title = clean_title(
                        s.select("td[1]/.//text()").extract()[0])
                    region = s.select("td[2]/.//text()").extract()[0].replace(
                        u"地区", u"")
                    category = s.select("td[3]/.//text()").extract()[0]
                    time = s.select("td[4]/.//text()").extract()[0]
                    time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
                except:
                    continue

                if not max_time:
                    max_time = time
                if time < min_time:
                    break

                data = {
                    "title": title,
                    "time": time,
                    'category': category,
                    'region': region,
                }

                lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                lasttime = lastdata.get(
                    "time", datetime.min) if lastdata else datetime.min
                Scheduler.schedule(type=AlbumCrawler.type,
                                   key=source_id,
                                   data=data,
                                   reset=data['time'] > lasttime)

            if time and time < min_time:
                break

            text = hxs.select("//div[@class='pages']/span/text()").extract()[0]
            page_count = int(re.findall(u"\d+/(\d+)页", text)[0])
            if page >= 5:
                break
            page += 1

        if max_time:
            if not self.data:
                self.data = {}
            self.data['updated'] = max_time
예제 #2
0
    def crawl(self):
        min_time = self.data.get('updated', datetime.min) if self.data else datetime.min
        max_time = None
        time = None

        page = 1
        while True:
            url = "http://zyqvod.com/?page=%s" % page
            hxs = load_html(url)

            for s in hxs.select("//table[@id='listTable']/tbody/tr"):
                try:
                    source_id = re.findall("id=(\d+)", s.select("td[1]/a/@href").extract()[0])[0]
                    title = clean_title(s.select("td[1]/.//text()").extract()[0])
                    category = s.select("td[2]/.//text()").extract()[0]
                    region = s.select("td[3]/.//text()").extract()[0]
                    completed = s.select("td[4]/.//text()").extract()[0] == u"完结"
                    time = s.select("td[5]/.//text()").extract()[0]
                    time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
                except:
                    continue

                if not max_time:
                    max_time = time
                if time < min_time:
                    break

                data = {
                            "title" : title,
                            "time" : time,
                            'category' : category,
                            'region' : region,
                            'completed' : completed,
                        }

                lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min
                Scheduler.schedule(type = AlbumCrawler.type, key = source_id, data = data, reset = data['time'] > lasttime)

            if time and time < min_time:
                break

            text = hxs.select("//div[@class='page_num']/text()").extract()[0]
            page_count = int(re.findall(u"\d+/(\d+)页", text)[0])
            if page >= page_count:
                break
            page += 1

        if max_time:
            if not self.data:
                self.data = {}
            self.data['updated'] = max_time
예제 #3
0
    def crawl(self):
        type = 3
        channel_id = self.key
        channel = self.data['channel']
        start = 0
        num = 16
        params = {
            "order": "times",
            "time": "today"
        }

        while 1:
            list_data = api_list(type, channel_id, start, num, params)
            if start == list_data['num']:
                return
            for item in list_data['data']:
                if channel in LONG_VIDEO_CHANNELS.values():
                    source_id = item['mid']
                    tags = []
                    time = item['public_time']
                    time = datetime.strptime(time, "%Y%m%d")

                    lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                    lasttime = lastdata.get(
                        "time", datetime.min) if lastdata else datetime.min
                    reset = time > lasttime
                else:
                    source_id = item['flvid']
                    tags = item.get("tags").split(",")
                    time = datetime.utcnow()
                    reset = False

                data = {
                    "url": item.get("web_url"),
                    "title": item.get("title"),
                    "image": item.get("bpic"),
                    "image2": item.get("mpic"),
                    "description": item.get("introduce"),
                    "duration": item.get("duration"),
                    "tags": tags,
                    "time": time,
                    "channel": channel
                }

                Scheduler.schedule(
                    AlbumCrawler.type,
                    source_id,
                    data,
                    reset=reset
                )
            start += 1
예제 #4
0
    def crawl(self):
        min_time = self.data.get('updated', datetime.min) if self.data else datetime.min
        max_time = None
        time = None

        page = 1
        while True:
            url = "http://www.265zy.com/list/?0-%s.html" % page
            hxs = load_html(url)

            for s in hxs.select("//body/.//tr[@class='row']"):
                try:
                    href = s.select("td[1]/a/@href").extract()[0]
                    source_id = re.findall("(\d+)\.html", href)[0]
                    title = clean_title(s.select("td[1]/.//text()").extract()[0])
                    region = s.select("td[2]/.//text()").extract()[0].replace(u"地区", u"")
                    category = s.select("td[3]/.//text()").extract()[0]
                    time = s.select("td[4]/.//text()").extract()[0]
                    time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
                except:
                    continue

                if not max_time:
                    max_time = time
                if time < min_time:
                    break

                data = {
                            "title" : title,
                            "time" : time,
                            'category' : category,
                            'region' : region,
                        }

                lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min
                Scheduler.schedule(type = AlbumCrawler.type, key = source_id, data = data, reset = data['time'] > lasttime)

            if time and time < min_time:
                break

            text = hxs.select("//div[@class='pages']/span/text()").extract()[0]
            page_count = int(re.findall(u"\d+/(\d+)页", text)[0])
            if page >= 5:
                break
            page += 1

        if max_time:
            if not self.data:
                self.data = {}
            self.data['updated'] = max_time
예제 #5
0
    def crawl(self):
        type = 3
        channel_id = self.key
        channel = self.data['channel']
        start = 0
        num = 16
        params = {"order": "times", "time": "today"}

        while 1:
            list_data = api_list(type, channel_id, start, num, params)
            if start == list_data['num']:
                return
            for item in list_data['data']:
                if channel in LONG_VIDEO_CHANNELS.values():
                    source_id = item['mid']
                    tags = []
                    time = item['public_time']
                    time = datetime.strptime(time, "%Y%m%d")

                    lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                    lasttime = lastdata.get(
                        "time", datetime.min) if lastdata else datetime.min
                    reset = time > lasttime
                else:
                    source_id = item['flvid']
                    tags = item.get("tags").split(",")
                    time = datetime.utcnow()
                    reset = False

                data = {
                    "url": item.get("web_url"),
                    "title": item.get("title"),
                    "image": item.get("bpic"),
                    "image2": item.get("mpic"),
                    "description": item.get("introduce"),
                    "duration": item.get("duration"),
                    "tags": tags,
                    "time": time,
                    "channel": channel
                }

                Scheduler.schedule(AlbumCrawler.type,
                                   source_id,
                                   data,
                                   reset=reset)
            start += 1
예제 #6
0
    def crawl(self):
        min_time = self.data['updated'] #上次爬取到最新视频的更新时间, 为本次爬取的时间下界
        max_time = None #本次抓取的最新视频的时间

        page = 1
        while True:
            url = "http://bdzy.cc/list/?0-%s.html" % page
            hxs = load_html(url) #读取网页html, 返回一个HtmlXPathSelector

            time = None
            for s in hxs.select("//body/.//tr[@class='row']"): #用xpath解析html
                try:
                    href = s.select("td[1]/a/@href").extract()[0]
                    source_id = re.findall("(\d+)\.html", href)[0] #源站ID
                    title = clean_title(s.select("td[1]/.//text()").extract()[0])
                    region = s.select("td[2]/.//text()").extract()[0].replace(u"地区", u"")
                    category = s.select("td[3]/.//text()").extract()[0]
                    time = s.select("td[4]/.//text()").extract()[0]
                    time = datetime.strptime(time, "%Y-%m-%d")

                    if not max_time: #第一条是最新更新的
                        max_time = time
                    if time < min_time: #已经爬取到上次最新的数据
                        break

                    data = { #详情页爬虫任务的附加数据
                        "title" : title,
                        "time" : time,
                        "category" : category,
                        "region" : region,
                        }

                    #获取对应详情页爬虫的附加数据,用time字段判断该内容是否已经更新,需要重新抓取. 如果第一次创建,则数据为空
                    lastdata = Scheduler.get_data(AlbumCrawler.type, source_id)
                    lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min

                    #创建相应的专辑爬虫,爬取相应的详情页. key为源站id
                    Scheduler.schedule(
                                       AlbumCrawler.type,
                                       source_id,
                                       data,
                                       reset = data["time"] > lasttime #是否需要强制重新抓取
                                       )
                except:
                    self.logger.warning(get_exception_info()) #纪录错误信息并继续
                    continue

            if time and time < min_time: #已经爬取到上次最新的数据
                break

            #获取总页数
            text = hxs.select("//div[@class='pages']/span/text()").extract()[0]
            page_count = int(re.findall(u"\d+/(\d+)页", text)[0])

            #超过总页数
            if page >= page_count:
                break
            page += 1

        if max_time:
            self.data = {'updated' : max_time} #保存上次爬取到的最新的时间