Exemplo n.º 1
0
class JiepaiHuaBanSpider(scrapy.Spider):
    name = 'jiepai_hua_ban'
    allowed_domains = ['huaban.com']
    start_urls = ['http://huaban.com/favorite/beauty/']

    # start_urls = ['http://huaban.com/boards/24116838/?md=newbn&beauty=']

    def __init__(self, name=None, **kwargs):
        super(JiepaiHuaBanSpider, self).__init__(name)
        self.bmob_helper = BMobUploadHelper()

    def parse(self, response):
        bsp = BeautifulSoup(response.body, 'lxml')
        hua_ban_group = bsp.select_one("#waterfall")
        hua_ban_items = hua_ban_group.select(".pin.wfc")

        cur_hua_ban_time = time_utils.get_jie_pai_hua_ban_scrapy_time()
        for hua_ban_item in hua_ban_items:

            if hua_ban_item.attrs.has_key("data-created-at"):
                #time
                hua_ban_item_time_stamp = hua_ban_item["data-created-at"]
                timeArray = time.localtime(int(hua_ban_item_time_stamp))
                hua_ban_item_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                                  timeArray)
                logging.info("time: " + hua_ban_item_time)

                if cmp(hua_ban_item_time, cur_hua_ban_time) < 0:
                    logging.info("time is out of date, hua_ban_item_time: " +
                                 hua_ban_item_time)
                    break

                #图片
                hua_ban_pic_item = hua_ban_item.select_one(
                    ".img.x.layer-view.loaded > img")
                hua_ban_pic = hua_ban_pic_item["src"]
                split_index = hua_ban_pic.index("_")
                hua_ban_url = "http:" + hua_ban_pic[0:split_index] + "_fw658"
                # hua_ban_url ="http:" + hua_ban_pic

                group_content = self.bmob_helper.get_group_content(
                    hua_ban_url, "")
                group_url = "https://api2.bmob.cn/1/classes/Beauty"
                logging.info("parse_hua_ban_detail group data: " +
                             json.dumps(group_content, ensure_ascii=False))
                point_group_id = self.bmob_helper.upload_to_bmob(
                    group_url, group_content)

        time_utils.save_jie_pai_hua_ban_scrapy_time(
            time_utils.get_next_day_time())
Exemplo n.º 2
0
class JiepaiThreeAppSpider(scrapy.Spider):
    name = 'jiepai_three_app'
    allowed_domains = ['app.3ajiepai.com']
    start_urls = [
        'http://app.3ajiepai.com/thread/list?fid=170&page=1&pageSize=20'
    ]

    def __init__(self, name=None, **kwargs):
        super(JiepaiThreeAppSpider, self).__init__(name)
        self.cur_time = time_utils.get_jie_pai_three_m_scrapy_time()
        self.cookies_jie_pai = {}
        self.bmob_helper = BMobUploadHelper()

    def start_requests(self):
        jsession_id, jie_pai = self.get_login_info()
        self.cookies_jie_pai = {
            '__cfduid': 'd038136efebfcd498fc25c12f2a9cbad81539412011',
            'JSESSIONID': jsession_id,
            '3ajiepai': jie_pai
        }
        for url in self.start_urls:
            yield scrapy.Request(url, cookies=self.cookies_jie_pai)

    def get_login_info(self):
        login_url = "http://app.3ajiepai.com/wechat/login?code=onQGp1RAFbnzN6m4y259Qma2vMu4"
        response = requests.get(login_url)
        jsession_id = response.cookies["JSESSIONID"]
        jie_pai = response.cookies["3ajiepai"]
        logging.info("response cookies: " + str(response.cookies))
        return jsession_id, jie_pai

    def parse(self, response):
        logging.info("jiepai_three_app response: " + response.body)
        json_content = json.loads(response.body)

        data_array = json_content["data"]
        count = 0
        for data_item in data_array:
            # count +=1
            # if count ==2:
            #     break
            #time
            data_date = data_item["dateline"]
            timeArray = time.localtime(int(data_date))
            jie_pai_time = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
            if cmp(jie_pai_time, self.cur_time) < 0:
                logging.info("time is out of date, jie_pai_time: " +
                             jie_pai_time)
                return

            # 封面
            data_thumb = data_item["thumb"]
            # title
            data_subject = data_item["subject"]
            #id
            data_tid = data_item["tid"]

            detial_url = "http://app.3ajiepai.com/thread/" + str(data_tid)

            yield scrapy.Request(detial_url,
                                 meta={
                                     "data_thumb": data_thumb,
                                     "data_subject": data_subject,
                                 },
                                 cookies=self.cookies_jie_pai,
                                 callback=self.handle_detail)

        # 所有的事情都办完了
        time_utils.save_jie_pai_three_m_scrapy_time(
            time_utils.get_next_day_time())

    def handle_detail(self, response):
        data_thumb = response.meta["data_thumb"]
        data_subject = response.meta["data_subject"]
        data_detail = json.loads(response.body)
        photos = data_detail["data"]["photos"]
        point_group_id = ""
        sub_pic_url = "https://api2.bmob.cn/1/classes/CardPicBean"
        first_photo = True
        for photo in photos:
            img_url = photo["origin"]
            # 第一张:
            if first_photo:
                group_content = self.bmob_helper.get_group_content(
                    img_url, data_subject)
                group_url = "https://api2.bmob.cn/1/classes/CardPicGroup"
                logging.info("parse_wei_bo_detail group data: " +
                             json.dumps(group_content, ensure_ascii=False))
                point_group_id = self.bmob_helper.upload_to_bmob(
                    group_url, group_content)
                first_photo = False
            else:
                detail_content = self.bmob_helper.get_detail_content(
                    "", img_url, point_group_id)
                logging.info("upload sub_pics json: " +
                             json.dumps(detail_content, ensure_ascii=False))
                self.bmob_helper.upload_to_bmob(sub_pic_url, detail_content)
Exemplo n.º 3
0
class JiepaiWeiBoSpider(scrapy.Spider):
    name = 'jiepai_wei_bo'
    allowed_domains = ['weibo.com']
    # start_urls = ['http://photo.weibo.com/1304494805/talbum/index#!/mode/2/page/1']
    start_urls = [
        'https://weibo.com/u/1304494805?is_all=1',  #街拍美
        'https://weibo.com/u/3757458303?is_all=1',  #街拍摄美
        # 'https://weibo.com/tajiepai?is_all=1'#她街拍
    ]

    def __init__(self, name=None, **kwargs):
        super(JiepaiWeiBoSpider, self).__init__(name)
        self.bmob_helper = BMobUploadHelper()
        self.key_words = ["四年"]
        self.craw_count = 0

    def hit_key_word(self, word):
        result = False
        for my_word in self.key_words:
            result = my_word in word.encode("utf-8")
            if result:
                return result
        return result

    def parse(self, response):
        bsp = BeautifulSoup(response.body, 'lxml')
        wei_bo_group = bsp.select_one(".WB_feed.WB_feed_v3.WB_feed_v4")
        time.sleep(1)
        wei_bo_items = wei_bo_group.select(".WB_feed_detail.clearfix")
        count = 0
        title = ""
        cur_wei_bo_time = time_utils.get_jie_pai_wei_bo_scrapy_time()
        cur_group_id = ""
        point_group_id = ""
        for wei_bo_item in wei_bo_items:

            # time
            wei_bo_time_item = wei_bo_item.select_one(".WB_from.S_txt2 > a")
            wei_bo_time = wei_bo_time_item["title"]

            if cmp(wei_bo_time, cur_wei_bo_time) < 0:
                logging.info("time is out of date, wei_bo_time: " +
                             wei_bo_time)
                continue

            # titile
            wei_bo_title_item = wei_bo_item.select_one(".WB_text.W_f14")
            wei_bo_title = wei_bo_title_item.text
            wei_bo_title = wei_bo_title.replace("\n", "").strip()
            reobj = re.compile("\(.*\)")
            wei_bo_title_result, number = reobj.subn("", wei_bo_title)

            #通过关键字过滤一些微博
            if self.hit_key_word(wei_bo_title):
                logging.info("hit_key_word title: " + wei_bo_title)
                continue
            img_urls = []
            #pic
            wei_bo_pics = wei_bo_item.select(".WB_pic")
            for wei_bo_pic in wei_bo_pics:
                img_item = wei_bo_pic.select_one("img")
                img_url = img_item["src"]
                final_img_url = ""
                if "thumb150" in img_url:
                    final_img_url = "http:" + img_url.replace(
                        "thumb150", "mw690")
                elif "orj360" in img_url:
                    final_img_url = "http:" + img_url.replace(
                        "orj360", "mw690")
                img_urls.append(final_img_url)

            if len(img_urls) > 0:
                #cover
                cover_url = img_urls[0]

                #upload cover
                group_content = self.bmob_helper.get_group_content(
                    cover_url, wei_bo_title_result)
                group_url = "https://api2.bmob.cn/1/classes/CardPicGroup"
                logging.info("parse_wei_bo_detail group data: " +
                             json.dumps(group_content, ensure_ascii=False))
                point_group_id = self.bmob_helper.upload_to_bmob(
                    group_url, group_content)

                #upload sub_pics
                sub_pic_url = "https://api2.bmob.cn/1/classes/CardPicBean"
                for index in range(1, len(img_urls)):
                    detail_content = self.bmob_helper.get_detail_content(
                        "", img_urls[index], point_group_id)
                    logging.info(
                        "upload sub_pics json: " +
                        json.dumps(detail_content, ensure_ascii=False))
                    self.bmob_helper.upload_to_bmob(sub_pic_url,
                                                    detail_content)

        self.craw_count = self.craw_count + 1

        if self.craw_count == len(self.start_urls):
            # 所有的事情都办完了
            time_utils.save_jie_pai_weibo_scrapy_time(
                time_utils.get_next_day_time())
Exemplo n.º 4
0
class JiepaiSpider(scrapy.Spider):

    name = 'jiepai'
    allowed_domains = [
        'blog.sina.com.cn'
    ,'www.bucuo.me']

    start_urls = [
        'http://blog.sina.com.cn/s/articlelist_1340398703_4_1.html'
    # ,
        # 'https://www.bucuo.me/app/1583407618504778'
    ]


    def __init__(self,name=None, **kwargs):
        super(JiepaiSpider,self).__init__(name)
        self.sina = "http://blog.sina.com.cn/s/articlelist_1340398703_4_1.html"
        self.bucou = "https://www.bucuo.me/app/1583407618504778"
        self.cur_time = time_utils.get_jie_pai_scrapy_time()
        self.bmob_helper = BMobUploadHelper()
        self.point_group_id = ""

    def parse(self, response):
        bsp = BeautifulSoup(response.body, 'lxml')
        cur_url = response.url
        article_class = self.get_article_list_class_by(cur_url)
        article_list = bsp.select(article_class)
        count = 0
        title =""
        for article in article_list:
            count +=1;
            # if count == 2:
            #     break

            article_time_class = self.get_article_time_class_by(cur_url)
            article_time = article.select(article_time_class)
            if cmp(cur_url,self.sina) == 0:
                scrap_time = article_time[0].string
            else:
                scrap_time = article_time[0]["title"]
            #第一个就不满足,可以立即去除掉
            if cmp(self.cur_time,scrap_time) > 0:
                logging.error("jie_pai_group time is out of date cur_time: "+self.cur_time+" scrap_time: "+scrap_time)
                return
            link_title_class = self.get_title_class_by(cur_url)
            link_title = article.select(link_title_class)
            title = link_title[0].string
            link = link_title[0]['href']
            logging.info('jie_pai_group title: ' + title + ' link: ' + link+" scrap_time: "+scrap_time)
            jie_pai_group_loader = ItemLoader(item=HCJiePaiGroup(), selector=response)
            jie_pai_group_loader.add_value('jie_pai_title', title)
            time.sleep(3)
            yield jie_pai_group_loader.load_item()

            call_back = self.parse_sina_detail
            detail_url = link
            if cmp(cur_url,self.sina)==0:
                call_back = self.parse_sina_detail
                detail_url = link
            else:
                call_back = self.parse_bu_cuo_detail
                detail_url = "https://www.bucuo.me"+link

            yield scrapy.Request(detail_url, meta={"group_title": title},
                                     callback=call_back)
        # 所有的事情都办完了
        time_utils.save_jie_pai_scrapy_time(time_utils.get_next_day_time())

    def parse_bu_cuo_detail(self,response):
        bsp = BeautifulSoup(response.body, 'lxml')
        for br in bsp('br'):
            br.extract()

        title = response.meta["group_title"]
        point_group_id = ""
        jie_pai_details = bsp.select(".body > p")
        is_first = True
        p_count = 0
        img_url = ""
        img_desc = ""
        for jie_pai_detail in jie_pai_details:
            p_count += 1
            img = jie_pai_detail.select('img')
            if img:     #图片
                img_url = ""
                img_url = img[0]["src"]
                continue

            else: #文字
                img_desc = ""
                img_desc = jie_pai_detail.string

            #第一张图片作为封面
            if is_first:
                upload_group_content = self.bmob_helper.get_group_content(img_url, title)
                url = "https://api2.bmob.cn/1/classes/CardPicGroup"
                logging.info("parse_bu_cuo_detail group data: " + json.dumps(upload_group_content, ensure_ascii=False))
                # point_group_id = self.upload_to_bmob(url, upload_group_content)
                is_first = False
            else:
                #后续图片作为sub_img
                upload_detail_content = self.bmob_helper.get_news_detail_content(img_desc,img_url,point_group_id)
                url = "https://api2.bmob.cn/1/classes/CardPicBean"
                logging.info("parse_bu_cuo_detail detail data: " + json.dumps(upload_detail_content,ensure_ascii=False))
                # self.upload_to_bmob(url, upload_detail_content)


    def parse_sina_detail(self, response):
        bsp = BeautifulSoup(response.body, 'lxml')
        for br in bsp('br'):
            br.extract()

        title = response.meta["group_title"]
        jie_pai_details = bsp.select_one('#sina_keyword_ad_area2')
        jie_pai_detail_links = jie_pai_details.select('a')
        is_first = True;
        for jie_pai_detail in jie_pai_detail_links:
            link_content = jie_pai_detail['href']
            if 'photo.blog.sina.com.cn' in link_content:
                result = self.process_detail(is_first, jie_pai_detail, title)
                if result and is_first:
                    is_first = False;
            else:
                logging.info('jie_pai_detail end')

    def process_detail(self, is_first, jie_pai_detail, title):
        jie_pai_detail_img = jie_pai_detail.select('img')
        img_width = 0
        img_height = 0;
        if jie_pai_detail_img[0].attrs.has_key("width"):
            img_width = int(jie_pai_detail_img[0]['width'])
        if jie_pai_detail_img[0].attrs.has_key("height"):
            img_height = int(jie_pai_detail_img[0]['height'])
        if img_height > img_width:

            img_url = jie_pai_detail_img[0]['real_src']
            img_desc = jie_pai_detail.next_sibling

            #另外一种情况获取img_desc
            img_desc = self.get_img_desc_if_needed(img_desc, jie_pai_detail)
            logging.info("type: " + str(type(img_desc)))
            # 第一张图片作为封面
            if is_first:
                # 上传group
                upload_group_content = self.bmob_helper.get_group_content_with_title(img_url,img_desc, title)
                url = "https://api2.bmob.cn/1/classes/StyleNews"
                logging.info("upload_group_content data: " + json.dumps(upload_group_content, ensure_ascii=False))
                self.point_group_id = self.bmob_helper.upload_to_bmob(url, upload_group_content)
            elif type(img_desc) == NavigableString:
                upload_detail_content = self.bmob_helper.get_news_detail_content(img_desc, img_url, self.point_group_id)
                url = "https://api2.bmob.cn/1/classes/StyleDetailItem"
                logging.info("upload json: " + json.dumps(upload_detail_content,ensure_ascii=False))
                self.bmob_helper.upload_to_bmob(url, upload_detail_content)
            # logging.info(
            #     'jie_pai_detail img_width: ' + str(img_width) + ' img_height: ' + str(
            #         img_height) + ' img_src: ' + img_url + ' img_desc: ' + img_desc)
            return True
        else:
            logging.info("jie_pai_detail img_heigh: " + str(img_height) + " img_width: " + str(img_width))

        return False

    def get_img_desc_if_needed(self, img_desc, jie_pai_detail):
        while cmp(img_desc, '\n') == 0:
            img_desc = jie_pai_detail.next_sibling
            jie_pai_detail = jie_pai_detail.next_sibling

        # div
        if cmp("div", jie_pai_detail.name) == 0:
            img_desc = jie_pai_detail.text

        return img_desc



    def get_article_list_class_by(self,url):
        article_class = ""
        if cmp(url,self.sina) == 0:
            article_class = '.articleCell.SG_j_linedot1'
        elif cmp(url,self.bucou) == 0:
            article_class = ".art-item"

        return article_class

    def get_title_class_by(self,url):
        title_class = ""
        if cmp(url,self.sina) == 0:
            title_class = '.atc_title > a'
        elif cmp(url,self.bucou) == 0:
            title_class = "h2 > a"

        return title_class

    def get_article_time_class_by(self,url):
        article_time_class = ""
        if cmp(url,self.sina) == 0:
            article_time_class = '.atc_tm.SG_txtc'
        elif cmp(url,self.bucou) == 0:
            article_time_class = ".title-info > span"

        return article_time_class


    def get_details_class_by(self,url):
        details_class = ""
        if cmp(url,self.sina) == 0:
            details_class = '#sina_keyword_ad_area2'
        elif cmp(url,self.bucou) == 0:
            details_class = ".art-item > a"

        return details_class