예제 #1
0
        def xt_pres_date(cls, raw_person):
            # Extract administration
            admin_datestring = Selector(text=raw_person).xpath(
                '//td[2]/text()').extract()[0]
            try:
                if " - " in admin_datestring:
                    start_date = _clean(admin_datestring.split(' - ')[0])
                    end_date = _clean(admin_datestring.split(' - ')[1])

                    start_date = datetime.datetime.strptime(
                        start_date, "%d.%m.%Y").date()
                    end_date = datetime.datetime.strptime(
                        end_date, "%d.%m.%Y").date()
                else:
                    start_date = datetime.datetime.strptime(
                        _clean(admin_datestring.replace(' -','')), "%d.%m.%Y").date()
                    end_date = None
            except:
                logger.error(
                    "Couldn't extract date from datestring {}".format(
                        admin_datestring))
                import ipdb
                ipdb.set_trace()

            return (start_date, end_date)
예제 #2
0
파일: Lu.py 프로젝트: RainmanRay/VSCODE
    def parse_subPage(self, response):
        item = LuyiluImgItem()

        item['img_url'] = Selector(response).xpath('//img[contains(@src,"images.") and not(@class="thumb")]/@src').extract()
        cur_title= Selector(response).xpath('//h1/text()').extract_first()
        par = re.compile('\(\d*\)')
        rst = par.findall(cur_title)
        if len(rst) > 0:
            rst = rst[0]
        else:rst = ''
        item['title'] = cur_title.replace(rst,'')
        item['url'] = response.url
        next_suburl = Selector(response).xpath('//li[@class="next-page"]/a/@href').extract_first(default=None)
        if not next_suburl == None:
            next_pagurl = response.url.replace(response.url.split('/')[-1], next_suburl)
        else:
            next_pagurl = response.url

        yield Request(next_pagurl, callback= self.parse_subPage)
        yield item
        rela_url = Selector(response).xpath('//a[contains(@href,"/20")]/@href').extract()
        for url in rela_url:
            par = re.compile('youfanhao')
            par2 =re.compile('xiurenwang')
            third_par = re.compile('youmihui')
            chedan = re.compile('xiachedan')
            chuchu = re.compile('chuchu')
            no_true = len(par.findall(url)) + len(par2.findall(url)) + len(third_par.findall(url))
            no_true += len(chedan.findall(url)) + len (chuchu.findall(url))
            if no_true == 0:
                yield Request('http://yxpjwnet1.com'+url, callback = self.parse_subPage)
    def person(self, response):
        """当前公司所有人员url"""
        # 获取当前表里的所有数据
        mycontinue = True
        tr = Selector(response=response).xpath('//tbody/tr')
        # 获取当前有多少数据
        all_date = Selector(response=response).xpath(
            '//div[@class="comp_regstaff_links"]/a[1]/span/text()'
        ).extract_first()
        # 去除不需要的
        one_name = Selector(response=response).xpath(
            '//tbody/tr[1]/td[2]/a/text()').extract_first()
        all_date = all_date.replace(')', '')
        all_date = int(all_date.replace('(', ''))
        if all_date == 0:
            print('----公司无人员\n\n')
            return 'zz'
        if all_date < 26:
            # logging.info( '------人员无分页\n\n')
            # logging.error( '------人员无分页\n\n')
            mycontinue = False
        # 算出有能有多少页
        self.page = all_date // 25 + 2
        # 拿出所有的人员的A标签属性
        for r in tr:
            one_person = r.xpath('./td/a/@onclick').extract_first()
            if not one_person == None:
                person_url = one_person.split('top.window.location.href=\'')[1]
                person_url = person_url.split('\'')[0]
                person_url = self.big_url + person_url
                time.sleep(0.5)
                yield Request(url=person_url, callback=self.person_detailed)

            # 查看是否有分页
        another_page = Selector(
            response=response).xpath('//div[@class="clearfix"]')
        # 如果不够分页或者,没有分页选择器这不执行
        if not another_page == [] and mycontinue:
            for a in range(2, self.page):
                print(a)
                a = str(a)
                yield scrapy.FormRequest(response.url,
                                         formdata={'$pg': a},
                                         callback=self.person)
            # 只循环一次
            mycontinue = False
예제 #4
0
def get_profile_links(page_content: str) -> List[tuple]:
    profile_selector = "#companies-column > ul > li a"
    profile_links = Selector(text=page_content).css(profile_selector).extract()
    results = []
    for link in profile_links:
        href = Selector(text=link).css("a::attr(href)").extract()[0]
        company_title = Selector(text=link).css("h3::text").extract()[0]
        clean_company_title = escape_html(company_title.replace("  ",
                                                                " ")).lower()
        results.append((clean_company_title, href))
    return results
예제 #5
0
    def parse(self, response):
        # print(response.body)
        video_list = response.xpath(
            "//div[@class='search-video-wrap']/div[@class='video-list clearfix add-quick-recommend']/ul/li"
        ).extract()
        video_item = VideoItem()
        for item in video_list:
            video_url = Selector(text=item).xpath(
                "//a/div[@class='video-box']/video/@data-original"
            ).extract_first()
            video_url = video_url.replace("_10s", "")
            video_item['video_url'] = 'https:' + video_url

            video_title = Selector(text=item).xpath(
                "//a[@class='video-name fl']/h3/text()").extract_first()
            video_item['video_title'] = video_title

            video_time = Selector(text=item).xpath(
                "//a[@class='video-name fl']/span[@class='video-time']/text()"
            ).extract_first()
            video_item['video_time'] = video_time
            yield video_item
예제 #6
0
    def google_selector(self, response):
        base_url = "https://www.google.com.mx/"
        snippets = response.xpath("//div[@class='g']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        for snippet in snippets:
            num_snippet = num_snippet + 1
            storage_item = UsmItem()

            title = Selector(text=snippet).xpath("//a/b/text() | //a/text()").extract()
            cite = Selector(text=snippet).xpath("//cite").extract()
            # cite = Selector(text=snippet).xpath("//h3/a/@href").extract()

            text = Selector(text=snippet).xpath("//span[@class='st']").extract()

            if title.__len__() >= 2:
                title = title[0]+title[1]
            else:
                title=""

            if cite.__len__() > 0:
                # cite = cite[0].split("url?q=")[-1]
                cite = cite[0]
                for r in ['<cite>', '</cite>', '<b>', '</b>']:
                    cite = cite.replace(r, '')
            else:
                cite=""

            if text.__len__() > 0:
                text = text[0]
                for r in ['<span class="st">', '</span>', '<br>', '</br>', '<b>', '</b>']:
                    text = text.replace(r, '')
            else:
                text = ""

            if cite != "":
                self.log("---------------------------------")
                self.log("--------------TITLE--------------")
                self.log(title)
                self.log("-------------CITE----------------")
                self.log(cite)
                self.log("---------------TEXT--------------")
                self.log(text)
                self.log("------------ID PERSON------------")
                self.log(id_person)
                self.log("------------SEARCH---------------")
                self.log(search)
                self.log("--------------ATTR---------------")
                self.log(base_attr)
                self.log("-----------ENGINE SEARCH---------")
                self.log(self.browser)
                self.log("------------NUMBER SNIPPET-------")
                self.log(num_snippet)

                storage_item['title'] = title
                storage_item['cite'] = cite
                storage_item['text'] = text
                storage_item['id_person'] = id_person
                storage_item['search'] = search
                storage_item['attr'] = base_attr
                storage_item['engine_search'] = self.browser
                storage_item['number_snippet'] = num_snippet

                itemproc.process_item(storage_item, self)

        number = response.xpath("//td/b/text()").extract()
        self.log("-----------NUMBER OF PAGE-----")
        self.log(number[0] + "")
        if int(number[0]) < 6:
            res = response.xpath("//td[@class='b'][@style='text-align:left']/a[@class='fl']/@href").extract()

            for url in res:
                self.log("--URL TO FOLLOW--")
                self.log(base_url + url)
                request = Request(base_url + url, callback=self.google_selector)
                request.meta['id_person'] = id_person
                request.meta['search'] = search
                request.meta['attr'] = base_attr
                request.meta['num_snip'] = num_snippet
                yield request
예제 #7
0
    def parse_detail_info(self, response):
        detail_info = response.text
        if ('http://tech.sina.com.cn'
                in response.url) or ('https://tech.sina.com.cn'
                                     in response.url):
            url = response.url.strip()
            title = response.meta['title']
            content = ''.join(
                Selector(text=detail_info).xpath(
                    '//div[@id="artibody"]/p').extract())
            pattern = re.compile('</?a[^>]*>')
            content = pattern.sub('', content)
            pattern = re.compile('</?img[^>]*>')
            content = pattern.sub('', content)
            cover = response.meta['cover']
            pdf = ''
            keywords = ''
            if len(
                    Selector(text=detail_info).xpath(
                        '//div[@id="keywords"]/a/text()').extract()) > 0:
                keywords = ','.join(
                    Selector(text=detail_info).xpath(
                        '//div[@id="keywords"]/a/text()').extract())
            else:
                keywords = ','.join(
                    Selector(text=detail_info).xpath(
                        '//p[@class="art_keywords"]/a/text()').extract())
            hot = response.meta['hot']
            type = response.meta['type']
            if type != '快讯':
                type = self.get_type(content)
            update = ''
            if len(
                    Selector(text=detail_info).xpath(
                        '//span[@class="date"]/text()').extract()) > 0:
                update = Selector(text=detail_info).xpath(
                    '//span[@class="date"]/text()').extract()[0].strip()
            else:
                update = Selector(text=detail_info).xpath(
                    '//span[@id="pub_date"]/text()').extract()[0].strip()
            batch = self.batch
            table_name = 'spider.news'
            yield self.save_result(batch, url, title, content, cover, pdf,
                                   keywords, hot, type, update, table_name,
                                   response).load_item()
        else:
            if ('http://report.iresearch.cn' in response.url):
                url = response.url
                title = response.meta['title']
                content = response.meta['content']
                pattern = re.compile('</?a[^>]*>')
                content = pattern.sub('', content)
                pattern = re.compile('</?img[^>]*>')
                content = pattern.sub('', content)
                cover = response.meta['cover']
                pdf = ''
                pdf_price = Selector(text=detail_info).xpath(
                    '//li[@class="price"]/text()').extract()[0]
                pdf_url = 'http://report.iresearch.cn/include/ajax/user_ajax.ashx?reportid=' + str(
                    url[url.rfind('/') + 1:-6]) + '&work=rdown&url=' + url
                if '¥0' == pdf_price:
                    pdf_content = yield Request(pdf_url)
                    # self.save_pdf(pdf_content)
                    pdf = base64.b64encode(pdf_content.body)
                keywords = response.meta['keywords']
                hot = response.meta['hot']
                type = response.meta['type']
                update = response.meta['update']
                batch = self.batch
                table_name = 'spider.news'
                yield self.save_result(batch, url, title, content, cover, pdf,
                                       keywords, hot, type, update, table_name,
                                       response).load_item()
            else:
                if ('https://new.qq.com' in response.url):  # 解析腾讯科技详情
                    url = response.url.strip()
                    title = response.meta['title']
                    content = ''.join(
                        Selector(text=detail_info).xpath(
                            '//div[@class="content-article"]/p').extract())
                    if len(content) != 0:
                        pattern = re.compile('</?a[^>]*>')
                        content = pattern.sub('', content)
                        pattern = re.compile('</?img[^>]*>')
                        content = pattern.sub('', content)
                        cover = response.meta['cover']
                        pdf = ''
                        keywords = Selector(text=detail_info).xpath(
                            '//meta[@name="keywords"]/@content').extract(
                            )[0].strip()
                        hot = response.meta['hot']
                        type = response.meta['type']
                        if type != '快讯':
                            type = self.get_type(content)
                        update = ''
                        update = detail_info.split('pubtime": "')
                        update = update[1].split('",')[0]
                        batch = self.batch
                        table_name = 'spider.news'
                        yield self.save_result(batch, url, title, content,
                                               cover, pdf, keywords, hot, type,
                                               update, table_name,
                                               response).load_item()
                else:
                    if ("https://www.toutiao.com" in response.url):  #头条详情解析
                        url = response.url.strip()
                        title = Selector(text=detail_info).xpath(
                            '//title/text()').extract()[0].strip()
                        content = detail_info.split('content: \'')
                        content = content[1].split('groupId: \'')[0]
                        content = content.replace(";',", "")
                        content = content.encode("utf-8")
                        pattern = re.compile('</?a[^>]*>')
                        content = pattern.sub('', content)
                        pattern = re.compile('</?img[^>]*>')
                        content = pattern.sub('', content)
                        cover = response.meta['cover']
                        pdf = ''
                        keywords = Selector(text=detail_info).xpath(
                            '//meta[@name="keywords"]/@content').extract(
                            )[0].strip()
                        hot = response.meta['hot']
                        type = response.meta['type']
                        if type != '快讯':
                            type = self.get_type(content)
                        update = ''
                        update = detail_info.split("time: '")
                        update = update[1].split("'")[0]
                        batch = self.batch
                        table_name = 'spider.news'
                        yield self.save_result(batch, url, title, content,
                                               cover, pdf, keywords, hot, type,
                                               update, table_name,
                                               response).load_item()
                    else:
                        if ("https://www.36kr.com" in response.url):  # 36氪解析详情
                            url = response.url.strip()
                            title = Selector(text=detail_info).xpath(
                                '//title/text()').extract()[0].strip()
                            title = title.replace('_36氪', '')
                            content = ''.join(
                                Selector(text=detail_info).xpath(
                                    '//div[@class="common-width content articleDetailContent"]/p'
                                ).extract())
                            pattern = re.compile('</?a[^>]*>')
                            content = pattern.sub('', content)
                            pattern = re.compile('</?img[^>]*>')
                            content = pattern.sub('', content)

                            cover = response.meta['cover']
                            pdf = ''
                            keywords = Selector(text=detail_info).xpath(
                                '//meta[@name="keywords"]/@content').extract(
                                )[0].strip()
                            hot = response.meta['hot']
                            type = response.meta['type']
                            if type != '快讯':
                                type = self.get_type(content)
                            update = response.meta['update']
                            batch = self.batch
                            table_name = 'spider.news'
                            yield self.save_result(batch, url, title, content,
                                                   cover, pdf, keywords, hot,
                                                   type, update, table_name,
                                                   response).load_item()
                        else:
                            if ('iresearch.cn' in response.url):  # 艾瑞网详情解析
                                url = response.url.strip()
                                title = Selector(text=detail_info).xpath(
                                    '//title/text()').extract()[0].strip()
                                title = title.replace('_互联网_艾瑞网', '')
                                content = ''.join(
                                    Selector(text=detail_info).xpath(
                                        '//div[@class="m-article"]/p').extract(
                                        ))
                                pattern = re.compile('</?a[^>]*>')
                                content = pattern.sub('', content)
                                pattern = re.compile('</?img[^>]*>')
                                content = pattern.sub('', content)

                                cover = response.meta['cover']
                                pdf = ''
                                keywords = Selector(text=detail_info).xpath(
                                    '//meta[@name="keywords"]/@content'
                                ).extract()[0].strip()
                                hot = response.meta['hot']
                                type = response.meta['type']
                                if type != '快讯':
                                    type = self.get_type(content)
                                update = Selector(text=detail_info).xpath(
                                    '//div[@class="box"]//div[@class="origin"]//em/text()'
                                ).extract()[0].strip()
                                batch = self.batch
                                table_name = 'spider.news'
                                yield self.save_result(batch, url, title,
                                                       content, cover, pdf,
                                                       keywords, hot, type,
                                                       update, table_name,
                                                       response).load_item()
                            else:
                                if ('http://www.sohu.com'
                                        in response.url):  # 搜狐科技解析详情
                                    url = response.url.strip()
                                    title = Selector(text=detail_info).xpath(
                                        '//title/text()').extract()[0].strip()
                                    content = ''.join(
                                        Selector(text=detail_info).xpath(
                                            '//article[@class="article"]/p').
                                        extract())
                                    pattern = re.compile('</?a[^>]*>')
                                    content = pattern.sub('', content)
                                    pattern = re.compile('</?img[^>]*>')
                                    content = pattern.sub('', content)
                                    cover = response.meta['cover']
                                    pdf = ''
                                    keywords = Selector(
                                        text=detail_info).xpath(
                                            '//meta[@name="keywords"]/@content'
                                        ).extract()[0].strip()
                                    hot = response.meta['hot']
                                    type = response.meta['type']
                                    if type != '快讯':
                                        type = self.get_type(content)
                                    update = Selector(text=detail_info).xpath(
                                        '//div[@class="article-info"]//span[@class="time"]/text()'
                                    ).extract()[0].strip()
                                    batch = self.batch
                                    table_name = 'spider.news'
                                    yield self.save_result(
                                        batch, url, title, content, cover, pdf,
                                        keywords, hot, type, update,
                                        table_name, response).load_item()
                                else:
                                    if ('http://www.tmtpost.com/'
                                            in response.url):  # 钛媒体详情解析
                                        url = response.url.strip()
                                        title = Selector(
                                            text=detail_info).xpath(
                                                '//title/text()').extract(
                                                )[0].strip()
                                        title = title.replace('-钛媒体官方网站', '')
                                        content = ''.join(
                                            Selector(text=detail_info).xpath(
                                                '//div[@class="inner"]/p').
                                            extract())
                                        pattern = re.compile('</?a[^>]*>')
                                        content = pattern.sub('', content)
                                        pattern = re.compile('</?img[^>]*>')
                                        content = pattern.sub('', content)
                                        cover = response.meta['cover']
                                        pdf = ''
                                        keywords = Selector(
                                            text=detail_info
                                        ).xpath(
                                            '//meta[@name="keywords"]/@content'
                                        ).extract()[0].strip()
                                        hot = response.meta['hot']
                                        type = response.meta['type']
                                        if type != '快讯':
                                            type = self.get_type(content)
                                        update = Selector(
                                            text=detail_info
                                        ).xpath(
                                            '//div[@class="post-info"]//span[@class="time "]/text()'
                                        ).extract()[0].strip()
                                        batch = self.batch
                                        table_name = 'spider.news'
                                        yield self.save_result(
                                            batch, url, title, content, cover,
                                            pdf, keywords, hot, type, update,
                                            table_name, response).load_item()
예제 #8
0
파일: google.py 프로젝트: rcln/unoporunoDQN
    def google_selector(self, response):

        if response.status != self.STATUS_OK:
            with open("error.log", "a") as log_file:
                log_file.write(str(response.status) + " " + str(self.browser) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")
                return

        base_url = "https://www.google.com/"
        snippets = response.xpath("//div[@class='g']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        with open("system_google.log", "a") as log_file:
            log_file.write(str(response.status) + " " + str(self.browser) + " " + str(search) + " " + str(num_snippet) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")

        for snippet in snippets:
            storage_item = UsmItem()

            title = Selector(text=snippet).xpath("//a/b/text() | //a/text()").extract()
            cite = Selector(text=snippet).xpath("//cite").extract()
            # cite = Selector(text=snippet).xpath("//h3/a/@href").extract()

            text = Selector(text=snippet).xpath("//span[@class='st']").extract()

            if title.__len__() >= 2:
                title = title[0]+title[1]
            else:
                title=""

            if cite.__len__() > 0:
                # cite = cite[0].split("url?q=")[-1]
                cite = cite[0]
                for r in ['<cite>', '</cite>', '<b>', '</b>', '<cite class="kv">', '</cite class="kv">']:
                    cite = cite.replace(r, '')
            else:
                cite = ""

            if text.__len__() > 0:
                text = text[0]
                for r in ['<span class="st">', '</span>', '<br>', '</br>', '<b>', '</b>', '<span class="f">',
                          '<span class="nobr">']:
                    text = text.replace(r, '')
            else:
                text = ""

            if cite != "":
                if not cite.__contains__("facebook") and not cite.__contains__("youtube"):


                    text = Cleaner.clean_reserved_xml(Cleaner(), text)
                    text = Cleaner.remove_accent(Cleaner(), text)
                    title = Cleaner.clean_reserved_xml(Cleaner(), title)
                    title = Cleaner.remove_accent(Cleaner(), title)

                    if FeatureFilter.is_lang(text) == 'en':

                        num_snippet = num_snippet + 1
                        self.log("---------------------------------")
                        self.log("--------------TITLE--------------")
                        self.log(title)
                        self.log("-------------CITE----------------")
                        self.log(cite)
                        self.log("---------------TEXT--------------")
                        self.log(text)
                        self.log("------------ID PERSON------------")
                        self.log(id_person)
                        self.log("------------SEARCH---------------")
                        self.log(search)
                        self.log("--------------ATTR---------------")
                        self.log(base_attr)
                        self.log("-----------ENGINE SEARCH---------")
                        self.log(self.browser)
                        self.log("------------NUMBER SNIPPET-------")
                        self.log(num_snippet)

                        storage_item['title'] = title
                        storage_item['cite'] = cite
                        storage_item['text'] = text
                        storage_item['id_person'] = id_person
                        storage_item['search'] = search
                        storage_item['attr'] = base_attr
                        storage_item['engine_search'] = self.browser
                        storage_item['number_snippet'] = num_snippet

                        itemproc.process_item(storage_item, self)

        number = response.xpath("//td/b/text()").extract()
        self.log("-----------NUMBER OF PAGE-----")
        self.log(number[0] + "")
        if int(number[0]) < 6 and num_snippet < 15:
            res = response.xpath("//td[@class='b'][@style='text-align:left']/a[@class='fl']/@href").extract()

            for url in res:
                self.log("--URL TO FOLLOW--")
                self.log(base_url + url)
                request = Request(base_url + url, callback=self.google_selector)
                request.meta['id_person'] = id_person
                request.meta['search'] = search
                request.meta['attr'] = base_attr
                request.meta['num_snip'] = num_snippet
                yield request
예제 #9
0
    def duck_selector(self, response):

        base_url = "https://duckduckgo.com/"
        snippets = response\
            .xpath("//div[@class='result results_links results_links_deep web-result ']")\
            .extract()

        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        for snippet in snippets:
            storage_item = UsmItem()
            num_snippet = num_snippet + 1

            title = Selector(text=snippet).xpath("//div/h2/a/node()").extract()
            cite = Selector(text=snippet).xpath("//div/a/@href").extract()
            text = Selector(text=snippet).xpath(
                "//div/a[@class='result__snippet']/node()").extract()

            if title.__len__() > 0:
                tmp = ""
                for text in title:
                    for r in ["<b>", "</b>"]:
                        text = text.replace(r, '')
                    tmp = tmp + text
                title = tmp
            else:
                title = ""

            if cite.__len__() > 0:
                cite = cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                tmp = ""
                for txt in title:
                    for r in ["<b>", "</b>"]:
                        txt = txt.replace(r, '')
                    tmp = tmp + txt
                text = tmp
            else:
                text = ""

            if cite != "":
                self.log("---------------------------------")
                self.log("------------TITLE----------------")
                self.log(title)
                self.log("------------CITE-----------------")
                self.log(cite)
                self.log("------------TEXT-----------------")
                self.log(text)
                self.log("-----------ID PERSON-----------------")
                self.log(id_person)
                self.log("-----------SEARCH----------------")
                self.log(search)
                self.log("--------------ATTR---------------")
                self.log(base_attr)
                self.log("-----------ENGINE SEARCH---------")
                self.log(self.browser)
                self.log("------------NUMBER SNIPPET-------")
                self.log(num_snippet)

                storage_item['title'] = title
                storage_item['cite'] = cite
                storage_item['text'] = text
                storage_item['id_person'] = id_person
                storage_item['search'] = search
                storage_item['attr'] = base_attr
                storage_item['engine_search'] = self.browser
                storage_item['number_snippet'] = num_snippet

                itemproc.process_item(storage_item, self)
예제 #10
0
파일: bing.py 프로젝트: rcln/unoporunoDQN
    def bing_selector(self, response):

        if response.status != self.STATUS_OK:
            with open("error.log", "a") as log_file:
                log_file.write(
                    str(response.status) + " " + str(self.browser) + " " +
                    datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")
                return

        base_url = "https://www.bing.com/"
        snippets = response.xpath("//li[@class='b_algo']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        with open("system_bing.log", "a") as log_file:
            log_file.write(
                str(response.status) + " " + str(self.browser) + " " +
                str(search) + " " + str(num_snippet) + " " +
                datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")

        for snippet in snippets:
            storage_item = UsmItem()
            title = Selector(text=snippet).xpath("//h2/a/node()").extract()
            cite = Selector(text=snippet).xpath("//h2/a/@href").extract()
            text = Selector(text=snippet).xpath("//p").extract()

            tmp_title = ""
            for cad in title:
                tmp_title = tmp_title + cad
            for r in ["<strong>", "</strong>"]:
                tmp_title = tmp_title.replace(r, '')
            title = tmp_title

            if cite.__len__() > 0:
                cite = cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                text = text[0]
                for r in [
                        "<p>", "</p>", "<strong>", "</strong>",
                        '<span class="news_dt">', '</span>'
                ]:
                    text = text.replace(r, '')
            else:
                text = ""

            if cite != "":
                if not cite.__contains__("facebook") and not cite.__contains__(
                        "youtube"):
                    text = Cleaner.clean_reserved_xml(Cleaner(), text)
                    text = Cleaner.remove_accent(Cleaner(), text)
                    title = Cleaner.clean_reserved_xml(Cleaner(), title)
                    title = Cleaner.remove_accent(Cleaner(), title)

                    if FeatureFilter.is_lang(text) == 'en':
                        num_snippet = num_snippet + 1

                        self.log("------------TITLE----------------")
                        self.log(title)
                        self.log("------------CITE-----------------")
                        self.log(cite)
                        self.log("------------TEXT-----------------")
                        self.log(text)
                        self.log("----------ID PERSON------------------")
                        self.log(id_person)
                        self.log("-----------SEARCH----------------")
                        self.log(search)
                        self.log("--------------ATTR---------------")
                        self.log(base_attr)
                        self.log("-----------ENGINE SEARCH---------")
                        self.log(self.browser)
                        self.log("------------NUMBER SNIPPET-------")
                        self.log(num_snippet)

                        storage_item['title'] = title
                        storage_item['cite'] = cite
                        storage_item['text'] = text
                        storage_item['id_person'] = id_person
                        storage_item['search'] = search
                        storage_item['attr'] = base_attr
                        storage_item['engine_search'] = self.browser
                        storage_item['number_snippet'] = num_snippet

                        itemproc.process_item(storage_item, self)

        number = response.xpath("//li[@class='b_pag']/nav[@role='navigation']"
                                "//a[@class='sb_pagS']/text()").extract()
        self.log("-----------NUMBER OF PAGE-------")
        if number.__len__() > 0:
            self.log(number[0] + "")
            if int(number[0]) < 6 and num_snippet < 10:
                num = int(number[0]) + 1
                num = str(num)
                res = response.xpath(
                    "//li[@class='b_pag']/nav[@role='navigation']"
                    "//a[@aria-label='Page " + num + "']/@href").extract()
                for url in res:
                    self.log("--URL TO FOLLOW--")
                    self.log(base_url + url)

                    request = Request(base_url + url,
                                      callback=self.bing_selector)
                    request.meta['id_person'] = id_person
                    request.meta['attr'] = base_attr
                    request.meta['search'] = search
                    request.meta['num_snip'] = num_snippet
                    yield request
예제 #11
0
    def duck_selector(self, response):

        if response.status != self.STATUS_OK:
            with open("error.log", "a") as log_file:
                log_file.write(
                    str(response.status) + " " + str(self.browser) + " " +
                    datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")
                return

        base_url = "https://duckduckgo.com/"
        snippets = response \
            .xpath("//div[@class='result results_links results_links_deep web-result ']") \
            .extract()

        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        with open("system_duckduckgo.log", "a") as log_file:
            log_file.write(
                str(response.status) + " " + str(self.browser) + " " +
                str(search) + " " + str(num_snippet) + " " +
                datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")

        for snippet in snippets:
            storage_item = UsmItem()

            title = Selector(text=snippet).xpath("//div/h2/a/node()").extract()
            cite = Selector(text=snippet).xpath("//div/a/@href").extract()
            text = Selector(text=snippet).xpath(
                "//div/a[@class='result__snippet']/node()").extract()

            if title.__len__() > 0:
                tmp = ""
                for text in title:
                    for r in ["<b>", "</b>"]:
                        text = text.replace(r, '')
                    tmp = tmp + text
                title = tmp
            else:
                title = ""

            if cite.__len__() > 0:
                cite = cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                tmp = ""
                for txt in title:
                    for r in ["<b>", "</b>"]:
                        txt = txt.replace(r, '')
                    tmp = tmp + txt
                text = tmp
            else:
                text = ""

            if cite != "" and num_snippet < 15:
                if not cite.__contains__("facebook") and not cite.__contains__(
                        "youtube"):
                    text = Cleaner.clean_reserved_xml(Cleaner(), text)
                    text = Cleaner.remove_accent(Cleaner(), text)
                    title = Cleaner.clean_reserved_xml(Cleaner(), title)
                    title = Cleaner.remove_accent(Cleaner(), title)

                    if FeatureFilter.is_lang(text) == 'en':
                        num_snippet = num_snippet + 1
                        self.log("---------------------------------")
                        self.log("------------TITLE----------------")
                        self.log(title)
                        self.log("------------CITE-----------------")
                        self.log(cite)
                        self.log("------------TEXT-----------------")
                        self.log(text)
                        self.log("-----------ID PERSON-----------------")
                        self.log(id_person)
                        self.log("-----------SEARCH----------------")
                        self.log(search)
                        self.log("--------------ATTR---------------")
                        self.log(base_attr)
                        self.log("-----------ENGINE SEARCH---------")
                        self.log(self.browser)
                        self.log("------------NUMBER SNIPPET-------")
                        self.log(num_snippet)

                        storage_item['title'] = title
                        storage_item['cite'] = cite
                        storage_item['text'] = text
                        storage_item['id_person'] = id_person
                        storage_item['search'] = search
                        storage_item['attr'] = base_attr
                        storage_item['engine_search'] = self.browser
                        storage_item['number_snippet'] = num_snippet

                        itemproc.process_item(storage_item, self)
    def bing_selector(self, response):
        base_url = "https://www.bing.com/"
        snippets = response.xpath("//li[@class='b_algo']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        for snippet in snippets:
            num_snippet = num_snippet + 1
            storage_item = UsmItem()
            title = Selector(text=snippet).xpath("//h2/a/node()").extract()
            cite = Selector(text=snippet).xpath("//h2/a/@href").extract()
            text = Selector(text=snippet).xpath("//p").extract()

            tmp_title = ""
            for cad in title:
                tmp_title = tmp_title + cad
            for r in ["<strong>", "</strong>"]:
                tmp_title = tmp_title.replace(r, '')
            title = tmp_title

            if cite.__len__() > 0:
                cite = cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                text = text[0]
                for r in ["<p>", "</p>", "<strong>", "</strong>"]:
                    text = text.replace(r, '')
            else:
                text = ""

            if cite != "":
                self.log("------------TITLE----------------")
                self.log(title)
                self.log("------------CITE-----------------")
                self.log(cite)
                self.log("------------TEXT-----------------")
                self.log(text)
                self.log("----------ID PERSON------------------")
                self.log(id_person)
                self.log("-----------SEARCH----------------")
                self.log(search)
                self.log("--------------ATTR---------------")
                self.log(base_attr)
                self.log("-----------ENGINE SEARCH---------")
                self.log(self.browser)
                self.log("------------NUMBER SNIPPET-------")
                self.log(num_snippet)

                storage_item['title'] = title
                storage_item['cite'] = cite
                storage_item['text'] = text
                storage_item['id_person'] = id_person
                storage_item['search'] = search
                storage_item['attr'] = base_attr
                storage_item['engine_search'] = self.browser
                storage_item['number_snippet'] = num_snippet

                itemproc.process_item(storage_item, self)
        number = response.xpath("//li[@class='b_pag']/nav[@role='navigation']"
                                "//a[@class='sb_pagS']/text()").extract()
        self.log("-----------NUMBER OF PAGE-------")
        if number.__len__() > 0:
            self.log(number[0] + "")
            if int(number[0]) < 5:
                num = int(number[0]) + 1
                num = str(num)
                res = response.xpath(
                    "//li[@class='b_pag']/nav[@role='navigation']"
                    "//a[@aria-label='Page " + num + "']/@href").extract()
                for url in res:
                    self.log("--URL TO FOLLOW--")
                    self.log(base_url + url)

                    request = Request(base_url + url,
                                      callback=self.bing_selector)
                    request.meta['id_person'] = id_person
                    request.meta['attr'] = base_attr
                    request.meta['search'] = search
                    request.meta['num_snip'] = num_snippet
                    yield request