Пример #1
0
    def google_selector(self, response):
        base_url = "https://www.google.com.mx/"
        snippets = response.xpath("//div[@class='g']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        for snippet in snippets:
            num_snippet = num_snippet + 1
            storage_item = UsmItem()

            title = Selector(text=snippet).xpath("//a/b/text() | //a/text()").extract()
            cite = Selector(text=snippet).xpath("//cite").extract()
            # cite = Selector(text=snippet).xpath("//h3/a/@href").extract()

            text = Selector(text=snippet).xpath("//span[@class='st']").extract()

            if title.__len__() >= 2:
                title = title[0]+title[1]
            else:
                title=""

            if cite.__len__() > 0:
                # cite = cite[0].split("url?q=")[-1]
                cite = cite[0]
                for r in ['<cite>', '</cite>', '<b>', '</b>']:
                    cite = cite.replace(r, '')
            else:
                cite=""

            if text.__len__() > 0:
                text = text[0]
                for r in ['<span class="st">', '</span>', '<br>', '</br>', '<b>', '</b>']:
                    text = text.replace(r, '')
            else:
                text = ""

            if cite != "":
                self.log("---------------------------------")
                self.log("--------------TITLE--------------")
                self.log(title)
                self.log("-------------CITE----------------")
                self.log(cite)
                self.log("---------------TEXT--------------")
                self.log(text)
                self.log("------------ID PERSON------------")
                self.log(id_person)
                self.log("------------SEARCH---------------")
                self.log(search)
                self.log("--------------ATTR---------------")
                self.log(base_attr)
                self.log("-----------ENGINE SEARCH---------")
                self.log(self.browser)
                self.log("------------NUMBER SNIPPET-------")
                self.log(num_snippet)

                storage_item['title'] = title
                storage_item['cite'] = cite
                storage_item['text'] = text
                storage_item['id_person'] = id_person
                storage_item['search'] = search
                storage_item['attr'] = base_attr
                storage_item['engine_search'] = self.browser
                storage_item['number_snippet'] = num_snippet

                itemproc.process_item(storage_item, self)

        number = response.xpath("//td/b/text()").extract()
        self.log("-----------NUMBER OF PAGE-----")
        self.log(number[0] + "")
        if int(number[0]) < 6:
            res = response.xpath("//td[@class='b'][@style='text-align:left']/a[@class='fl']/@href").extract()

            for url in res:
                self.log("--URL TO FOLLOW--")
                self.log(base_url + url)
                request = Request(base_url + url, callback=self.google_selector)
                request.meta['id_person'] = id_person
                request.meta['search'] = search
                request.meta['attr'] = base_attr
                request.meta['num_snip'] = num_snippet
                yield request
Пример #2
0
    def cite_selector(self, response):
        # Utils.create_page(Utils(), response.body, "-citeseerx")

        base_url = "http://citeseerx.ist.psu.edu/"
        snippets = response.xpath("//div[@class='result']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        for snippet in snippets:
            storage_item = UsmItem()
            num_snippet = num_snippet + 1

            title = Selector(text=snippet).xpath("//h3/a/node()").extract()
            # tmpTitle = Selector(text=snippet).xpath("//div[@class='pubinfo']")
            cite = Selector(text=snippet).xpath("//h3/a/@href").extract()
            text = Selector(text=snippet).xpath(
                "//div[@class='snippet']/text()").extract()

            if title.__len__() > 0:
                tmp = ""
                for txt in title:
                    for r in ['<em>', '</em>', '\n']:
                        txt = txt.replace(r, '')
                    tmp = tmp + txt
                title = tmp.strip()
            else:
                title = ""

            if cite.__len__() > 0:
                cite = base_url + cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                text = text[0]
            else:
                text = ""

            if cite != "":
                self.log("---------------------------------")
                self.log("------------TITLE----------------")
                self.log(title)
                self.log("------------CITE-----------------")
                self.log(cite)
                self.log("------------TEXT-----------------")
                self.log(text)
                self.log("------------ID PERSON----------------")
                self.log(id_person)
                self.log("------------SEARCH---------------")
                self.log(search)
                self.log("--------------ATTR---------------")
                self.log(base_attr)
                self.log("-----------ENGINE SEARCH---------")
                self.log(self.browser)
                self.log("------------NUMBER SNIPPET-------")
                self.log(num_snippet)

                storage_item['title'] = title
                storage_item['cite'] = cite
                storage_item['text'] = text
                storage_item['id_person'] = id_person
                storage_item['search'] = search
                storage_item['attr'] = base_attr
                storage_item['engine_search'] = self.browser
                storage_item['number_snippet'] = num_snippet

                itemproc.process_item(storage_item, self)

        num = response.xpath(
            "//div[@id='result_info']/strong/text()").extract()

        self.log("----------NUM OF ELEMENTS---------")
        self.log(num[0].split(' ')[2])
        num = num[0].split(' ')[2]

        if int(num) < 60:
            url = response.xpath("//div[@id='result_info']"
                                 "/div[@id='pager']/a/@href").extract()
            self.log("------------URL TO FOLLOW ------------")
            if url.__len__() > 0:
                self.log(base_url + url[0])

                request = Request(base_url + url[0],
                                  callback=self.cite_selector)
                request.meta['id_person'] = id_person
                request.meta['search'] = search
                request.meta['attr'] = base_attr
                request.meta['num_snip'] = num_snippet
                yield request
Пример #3
0
    def bing_selector(self, response):

        if response.status != self.STATUS_OK:
            with open("error.log", "a") as log_file:
                log_file.write(
                    str(response.status) + " " + str(self.browser) + " " +
                    datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")
                return

        base_url = "https://www.bing.com/"
        snippets = response.xpath("//li[@class='b_algo']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        with open("system_bing.log", "a") as log_file:
            log_file.write(
                str(response.status) + " " + str(self.browser) + " " +
                str(search) + " " + str(num_snippet) + " " +
                datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")

        for snippet in snippets:
            storage_item = UsmItem()
            title = Selector(text=snippet).xpath("//h2/a/node()").extract()
            cite = Selector(text=snippet).xpath("//h2/a/@href").extract()
            text = Selector(text=snippet).xpath("//p").extract()

            tmp_title = ""
            for cad in title:
                tmp_title = tmp_title + cad
            for r in ["<strong>", "</strong>"]:
                tmp_title = tmp_title.replace(r, '')
            title = tmp_title

            if cite.__len__() > 0:
                cite = cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                text = text[0]
                for r in [
                        "<p>", "</p>", "<strong>", "</strong>",
                        '<span class="news_dt">', '</span>'
                ]:
                    text = text.replace(r, '')
            else:
                text = ""

            if cite != "":
                if not cite.__contains__("facebook") and not cite.__contains__(
                        "youtube"):
                    text = Cleaner.clean_reserved_xml(Cleaner(), text)
                    text = Cleaner.remove_accent(Cleaner(), text)
                    title = Cleaner.clean_reserved_xml(Cleaner(), title)
                    title = Cleaner.remove_accent(Cleaner(), title)

                    if FeatureFilter.is_lang(text) == 'en':
                        num_snippet = num_snippet + 1

                        self.log("------------TITLE----------------")
                        self.log(title)
                        self.log("------------CITE-----------------")
                        self.log(cite)
                        self.log("------------TEXT-----------------")
                        self.log(text)
                        self.log("----------ID PERSON------------------")
                        self.log(id_person)
                        self.log("-----------SEARCH----------------")
                        self.log(search)
                        self.log("--------------ATTR---------------")
                        self.log(base_attr)
                        self.log("-----------ENGINE SEARCH---------")
                        self.log(self.browser)
                        self.log("------------NUMBER SNIPPET-------")
                        self.log(num_snippet)

                        storage_item['title'] = title
                        storage_item['cite'] = cite
                        storage_item['text'] = text
                        storage_item['id_person'] = id_person
                        storage_item['search'] = search
                        storage_item['attr'] = base_attr
                        storage_item['engine_search'] = self.browser
                        storage_item['number_snippet'] = num_snippet

                        itemproc.process_item(storage_item, self)

        number = response.xpath("//li[@class='b_pag']/nav[@role='navigation']"
                                "//a[@class='sb_pagS']/text()").extract()
        self.log("-----------NUMBER OF PAGE-------")
        if number.__len__() > 0:
            self.log(number[0] + "")
            if int(number[0]) < 6 and num_snippet < 10:
                num = int(number[0]) + 1
                num = str(num)
                res = response.xpath(
                    "//li[@class='b_pag']/nav[@role='navigation']"
                    "//a[@aria-label='Page " + num + "']/@href").extract()
                for url in res:
                    self.log("--URL TO FOLLOW--")
                    self.log(base_url + url)

                    request = Request(base_url + url,
                                      callback=self.bing_selector)
                    request.meta['id_person'] = id_person
                    request.meta['attr'] = base_attr
                    request.meta['search'] = search
                    request.meta['num_snip'] = num_snippet
                    yield request
Пример #4
0
    def google_selector(self, response):

        if response.status != self.STATUS_OK:
            with open("error.log", "a") as log_file:
                log_file.write(str(response.status) + " " + str(self.browser) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")
                return

        base_url = "https://www.google.com/"
        snippets = response.xpath("//div[@class='g']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        with open("system_google.log", "a") as log_file:
            log_file.write(str(response.status) + " " + str(self.browser) + " " + str(search) + " " + str(num_snippet) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")

        for snippet in snippets:
            storage_item = UsmItem()

            title = Selector(text=snippet).xpath("//a/b/text() | //a/text()").extract()
            cite = Selector(text=snippet).xpath("//cite").extract()
            # cite = Selector(text=snippet).xpath("//h3/a/@href").extract()

            text = Selector(text=snippet).xpath("//span[@class='st']").extract()

            if title.__len__() >= 2:
                title = title[0]+title[1]
            else:
                title=""

            if cite.__len__() > 0:
                # cite = cite[0].split("url?q=")[-1]
                cite = cite[0]
                for r in ['<cite>', '</cite>', '<b>', '</b>', '<cite class="kv">', '</cite class="kv">']:
                    cite = cite.replace(r, '')
            else:
                cite = ""

            if text.__len__() > 0:
                text = text[0]
                for r in ['<span class="st">', '</span>', '<br>', '</br>', '<b>', '</b>', '<span class="f">',
                          '<span class="nobr">']:
                    text = text.replace(r, '')
            else:
                text = ""

            if cite != "":
                if not cite.__contains__("facebook") and not cite.__contains__("youtube"):


                    text = Cleaner.clean_reserved_xml(Cleaner(), text)
                    text = Cleaner.remove_accent(Cleaner(), text)
                    title = Cleaner.clean_reserved_xml(Cleaner(), title)
                    title = Cleaner.remove_accent(Cleaner(), title)

                    if FeatureFilter.is_lang(text) == 'en':

                        num_snippet = num_snippet + 1
                        self.log("---------------------------------")
                        self.log("--------------TITLE--------------")
                        self.log(title)
                        self.log("-------------CITE----------------")
                        self.log(cite)
                        self.log("---------------TEXT--------------")
                        self.log(text)
                        self.log("------------ID PERSON------------")
                        self.log(id_person)
                        self.log("------------SEARCH---------------")
                        self.log(search)
                        self.log("--------------ATTR---------------")
                        self.log(base_attr)
                        self.log("-----------ENGINE SEARCH---------")
                        self.log(self.browser)
                        self.log("------------NUMBER SNIPPET-------")
                        self.log(num_snippet)

                        storage_item['title'] = title
                        storage_item['cite'] = cite
                        storage_item['text'] = text
                        storage_item['id_person'] = id_person
                        storage_item['search'] = search
                        storage_item['attr'] = base_attr
                        storage_item['engine_search'] = self.browser
                        storage_item['number_snippet'] = num_snippet

                        itemproc.process_item(storage_item, self)

        number = response.xpath("//td/b/text()").extract()
        self.log("-----------NUMBER OF PAGE-----")
        self.log(number[0] + "")
        if int(number[0]) < 6 and num_snippet < 15:
            res = response.xpath("//td[@class='b'][@style='text-align:left']/a[@class='fl']/@href").extract()

            for url in res:
                self.log("--URL TO FOLLOW--")
                self.log(base_url + url)
                request = Request(base_url + url, callback=self.google_selector)
                request.meta['id_person'] = id_person
                request.meta['search'] = search
                request.meta['attr'] = base_attr
                request.meta['num_snip'] = num_snippet
                yield request
Пример #5
0
    def cite_selector(self, response):
        # Utils.create_page(Utils(), response.body, "-citeseerx")

        if response.status != self.STATUS_OK:
            with open("error.log", "a") as log_file:
                log_file.write(
                    str(response.status) + " " + str(self.browser) + " " +
                    datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")

            # with open("count_citeseerx.txt", 'r') as file:
            #     num = file.readline()
            #     with open("count_citeseerx_error"+ str(num) +".txt", 'w') as file2:
            #         file2.write(num)
            return

        base_url = "http://citeseerx.ist.psu.edu/"
        snippets = response.xpath("//div[@class='result']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        with open("system_citeseer.log", "a") as log_file:
            log_file.write(
                str(response.status) + " " + str(self.browser) + " " +
                str(search) + " " + str(num_snippet) + " " +
                datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")

        if os.path.isfile('count_citeseerx.txt'):
            with open('count_citeseerx.txt', 'r') as file:
                num = file.readline()
            with open('count_citeseerx.txt', 'w') as file:
                file.write(str(int(num) + 1))
        else:
            with open('count_citeseerx.txt', 'w') as file:
                file.write("0")

        for snippet in snippets:
            storage_item = UsmItem()

            title = Selector(text=snippet).xpath("//h3/a/node()").extract()
            # tmpTitle = Selector(text=snippet).xpath("//div[@class='pubinfo']")
            cite = Selector(text=snippet).xpath("//h3/a/@href").extract()
            text = Selector(text=snippet).xpath(
                "//div[@class='snippet']/text()").extract()

            if title.__len__() > 0:
                tmp = ""
                for txt in title:
                    for r in ['<em>', '</em>', '\n']:
                        txt = txt.replace(r, '')
                    tmp = tmp + txt
                title = tmp.strip()
            else:
                title = ""

            if cite.__len__() > 0:
                cite = base_url + cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                text = text[0]
            else:
                text = ""

            if cite != "":
                if not cite.__contains__("facebook") and not cite.__contains__(
                        "youtube"):

                    text = Cleaner.clean_reserved_xml(Cleaner(), text)
                    text = Cleaner.remove_accent(Cleaner(), text)
                    title = Cleaner.clean_reserved_xml(Cleaner(), title)
                    title = Cleaner.remove_accent(Cleaner(), title)

                    if FeatureFilter.is_lang(text) == 'en':
                        num_snippet = num_snippet + 1

                        self.log("---------------------------------")
                        self.log("------------TITLE----------------")
                        self.log(title)
                        self.log("------------CITE-----------------")
                        self.log(cite)
                        self.log("------------TEXT-----------------")
                        self.log(text)
                        self.log("------------ID PERSON----------------")
                        self.log(id_person)
                        self.log("------------SEARCH---------------")
                        self.log(search)
                        self.log("--------------ATTR---------------")
                        self.log(base_attr)
                        self.log("-----------ENGINE SEARCH---------")
                        self.log(self.browser)
                        self.log("------------NUMBER SNIPPET-------")
                        self.log(num_snippet)

                        storage_item['title'] = title
                        storage_item['cite'] = cite
                        storage_item['text'] = text
                        storage_item['id_person'] = id_person
                        storage_item['search'] = search
                        storage_item['attr'] = base_attr
                        storage_item['engine_search'] = self.browser
                        storage_item['number_snippet'] = num_snippet

                        itemproc.process_item(storage_item, self)

        num = response.xpath(
            "//div[@id='result_info']/strong/text()").extract()

        if num == [] or num == ['No results found']:
            return

        try:
            self.log("----------NUM OF ELEMENTS---------")
            self.log(num[0].split(' ')[2])
            num = num[0].split(' ')[2]
            # ToDo Add constant names
            # ['No results found']

            if int(num) < 60 and num_snippet < 15:
                url = response.xpath("//div[@id='result_info']"
                                     "/div[@id='pager']/a/@href").extract()
                self.log("------------URL TO FOLLOW ------------")
                if url.__len__() > 0:
                    self.log(base_url + url[0])

                    request = Request(base_url + url[0],
                                      callback=self.cite_selector)
                    request.meta['id_person'] = id_person
                    request.meta['search'] = search
                    request.meta['attr'] = base_attr
                    request.meta['num_snip'] = num_snippet
                    yield request

        except:
            with open("error_num_citeseer.html", "w") as log_file:
                log_file.write(str(response.body))
Пример #6
0
    def duck_selector(self, response):

        if response.status != self.STATUS_OK:
            with open("error.log", "a") as log_file:
                log_file.write(
                    str(response.status) + " " + str(self.browser) + " " +
                    datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")
                return

        base_url = "https://duckduckgo.com/"
        snippets = response \
            .xpath("//div[@class='result results_links results_links_deep web-result ']") \
            .extract()

        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        with open("system_duckduckgo.log", "a") as log_file:
            log_file.write(
                str(response.status) + " " + str(self.browser) + " " +
                str(search) + " " + str(num_snippet) + " " +
                datetime.today().strftime("%y-%m-%d-%H-%M") + "\n")

        for snippet in snippets:
            storage_item = UsmItem()

            title = Selector(text=snippet).xpath("//div/h2/a/node()").extract()
            cite = Selector(text=snippet).xpath("//div/a/@href").extract()
            text = Selector(text=snippet).xpath(
                "//div/a[@class='result__snippet']/node()").extract()

            if title.__len__() > 0:
                tmp = ""
                for text in title:
                    for r in ["<b>", "</b>"]:
                        text = text.replace(r, '')
                    tmp = tmp + text
                title = tmp
            else:
                title = ""

            if cite.__len__() > 0:
                cite = cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                tmp = ""
                for txt in title:
                    for r in ["<b>", "</b>"]:
                        txt = txt.replace(r, '')
                    tmp = tmp + txt
                text = tmp
            else:
                text = ""

            if cite != "" and num_snippet < 15:
                if not cite.__contains__("facebook") and not cite.__contains__(
                        "youtube"):
                    text = Cleaner.clean_reserved_xml(Cleaner(), text)
                    text = Cleaner.remove_accent(Cleaner(), text)
                    title = Cleaner.clean_reserved_xml(Cleaner(), title)
                    title = Cleaner.remove_accent(Cleaner(), title)

                    if FeatureFilter.is_lang(text) == 'en':
                        num_snippet = num_snippet + 1
                        self.log("---------------------------------")
                        self.log("------------TITLE----------------")
                        self.log(title)
                        self.log("------------CITE-----------------")
                        self.log(cite)
                        self.log("------------TEXT-----------------")
                        self.log(text)
                        self.log("-----------ID PERSON-----------------")
                        self.log(id_person)
                        self.log("-----------SEARCH----------------")
                        self.log(search)
                        self.log("--------------ATTR---------------")
                        self.log(base_attr)
                        self.log("-----------ENGINE SEARCH---------")
                        self.log(self.browser)
                        self.log("------------NUMBER SNIPPET-------")
                        self.log(num_snippet)

                        storage_item['title'] = title
                        storage_item['cite'] = cite
                        storage_item['text'] = text
                        storage_item['id_person'] = id_person
                        storage_item['search'] = search
                        storage_item['attr'] = base_attr
                        storage_item['engine_search'] = self.browser
                        storage_item['number_snippet'] = num_snippet

                        itemproc.process_item(storage_item, self)
Пример #7
0
    def parse_myrg(self, response):

        black_list = ['1', ' ', '[...]', 'Recommendation', 'Download', 'Recommend', 'Follow', 'Share',
                      'Request full-text']
        black_list2 = ['Referenced in the project:', 'Research from: ', ' ']

        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        for search_box in response.xpath('//div[contains(@class, "search-box__result-item")]'):

            storage_item = UsmItem()

            num_snippet = num_snippet + 1

            text = ' '.join(list(set(search_box.css('span::text').extract()) - set(black_list)) + list(
                set(search_box.css('div::text').extract()) - set(black_list2)))
            url = response.url
            title_list = search_box.css('a.nova-e-link--theme-bare ::text').extract()

            try:
                if title_list[0] == 'Source':
                    title = title_list[1]
                else:
                    title = title_list[0]
            except:
                title = "NO TITLE WAS FOUND, YOUR SCRAPER SUCKS!"
            # print(url, title, text, "\n\n\n\n")

            self.log("---------------------------------")
            self.log("--------------TITLE--------------")
            self.log(title)
            self.log("-------------CITE----------------")
            self.log(url)
            self.log("---------------TEXT--------------")
            self.log(text)
            self.log("------------ID PERSON------------")
            self.log(id_person)
            self.log("------------SEARCH---------------")
            self.log(search)
            self.log("--------------ATTR---------------")
            self.log(base_attr)
            self.log("-----------ENGINE SEARCH---------")
            self.log(self.browser)
            self.log("------------NUMBER SNIPPET-------")
            self.log(num_snippet)

            storage_item['title'] = title
            storage_item['cite'] = url
            storage_item['text'] = text
            storage_item['engine_search'] = self.browser

            storage_item['id_person'] = id_person
            storage_item['search'] = search
            storage_item['attr'] = base_attr
            storage_item['number_snippet'] = num_snippet

            itemproc.process_item(storage_item, self)
Пример #8
0
    def duck_selector(self, response):

        base_url = "https://duckduckgo.com/"
        snippets = response \
            .xpath("//div[@class='result results_links results_links_deep web-result ']") \
            .extract()

        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        for snippet in snippets:
            storage_item = UsmItem()
            num_snippet = num_snippet + 1

            title = Selector(text=snippet).xpath("//div/h2/a/node()").extract()
            cite = Selector(text=snippet).xpath("//div/a/@href").extract()
            text = Selector(text=snippet).xpath("//div/a[@class='result__snippet']/node()").extract()

            if title.__len__() > 0:
                tmp = ""
                for text in title:
                    for r in ["<b>", "</b>"]:
                        text = text.replace(r, '')
                    tmp = tmp + text
                title = tmp
            else:
                title = ""

            if cite.__len__() > 0:
                cite = cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                tmp = ""
                for txt in title:
                    for r in ["<b>", "</b>"]:
                        txt = txt.replace(r, '')
                    tmp = tmp + txt
                text = tmp
            else:
                text = ""

            if cite != "":
                self.log("---------------------------------")
                self.log("------------TITLE----------------")
                self.log(title)
                self.log("------------CITE-----------------")
                self.log(cite)
                self.log("------------TEXT-----------------")
                self.log(text)
                self.log("-----------ID PERSON-----------------")
                self.log(id_person)
                self.log("-----------SEARCH----------------")
                self.log(search)
                self.log("--------------ATTR---------------")
                self.log(base_attr)
                self.log("-----------ENGINE SEARCH---------")
                self.log(self.browser)
                self.log("------------NUMBER SNIPPET-------")
                self.log(num_snippet)

                storage_item['title'] = title
                storage_item['cite'] = cite
                storage_item['text'] = text
                storage_item['id_person'] = id_person
                storage_item['search'] = search
                storage_item['attr'] = base_attr
                storage_item['engine_search'] = self.browser
                storage_item['number_snippet'] = num_snippet

                itemproc.process_item(storage_item, self)
Пример #9
0
    def bing_selector(self, response):
        base_url = "https://www.bing.com/"
        snippets = response.xpath("//li[@class='b_algo']").extract()
        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        for snippet in snippets:
            num_snippet = num_snippet + 1
            storage_item = UsmItem()
            title = Selector(text=snippet).xpath("//h2/a/node()").extract()
            cite = Selector(text=snippet).xpath("//h2/a/@href").extract()
            text = Selector(text=snippet).xpath("//p").extract()

            tmp_title = ""
            for cad in title:
                tmp_title = tmp_title + cad
            for r in ["<strong>", "</strong>"]:
                tmp_title = tmp_title.replace(r, '')
            title = tmp_title

            if cite.__len__() > 0:
                cite = cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                text = text[0]
                for r in [
                        "<p>", "</p>", "<strong>", "</strong>",
                        '<span class="news_dt">', '</span>'
                ]:
                    text = text.replace(r, '')
            else:
                text = ""

            if cite != "":
                self.log("------------TITLE----------------")
                self.log(title)
                self.log("------------CITE-----------------")
                self.log(cite)
                self.log("------------TEXT-----------------")
                self.log(text)
                self.log("----------ID PERSON------------------")
                self.log(id_person)
                self.log("-----------SEARCH----------------")
                self.log(search)
                self.log("--------------ATTR---------------")
                self.log(base_attr)
                self.log("-----------ENGINE SEARCH---------")
                self.log(self.browser)
                self.log("------------NUMBER SNIPPET-------")
                self.log(num_snippet)

                storage_item['title'] = title
                storage_item['cite'] = cite
                storage_item['text'] = text
                storage_item['id_person'] = id_person
                storage_item['search'] = search
                storage_item['attr'] = base_attr
                storage_item['engine_search'] = self.browser
                storage_item['number_snippet'] = num_snippet

                itemproc.process_item(storage_item, self)
        number = response.xpath("//li[@class='b_pag']/nav[@role='navigation']"
                                "//a[@class='sb_pagS']/text()").extract()
        self.log("-----------NUMBER OF PAGE-------")
        if number.__len__() > 0:
            self.log(number[0] + "")
            if int(number[0]) < 5:
                num = int(number[0]) + 1
                num = str(num)
                res = response.xpath(
                    "//li[@class='b_pag']/nav[@role='navigation']"
                    "//a[@aria-label='Page " + num + "']/@href").extract()
                for url in res:
                    self.log("--URL TO FOLLOW--")
                    self.log(base_url + url)

                    request = Request(base_url + url,
                                      callback=self.bing_selector)
                    request.meta['id_person'] = id_person
                    request.meta['attr'] = base_attr
                    request.meta['search'] = search
                    request.meta['num_snip'] = num_snippet
                    yield request