示例#1
0
    def get_contact(self, response: Response) -> dict:
        """
        Gets the contact information.

        :param response: the response object
        :return: a dict containing the phone and email
        """
        contact = {
            'email': '',
            'phone': '',
            'website': response.url,
            'meet': ''
        }
        for result in response.xpath(
                'string(//*[@id="formTechPub1"]/div/table/tr/td[3])').getall():
            phone = extract_phone(result)
            if len(phone) > 0:
                contact['phone'] = phone[0]
        for text in response.xpath(
                '//*[@id="formTechPub1"]/div/table/tr/td[3]//a/@href').getall(
                ):
            if text.startswith('mailto:'):
                contact['email'] = text.split(':')[-1]
                break
        return contact
 def response_transform(self, response: Response) -> Iterable[Thread]:
     """
     Makes a list of items from the response
     """
     forum_id = self.extract_forum_id_from_url(response.url)
     print(str(forum_id))
     thread_strings = response.xpath('//tbody/tr[@class="thread"]/@id'
                                     ).extract()  # gives 'thread#######'
     thread_authors = response.xpath(
         '//tbody//td[@class="author"]/a/text()').extract()
     titles = response.xpath('//a[@class="thread_title"]/text()').extract()
     views = response.xpath('//td[@class="views"]/text()').extract()
     replies = response.xpath('//td[@class="replies"]/text()').extract()
     # parse everything
     for i in range(0, 40):
         thnum = re.search('(\d{7})', thread_strings[i]).group(0)
         author = thread_authors[i]
         title = titles[i]
         vw = views[i]
         reply = replies[i]
         if views == '-' or reply == '-':  # admin threads, dgaf
             continue
         # print(str([thread_authors,titles,views,replies]))
         item = Thread(int(thnum), title, author, int(vw), int(reply),
                       int(forum_id))
         yield item
示例#3
0
    def parse(self, response: Response) -> Iterator[Optional[Dict[str, Any]]]:
        """Parse reponse from IseWan Vessel Traffic Service Centre website."""

        reported_date = response.xpath(
            '//div[@class="_inner"]/p/text()').extract_first()
        events = [
        ]  # to hold sequential list of vessel lineup for advanced parsing

        table = response.xpath('//table[@class="generalTB"]')
        for row_idx, row in enumerate(table.xpath('.//tr')):
            # first row of source table is always the header
            if row_idx == 0:
                headers = row.xpath('.//th/text()').extract()
                continue

            # subsequent rows are exclusively vessel movements only
            raw_item = row_to_dict(row, headers)

            # contextualise item with meta info
            raw_item.update(provider_name=self.provider,
                            reported_date=reported_date)

            # standardize character width
            for key, value in raw_item.items():
                raw_item[key] = may_strip(_standardize_char_width(value))

            event = normalize.process_item(raw_item)
            events.append(event) if event else None

        # combine arrival and departure events into a single 'PortCall' datatype
        for event in events:
            yield from normalize.combine_event(event, events)
    def parse(self, response: Response):

        all_link = response.xpath('//a/@href')
        links = []
        for link in all_link:
            url = link.extract()
            url = response.url + url
            links.append(url)
            log.info(url)

        is_dirs = []
        all_text = response.xpath('/html/body/pre/text()')
        for _text in all_text:
            text: str = _text.extract()
            text = text.strip()
            laststr = text.split(' ')[-1]
            print("-->" + laststr + "<--")
            is_dirs.append(laststr == '-')

        for i in range(len(all_link)):
            item = PackagesItem()
            url = links[i]
            if '../' in url:
                continue
            is_dir = is_dirs[i]
            item['url'] = url
            item['is_dir'] = is_dir

            if is_dir:
                yield scrapy.Request(url, callback=self.parse)
            else:
                yield item
示例#5
0
    def ads_item_parse(self, response: Response):
        db = self.data_base_client['db_habr']
        collection = db['habr_blog']
        title = response.xpath(
            '//span[@class="post__title-text"]/text()').extract_first()
        url_stat = response.request._get_url()
        name_autor = response.xpath(
            '//span[contains(@class, "user-info__nickname")]//text()'
        ).extract_first()
        href_autor = response.xpath(
            '//header[contains(@class, "post__meta")]//a//@href'
        ).extract_first()
        images = response.xpath('//img/@src').getall()

        collection.insert_one({
            'title': title,
            'url_stat': url_stat,
            'name_autor': name_autor,
            'href_autor': href_autor,
            'images': images
        })

        yield {
            'title': title,
            'url_stat': url_stat,
            'name_autor': name_autor,
            'href_autor': href_autor,
            'images': images
        }
示例#6
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath(
             "//div[@class='view-content']/div[contains(@class,'views-row')]/div/h3/a"
     ):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         self.log("find technology {}/{}".format(text, url),
                  level=logging.INFO)
         patent_links.append(url)
     # for next page
     next_page = response.xpath("//li[@class='pager-next']/a/@href").get()
     if next_page is not None:
         self.log('process page {}'.format(next_page), level=logging.INFO)
         yield response.follow(
             url=next_page,
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
示例#7
0
    def get_contact(self, response: Response) -> dict:
        """
        Gets the contact information.

        :param response: the response object
        :return: a dict containing the phone and email
        """
        contact = {
            'email': '',
            'phone': '',
            'website': response.url,
            'meet': ''
        }
        # for phone number
        for text in response.xpath(
                "//div[@class='information']/ul/li/text()").getall():
            result = extract_phone(text)
            if len(result) > 0:
                contact['phone'] = result[0]
                break
        for text in response.xpath(
                "//div[@class='information']/ul/li/a/@href").getall():
            if text.startswith('mailto:'):
                contact['email'] = text.split(':')[-1]
                break
        return contact
示例#8
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath("//a[@class='lead-in']/@href").getall():
         patent_links.append(link)
     # for next page
     next_page = response.xpath(
         "//div[@class='nav-previous']/a/@href").get()
     if next_page is not None:
         self.log('process page {}'.format(next_page), level=logging.INFO)
         yield response.follow(
             url=next_page,
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = p.split('/')[-2]
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
示例#9
0
 def extract_table(
         response: Response) -> Generator[Tuple[str, str], None, None]:
     versions = response.xpath(
         TPLinkGPL.XPATH['table_device_version']).extract()
     links = response.xpath(TPLinkGPL.XPATH['table_device_link']).extract()
     for version, link in zip(versions, links):
         yield version.strip(), link.strip()
示例#10
0
    def parse_thread(self, response: Response) -> Iterator[Request]:
        """
        Parse a thread, gathering the URLs within.

        A thread consists of:
        - Email URLs
        - Next Page Pagination URL

        See https://marc.info/?t=111957107900001&r=1&w=2

        :param response: Scrapy response
        :return: Request object callback to sub-parsing functions.
        """
        self.logger.info(f"parse_thread: {response.url}")

        email_urls = response.xpath(
            "//a[contains(@href, '&m=')]/@href").extract()
        email_urls = [self.marc_url + u for u in email_urls]

        for url in email_urls:
            yield Request(url, self.parse_email)

        next_page_url = response.xpath(
            "//pre//a[contains(text(), 'Next')][1]//@href").extract()

        if any(next_page_url):
            next_url = self.marc_url + next_page_url[0]
            yield Request(next_url, callback=self.parse_thread)
示例#11
0
    def response_transform(response: Response):
        """
        Makes a list of items from the response
        """
        forum_id = extract_forum_id_from_url(response.url)
        thread_strings = response.xpath(
            '//tbody/tr[contains(@class,"thread")]/@id').extract(
            )  # gives 'thread#######'
        thread_authors = response.xpath(
            '//tbody/tr[@id]/td[@class="author"]/a/text()').extract()
        thread_author_ids = BQThreadSpider.get_thread_author_ids(response)
        titles = response.xpath('//a[@class="thread_title"]/text()').extract()

        if not (len(titles) == len(thread_author_ids)
                and len(thread_author_ids) == len(thread_authors)
                and len(thread_authors) == len(thread_strings)):
            log.warning(
                "WARNING Extracted components do not match on page %s--titles: \t %d \n author ids: \t %d \n authors: %d \n threadids: \t %d",
                response.url, len(titles), len(thread_author_ids),
                len(thread_authors), len(thread_strings))
        # parse everything
        for i in range(0, len(thread_strings)):
            thnum = re.search('(\d{7})', thread_strings[i]).group(0)
            author = thread_authors[i]
            title = titles[i]
            aid = thread_author_ids[i]
            # print(str([thread_authors,titles,views,replies]))
            item = ThreadAvro(int(forum_id), int(thnum), title, author,
                              int(aid), False)
            yield item
示例#12
0
    def get_contact(self, response: Response) -> (dict, dict):
        """
        Gets the contact information.

        :param response: the response object
        :return: a tuple of two dict, one for an user and the other for the contact information
        """
        contact = {
            'email': '',
            'phone': '',
            'website': response.url,
            'meet': ''
        }

        # manager
        name = response.xpath("//dd[@class='manager']/a/text()").get()
        link = response.xpath("//dd[@class='manager']/a/@href").get()
        manager = create_user()
        manager['name'] = name
        manager['ref'] = link
        tag = response.xpath("//dd[@class='manager']/div/em[1]/text()").get()
        if tag is not None and isinstance(tag, str):
            manager['tag'] = remove_empty_string_from_array(tag.split(', '))
        contact['phone'] = response.xpath(
            "//dd[@class='manager']/div/em[2]/text()").get()
        manager['contact'] = contact
        manager['contact']['website'] = link
        self.log('find manager {} with contact {}'.format(manager, contact),
                 level=logging.DEBUG)
        return manager, contact
    def parse_forum(self, response: Response):
        forum_url_query = urllib.parse.urlparse(response.url).query
        forum_id = int(urllib.parse.parse_qs(forum_url_query)['id'][0])
        forum_title = response.xpath(
            '//div[@id="pun-main"]/h1/span/text()').get()

        section_id = self.forums[forum_id]

        yield ForumItem(id=forum_id, title=forum_title, section_id=section_id)

        for topic in response.xpath(
                '//div[@class="forum"]/div[@class="container"]//tbody//tr/td[@class="tcl"]/div[@class="tclcon"]'
        ):
            topic_url = topic.xpath('a/@href').get()
            topic_url_query = urllib.parse.urlparse(topic_url).query
            topic_id = int(urllib.parse.parse_qs(topic_url_query)['id'][0])

            self.topics[topic_id] = forum_id

            yield response.follow(topic_url, self.parse_topic)

        next_page_url = response.xpath(
            '//div[@class="pagelink"]/a[@class="next"]/@href').get()
        if next_page_url:
            yield response.follow(next_page_url, callback=self.parse_forum)
示例#14
0
def scrape_chapter(response: Response):
    book_num = response.meta['book_num']
    file_name = response.url.replace(
        "https://practicalguidetoevil.wordpress.com", "")
    file_name = file_name.strip("/").replace("/", "_")

    chapter_title = response.css("header > h1").xpath("text()").extract_first()
    paragraphs = response.xpath('//*[starts-with(@id, "post")]/div/div/p')
    with open(f"chapters/Book {book_num}/{file_name}.txt",
              "w",
              encoding="utf8") as file:
        # just makes a readable txt file - this removes useful stuff like italics/bold
        file.write(f"---{chapter_title}---")
        file.write("\r\n\r\n")
        for p in paragraphs:
            p_str = " ".join(p.xpath("*//text()").extract())
            if p_str == "":
                p_str = " ".join(p.xpath("text()").extract())
            file.write(p_str)
            file.write("\r\n")

    with open(f"sources/Book {book_num}/{file_name}.html",
              "w",
              encoding="utf8") as file:
        # "HTML" source
        file.write(
            response.xpath(
                '//*[starts-with(@id, "post")]/div/div').extract_first())
    print(f"Scraped Book {book_num}: {chapter_title}")
示例#15
0
    def parse(self, response: Response):
        for row in response.xpath(
                '//div[@class="usertable"]//div[@class="container"]//tbody/tr'
        ):
            profile_url = row.xpath('td[@class="tcl"]//a/@href').get()
            yield response.follow(profile_url, callback=self.parse_profile)

        next_page_url = response.xpath(
            '//div[@class="pagelink"]/a[@class="next"]/@href').get()
        if next_page_url:
            yield response.follow(next_page_url, callback=self.parse)
示例#16
0
 def parse_detail(self, response: Response):
     XPATH_TITLE = "//div[@class='text']//h4[1]/text()"
     XPATH_COURSE = "//div[@class='childtitle']//p/text()"
     XPATH_VIDEO = "//video/@src"
     title = response.xpath(XPATH_TITLE).get()
     course = response.xpath(XPATH_COURSE).get()
     video_url = response.urljoin(response.xpath(XPATH_VIDEO).get())
     return XdvideoItem(title=title,
                        course=course,
                        file_urls=[video_url],
                        episode=response.meta["n"])
示例#17
0
 def _get_lyrics_text(response: Response) -> str:
     selectors: List[Selector] = response.xpath("//pre")
     if not selectors:
         try:
             return response.body.decode("latin1")
         except Exception:
             raise OhhlaException(f"Skipping {response.url}; could not decode into 'latin1' encoding.")
     elif len(selectors) > 1:
         raise OhhlaException(f"Skipping {response.url}; non-conformant for a song page.")
     else:
         return response.xpath("//pre")[0].root.text
    def parse(self, response: Response, **kwargs):
        if self.url_to_crawl:
            yield response.follow(url=self.url_to_crawl, callback=self.parse_residences)
        else:
            residences = response.xpath("//a[contains(@class,'detalii-proprietate')][contains(.,'Vezi detalii')]/@href").getall()
            residences = list(set(residences))

            yield from response.follow_all(urls=residences, callback=self.parse_residences)

            next_page = response.xpath("//a[@class='inainte butonpaginare']/@href").get()
            if next_page:
                yield response.follow(url=next_page, callback=self.parse)
示例#19
0
    def single_parse(self, response: Response):
        self.debug(response)
        if not response.xpath('//h1[@id="title"]/*/text()').get('').strip():
            return Request(response.url,
                           self.single_parse,
                           dont_filter=True,
                           errback=self.errors('single_parse'))
        log('product')
        now = dt.now(timezone('Asia/Tokyo'))
        product = AmazonItem()

        product['time'] = now.strftime('%Y-%m-%dT%H-%M-%S')
        product['title'] = response.xpath('//h1[@id="title"]/*/text()').get(
            '').strip()
        product['url'] = response.url
        review = response.css('span#acrCustomerReviewText::text').get('')
        product['review_num'] = review[0:-4] if review else 0
        product['description'] = '\n'.join([
            x.strip() for x in response.css(
                '#feature-bullets > ul > li *::text').getall()
            if x.strip() not in ('', 'モデル番号を入力してください', 'これが適合するか確認:')
        ])
        seller = response.css('a#sellerProfileTriggerId')
        if seller:
            shop_name = seller.css('*::text').get('')
            seller_id = get_query_val(seller.attrib['href'], 'seller')
            shop_url = f'{amazon_url}/sp?seller={seller_id}' if seller_id else ''
        elif response.xpath('//*[@id="merchant-info"]/a'):
            shop_name = 'Amazon.co.jp'
            shop_url = 'https://www.amazon.co.jp/gp/help/customer/display.html?nodeId=202008070'
        else:
            shop_name = '-'
            shop_url = ''
        product['shop_name'] = shop_name
        product['shop_url'] = shop_url
        product['categories'] = ' > '.join([
            el.get().strip() for el in response.css(
                '#wayfinding-breadcrumbs_feature_div > ul > li > span > a::text'
            )
        ])
        if shop_url == 'https://www.amazon.co.jp/gp/help/customer/display.html?nodeId=202008070':
            product['shop_address'] = '〒153-0064 東京都目黒区下目黒1-8-1 日本'
            yield product
        elif shop_url:
            yield Request(shop_url,
                          self.shop_parse,
                          meta={'product': product},
                          dont_filter=True,
                          errback=self.errors('single_parse', response.url))
        else:
            product['shop_address'] = '---'
            yield product
示例#20
0
    def _document(self, response: Response,
                  **kwargs: dict) -> Generator[dict, None, None]:
        link = kwargs['url']
        title = response.xpath(
            '//h1[@class="documentFirstHeading"]/text()').get()
        paragraphs = response.xpath(
            '//div[@class="field-item even"]//p[string-length(text()) > 3 and not(@class)]/text()'
        ).getall()
        img = response.xpath(
            '//div[@class="field-item even"]//a[not(@class) and @target="_blank"]/img/@src'
        ).get()

        yield {'url': link, 'title': title, 'body': paragraphs, 'img': img}
示例#21
0
 def parse(self, response: Response, **kwargs):
     a_tags = response.xpath(
         "//div[@class='search-lists-container']//div[@class='car-name-left']/h4/a"
     )
     for a in a_tags:
         url = a.xpath('./@href').get()
         yield Request(url=response.urljoin(url),
                       callback=self.parse_detail)
     next_page_link = response.xpath(
         "(//li[@class='pagination-li pag-next']/a/@href)[1]").get()
     if next_page_link is not None:
         yield Request(url=response.urljoin(next_page_link),
                       callback=self.parse)
示例#22
0
 def readScript(self, response: Response):
     if self.num < self.maxNum:
         script = response.xpath(
             "//td[@class='scrtext']/pre").extract_first()
         if not script:
             script = response.xpath(
                 "//td[@class='scrtext']").extract_first()
         if script:
             if not os.path.exists("data"):
                 os.mkdir("data")
             with open("data/script" + str(self.num) + ".txt", "w+") as f:
                 f.write(remove_tags(script))
                 self.num += 1
示例#23
0
    def get_contact(self, response: Response) -> dict:
        """
        Gets the contact information.

        :param response: the response object
        :return: a dict containing the phone and email
        """
        contact = {'email': '', 'phone': '', 'website': response.url, 'meet': ''}
        contact['email'] = response.xpath("string(//*[@id='email'])").get()
        phone = extract_phone(response.xpath("//*[@id='PhoneNumber']/@onclick").get())
        if len(phone) > 0:
            contact['phone'] = phone[0]
        self.log('Found contact {}'.format(contact), level=logging.DEBUG)
        return contact
示例#24
0
 def parse_item(self, response: Response):
     item = PhotoSetItem()
     item['title'] = response.xpath(
         '//div[@class="title"]/h1/text()').extract()[0]
     info = response.xpath('//div[@class="info"]/left/text()').extract()[0]
     item['datetime_published'], item['author'], item['source'] = [
         e[3:] for e in re.split(r'\xa0+', info)
     ]
     item['url'] = response.url
     item['tags'] = '/'.join(
         response.xpath(
             '//div[@class="tags"]/a[@target="_blank"]/text()').extract())
     item['description'] = response.xpath(
         '//div[@class="text"]/text()').extract()[0]
     yield item
示例#25
0
    def add_keywords(self, response: Response) -> list:
        """
        Obtain the keywords of the patent

        :param response: response
        :return list of keywords
        """
        categories = response.xpath(
            "//div[@class='ncd-data otherdata-categories display-block indented ']//a/text()").getall()
        try:
            # keyword may not exist
            categories.extend(response.xpath("//div[@class='ncd-data otherdata-keywords']/p/text()").get().split(', '))
        except:
            pass
        return categories
示例#26
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath(
             '//*[@id="formTechPub1"]/div/table[2]/tr/td/a'):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         self.log("find technology {}/{}".format(text, url),
                  level=logging.INFO)
         patent_links.append(url)
     # for next page
     total_result = self.statictics(response)
     self.page += 1
     if self.page * self.item_per_page < total_result:
         self.log('process page {}'.format(self.page), level=logging.INFO)
         yield response.follow(
             url=self.next_page_template.format(self.page),
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
示例#27
0
    def get_contact(self, response: Response) -> dict:
        """
        Gets the contact information.

        :param response: the response object
        :return: the contact information
        """
        contact = {"website": "", "meet": "", "email": "", "phone": ""}
        email = response.xpath("//div[@class='c_tp_contact']/a/text()").get()
        if email is not None:
            contact['email'] = email
        phone = extract_phone(
            response.xpath("string(//div[@class='c_tp_contact'])").get())
        if len(phone) > 0:
            contact['phone'] = phone[0]
        return contact
示例#28
0
    def parse_content(self, response: Response):
        if response.status == 200:
            contents = response.xpath('//*[@id=$id]//*/text()',
                                      id='text110').extract()
            title = response.meta.get('title')

            return {'title': title, 'contents': contents}
示例#29
0
    def parse(self, response: Response, **kwargs):
        for script in response.xpath('//script/text()').getall():
            # Look for the specific script tag we want
            if 'INITIAL_STATE' in script:
                # Extract the interesting part from the script tag
                m = re.match(r'window\.INITIAL_STATE\s+=\s+({[\s\S]+});',
                             script)

                # Decode it properly, handling annoying unicode escapes and nonsense from the site renderer
                custom_demjson = CustomJSON(json_options=demjson.json_options(
                    compactly=False))
                decoded = custom_demjson.decode(m.group(1),
                                                encoding='unicode-escape')

                # Write a proper valid JSON file out
                # with open('example.json', 'w', encoding='utf-8') as file:
                #     file.write(custom_demjson.encode(decoded))

                raw_data = decoded['searchData']
                word = Word.from_raw(data=raw_data)

                urls = word.get_urls()
                new = urls - self.queue
                self.queue.update(new)

                if len(new) > 0:
                    print(f'Found {len(new)} more URLs.')
                return response.follow_all(new)
示例#30
0
 def extract_multi_firmware(
         response: Response) -> Generator[Tuple[str, str], None, None]:
     names = response.xpath(TPLinkGPL.XPATH['device_names_multi']).extract()
     links = response.xpath(TPLinkGPL.XPATH['device_links_multi']).extract()
     for device, link in zip(names, links):
         yield device.strip(
         ), f'https://www.tp-link.com/phppage/gpl-res-list.html{link.strip()}&appPath=de'