Exemplo n.º 1
0
 def parse(self, response: TextResponse):
     latest_articles = response.css(
         'main#content article[class*="expanded-feature"] *[class*="title"] a'
     )
     more_stories = response.css('ol[class*="list"] *[class*="title"] a')
     yield from response.follow_all(chain(latest_articles, more_stories),
                                    callback=self.parse_news)
Exemplo n.º 2
0
    def parse(self, response: TextResponse, **kwargs):
        if self.n_pages == self.max_pages:
            return

        self.n_pages += 1

        links = response.css("#recent p.title > a::attr(href)").extract()

        if links:
            for link in links:
                if self.db.rentals.find({
                        "_id": link
                }, {
                        "_id": 1
                }).count() == 0:
                    yield response.follow(link, callback=self.parse)
        else:
            ad = parse_ad_page_html(response.body)
            with contextlib.suppress(DuplicateKeyError):
                self.db.rentals.insert_one(asdict(ad))
                yield ad

        next_page = response.css(".next a")

        if next_page:
            yield response.follow(next_page[0], callback=self.parse)
Exemplo n.º 3
0
def run():
    response = requests.get("https://en.wikipedia.org/wiki/Quantum_mechanics")
    response.raise_for_status()
    response = TextResponse(
        body=response.content,
        url="https://en.wikipedia.org/wiki/Quantum_mechanics")

    response.css('div.printfooter > a::text').extract_first()
    soup = BeautifulSoup(response.text, 'html.parser')

    a_tags = soup.findAll('a')
    s = []
    for tag in a_tags:
        #print(len(tag.contents))
        s.append(tag.get('href', None))

    wikis = []
    for i in range(len(s)):
        if re.search(r'^/wiki/.+', str(s[i])) or re.search(
                r'^https://en.wikipedia.org.+', str(s[i])):
            continue
        else:
            if re.search(r'^https://.+', str(s[i])):
                wikis.append(s[i])

    del wikis[2]

    print(wikis)
    '''
Exemplo n.º 4
0
def jobs_scraper(url):
    page = requests.get(url)
    response = TextResponse(body=page.text, url=url, encoding="utf-8")
    companies_name = response.xpath(
        "//p[@class='job_list_company_title']/text()").extract()
    vac_name = response.xpath("//p[@class='font_bold']/text()").extract()
    base_url = "https://staff.am"
    urls = response.xpath(
        "//div[@class='web_item_card hs_job_list_item']/a/@href").extract()
    vacs_url = [base_url + i for i in urls]
    deadline1 = response.css(
        "div[class = 'job-inner job-list-deadline'] >p:not([class='job_location'])"
    )
    deadline2 = [i.css('::text').extract()[1] for i in deadline1]
    deadline = [i.replace('\n', " ") for i in deadline2]
    location1 = response.css(
        "div[class = 'job-inner job-list-deadline'] >p[class='job_location']")
    location2 = [i.css("::text").extract()[1] for i in location1]
    location = [i.replace('\n', "").strip() for i in location2]
    return pd.DataFrame({
        "Companies": companies_name,
        "Vacancies": vac_name,
        'Links': vacs_url,
        "Deadline": deadline,
        'Location': location
    })
Exemplo n.º 5
0
    def parse(self, response: TextResponse):
        # getting data for productdata object
        r_url = response.url
        r_page = response.text
        r_time = datetime.now()

        print("scraping for productData: {}".format(r_url))

        dct = product_data_html_parse(response.css(".prodtable").get())
        remove = False
        try:
            # get price
            r_price = dct["RSP"]
            r_price = "".join(
                re.findall(r"([\d,.])",
                           r_price))  # use regex to remove the currency symbol

            # get brand
            r_brand = dct['Brand']

            # get item name
            r_itemname = response.css(
                ".productpagedetail-inner .prodname::text").get()

            # get item size
            r_size = dct['Pack Size']
            # r_size = "".join(re.findall(r"([\d,.])", r_size))
        except KeyError as e:
            print(e)
            print("Missing data, remove ")
            remove = True

        # since product and productdata has a 1 to 1 relationship, the url of product and productdata is the same
        # iterate thorugh the product table, find the matching url and create the productdata sqlalchemy object with the product object
        Product_table = crawlProduct.dbSession.query(Product).all()
        for i in Product_table:
            # if there is a matcging url, we found the matching product
            if i.url == r_url:
                # check if any of the scraped data has none values. if true, delete the product entry
                if remove:
                    crawlProduct.dbSession.query(Product).filter(
                        Product.id == i.id).delete()
                else:
                    product_data_object = ProductData(
                        url=r_url,
                        html=r_page,
                        date=r_time,
                        price=r_price,
                        brand=r_brand,
                        itemName=r_itemname,
                        size=r_size,
                        product=i,
                    )
                    crawlProduct.dbSession.add(product_data_object)
                crawlProduct.dbSession.commit()
Exemplo n.º 6
0
    def parse(self, response):
        links = []
        for url in self.start_urls:
            time.sleep(3)
            self.driver.get(url)

            i = 0
            while i < 60:
                next = self.driver.find_element_by_class_name('load-more')
                try:
                    next.click()

                except:
                    print "\n\n#######\n", "SAIU", "\n\n#########\n"
                    break
                i = i + 1

            resp = TextResponse(url=self.driver.current_url,
                                body=self.driver.page_source,
                                encoding='utf-8')
            links = links + resp.css('.feed-post-link::attr(href)').extract()
            tam = len(links)
            print "\n\n************** ", tam, " **************\n\n"
            if tam >= 1000:
                break

        c = 1
        setLinks = set(links)
        n = len(setLinks)
        for link in setLinks:
            c += 1
            print "**********************", c / (
                n * 1.0), "**********************", "\n"

            self.driver.get(link)
            resp = TextResponse(url=self.driver.current_url,
                                body=self.driver.page_source,
                                encoding='utf-8')

            noticia = {
                'titulo':
                resp.css('.content-head__title::text').extract_first(),
                'subtitulo':
                resp.css('.content-head__subtitle::text').extract_first(),
                'texto':
                "".join(resp.css('.content-text__container::text').extract()),
            }

            if (noticia['titulo'] is not None
                    and noticia['subtitulo'] is not None):
                self.news.append(noticia)

            yield noticia

        self.driver.close()
Exemplo n.º 7
0
def quote_scraper(response):
    page = requests.get(url)
    response= TextResponse(body=page.text,url=url,encoding="utf-8")
    author = response.css("small.author::text").extract()
    quotes = response.css("div.quote > span.text::text").extract()
    div_tags =response.css("div.tags ")
    tags = [i.css("a.tag::text").extract() for i in response.css("div.tags")]
    base_url="http://quotes.toscrape.com/"
    rel_hyperlinks=response.css("small.author~a::attr(href)").extract()
    hyperlink=[base_url+i for i in rel_hyperlinks]
    return pd.DataFrame({"quotes":quotes,"author":author,"tags":tags,"author_page":hyperlink})
    def parse(self, response):
        self.driver.get(response.url)
        website = self.website
        while True:
            if self.page_number > 10000:
                break
            try:
                prefix_size = 0
                next = self.driver.find_element_by_id(website.next_button_id)
                next.click()
                time.sleep(5)
                sel_response = TextResponse(
                    url=self.driver.current_url,
                    body=self.driver.page_source,
                    encoding='utf-8')
                if (prefix_size == 0):
                    suffix_size = get_suffix_size(
                        response.css('a::attr(href)').re(website.url_patterns),
                        sel_response.css('a::attr(href)').re(
                            website.url_patterns))
                # extract all links from current page that respect pattern. Remove common links found previously.
                links = sel_response.css('a::attr(href)').re(
                    website.url_patterns)[prefix_size:-suffix_size]
                prefix_size += len(links)
                links = set(links)
                if (not links):
                    self.logger.info('No link found. Stopping scraper.')
                    return
                self.page_number += 1
                # if website uses relative url, prepend all links with domain name
                if (website.relative_url):
                    links = (response.urljoin(link) for link in links if link)

                # only keep unvisited links
                links = (link for link in links
                         if not Article.objects.filter(url=link))

                for link in links:
                    yield response.follow(
                        link,
                        callback=parse_article,
                        meta={
                            'spider': self,
                            'website': website
                        },
                        priority=1)
            except Exception as e:
                break
        driver.close()
        driver.quit()
Exemplo n.º 9
0
def booksto_scrape(url,base_url="http://books.toscrape.com/"):
    page = requests.get(url)
    response = TextResponse(body=page.text,url=url,encoding="utf-8")

    book_title=response.css("h3>a::attr(title)").extract()
    book_rating = response.css("p[class^='star-rating']::attr(class)").extract()
    instock  =response.css("p.price_color ~ p.instock::attr(class)").extract()
    instock_or_not = [i.replace("availability", " ") for i in instock]
    p_price = response.css("p.price_color::text").extract()
    price = [i.replace("Â", "") for i in p_price]
    book_page_URL = response.css("h3 >a::attr(href)").extract()
    book_picture_URL = response.css("img::attr(src)").extract()
    base_url = "http://books.toscrape.com/catalogue/"
    book_page_URL2 = [base_url + i for i in book_page_URL ]
    book_picture_URL2 = [base_url + i for i in book_picture_URL]
    rating = []
    for i in book_rating:
        rating.append(i.replace("star-rating", ""))
    book_genre = []
    book_description = []
    
    for i in book_page_URL2:
        page = requests.get(i)
        response = TextResponse(body=page.text,url=i,encoding="utf-8")

        book_genre.append(response.css("li~li~li > a::text")[0].extract())
        book_description.append(response.css("article[class='product_page'] > p::text").extract_first())
    
    return pd.DataFrame({ "price":price,"book_title":book_title, "rating":rating,"instock_or_not":instock_or_not,"book_genre":book_genre,"book_description":book_description,"book_page_URL2":book_page_URL2,"book_picture_URL2":book_picture_URL2})
Exemplo n.º 10
0
 def parse_unit(self, response: TextResponse):
     resp = TextResponse(
         url=response.url,
         request=response.url,
         body=json.loads(response.body.decode('utf-8'))['goods']['html'],
         encoding='utf-8')
     base_url = response.request.headers['user_base_url']
     units = []
     for index, item in enumerate(
             resp.css('table#search_results tr.highlight') +
             resp.css('table#search_results tr.editors_choise')):
         url = self.strip(item.css('a.t::attr(href)').extract_first())
         title = self.strip(item.css('a.t span::text').extract_first())
         units.append({'base_url': base_url, 'url': url, 'title': title})
     return {'data': units}
Exemplo n.º 11
0
def movie_scraper(url,base_url="https://www.imdb.com/chart/moviemeter/"):
    page = requests.get(url)
    response = TextResponse(body=page.text,url=url,encoding="utf-8")
    title = response.css("td.titleColumn >a ::text").extract()
    release_year = response.css("td >span.secondaryInfo::text").extract()
    year = [i.replace("(" , "").replace(")" , "") for i in release_year]
    ratings_table = response.css("td.ratingColumn.imdbRating")
    rating = [i.css('strong::text').extract_first() for i in ratings_table]
    movurl = response.css("td.titleColumn ::attr(href)").extract()
    hyperlink = 'https://www.imdb.com/'
    movie_hyperlink = [hyperlink+i for i in movurl]
    rank_div = response.css("div.velocity")
    rank_ext = [i.css("::text").extract_first() for i in rank_div]
    ranks = [i.replace('\n',"").replace("(no change)","") for i in rank_ext]  
    return pd.DataFrame({"Title":title,  "Year":year, "Ratings":rating, "Movie_Hyperlink": movie_hyperlink , 'Rank':ranks,})
Exemplo n.º 12
0
class Movies: 

    def __init__(self,URL):
        self.URL = URL
        self.page = requests.get(self.URL)
        self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8")

    def get_movies(self):
        titles = self.response.css("td.titleColumn>a::text").extract()
        year = self.response.css("td.titleColumn>span.secondaryInfo::text").extract()
        ranking = [i.css("div.velocity::text").extract() for i in self.response.css("td.titleColumn")]
        rating_td = self.response.css('td[class = "ratingColumn imdbRating"]')
        rating = [i.css('strong::text').extract() for i in rating_td]
        rel_hyperlinks = self.response.css("td.titleColumn > a::attr(href)").extract()
        hyperlink_movie = [base_url + i for i in rel_hyperlinks]
        return pd.DataFrame({"Titles":titles,"Year":year,"Hyperlink_movie":hyperlink_movie,"Ranking":ranking,"Rating":rating})
Exemplo n.º 13
0
class Jobs:
    def __init__(self,URL):
        self.URL = URL
        self.page = requests.get(self.URL)
        self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8")

    def get_vacancy(self):
        vac = self.response.xpath('//div[@class="job-inner job-item-title"]/p[@class="font_bold"]/text()').extract()
        return vac 

    def get_company(self):
        comp = self.response.xpath('//div[@class="job-inner job-item-title"]/p[@class="job_list_company_title"]/text()').extract()
        return comp

    def get_deadline(self):
        dl1 = self.response.css('div[class="job-inner job-list-deadline"] p::text').extract()
        dl2 = [''.join(x) for x in zip(dl1[0::2], dl1[1::2])]
        del dl2[1::2]
        dl = [i.replace("\n\n", "").replace("\n"," ").strip() for i in dl2]
        return dl 

    def get_location(self):
        loc = self.response.xpath('//div[@class="job-inner job-list-deadline"]/p[@class="job_location"]/text()').extract()
        loc = [i.replace('\n','').strip() for i in loc]
        return loc 

    def get_ind_page(self):
        ind_page = [base_url + i for i in self.response.xpath('//div[@class="list-view"]/div/div/a/@href').extract()]
        return ind_page

    def get_next(self):
        page = self.response.xpath('//ul[@class="pagination"]/li[@class="next"]/a/@href').extract()
        return page
Exemplo n.º 14
0
    def parse(self, response: TextResponse) -> [Request, YelpService]:
        """
        This is the default callback used by Scrapy to process downloaded responses, when their
        requests don’t specify a callback.

        The parse method is in charge of processing the response and returning scraped data
        and/or more URLs to follow.

        Args:
            :param response: the response to parse
        """
        # Checks if we are in the search result page
        if response.url.startswith("https://www.yelp.com/search?"):
            info_page_urls = response.css(".biz-name::attr(href)")

            # Checks if we have some result
            if info_page_urls is not None:
                for url in info_page_urls[:self.max_results]:
                    # Joins the url found with the domain url, and returns a new Request for it,
                    # that gonna be parsed by this method.
                    info_page = response.urljoin(url.extract())
                    yield Request(info_page)

        # We are in the info page, therefore we already can extract the information
        else:
            yield self._map_response(response)
Exemplo n.º 15
0
    def parse(self, response):
        self.driver.get(response.url)

        selector = TextResponse(url=response.url,
                                body=self.driver.page_source,
                                encoding='utf-8')
        time.sleep(4)
        items = selector.css(
            '#education-EeiiqoQoZTGTLg2SBVVUiQ span::text').extract()
        time.sleep(2)
        print(items)
        '''    
    driver.get("https://resumes.indeed.com/resume/91d7f36dfe695dee?s=l%3Dnoida%26q%3Dpython%2520developer%26searchFields%3D")
    html = driver.page_source
    title = driver.find_element_by_class_name('locality').text
    

    start_urls = [
                'https://resumes.indeed.com/resume/91d7f36dfe695dee?s=l%3Dnoida%26q%3Dpython%2520developer%26searchFields%3D'
        ]

    def start_requests(self):        
        for url in self.start_urls:
            yield scrapy.Request(url=url,callback=self.parse)
        
    def parse(self,response):
        print("respo nse ===============================>",response.status,"<=========================")
        validating = response.xpath('//a')
        print(response.text)
        '''
        '''
Exemplo n.º 16
0
 def parse(self, response: TextResponse):
     input_hidden = response.css('input[type="hidden"]')
     form_data = {
         'command':
         'get_goods',
         'page':
         'all',
         'store':
         'msk-0_1721_1',
         'thumbnail_view':
         '2',
         'sch_good_id':
         '',
         'sch_id':
         '',
         'c_id':
         input_hidden.css(
             'input[name="c_id"]::attr(value)').extract_first(),
         'fn':
         input_hidden.css('input[name="fn"]::attr(value)').extract_first(),
         'g_id':
         input_hidden.css(
             'input[name="g_id"]::attr(value)').extract_first(),
         'def_sort':
         input_hidden.css(
             'input[name="def_sort"]::attr(value)').extract_first(),
         'sys_all':
         input_hidden.css(
             'input[name="sys_all"]::attr(value)').extract_first()
     }
     return FormRequest(self.api_url,
                        headers={'user_base_url': response.url},
                        formdata=form_data,
                        callback=self.parse_unit)
Exemplo n.º 17
0
class Movies: 
     def __init__(self,URL):
        self.URL = URL
        self.page = requests.get(self.URL)
        self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8")

     def scrape_movies(self):
        "Scrapes the movies, ratings, ranks and the hyperlink"
        title = self.response.css("td.titleColumn a::text").extract()
        year = [str(i).strip('()') for i in self.response.css(".secondaryInfo:nth-child(2)::text").extract()]
        rank = []
        [rank.append(i) for i in range(1,101)]
        rating = []
        for i in self.response.css(".imdbRating"):
            rating.append(str(i.css("strong::text").extract()).strip('[]'))
        hyperlink = [base_url+i for i in self.response.css("td.titleColumn a::attr(href)").extract()]
        return title, year, rank, rating, hyperlink
Exemplo n.º 18
0
class Quotes:

    def __init__(self,URL):
        self.URL = URL
        self.page = requests.get(self.URL)
        self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8")

    def get_quotes(self):
        quotes = self.response.css("span.text::text").extract()
        authors = self.response.css("small.author::text").extract() 
        tags = [i.css("a.tag::text").extract() for i in self.response.css("div.tags")]
        hyperlinks = [base_url+i for i in self.response.css("small.author ~ a::attr(href)").extract()]
        return quotes, authors, tags, hyperlinks

    def get_next(self):
        next_url = self.response.css("li.next a::attr(href)").extract()
        return next_url
class Book:
    def __init__(self, URL):
        self.URL = URL
        self.page = requests.get(self.URL)
        self.response = TextResponse(body=self.page.text,
                                     url=self.URL,
                                     encoding="utf-8")

    def get_next(self):
        "returns the link of the following page, if it exists"
        next_url = self.response.css("li.next a::attr(href)").extract()
        return next_url

    def get_title(self):
        title = self.response.css(
            'article[class="product_pod"] h3 a::attr(title)').extract()
        return title

    def get_rating(self):
        rating = self.response.css(
            'p[class*="star-rating"]::attr(class)').extract()
        rating = [i.replace('star-rating', '').strip() for i in rating]
        return rating

    def get_price(self):
        price = self.response.css(".price_color::text").extract()
        price = [i.replace('£', '') for i in price]
        return price

    def get_book_url(self):
        book_url = [
            temp_url + i for i in self.response.css(
                'article[class="product_pod"] h3 a::attr(href)').extract()
        ]
        return book_url

    def get_img_url(self):
        img_url = [
            temp_url + i
            for i in self.response.css('.thumbnail::attr(src)').extract()
        ]
        return img_url

    def get_inStock(self):
        in_stock = self.response.css('.instock::text').extract()
        in_stock = [i.replace('\n', '').strip() for i in in_stock]
        return in_stock[1::2]

    def get_genre(self):
        book_genre = self.response.css(
            '.breadcrumb li:nth-child(3) a::text').extract()
        return (book_genre)

    def get_desc(self):
        book_desc = self.response.xpath('//article/p/text()').extract()
        return book_desc
Exemplo n.º 20
0
def get_chromedriver_version(chrome_version: str):
    url = 'https://chromedriver.storage.googleapis.com/'
    response = requests.get(url, headers=headers, params=params)
    response = TextResponse(body=response.content, url=url)

    for item in response.css('Prefix').extract():
        if chrome_version[0:7] in item:
            return re.compile(r'(\d.*\d)').search(item)
Exemplo n.º 21
0
class Book:
    def __init__(self, URL):
        self.URL = URL
        self.page = requests.get(self.URL)
        self.response = TextResponse(body=self.page.text,
                                     url=self.URL,
                                     encoding="utf-8")

    def scrape_book(self):
        "Scrapes the book with all its info."
        title = self.response.css(
            'article[class="product_pod"] h3 a::attr(title)').extract()
        rating = self.response.css(
            'p[class*="star-rating"]::attr(class)').extract()
        rating = [i.replace('star-rating', '').strip() for i in rating]
        price = self.response.css(".price_color::text").extract()
        price = [i.replace('£', '') for i in price]
        book_url = [
            URL + i for i in self.response.css(
                'article[class="product_pod"] h3 a::attr(href)').extract()
        ]
        img_url = [
            URL + i for i in self.response.css('img::attr(src)').extract()
        ]
        in_stock = self.response.css('.instock::text').extract()
        in_stock = [i.replace('\n', '').strip() for i in in_stock]
        return title, rating, price, book_url, img_url, in_stock[1::2]

    def get_next(self):
        next_url = self.response.css("li.next a::attr(href)").extract()
        return next_url
 def parse_node(self, response, node):
     item = NewsItem()
     item["title"] = node.xpath("title/text()").get()
     item["link"] = node.xpath("link/text()").get()
     item["date"] = datetime.datetime.strptime(
         node.xpath("pubDate/text()").get(), '%a, %d %b %Y %X +%f')
     item["category"] = node.xpath("category/text()").get()
     item["author"] = node.xpath("dc/author/text()").get()
     item["category"] = node.xpath("category/text()").get()
     description = node.xpath("description/text()").get()
     description = TextResponse(response.url,
                                body=description,
                                encoding='utf-8')
     item["image"] = description.css("img ::attr('src')").get()
     item["content"] = get_description(
         description.css(":not(script)::text").getall())
     item["resume"] = item["content"]
     yield Request(item["link"], self.parse_item, meta={"item": item})
Exemplo n.º 23
0
        def __init__(self, response: TextResponse):
            self.response = response

            # parse the class table
            self.content_all = [tr.css('td') for tr in response.css('tr')]
            self.content = filter(lambda x: len(x) == 2, self.content_all)  # non-fields have only 1 td tag
            self.fields = {field_name.css('::text').get(): field_value
                           for field_name, field_value in self.content}

            # main fields
            self.class_id = [p for p in response.url.split('/') if len(p) > 0][-1]
            self.department_code = [p for p in response.url.split('/') if len(p) > 0][-2]
            self.course_code = self.department_code + " " + self._get_field("Number")

            # parse all fields
            self.instructor = self._get_field('Instructor', first_line_only=True)
            if self.instructor:
                self.instructor = re.sub(r'[\s-]+$', '', self.instructor)  # clean up
            self.course_title = self._get_course_title()
            self.course_subtitle = self._get_course_subtitle()

            # schedule
            self.scheduled_days = None
            self.scheduled_time_start = None
            self.scheduled_time_end = None
            self.location = None
            date_and_location = self._get_field('Day & Time')
            if date_and_location:
                tmp = date_and_location.split("\n")  # e.g.: TR 4:10pm-5:25pm\n825 Seeley W. Mudd Building
                date_and_time = tmp[0].split()  # e.g.: TR 4:10pm-5:25pm
                self.scheduled_days = date_and_time[0]  # e.g. TR
                t = date_and_time[1]  # e.g. 4:10pm-5:25pm
                self.scheduled_time_start, self.scheduled_time_end = t.split('-')
                self.location = None
                if len(tmp)>1:
                    self.location = tmp[1]

            self.section_key = self._get_field("Section key")

            self.open_to = self._get_field("Open To")
            if self.open_to:
                self.open_to = [s.strip() for s in self.open_to.split(',')]

            self.course_descr = self._get_field("Course Description")
            self.prerequisites = []
            if self.course_descr:
                self.prerequisites = ColumbiaClassListing.get_prerequisites(self.course_descr)
            self.points = self._get_field("Points")
            self.class_type = self._get_field("Type")
            self.method_of_instruction = self._get_field("Method of Instruction")
            self.department = self._get_field("Department")
            self.call_number = self._get_field("Call Number")
            self.campus = self._get_field("Campus")

            enrollment_string = self._get_field('Enrollment')
            self.enrollment_date, self.enrollment_current, self.enrollment_max \
                = self._get_enrollment(enrollment_string)
Exemplo n.º 24
0
def get_proxies():
    r = requests.get("https://github.com/fate0/proxylist/blob/master/proxy.list")
    response = TextResponse(r.url, body=r.text, encoding='utf-8')
    res = []
    for i in response.css("td::text").getall():
        ii = json.loads(i)
        country = ii["country"] or ""
        if country == "US" and ii["response_time"] < 5:
            res.append(f"{ii['host']}:{ii['port']}")
    return res
Exemplo n.º 25
0
    def parse(self, response):
        self.driver.get(
            'http://www.hannovermesse.de/en/exhibition/exhibitors-products/advanced-search/'
        )

        wait = WebDriverWait(self.driver, 5)

        wait.until(
            EC.element_to_be_clickable((
                By.XPATH,
                '//*[@id="searchAP:search"]/section/div[6]/div/div/div[2]/div[2]'
            )))
        showallbutton = WebDriverWait(self.driver, 10).until(
            EC.element_to_be_clickable((
                By.XPATH,
                '//*[@id="searchAP:search"]/section/div[6]/div/div/div[2]/div[2]/a'
            )))
        showallbutton.click()
        self.driver.execute_script(
            "document.getElementById('searchAP:zb:442:r').click()")
        self.driver.find_element_by_xpath(
            '//*[@id="searchAP:searchButton2"]').click()

        #searchAP:zb:1:r Agriculture, forestry and fishing
        #searchAP:zb:18:r Mining and extracting rocks and earth
        #searchAP:zb:35:r Manufacturing industry
        #searchAP:zb:220:r Energy supply
        #searchAP:zb:229:r  Water supply, sewage and refuse disposal, sanitation and similar activities
        #searchAP:zb:239:r  Construction/construction industry
        #searchAP:zb:251:r  Sale, maintenance and repair of motor vehicles
        #searchAP:zb:291:r  Transportation and storage
        #searchAP:zb:312:r  Hotels and restaurants/lodging and catering
        #searchAP:zb:320:r Information and communication
        #searchAP:zb:351:r  Provision of financial and insurance services
        #searchAP:zb:357:r  Real estate activities
        #searchAP:zb:361:r Provision of freelanced, scientific and technical services
        #searchAP:zb:379:r Provision of other business activities
        #searchAP:zb:394:r Public adminstration, defence and social security
        #searchAP:zb:404:r  Education
        #searchAP:zb:412:r Human health and social work activities
        #searchAP:zb:425:r Arts, entertainment and recreation
        #searchAP:zb:430:r Recreational, cultural and sporting activities; other
        #searchAP:zb:439:r Households
        #searchAP:zb:440:r  Extra-territorial organisations and bodies
        #searchAP:zb:441:r All sectors, sector independent
        #searchAP:zb:442:r pupils, students

        # Now that the webpage is all revealed Scrapy can bring down all the Company URLs
        # I.e. we need to follow the link for every companys to get onto its page to get our data
        response1 = TextResponse(url=response.url,
                                 body=self.driver.page_source,
                                 encoding='utf-8')
        for href in response1.css('.search-link ::attr(href)'):
            url = response1.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_dir_contents)
Exemplo n.º 26
0
def getimgdec(movies):
    try:
        r = requests.get('http://www.imdb.com/find?ref_=nv_sr_fn&q=' + movies)
        response = TextResponse(r.url, body=r.text, encoding='utf-8')
        spcial = response.css(
            '.findList .findResult .result_text a::attr(href)').extract_first(
            )
        spcial = 'http://www.imdb.com/' + spcial.encode("ascii", "ignore")
    except AttributeError:
        spcial = 'http://www.imdb.com/'
    return MoviesNameUrl[movies][1], MoviesNameUrl[movies][0], spcial
Exemplo n.º 27
0
class Quotes:
    def __init__(self, URL):
        self.URL = URL
        self.page = requests.get(self.URL)
        self.response = TextResponse(body=self.page.text,
                                     url=self.URL,
                                     encoding="utf-8")

    def get_quotes(self):
        return self.response.css("span.text::text").extract()

    def get_authors(self):
        return self.response.css("small.author::text").extract()

    def get_tags(self):
        "gets the tags all in one list"
        return self.response.css("div.tags > a.tag::text").extract()

    def get_author_link(self):
        return self.response.css("small.author ~ a::attr(href)").extract()
class Pages:

    def __init__(self,URL):
        self.URL = URL
        self.page = requests.get(self.URL)
        self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8")

    def url_scraper(self):
        """
       Getting urls for all top restaurants in Yerevan
        """
        
        url = [base_url + i for i in self.response.css('div.wQjYiB7z>span>a._15_ydu6b::attr(href)').extract()]
        return url
      
    def get_next(self):
        """
        If a NEXT button exists,get the next page's URL 
        """
        next_url = self.response.css("div[class='unified pagination js_pageLinks'] a[class='nav next rndBtn ui_button primary taLnk']::attr(href)").extract()
        return next_url
Exemplo n.º 29
0
 def parse(self, response: TextResponse):
     categories = []
     base_url = response.url
     for index, item in enumerate(response.css('ul.nix-menu').xpath('li')):
         url = item.css('a::attr(href)').extract_first()
         title = item.css('a::text').extract_first()
         categories.append({
             'base_url': base_url,
             'url': url,
             'title': title
         })
     return {'data': categories}
Exemplo n.º 30
0
def books_scraper(url, base_url="http://books.toscrape.com/"):
    page = requests.get(url)
    response = TextResponse(body=page.text, url=url, encoding="utf-8")
    prices = response.css("p.price_color::text").extract()
    price = [float(i.replace("£", "")) for i in prices]
    book_url = response.css("h3 >a::attr(href)").extract()
    pic_url = response.css("img::attr(src)").extract()
    star = response.css("p[class^='star-rating']::attr(class)").extract()
    star_rating = []
    for i in star:
        star_rating.append(i.replace("star-rating", ""))
    base_url = "http://books.toscrape.com/catalogue/"
    bookurl = [base_url + i for i in book_url]
    picurl = [base_url + i for i in pic_url]
    genere = []
    desc = []
    for i in bookurl:
        page = requests.get(i)
        response = TextResponse(body=page.text, url=i, encoding="utf-8")
        genere.append(response.css("li~li~li > a::text")[0].extract())
        desc.append(
            response.css(
                "article[class='product_page'] > p::text").extract_first())

    return pd.DataFrame({
        "Price": price,
        "BooksHyperlinks": bookurl,
        "PicsHyperlinks": picurl,
        "Star_Ratings": star_rating,
        'Generes': genere,
        "AboutBook": desc
    })
Exemplo n.º 31
0
 def parse_node(self, response, node):
     item = NewsItem()
     item['title'] = node.xpath('title/text()').get()
     item['link'] = node.xpath('link/text()').get()
     description = node.xpath('description/text()').get()
     description = TextResponse(response.url,
                                body=description,
                                encoding='utf-8')
     item['image'] = description.css("img ::attr('src')").get()
     item['resume'] = description.css(".K2FeedIntroText strong::text").get()
     item['content'] = description.css(".K2FeedFullText ::text ").getall()
     item['category'] = node.xpath('category/text()').get()
     item['author'] = node.xpath('author/text()').get()
     item['date'] = datetime.datetime.strptime(
         node.xpath("pubDate/text()").get(), '%a, %d %b %Y %X +%f')
     if item["content"] is None or item["content"] is '':
         item["content"] = description.css("::text").get()
     if item["resume"] is None:
         item["resume"] = description.css("::text").get()
     if '.jpg' not in item["image"]:
         return
     yield item
Exemplo n.º 32
0
    def _extract_service_phone(self, response: TextResponse) -> str:
        """
        Extracts the service phone from the response if it can be found, otherwise
        returns an empty string.

        Args:
            :param response: the response received from a `Request` object

        :return: the service phone if it can be found, otherwise an empty string
        """
        phone = response.css(".biz-phone::text").extract_first()
        if not phone:
            self.log("Cannot find the phone of the service: " + response.url, logging.ERROR)
            return ""
        else:
            return phone.strip()
Exemplo n.º 33
0
    def _extract_service_address(self, response: TextResponse) -> str:
        """
        Extracts the service address from the response if it can be found, otherwise
        returns an empty string.

        Args:
            :param response: the response received from a `Request` object

        :return: the service address if it can be found, otherwise an empty string
        """
        # The address information is formatted by using "<br>" tags, so, we need to extract all
        # items within the "<address>" tag and merge them at the end separated by commas.
        address = response.css(".street-address address::text").extract()
        if not address:
            self.log("Cannot find the address of the service: " + response.url, logging.ERROR)
            return ""
        else:
            return ', '.join(address).strip()