Пример #1
0
    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave and get_seller_id
        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        if soup is not None:

            # Seller's Name
            print(seller_name)
            # Code will break if seller's name is not found

            # Location
            try:
                print(location)
            except AttributeError:
                location = None

            # Website
            try:
                print(website)
            except AttributeError:
                website = None
            except TypeError:
                website = None

            bundle = [url, self.website.platform, seller_name, location, website]
            print(bundle)
            TheAuthour.write_seller(*bundle)
Пример #2
0
        def recurr(url):
            soup = TheMiner.fetch_page(url, ghost=True)
            if soup is not None:
                # Because singulart keeps blocking ips, we'll ship everything inside try-except statements.
                try:
                    # artist_blocks = soup.find_all('div', class_='artist-container')
                    artist_blocks = soup.find_all('figure', class_='pic-artist')
                    print(len(artist_blocks))
                    for artist in artist_blocks:
                        link = artist.figcaption.h2.a.get('href')
                        if self.website.domain not in link:
                            link = self.link_maker(list)
                        self.artist_listings.append(link)
                    # print(self_artist_listings)

                    # next pages
                    next_pages = soup.find('div', class_='pagerfanta').find('nav')
                    next_pages = next_pages.find_all('a')
                    for next_ in next_pages:
                        link = next_.get('href')
                        if self.website.domain not in link:
                            link = self.link_maker(link)
                        if link not in self.listy:
                            self.listy.append(link)

                    # print(listy)
                    # print(len(listy))

                    with concurrent.futures.ThreadPoolExecutor() as executor:
                        trig = executor.map(recurr, self.listy)
                    for trigger in trig:
                        pass
                except AttributeError:
                    visited.discard(url)
                    pass
Пример #3
0
    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave
        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        # print("A")
        if soup is not None:
            # print("B")

            A = soup.find('div', id='top-seller')
            seller_name = A.h1.text.strip()
            # print(seller_name)
            # Code will break if seller's name is not found

            try:
                location = A.find('p',
                                  class_="subtitle").text.strip().split(',')
                location = location[-1].strip()
                # print(location)
            except AttributeError:
                location = None
            try:
                website = str(soup.find('ul', id="websites").a['href']).strip()
                # print(website)
            except AttributeError:
                website = None
            except TypeError:
                website = None

            bundle = [
                url, self.website.platform, seller_name, location, website
            ]
            # print(bundle)
            TheAuthour.write_seller(*bundle)
Пример #4
0
    def key_maker(artist_url):
        visited.discard(artist_url)
        soup = TheMiner.fetch_page(artist_url)
        if soup is not None:
            artist_resume = soup.find('div', class_='artist-resume').find('div', class_='artist-resume_text')
            name = artist_resume.h1.text.strip()
            print(name)
            # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed

            if name is not None:
                try:
                    country = artist_resume.find('p', class_='location').text.strip().split('\n')
                    country = country[0].split(',')
                    country = country[-1].strip()
                    print(country)
                except AttributeError:
                    country = None

                about = soup.find('div', id='about').text.strip()
                # About will either be found and be some text or be None.
                # print(about)

                artist_data_pack = [name, None, country, about]
                key = db.Artist.key_maker(artist_data_pack)
                # pack = [name, born, country, about]
                return key

        else:
            return None
Пример #5
0
    def miner(self):
        # Miner's track : We land on artwork listings page. We pick the listings from there.
        # We pick the Sellers and Artists from artwork pages.
        # From artwork pages we fetch the artwork for sale for artists listed.

        self.get_artist_listings()
        # print(kazoart.artist_listings)
        # print(self.artist_listings)
        # print("ARTIST LISTINGS")
        # print(len(self.artist_listings))
        # time.sleep(10)

        # That the pages where we discarded the links can be visited as well
        for link in self.first_prod_list:
            visited.discard(link)

        self.get_artwork_listings_master()
        # get_artwork_listings_master -> get_artwork_listings_slave -> get_artist_data -> write_artist_data
        # So we're done with artist data.
        # print(f"ARTWORK LISTINGS, {len(self.artwork_listings)}")
        # print(len(self.artwork_listings))
        # time.sleep(10)

        self.get_artwork_data_master()

        # DATA COLLECTION COMPLETED FOR THIS MODULE.
        # DOWNLOADING IMAGES NOW.
        TheMiner.sir_image_manager()
Пример #6
0
    def key_maker(artist_url):
        options = Options()
        options.headless = True
        driver = webdriver.Firefox(options=options)
        visited.discard(artist_url)
        soup = BeautifulSoup(driver.page_source, artist_url)
        if soup is not None:

            n_c = soup.find_all('h2', class_='font_2')
            # Artist's name
            try:
                name = n_c[0].text.strip()
            except IndexError:
                print(n_c)
                name = None
            # print(name)
            # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed

            if name is not None:
                # Country
                try:
                    country = n_c[1].text.strip()
                except AttributeError:
                    country = None

                # About
                try:
                    text = soup.find_all('p', class_='font_8')
                    about = ""
                    for t in text:
                        about += t.text.strip()
                        about += " "
                    # print(about)
                except AttributeError:
                    about = None
                except TypeError:
                    about = None
                # About will either be found and be some text or be None.
                # print(about)

                artist_data_pack = [name, None, country, about]
                # artist_data_pack = [name, born, country, about]
                # pack = [name, born, country, about]
                # Updating KEY_INFO dictionary.
                KEY_INFO[artist_url] = db.Artist.key_maker(artist_data_pack)
                key = KEY_INFO.get(artist_url)
                # Updating the dB with artist listings.
                TheAuthour.write_artist(*artist_data_pack)

                # key = db.Artist.key_maker(artist_data_pack)
                # pack = [name, born, country, about]
                driver.quit()
                return key
            else:
                driver.quit()
                return None

        else:
            return None
Пример #7
0
    def artist_id_slave(self, artist_url):
        visited.discard(artist_url)
        soup = TheMiner.fetch_page(artist_url)
        if soup is not None:
            self.get_artist_data(soup, artist_url)
            # Getting the key from KEY_INFO
            if artist_url in KEY_INFO.keys():
                key = KEY_INFO.get(artist_url)
                # Getting artist_id using the key from ARTIST_INFO
                if key in ARTIST_INFO.keys():
                    artist_id = ARTIST_INFO.get(key)
                    return artist_id
                else:
                    print("ARTIST_ID_SLAVE : Artist id not in ARTIST_INFO")
                    return None
            else:
                print("ARTIST_ID_SLAVE : Could not find artist_id")
                return None

        else:
            print("ARTIST_ID_SLAVE : Soup not returned")
            return None
Пример #8
0
    def get_artwork_listings_slave(self, url):

        soup = TheMiner.fetch_page(url, ghost=True)
        # Artist's info and artwork listings are available on the same page.
        if soup is not None:
            try:
                name = soup.find('div', class_='artist-intro').find('div', class_='content').h1.text
                # Name will cause the crash if the page is not returned
                block = soup.find_all('div', class_='artist-container artist-container--details')
                print(f"BLOCK : {len(block)}")
                try:
                    for chunk in block:
                        items = chunk.find_all('figure', class_='artwork-item artwork-item--details')
                        print(f"ITEMS : {len(items)}")

                        for piece in items:
                            paise = piece.find('div', class_='meta').text.strip()
                            # print(paise)
                            if "Sold" not in str(paise):
                                # print("B")
                                a = piece.find('a')['href']
                                if self.website.domain not in a:
                                    a = self.link_maker(a)
                                if a not in self.artwork_listings:
                                    self.artwork_listings.append(a)

                except AttributeError:
                    # print("A")
                    pass

                self.get_artist_data(soup, url)

            except AttributeError:
                print("B")
                # Urls that get blocked are discarded from visited and added to listy for a recall. (linear if listy is
                # small and multithreaded if listy is large enough till, its brought of size.
                visited.discard(url)
                self.listy.append(url)
Пример #9
0
def get_artwork_data_slave(url):
    soup = TheMiner.fetch_page(url, ghost=True)
    if soup is not None:
        # Initiation

        try:
            # Artist_url
            artist_url = soup.find('div', class_='artwork-focus').find_all('div', class_='col-md-12 col-lg-6')
            try:
                artist_url = artist_url[1].find('h2').a['href']
                if self_website_domain not in artist_url:
                    artist_url = self_link_maker(artist_url)
            except AttributeError:
                artist_url = None

                # Artist_id
                artist_id = self.artist_id


        except AttributeError:
            # Comes here if the page is not returned by the website.
            visited.discard(url)
            self_listy.append(url)
Пример #10
0
    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave and get_seller_id
        # We get to here only after we do not find the seller's info in SELLER_INFO
        # print("GET SELLER DATA")

        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        if soup is not None:
            # print("GET SELLER DATA: SOUP RETURNED")

            seller_name = None
            try:
                # Seller's Name

                seller_box = soup.find('div', id='jumpto--PartnerHeader')
                seller_name = seller_box.h1.text.strip()

                # print(seller_name)
                # Code will break if seller's name is not found
            except AttributeError:
                pass

            if seller_name is not None:
                # print(f"SELLER NAME : {seller_name}")
                # Location
                try:
                    # Location is not available here.
                    location = ""
                    locatio = seller_box.h1.nextSibling()
                    # print(type(locatio))
                    try:
                        location = locatio.text
                    except AttributeError:
                        for l in locatio:
                            location += l.text
                            location += " "
                    # print(location)
                except AttributeError:
                    location = None
                except TypeError:
                    location = None

                # Website
                try:
                    website = soup.find_all('a')
                    for web in website:
                        if "http" in str(web.get('href')):
                            website = web.get('href')
                            print(web.get('href'))
                            break
                    # print(website)
                except AttributeError:
                    website = None
                except IndexError:
                    website = None

                bundle = [
                    url, self.website.platform, seller_name, location, website
                ]
                # print(bundle)
                TheAuthour.write_seller(*bundle)