示例#1
0
    def get_artist_id(self, artist_url):
        # We go to artist page to pick data we need to make the ARTIST_INFO key.

        artist_id = None
        if artist_url in KEY_INFO.keys():
            key = KEY_INFO.get(artist_url)
            if key in ARTIST_INFO.keys():
                artist_id = ARTIST_INFO.get(key)
            return artist_id
        else:
            # self.artist_id_slave (key_maker) returns the artist_id
            artist_id = self.artist_id_slave(artist_url)
            return artist_id
示例#2
0
    def get_artist_id(self, artist_url):
        # We go to artist page to pick data we need to make the ARTIST_INFO key.

        artist_id = None
        if artist_url in KEY_INFO.keys():
            # print("\n\n\n\nA:\n\n\n\n")
            key = KEY_INFO[artist_url]
            if key in ARTIST_INFO.keys():
                artist_id = ARTIST_INFO[key]
                return artist_id

        # Key maker here, writes the artist data in the db. Makes it much simpler. Nah?
        key = self.key_maker(artist_url)
        if key is not None and artist_url is not None:
            if key in ARTIST_INFO.keys():
                artist_id = ARTIST_INFO.get(key)
                return artist_id
                # print(artist_id)
            else:
                print("FATAL ERROR :: Artist_id not found.")
        else:
            # If it ever comes to here, the page will not have an Artist
            print("FATAL ERROR :: Artist_id not found. Artist_url broken")
        # Let's return None here, and not pick rest of the data if the artist_id is not found.
        # Artist id is used in artworks table only.
        return artist_id
示例#3
0
    def key_maker(artist_url):
        options = Options()
        options.headless = True
        driver = webdriver.Firefox(options=options)
        visited.discard(artist_url)
        soup = BeautifulSoup(driver.page_source, artist_url)
        if soup is not None:

            n_c = soup.find_all('h2', class_='font_2')
            # Artist's name
            try:
                name = n_c[0].text.strip()
            except IndexError:
                print(n_c)
                name = None
            # print(name)
            # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed

            if name is not None:
                # Country
                try:
                    country = n_c[1].text.strip()
                except AttributeError:
                    country = None

                # About
                try:
                    text = soup.find_all('p', class_='font_8')
                    about = ""
                    for t in text:
                        about += t.text.strip()
                        about += " "
                    # print(about)
                except AttributeError:
                    about = None
                except TypeError:
                    about = None
                # About will either be found and be some text or be None.
                # print(about)

                artist_data_pack = [name, None, country, about]
                # artist_data_pack = [name, born, country, about]
                # pack = [name, born, country, about]
                # Updating KEY_INFO dictionary.
                KEY_INFO[artist_url] = db.Artist.key_maker(artist_data_pack)
                key = KEY_INFO.get(artist_url)
                # Updating the dB with artist listings.
                TheAuthour.write_artist(*artist_data_pack)

                # key = db.Artist.key_maker(artist_data_pack)
                # pack = [name, born, country, about]
                driver.quit()
                return key
            else:
                driver.quit()
                return None

        else:
            return None
示例#4
0
    def artist_id_slave(self, artist_url):
        visited.discard(artist_url)
        soup = TheMiner.fetch_page(artist_url)
        if soup is not None:
            self.get_artist_data(soup, artist_url)
            # Getting the key from KEY_INFO
            if artist_url in KEY_INFO.keys():
                key = KEY_INFO.get(artist_url)
                # Getting artist_id using the key from ARTIST_INFO
                if key in ARTIST_INFO.keys():
                    artist_id = ARTIST_INFO.get(key)
                    return artist_id
                else:
                    print("ARTIST_ID_SLAVE : Artist id not in ARTIST_INFO")
                    return None
            else:
                print("ARTIST_ID_SLAVE : Could not find artist_id")
                return None

        else:
            print("ARTIST_ID_SLAVE : Soup not returned")
            return None
示例#5
0
    def get_artist_id(self, artist_url):
        # We go to artist page to pick data we need to make the ARTIST_INFO key.
        # print(f"\n\n\n\nARTIST_ID_GET:\n{artist_url}\n{KEY_INFO}\n\n\n\n")

        artist_id = None
        if artist_url in KEY_INFO.keys():
            # print("\n\n\n\nA:\n\n\n\n")
            key = KEY_INFO[artist_url]
            artist_id = ARTIST_INFO[key]
        else:
            key = self.key_maker(artist_url)
            if key is not None and artist_url is not None:
                if key in ARTIST_INFO.keys():
                    artist_id = ARTIST_INFO.get(key)
                    # print(artist_id)
                else:
                    print("FATAL ERROR :: Artist_id not found.")
            else:
                # If it ever comes to here, the page will not have an Artist
                print("FATAL ERROR :: Artist_id not found. Artist_url broken")
            # Let's return None here, and not pick rest of the data if the artist_id is not found.
            # Artist id is used in artworks table only.
        return artist_id
示例#6
0
    def get_artwork_data_slave(self, url, driver):

        driver.get(url)
        soup = BeautifulSoup(driver.page_source, url)
        if soup is not None:

            # Field initiation ::

            artwork = None
            price = None
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None
            # Material to be added to technique
            technique = ""

            seller_id = None
            artist = None
            medium = None

            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            # if "/painting/" in str(url):
            #     medium = "Painting"  # (painting or sculpture)
            # elif "/sculpture/" in str(url):
            #     medium = "Sculpture"
            # else:
            #     # So that url leaks don't break the code.
            #     medium = None

            # Seller_id
            try:
                seller_url = soup.find('div', class_='WncCi').find('a')['href']
                seller_id = self.get_seller_id(seller_url)
            except AttributeError or TypeError:
                # Seller doesn't have a page.
                try:
                    seller_url = soup.find('div', class_='WncCi').text.strip()
                    if seller_url in SELLER_INFO.keys():
                        seller_id = SELLER_INFO.get(seller_url)
                    else:
                        # Make a Kazoart style bundle, and write it to obtain a seller_id.
                        # [seller_url, platform_id(from name), Seller's name, Location, website]
                        bundle = [
                            seller_url, self.website.platform,
                            'EMERGINGARTISTPLATFOM', None, None
                        ]
                        # Writing to db.
                        TheAuthour.write_seller(*bundle)
                        # This should generate the seller_id we so desperately desire.
                        # time.sleep(1)
                        seller_id = SELLER_INFO.get(seller_url)
                except AttributeError:
                    pass

            # We'll let the seller name be seller_url if the url is not found.

            # Artist_id
            try:
                artist_url = soup.find('div', class_='WncCi').a.get('href')
                if str(artist_url).endswith(".com"):
                    artist_url = re.sub('.com', "", artist_url)
                    artist_url = re.sub('emergingartistplatform',
                                        'emergingartistplatform.com',
                                        artist_url)
                artist_id = self.get_artist_id(artist_url)

            except AttributeError:
                try:
                    artist_url = soup.find('div', class_='WncCi').text.strip()
                    country = None
                    a = soup.find_all('pre')
                    for b in a:
                        if b.get('data-hook') == 'description':
                            p = b.find_all('p')
                            for j in p:
                                if 'Country' in j.text or 'country' in j.text or 'COUNTRY' in j.text:
                                    title = j.text.split(":")
                                    country = title[-1].strip()

                    artist_data_pack = [artist_url, None, country, None]
                    # artist_data_pack = [name, born, country, about]
                    # pack = [name, born, country, about]
                    # Updating KEY_INFO dictionary.
                    KEY_INFO[artist_url] = db.Artist.key_maker(
                        artist_data_pack)
                    key = KEY_INFO.get(artist_url)
                    # Updating the dB with artist listings.
                    TheAuthour.write_artist(*artist_data_pack)
                    artist_id = ARTIST_INFO[key]
                except AttributeError:
                    artist_id = None

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None:
                try:
                    a = soup.find_all('span')
                    t = ""
                    for b in a:
                        if b.get('data-hook') == "formatted-primary-price":
                            # print(b.text)
                            for p in b.text:
                                if str(p).isnumeric() or str(p) == ".":
                                    t += p
                    price = float(t) * rate
                    # print(price)
                    # Price
                    # print(price)
                except AttributeError:
                    price = None
                except ValueError:
                    price = None

                # RULE : 5
                if price is not None:

                    # Find artist, artwork, year, type_(N/A), dimensions, support, frame, signature, authenticity,
                    # about, image_loc(actual url of the image), and technique

                    # Wish the code to break if either Artist's name or Artwork's name are not found.
                    # Artist
                    artist = soup.find('div', class_='WncCi').text.strip()
                    # print(artist)

                    # Artwork
                    a = soup.find_all('pre')
                    for b in a:
                        if b.get('data-hook') == 'description':
                            p = b.find_all('p')
                            for j in p:
                                if 'Title' in j.text or 'title' in j.text or 'TITLE' in j.text:
                                    title = j.text.split(":")
                                    artwork = title[-1].strip()
                                    if len(artwork) >= 255:
                                        artwork = artwork[0:255]
                                    # print(artwork)

                                if 'Date' in j.text:
                                    date = j.text.split(":")
                                    year = date[-1].strip()
                                    # print(year)

                                if 'Size' in j.text:
                                    dimensions = j.text.split(":")
                                    dimensions = dimensions[-1].strip()
                                    # print(dimensions)

                                if 'Medium' in j.text:
                                    technique = j.text.split(":")
                                    technique = technique[-1].strip()
                                    # print(technique)

                                if len(j.text.split(
                                        ":")) == 1 and about is None:
                                    about = j.text[-1].strip()

                    # Medium (RULE : 3)
                    if "Sculptures" in self.website.start_url:
                        medium = "Sculpture"
                    else:
                        medium = "Painting"

                    # image_loc
                    image = soup.find('div',
                                      class_='main-media-image-wrapper-hook')
                    image = image.find('div', id='get-image-item-id')
                    image_loc = image.get('href')

                    # print(image_loc)

                    artwork_bundle = {
                        "artwork_title": artwork,
                        "artist_name": artist,
                        "year": year,
                        "price": price,
                        "Medium": medium,
                        "Type": type_,
                        "Dimensions": dimensions,
                        "Support": support,
                        "Frame": frame,
                        "Signature": signature,
                        "Authenticity": authenticity,
                        "About": about,
                        "platform": self.website.platform,
                        "image_addr": image_loc,
                        "seller_id": seller_id,
                        "artist_id": artist_id,
                        "url": url,
                        "technique": technique
                    }

                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else:
                    print(f"Skipping {url}\n PRICE : {price}")
            else:
                print(
                    f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}"
                )
        else:
            print(f"Soup not returned for {url}")