Пример #1
0
    def get_seller_id(self, seller_url) -> int:
        # Fetches seller_data, writes it in db, and returns seller_id.
        # bundle = [seller_url, self.website.platform, 'KAZoART', None, url]
        seller_id = None

        if seller_url is not None:
            if seller_url in SELLER_INFO.keys():
                seller_id = SELLER_INFO.get(seller_url)
                # print(seller_id)
            else:
                # Process and create the bundle here.
                bundle = [
                    seller_url, self.website.platform, 'KAZoART', None,
                    seller_url
                ]
                # Writing to db.
                TheAuthour.write_seller(*bundle)

                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    print("FATAL ERROR :: Seller_id not found.")
        else:
            print("FATAL ERROR :: Seller_id not found.")
        # Let's return seller_id, even if it's None.
        return seller_id
Пример #2
0
    def seller_info(self, soup):

        seller_bundle =[]
        # Seller name
        # Seller's website
        # Seller's location
        # Return seller_id, seller_bundle
        # Write data to table seller's in db.
        try:
            A = soup.find('div', id = 'top-seller')
            B = A.find('a')
            seller_name = str(B.text).strip()
            location = str(A.find('p', class_='subtitle')).strip()
            # if seller and location are already recorded in the global seller variable, we fetch the seller_id and
            # return it .
            seller_name = "_".join([seller_name, location])
            if seller_name in SELLER_INFO.keys():
                seller_id = SELLER_INFO(seller_name)
                print(f"We have a seller for seller id {seller_id}, named {seller_name}")
                return seller_id, None
            link = B['href']
            if 'galeries-d-art' in str(link):
                link = re.sub('galeries-d-art', 'art-galleries', link)

        except AttributeError:
            link = None
        except TypeError:
            link = None
        if link is not None:
            # Moving to seller page now.!!
            # Read the name and location before moving to the next page.
            soup = TheMiner.fetch_page(link)
            # visited.discard(link)
            if soup is not None:
                try:
                    A = soup.find('div', id = 'top-seller')
                    seller_name = A.h1.text.strip()
                    # print(seller_name)
                except AttributeError:
                    return 1, seller_bundle
                try:
                    location = A.find('p', class_="subtitle").text.strip()
                    # print(location)
                except AttributeError:
                    location = None
                try:
                    website = str(soup.find('ul', id="websites").a['href']).strip()
                    # print(website)
                except AttributeError:
                    website = None
                except TypeError:
                    website = None

                seller_bundle.append(seller_name)
                seller_bundle.append(location)
                seller_bundle.append(website)
                return 0, seller_bundle

        return 1, seller_bundle
Пример #3
0
 def read_data_sellers(self):
     self.create_table_sellers()
     self.my_cursor.execute("""SELECT * FROM sellers""")
     sellers = self.my_cursor.fetchall()
     SELLER_INFO.clear()
     for seller in sellers:
         url = seller[0]
         seller_id = seller[5]
         SELLER_INFO[url] = int(seller_id)
Пример #4
0
    def insert_data_sellers(self, *args):
        # Return seller_id
        values = [*args]
        # args = url, platform_id, seller, location, website

        insert_query = """INSERT INTO sellers(
                                 URL, PLATFORM_ID, SELLER, LOCATION, WEBSITE
                                  )
                                 VALUES(%s, %s, %s, %s, %s)
                                 """

        self.my_cursor.execute("""SELECT * FROM sellers
                WHERE URL = %s""", [values[0]])
        results = list(self.my_cursor.fetchall())

        if len(results) == 0:
            try:
                self.my_cursor.execute(insert_query, values)
                self.mydb.commit()
                self.my_cursor.execute("""SELECT LAST_INSERT_ID()""")
                seller_id = self.my_cursor.fetchone()
                SELLER_INFO[values[0]] = int(seller_id[0])

            except pymysql.err.IntegrityError:
                print("SELLER ENTRY EXISTS")
                # We don't update SELLER_INFO here. Instead we slow this thread down so that the other thread has
                # time to write the entry.
                time.sleep(1)
                # Fetch the seller entry again.
                self.my_cursor.execute("""SELECT * FROM sellers WHERE URL = %s""", [values[0]])
                results = list(self.my_cursor.fetchall())
                print(values)
                print(results)
                if values[0] not in SELLER_INFO.keys():
                    print(values)
                    print(results)
                    SELLER_INFO[values[0]] = int(results[0][5])

        elif len(results) == 1:
            print("SELLER ENTRY EXISTS")
            if values[0] not in SELLER_INFO.keys():
                SELLER_INFO[values[0]] = int(results[0][5])
            # return results[0][4]
        else:
            print("SIR THE MATRIX HAS GLITCHED . MULTIPLE SELLER ENTRIES FOR A SINGLE URL ARE HERE.")
Пример #5
0
    def get_seller_id(self, seller_url) -> int:
        # Fetches seller_data, writes it in db, and returns seller_id.
        # bundle = [seller_url, self.website.platform, 'KAZoART', None, url]
        seller_id = None

        if seller_url is not None:
            if seller_url in SELLER_INFO.keys():
                seller_id = SELLER_INFO.get(seller_url)
                # print(seller_id)
            else:
                # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data
                # again with seller_url
                self.get_seller_data(seller_url)
                # wait for a second to make sure that transaction is smooth. Activate this line if errors are thrown.
                # time.sleep(1)
                # Try to fetch seller data again.
                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    # Make a Kazoart style bundle, and write it to obtain a seller_id.
                    bundle = [seller_url, self.website.platform, 'BAREBONES', None, seller_url]
                    # Writing to db.
                    TheAuthour.write_seller(*bundle)
                    # This should generate the seller_id we so desperately desire.
                    # time.sleep(1)
                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    print("FATAL ERROR :: Seller_id not found.")
        else:
            print("FATAL ERROR :: Seller_id not found.")
        # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering
        # data beyond rule 3 check .
        return seller_id
Пример #6
0
    def get_seller_id(self, seller_url) -> int:
        # Fetches seller_data, writes it in db, and returns seller_id.
        # bundle = [seller_url, self.website.platform, 'KAZoART', None, url]
        # print("GET SELLER ID")
        seller_id = None

        if seller_url is not None:
            if seller_url in SELLER_INFO.keys():
                seller_id = SELLER_INFO.get(seller_url)
                return seller_id
                # print(seller_id)
            else:
                # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data
                # with seller_url
                self.get_seller_data(seller_url)
                # Try to fetch seller data again.
                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                    # If it is not a url, get_seller_data will fail to make an entry.In that case we move to the next part.
                else:
                    # Make a Kazoart style bundle, and write it to obtain a seller_id.
                    # bundle = [seller_url, platform, Seller's name, location, website]
                    bundle = [
                        seller_url, self.website.platform, seller_url, None,
                        seller_url
                    ]
                    # Writing to db.
                    TheAuthour.write_seller(*bundle)
                    # This should generate the seller_id we so desperately desire.
                    # time.sleep(1)
                if seller_url in SELLER_INFO.keys():
                    # This will always run, unless the program is failing unexpectedly.
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    print("FATAL ERROR :: Seller_id not found.")
        else:
            print("FATAL ERROR :: Seller_id not found.")
        # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering
        # data beyond rule 3 check .
        return seller_id
Пример #7
0
    def get_artwork_data_slave(self, url):
        soup = TheMiner.fetch_page(url)
        if soup is not None:

            # Field initiation :: Artwork_title, artist, price, seller_id :: (picked),
            # medium, type, dimension, frame, authenticity, about  :: year, support, signature
            # artist_id, Image_loc = None

            seller_id = None
            artist = None
            artwork = None
            price = None

            # Material to be added to medium
            material = None

            medium = None  # (painting or sculpture)
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None

            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None

            seller_url = str(soup.find('div', class_='product-artist').a.get('href')).strip()
            # We want the code to break if this entry is not found so that we can fix it.
            # THE PAGE MUST HAVE A SELLER.

            # Seller_id
            if seller_url is not None:
                if seller_url in SELLER_INFO:
                    seller_id = SELLER_INFO.get(seller_url)
                    print(seller_id)
                else:
                    self.get_seller_data(seller_url)
                    if seller_url in SELLER_INFO:
                        seller_id = SELLER_INFO.get(seller_url)
                    else:
                        if seller_id is None:
                            print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN SELLER_INFO. WE SHALL BREAK.")
            else:
                if seller_id is None:
                    print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN SELLER_INFO. WE SHALL BREAK.")

            # Artist_id
            if seller_url is not None:
                if seller_url in ARTIST_INFO:
                    artist_id = ARTIST_INFO.get(seller_url)
                    print(artist_id)
                else:
                    if artist_id is None:
                        print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN ARTIST_INFO. WE SHALL BREAK.")
            else:
                # If it ever comes to here, the page will not have a Seller/Artist
                if artist_id is None:
                    print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN ARTIST_INFO. WE SHALL BREAK.")

            A = soup.h1
            B = A.find('div', class_='product-artist')
            artist = str(B.a.text).strip()
            # Artist
            print(artist)

            artwork = str(A.find('div', class_='product-name').text).strip()
            # Artwork
            print(artwork)

            price = str(soup.find('div', class_='product-price').find('div', class_='p-price-container').text).strip()
            temp = ""
            for i in price:
                if i.isdigit():
                    temp += i
            price = int(temp)
            # Price
            print(price)

            product_details_desc = soup.find('div', class_='product-details_desc')
            product_details = product_details_desc.find_all('div', class_='tech-item')

            for detail in product_details:
                label = str(detail.find('div', class_='tech-label').text).strip().upper()
                value = str(detail.find('div', class_='tech-value').text).strip()
                print(label)
                print(value)

                if label == 'TECHNIQUE':
                    medium = value
                elif label == 'TYPE':
                    type_ = value
                elif label == 'MATERIAL':
                    # We don't need material. Adding material to medium??
                    material = value
                elif label == 'DIMENSIONS':
                    dimensions = value
                elif label == 'FRAMING':
                    frame = value
                elif label == 'QUALITY GUARANTEE':
                    authenticity = value

                # if that is not here, it'll throw errors.
                # elif label == ''

            try:
                about = str(product_details_desc.find('div', class_='desc text-1').text).strip()
            except AttributeError:
                about = None

            # If material is None, we don't add it to medium.
            if material is not None:
                # If medium is None, we make it a string before adding material to it.
                if medium is None:
                    medium = ""
                else:
                    medium += " "
                medium += material

            # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None,
            #     Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None,
            #      platform=None, image_addr=None, seller_id=None, artist_id=None)

            artwork_bundle = {"artwork_title": artwork, "artist_name": artist, "year": year, "price": price,
                              "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support,
                              "Frame": frame, "Signature": signature, "Authenticity": authenticity,
                              "About": about, "image_addr": image_loc, "seller_id": seller_id,
                              "artist_id": artist_id}

            self.write_artwork_data(**artwork_bundle)
Пример #8
0
def close_connection(mydb):
    SELLER_INFO.clear()
    mydb.close()
Пример #9
0
    def get_artwork_data_slave(self, url, driver):

        driver.get(url)
        soup = BeautifulSoup(driver.page_source, url)
        if soup is not None:

            # Field initiation ::

            artwork = None
            price = None
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None
            # Material to be added to technique
            technique = ""

            seller_id = None
            artist = None
            medium = None

            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            # if "/painting/" in str(url):
            #     medium = "Painting"  # (painting or sculpture)
            # elif "/sculpture/" in str(url):
            #     medium = "Sculpture"
            # else:
            #     # So that url leaks don't break the code.
            #     medium = None

            # Seller_id
            try:
                seller_url = soup.find('div', class_='WncCi').find('a')['href']
                seller_id = self.get_seller_id(seller_url)
            except AttributeError or TypeError:
                # Seller doesn't have a page.
                try:
                    seller_url = soup.find('div', class_='WncCi').text.strip()
                    if seller_url in SELLER_INFO.keys():
                        seller_id = SELLER_INFO.get(seller_url)
                    else:
                        # Make a Kazoart style bundle, and write it to obtain a seller_id.
                        # [seller_url, platform_id(from name), Seller's name, Location, website]
                        bundle = [
                            seller_url, self.website.platform,
                            'EMERGINGARTISTPLATFOM', None, None
                        ]
                        # Writing to db.
                        TheAuthour.write_seller(*bundle)
                        # This should generate the seller_id we so desperately desire.
                        # time.sleep(1)
                        seller_id = SELLER_INFO.get(seller_url)
                except AttributeError:
                    pass

            # We'll let the seller name be seller_url if the url is not found.

            # Artist_id
            try:
                artist_url = soup.find('div', class_='WncCi').a.get('href')
                if str(artist_url).endswith(".com"):
                    artist_url = re.sub('.com', "", artist_url)
                    artist_url = re.sub('emergingartistplatform',
                                        'emergingartistplatform.com',
                                        artist_url)
                artist_id = self.get_artist_id(artist_url)

            except AttributeError:
                try:
                    artist_url = soup.find('div', class_='WncCi').text.strip()
                    country = None
                    a = soup.find_all('pre')
                    for b in a:
                        if b.get('data-hook') == 'description':
                            p = b.find_all('p')
                            for j in p:
                                if 'Country' in j.text or 'country' in j.text or 'COUNTRY' in j.text:
                                    title = j.text.split(":")
                                    country = title[-1].strip()

                    artist_data_pack = [artist_url, None, country, None]
                    # artist_data_pack = [name, born, country, about]
                    # pack = [name, born, country, about]
                    # Updating KEY_INFO dictionary.
                    KEY_INFO[artist_url] = db.Artist.key_maker(
                        artist_data_pack)
                    key = KEY_INFO.get(artist_url)
                    # Updating the dB with artist listings.
                    TheAuthour.write_artist(*artist_data_pack)
                    artist_id = ARTIST_INFO[key]
                except AttributeError:
                    artist_id = None

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None:
                try:
                    a = soup.find_all('span')
                    t = ""
                    for b in a:
                        if b.get('data-hook') == "formatted-primary-price":
                            # print(b.text)
                            for p in b.text:
                                if str(p).isnumeric() or str(p) == ".":
                                    t += p
                    price = float(t) * rate
                    # print(price)
                    # Price
                    # print(price)
                except AttributeError:
                    price = None
                except ValueError:
                    price = None

                # RULE : 5
                if price is not None:

                    # Find artist, artwork, year, type_(N/A), dimensions, support, frame, signature, authenticity,
                    # about, image_loc(actual url of the image), and technique

                    # Wish the code to break if either Artist's name or Artwork's name are not found.
                    # Artist
                    artist = soup.find('div', class_='WncCi').text.strip()
                    # print(artist)

                    # Artwork
                    a = soup.find_all('pre')
                    for b in a:
                        if b.get('data-hook') == 'description':
                            p = b.find_all('p')
                            for j in p:
                                if 'Title' in j.text or 'title' in j.text or 'TITLE' in j.text:
                                    title = j.text.split(":")
                                    artwork = title[-1].strip()
                                    if len(artwork) >= 255:
                                        artwork = artwork[0:255]
                                    # print(artwork)

                                if 'Date' in j.text:
                                    date = j.text.split(":")
                                    year = date[-1].strip()
                                    # print(year)

                                if 'Size' in j.text:
                                    dimensions = j.text.split(":")
                                    dimensions = dimensions[-1].strip()
                                    # print(dimensions)

                                if 'Medium' in j.text:
                                    technique = j.text.split(":")
                                    technique = technique[-1].strip()
                                    # print(technique)

                                if len(j.text.split(
                                        ":")) == 1 and about is None:
                                    about = j.text[-1].strip()

                    # Medium (RULE : 3)
                    if "Sculptures" in self.website.start_url:
                        medium = "Sculpture"
                    else:
                        medium = "Painting"

                    # image_loc
                    image = soup.find('div',
                                      class_='main-media-image-wrapper-hook')
                    image = image.find('div', id='get-image-item-id')
                    image_loc = image.get('href')

                    # print(image_loc)

                    artwork_bundle = {
                        "artwork_title": artwork,
                        "artist_name": artist,
                        "year": year,
                        "price": price,
                        "Medium": medium,
                        "Type": type_,
                        "Dimensions": dimensions,
                        "Support": support,
                        "Frame": frame,
                        "Signature": signature,
                        "Authenticity": authenticity,
                        "About": about,
                        "platform": self.website.platform,
                        "image_addr": image_loc,
                        "seller_id": seller_id,
                        "artist_id": artist_id,
                        "url": url,
                        "technique": technique
                    }

                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else:
                    print(f"Skipping {url}\n PRICE : {price}")
            else:
                print(
                    f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}"
                )
        else:
            print(f"Soup not returned for {url}")