def get_seller_id(self, seller_url) -> int: # Fetches seller_data, writes it in db, and returns seller_id. # bundle = [seller_url, self.website.platform, 'KAZoART', None, url] seller_id = None if seller_url is not None: if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) # print(seller_id) else: # Process and create the bundle here. bundle = [ seller_url, self.website.platform, 'KAZoART', None, seller_url ] # Writing to db. TheAuthour.write_seller(*bundle) if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: print("FATAL ERROR :: Seller_id not found.") else: print("FATAL ERROR :: Seller_id not found.") # Let's return seller_id, even if it's None. return seller_id
def seller_info(self, soup): seller_bundle =[] # Seller name # Seller's website # Seller's location # Return seller_id, seller_bundle # Write data to table seller's in db. try: A = soup.find('div', id = 'top-seller') B = A.find('a') seller_name = str(B.text).strip() location = str(A.find('p', class_='subtitle')).strip() # if seller and location are already recorded in the global seller variable, we fetch the seller_id and # return it . seller_name = "_".join([seller_name, location]) if seller_name in SELLER_INFO.keys(): seller_id = SELLER_INFO(seller_name) print(f"We have a seller for seller id {seller_id}, named {seller_name}") return seller_id, None link = B['href'] if 'galeries-d-art' in str(link): link = re.sub('galeries-d-art', 'art-galleries', link) except AttributeError: link = None except TypeError: link = None if link is not None: # Moving to seller page now.!! # Read the name and location before moving to the next page. soup = TheMiner.fetch_page(link) # visited.discard(link) if soup is not None: try: A = soup.find('div', id = 'top-seller') seller_name = A.h1.text.strip() # print(seller_name) except AttributeError: return 1, seller_bundle try: location = A.find('p', class_="subtitle").text.strip() # print(location) except AttributeError: location = None try: website = str(soup.find('ul', id="websites").a['href']).strip() # print(website) except AttributeError: website = None except TypeError: website = None seller_bundle.append(seller_name) seller_bundle.append(location) seller_bundle.append(website) return 0, seller_bundle return 1, seller_bundle
def read_data_sellers(self): self.create_table_sellers() self.my_cursor.execute("""SELECT * FROM sellers""") sellers = self.my_cursor.fetchall() SELLER_INFO.clear() for seller in sellers: url = seller[0] seller_id = seller[5] SELLER_INFO[url] = int(seller_id)
def insert_data_sellers(self, *args): # Return seller_id values = [*args] # args = url, platform_id, seller, location, website insert_query = """INSERT INTO sellers( URL, PLATFORM_ID, SELLER, LOCATION, WEBSITE ) VALUES(%s, %s, %s, %s, %s) """ self.my_cursor.execute("""SELECT * FROM sellers WHERE URL = %s""", [values[0]]) results = list(self.my_cursor.fetchall()) if len(results) == 0: try: self.my_cursor.execute(insert_query, values) self.mydb.commit() self.my_cursor.execute("""SELECT LAST_INSERT_ID()""") seller_id = self.my_cursor.fetchone() SELLER_INFO[values[0]] = int(seller_id[0]) except pymysql.err.IntegrityError: print("SELLER ENTRY EXISTS") # We don't update SELLER_INFO here. Instead we slow this thread down so that the other thread has # time to write the entry. time.sleep(1) # Fetch the seller entry again. self.my_cursor.execute("""SELECT * FROM sellers WHERE URL = %s""", [values[0]]) results = list(self.my_cursor.fetchall()) print(values) print(results) if values[0] not in SELLER_INFO.keys(): print(values) print(results) SELLER_INFO[values[0]] = int(results[0][5]) elif len(results) == 1: print("SELLER ENTRY EXISTS") if values[0] not in SELLER_INFO.keys(): SELLER_INFO[values[0]] = int(results[0][5]) # return results[0][4] else: print("SIR THE MATRIX HAS GLITCHED . MULTIPLE SELLER ENTRIES FOR A SINGLE URL ARE HERE.")
def get_seller_id(self, seller_url) -> int: # Fetches seller_data, writes it in db, and returns seller_id. # bundle = [seller_url, self.website.platform, 'KAZoART', None, url] seller_id = None if seller_url is not None: if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) # print(seller_id) else: # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data # again with seller_url self.get_seller_data(seller_url) # wait for a second to make sure that transaction is smooth. Activate this line if errors are thrown. # time.sleep(1) # Try to fetch seller data again. if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: # Make a Kazoart style bundle, and write it to obtain a seller_id. bundle = [seller_url, self.website.platform, 'BAREBONES', None, seller_url] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: print("FATAL ERROR :: Seller_id not found.") else: print("FATAL ERROR :: Seller_id not found.") # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering # data beyond rule 3 check . return seller_id
def get_seller_id(self, seller_url) -> int: # Fetches seller_data, writes it in db, and returns seller_id. # bundle = [seller_url, self.website.platform, 'KAZoART', None, url] # print("GET SELLER ID") seller_id = None if seller_url is not None: if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) return seller_id # print(seller_id) else: # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data # with seller_url self.get_seller_data(seller_url) # Try to fetch seller data again. if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) # If it is not a url, get_seller_data will fail to make an entry.In that case we move to the next part. else: # Make a Kazoart style bundle, and write it to obtain a seller_id. # bundle = [seller_url, platform, Seller's name, location, website] bundle = [ seller_url, self.website.platform, seller_url, None, seller_url ] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) if seller_url in SELLER_INFO.keys(): # This will always run, unless the program is failing unexpectedly. seller_id = SELLER_INFO.get(seller_url) else: print("FATAL ERROR :: Seller_id not found.") else: print("FATAL ERROR :: Seller_id not found.") # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering # data beyond rule 3 check . return seller_id
def get_artwork_data_slave(self, url): soup = TheMiner.fetch_page(url) if soup is not None: # Field initiation :: Artwork_title, artist, price, seller_id :: (picked), # medium, type, dimension, frame, authenticity, about :: year, support, signature # artist_id, Image_loc = None seller_id = None artist = None artwork = None price = None # Material to be added to medium material = None medium = None # (painting or sculpture) type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None seller_url = str(soup.find('div', class_='product-artist').a.get('href')).strip() # We want the code to break if this entry is not found so that we can fix it. # THE PAGE MUST HAVE A SELLER. # Seller_id if seller_url is not None: if seller_url in SELLER_INFO: seller_id = SELLER_INFO.get(seller_url) print(seller_id) else: self.get_seller_data(seller_url) if seller_url in SELLER_INFO: seller_id = SELLER_INFO.get(seller_url) else: if seller_id is None: print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN SELLER_INFO. WE SHALL BREAK.") else: if seller_id is None: print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN SELLER_INFO. WE SHALL BREAK.") # Artist_id if seller_url is not None: if seller_url in ARTIST_INFO: artist_id = ARTIST_INFO.get(seller_url) print(artist_id) else: if artist_id is None: print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN ARTIST_INFO. WE SHALL BREAK.") else: # If it ever comes to here, the page will not have a Seller/Artist if artist_id is None: print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN ARTIST_INFO. WE SHALL BREAK.") A = soup.h1 B = A.find('div', class_='product-artist') artist = str(B.a.text).strip() # Artist print(artist) artwork = str(A.find('div', class_='product-name').text).strip() # Artwork print(artwork) price = str(soup.find('div', class_='product-price').find('div', class_='p-price-container').text).strip() temp = "" for i in price: if i.isdigit(): temp += i price = int(temp) # Price print(price) product_details_desc = soup.find('div', class_='product-details_desc') product_details = product_details_desc.find_all('div', class_='tech-item') for detail in product_details: label = str(detail.find('div', class_='tech-label').text).strip().upper() value = str(detail.find('div', class_='tech-value').text).strip() print(label) print(value) if label == 'TECHNIQUE': medium = value elif label == 'TYPE': type_ = value elif label == 'MATERIAL': # We don't need material. Adding material to medium?? material = value elif label == 'DIMENSIONS': dimensions = value elif label == 'FRAMING': frame = value elif label == 'QUALITY GUARANTEE': authenticity = value # if that is not here, it'll throw errors. # elif label == '' try: about = str(product_details_desc.find('div', class_='desc text-1').text).strip() except AttributeError: about = None # If material is None, we don't add it to medium. if material is not None: # If medium is None, we make it a string before adding material to it. if medium is None: medium = "" else: medium += " " medium += material # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None, # Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None, # platform=None, image_addr=None, seller_id=None, artist_id=None) artwork_bundle = {"artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id} self.write_artwork_data(**artwork_bundle)
def close_connection(mydb): SELLER_INFO.clear() mydb.close()
def get_artwork_data_slave(self, url, driver): driver.get(url) soup = BeautifulSoup(driver.page_source, url) if soup is not None: # Field initiation :: artwork = None price = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None # Material to be added to technique technique = "" seller_id = None artist = None medium = None # Medium must always have "Painting" or "Sculpture" (RULE :: 2) # if "/painting/" in str(url): # medium = "Painting" # (painting or sculpture) # elif "/sculpture/" in str(url): # medium = "Sculpture" # else: # # So that url leaks don't break the code. # medium = None # Seller_id try: seller_url = soup.find('div', class_='WncCi').find('a')['href'] seller_id = self.get_seller_id(seller_url) except AttributeError or TypeError: # Seller doesn't have a page. try: seller_url = soup.find('div', class_='WncCi').text.strip() if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: # Make a Kazoart style bundle, and write it to obtain a seller_id. # [seller_url, platform_id(from name), Seller's name, Location, website] bundle = [ seller_url, self.website.platform, 'EMERGINGARTISTPLATFOM', None, None ] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) seller_id = SELLER_INFO.get(seller_url) except AttributeError: pass # We'll let the seller name be seller_url if the url is not found. # Artist_id try: artist_url = soup.find('div', class_='WncCi').a.get('href') if str(artist_url).endswith(".com"): artist_url = re.sub('.com', "", artist_url) artist_url = re.sub('emergingartistplatform', 'emergingartistplatform.com', artist_url) artist_id = self.get_artist_id(artist_url) except AttributeError: try: artist_url = soup.find('div', class_='WncCi').text.strip() country = None a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Country' in j.text or 'country' in j.text or 'COUNTRY' in j.text: title = j.text.split(":") country = title[-1].strip() artist_data_pack = [artist_url, None, country, None] # artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[artist_url] = db.Artist.key_maker( artist_data_pack) key = KEY_INFO.get(artist_url) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack) artist_id = ARTIST_INFO[key] except AttributeError: artist_id = None # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None: try: a = soup.find_all('span') t = "" for b in a: if b.get('data-hook') == "formatted-primary-price": # print(b.text) for p in b.text: if str(p).isnumeric() or str(p) == ".": t += p price = float(t) * rate # print(price) # Price # print(price) except AttributeError: price = None except ValueError: price = None # RULE : 5 if price is not None: # Find artist, artwork, year, type_(N/A), dimensions, support, frame, signature, authenticity, # about, image_loc(actual url of the image), and technique # Wish the code to break if either Artist's name or Artwork's name are not found. # Artist artist = soup.find('div', class_='WncCi').text.strip() # print(artist) # Artwork a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Title' in j.text or 'title' in j.text or 'TITLE' in j.text: title = j.text.split(":") artwork = title[-1].strip() if len(artwork) >= 255: artwork = artwork[0:255] # print(artwork) if 'Date' in j.text: date = j.text.split(":") year = date[-1].strip() # print(year) if 'Size' in j.text: dimensions = j.text.split(":") dimensions = dimensions[-1].strip() # print(dimensions) if 'Medium' in j.text: technique = j.text.split(":") technique = technique[-1].strip() # print(technique) if len(j.text.split( ":")) == 1 and about is None: about = j.text[-1].strip() # Medium (RULE : 3) if "Sculptures" in self.website.start_url: medium = "Sculpture" else: medium = "Painting" # image_loc image = soup.find('div', class_='main-media-image-wrapper-hook') image = image.find('div', id='get-image-item-id') image_loc = image.get('href') # print(image_loc) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } TheAuthour.write_artwork_price_image(**artwork_bundle) else: print(f"Skipping {url}\n PRICE : {price}") else: print( f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}" ) else: print(f"Soup not returned for {url}")