def get_artist_id(self, artist_url): # We go to artist page to pick data we need to make the ARTIST_INFO key. artist_id = None if artist_url in KEY_INFO.keys(): key = KEY_INFO.get(artist_url) if key in ARTIST_INFO.keys(): artist_id = ARTIST_INFO.get(key) return artist_id else: # self.artist_id_slave (key_maker) returns the artist_id artist_id = self.artist_id_slave(artist_url) return artist_id
def get_artist_id(self, artist_url): # We go to artist page to pick data we need to make the ARTIST_INFO key. artist_id = None if artist_url in KEY_INFO.keys(): # print("\n\n\n\nA:\n\n\n\n") key = KEY_INFO[artist_url] if key in ARTIST_INFO.keys(): artist_id = ARTIST_INFO[key] return artist_id # Key maker here, writes the artist data in the db. Makes it much simpler. Nah? key = self.key_maker(artist_url) if key is not None and artist_url is not None: if key in ARTIST_INFO.keys(): artist_id = ARTIST_INFO.get(key) return artist_id # print(artist_id) else: print("FATAL ERROR :: Artist_id not found.") else: # If it ever comes to here, the page will not have an Artist print("FATAL ERROR :: Artist_id not found. Artist_url broken") # Let's return None here, and not pick rest of the data if the artist_id is not found. # Artist id is used in artworks table only. return artist_id
def key_maker(artist_url): options = Options() options.headless = True driver = webdriver.Firefox(options=options) visited.discard(artist_url) soup = BeautifulSoup(driver.page_source, artist_url) if soup is not None: n_c = soup.find_all('h2', class_='font_2') # Artist's name try: name = n_c[0].text.strip() except IndexError: print(n_c) name = None # print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: # Country try: country = n_c[1].text.strip() except AttributeError: country = None # About try: text = soup.find_all('p', class_='font_8') about = "" for t in text: about += t.text.strip() about += " " # print(about) except AttributeError: about = None except TypeError: about = None # About will either be found and be some text or be None. # print(about) artist_data_pack = [name, None, country, about] # artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[artist_url] = db.Artist.key_maker(artist_data_pack) key = KEY_INFO.get(artist_url) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack) # key = db.Artist.key_maker(artist_data_pack) # pack = [name, born, country, about] driver.quit() return key else: driver.quit() return None else: return None
def artist_id_slave(self, artist_url): visited.discard(artist_url) soup = TheMiner.fetch_page(artist_url) if soup is not None: self.get_artist_data(soup, artist_url) # Getting the key from KEY_INFO if artist_url in KEY_INFO.keys(): key = KEY_INFO.get(artist_url) # Getting artist_id using the key from ARTIST_INFO if key in ARTIST_INFO.keys(): artist_id = ARTIST_INFO.get(key) return artist_id else: print("ARTIST_ID_SLAVE : Artist id not in ARTIST_INFO") return None else: print("ARTIST_ID_SLAVE : Could not find artist_id") return None else: print("ARTIST_ID_SLAVE : Soup not returned") return None
def get_artist_id(self, artist_url): # We go to artist page to pick data we need to make the ARTIST_INFO key. # print(f"\n\n\n\nARTIST_ID_GET:\n{artist_url}\n{KEY_INFO}\n\n\n\n") artist_id = None if artist_url in KEY_INFO.keys(): # print("\n\n\n\nA:\n\n\n\n") key = KEY_INFO[artist_url] artist_id = ARTIST_INFO[key] else: key = self.key_maker(artist_url) if key is not None and artist_url is not None: if key in ARTIST_INFO.keys(): artist_id = ARTIST_INFO.get(key) # print(artist_id) else: print("FATAL ERROR :: Artist_id not found.") else: # If it ever comes to here, the page will not have an Artist print("FATAL ERROR :: Artist_id not found. Artist_url broken") # Let's return None here, and not pick rest of the data if the artist_id is not found. # Artist id is used in artworks table only. return artist_id
def get_artwork_data_slave(self, url, driver): driver.get(url) soup = BeautifulSoup(driver.page_source, url) if soup is not None: # Field initiation :: artwork = None price = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None # Material to be added to technique technique = "" seller_id = None artist = None medium = None # Medium must always have "Painting" or "Sculpture" (RULE :: 2) # if "/painting/" in str(url): # medium = "Painting" # (painting or sculpture) # elif "/sculpture/" in str(url): # medium = "Sculpture" # else: # # So that url leaks don't break the code. # medium = None # Seller_id try: seller_url = soup.find('div', class_='WncCi').find('a')['href'] seller_id = self.get_seller_id(seller_url) except AttributeError or TypeError: # Seller doesn't have a page. try: seller_url = soup.find('div', class_='WncCi').text.strip() if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: # Make a Kazoart style bundle, and write it to obtain a seller_id. # [seller_url, platform_id(from name), Seller's name, Location, website] bundle = [ seller_url, self.website.platform, 'EMERGINGARTISTPLATFOM', None, None ] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) seller_id = SELLER_INFO.get(seller_url) except AttributeError: pass # We'll let the seller name be seller_url if the url is not found. # Artist_id try: artist_url = soup.find('div', class_='WncCi').a.get('href') if str(artist_url).endswith(".com"): artist_url = re.sub('.com', "", artist_url) artist_url = re.sub('emergingartistplatform', 'emergingartistplatform.com', artist_url) artist_id = self.get_artist_id(artist_url) except AttributeError: try: artist_url = soup.find('div', class_='WncCi').text.strip() country = None a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Country' in j.text or 'country' in j.text or 'COUNTRY' in j.text: title = j.text.split(":") country = title[-1].strip() artist_data_pack = [artist_url, None, country, None] # artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[artist_url] = db.Artist.key_maker( artist_data_pack) key = KEY_INFO.get(artist_url) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack) artist_id = ARTIST_INFO[key] except AttributeError: artist_id = None # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None: try: a = soup.find_all('span') t = "" for b in a: if b.get('data-hook') == "formatted-primary-price": # print(b.text) for p in b.text: if str(p).isnumeric() or str(p) == ".": t += p price = float(t) * rate # print(price) # Price # print(price) except AttributeError: price = None except ValueError: price = None # RULE : 5 if price is not None: # Find artist, artwork, year, type_(N/A), dimensions, support, frame, signature, authenticity, # about, image_loc(actual url of the image), and technique # Wish the code to break if either Artist's name or Artwork's name are not found. # Artist artist = soup.find('div', class_='WncCi').text.strip() # print(artist) # Artwork a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Title' in j.text or 'title' in j.text or 'TITLE' in j.text: title = j.text.split(":") artwork = title[-1].strip() if len(artwork) >= 255: artwork = artwork[0:255] # print(artwork) if 'Date' in j.text: date = j.text.split(":") year = date[-1].strip() # print(year) if 'Size' in j.text: dimensions = j.text.split(":") dimensions = dimensions[-1].strip() # print(dimensions) if 'Medium' in j.text: technique = j.text.split(":") technique = technique[-1].strip() # print(technique) if len(j.text.split( ":")) == 1 and about is None: about = j.text[-1].strip() # Medium (RULE : 3) if "Sculptures" in self.website.start_url: medium = "Sculpture" else: medium = "Painting" # image_loc image = soup.find('div', class_='main-media-image-wrapper-hook') image = image.find('div', id='get-image-item-id') image_loc = image.get('href') # print(image_loc) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } TheAuthour.write_artwork_price_image(**artwork_bundle) else: print(f"Skipping {url}\n PRICE : {price}") else: print( f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}" ) else: print(f"Soup not returned for {url}")