def get_artist_data(self, soup, url): # Called by self.get_artwork_listings_slave() # Pick name, born, country, about artist_resume = soup.find('div', class_='artist-resume').find( 'div', class_='artist-resume_text') name = artist_resume.h1.text.strip() print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: try: country = artist_resume.find( 'p', class_='location').text.strip().split('\n') country = country[0].split(',') country = country[-1].strip() print(country) except AttributeError: country = None about = soup.find('div', id='about').text.strip() # About will either be found and be some text or be None. # print(about) artist_data_pack = [name, None, country, about] # pack = [name, born, country, about] # self.write_artist_data(*artist_data_pack) KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) TheAuthour.write_artist(*artist_data_pack)
def get_artist_data(self, soup, url): # Called by self.get_artwork_listings_slave() # Pick name, born, country, about # Name : Pick artist's name here print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: try: # Pick artist's country here. print(country) except AttributeError: country = None try: # Pick birth year here here. print(born) except AttributeError: born = None try: # Pick artist's description here. print(about) except AttributeError: about = None artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack)
def key_maker(artist_url): options = Options() options.headless = True driver = webdriver.Firefox(options=options) visited.discard(artist_url) soup = BeautifulSoup(driver.page_source, artist_url) if soup is not None: n_c = soup.find_all('h2', class_='font_2') # Artist's name try: name = n_c[0].text.strip() except IndexError: print(n_c) name = None # print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: # Country try: country = n_c[1].text.strip() except AttributeError: country = None # About try: text = soup.find_all('p', class_='font_8') about = "" for t in text: about += t.text.strip() about += " " # print(about) except AttributeError: about = None except TypeError: about = None # About will either be found and be some text or be None. # print(about) artist_data_pack = [name, None, country, about] # artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[artist_url] = db.Artist.key_maker(artist_data_pack) key = KEY_INFO.get(artist_url) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack) # key = db.Artist.key_maker(artist_data_pack) # pack = [name, born, country, about] driver.quit() return key else: driver.quit() return None else: return None
def get_artist_data(self, soup, url): # Called by self.get_artwork_listings_slave() # Pick name, born, country, about # PICKING ARTIST DATA A = soup.find('div', id='biography') # Artist's name name = A.h1.text.strip() # print(name) # Code should break if the name goes missing try: # Born A = soup.find('div', id='biography') B = A.find('div', class_='sub-title col-sm-9 col-xs-12') bo = B.find('span', class_='birthday-date').text born = "" for b in bo: if b.isdigit(): born += b born = int(born) # print(born) except AttributeError: born = None try: # Country A = soup.find('div', id='biography') B = A.find('div', class_='sub-title col-sm-9 col-xs-12') country = B.span.text.strip() # print(country) except AttributeError: country = None try: # About A = soup.find('div', id='biography') about = A.find('div', class_='col-sm-9 col-xs-12 biography').text.strip() ab = about.split(" ") about = '' for a in range(len(ab) - 1): b = ab[a] about = about + "\n" + b.strip() about = about.strip() # print(about) except AttributeError: about = None artist_data_pack = [name, born, country, about] KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) TheAuthour.write_artist(*artist_data_pack)
def get_artist_data(self, soup, url): # name, born, country, about # pack = [name, born, country, about] # no need to run the safety try: except: here because we're not fetching the page here. try: name = soup.find('div', class_='artist-intro').find('h1').text name = str(name).strip() except AttributeError: name = None if name is not None: try: born = soup.find('p', class_='born').text.strip() t = "" for b in born: if str(b).isdigit(): t += b born = int(t) if born > 3000: born = str(born)[0:3] except AttributeError: born = None except ValueError: born = None # Country try: country = soup.find('div', class_="artist-intro") country = country.find('div', class_='h2').text.strip().split("|") country = str(country[-1]).strip() except AttributeError: country = None # About try: about = soup.find('section', class_='artist-bio') about = about.find('div', class_='resume').text.strip() except AttributeError: about = None # pack = [name, born, country, about] # print(pack) artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack)
def get_artist_data(self, soup, url): # Called by self.get_artwork_listings_slave() # Pick name, born, country, about # dom = etree.HTML(str(soup)) # Name : Pick artist's name here A = soup.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 GridColumns__Cell-sc-1g9p6xx-1\.*')) name = soup.find('h1').text.strip() # print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: try: # Pick artist's country here. B = A[1].find('h2').text.strip().split(",") country = B[0].strip() if country == "American": country = "USA" elif country == "Japanese": country = "Japan" elif "French" in country: country = "France" elif "Argentine" in country: country = "Argentina" elif "Dutch" in country: country = "Netherlands" elif "Indian" in country: country = "India" elif "Pakistani" in country: country = "Pakistan" elif "Italian" in country: country = "Italy" elif "English" in country: country = "UK" elif "Chinese" in country: country = "China" elif "Hispanic" in country: country = "Spain" elif "German" in country: country = "Germany" elif "Spanish" in country: country = "Spain" elif "Russian" in country: country = "Russia" elif "British" in country: country = "UK" elif "Mexican" in country: country = "Mexico" elif "Brazilian" in country: country = "Brazil" elif "Canadian" in country: country = "Canada" elif "Belgian" in country: country = "Belgium" elif "Israeli" in country: country = "Israel" elif "Venezuelan" in country: country = "Venezuela" elif "Polish" in country: country = "Poland" else: for i in country: if str(i).isnumeric(): country = None # print(country) try: born = str(B[-1]).strip().split("–") born = born[0] t = "" for b in born: if b.isnumeric(): t += b born = int(t) except ValueError: born = None # print(born) except AttributeError: born = None country = None try: about = None # Pick artist's description here. about_block = soup.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*')) for a in about_block: if a.text.strip() == 'Bio': # print("A") about = a.nextSibling.text.strip() break # print(about) except AttributeError: about = None artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack)
def get_artwork_data_slave(self, url, driver): driver.get(url) soup = BeautifulSoup(driver.page_source, url) if soup is not None: # Field initiation :: artwork = None price = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None # Material to be added to technique technique = "" seller_id = None artist = None medium = None # Medium must always have "Painting" or "Sculpture" (RULE :: 2) # if "/painting/" in str(url): # medium = "Painting" # (painting or sculpture) # elif "/sculpture/" in str(url): # medium = "Sculpture" # else: # # So that url leaks don't break the code. # medium = None # Seller_id try: seller_url = soup.find('div', class_='WncCi').find('a')['href'] seller_id = self.get_seller_id(seller_url) except AttributeError or TypeError: # Seller doesn't have a page. try: seller_url = soup.find('div', class_='WncCi').text.strip() if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: # Make a Kazoart style bundle, and write it to obtain a seller_id. # [seller_url, platform_id(from name), Seller's name, Location, website] bundle = [ seller_url, self.website.platform, 'EMERGINGARTISTPLATFOM', None, None ] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) seller_id = SELLER_INFO.get(seller_url) except AttributeError: pass # We'll let the seller name be seller_url if the url is not found. # Artist_id try: artist_url = soup.find('div', class_='WncCi').a.get('href') if str(artist_url).endswith(".com"): artist_url = re.sub('.com', "", artist_url) artist_url = re.sub('emergingartistplatform', 'emergingartistplatform.com', artist_url) artist_id = self.get_artist_id(artist_url) except AttributeError: try: artist_url = soup.find('div', class_='WncCi').text.strip() country = None a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Country' in j.text or 'country' in j.text or 'COUNTRY' in j.text: title = j.text.split(":") country = title[-1].strip() artist_data_pack = [artist_url, None, country, None] # artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[artist_url] = db.Artist.key_maker( artist_data_pack) key = KEY_INFO.get(artist_url) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack) artist_id = ARTIST_INFO[key] except AttributeError: artist_id = None # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None: try: a = soup.find_all('span') t = "" for b in a: if b.get('data-hook') == "formatted-primary-price": # print(b.text) for p in b.text: if str(p).isnumeric() or str(p) == ".": t += p price = float(t) * rate # print(price) # Price # print(price) except AttributeError: price = None except ValueError: price = None # RULE : 5 if price is not None: # Find artist, artwork, year, type_(N/A), dimensions, support, frame, signature, authenticity, # about, image_loc(actual url of the image), and technique # Wish the code to break if either Artist's name or Artwork's name are not found. # Artist artist = soup.find('div', class_='WncCi').text.strip() # print(artist) # Artwork a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Title' in j.text or 'title' in j.text or 'TITLE' in j.text: title = j.text.split(":") artwork = title[-1].strip() if len(artwork) >= 255: artwork = artwork[0:255] # print(artwork) if 'Date' in j.text: date = j.text.split(":") year = date[-1].strip() # print(year) if 'Size' in j.text: dimensions = j.text.split(":") dimensions = dimensions[-1].strip() # print(dimensions) if 'Medium' in j.text: technique = j.text.split(":") technique = technique[-1].strip() # print(technique) if len(j.text.split( ":")) == 1 and about is None: about = j.text[-1].strip() # Medium (RULE : 3) if "Sculptures" in self.website.start_url: medium = "Sculpture" else: medium = "Painting" # image_loc image = soup.find('div', class_='main-media-image-wrapper-hook') image = image.find('div', id='get-image-item-id') image_loc = image.get('href') # print(image_loc) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } TheAuthour.write_artwork_price_image(**artwork_bundle) else: print(f"Skipping {url}\n PRICE : {price}") else: print( f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}" ) else: print(f"Soup not returned for {url}")