def get_seller_data(self, url): # Caller :: get_artwork_data_slave and get_seller_id visited.discard(url) soup = TheMiner.fetch_page(url) if soup is not None: # Seller's Name print(seller_name) # Code will break if seller's name is not found # Location try: print(location) except AttributeError: location = None # Website try: print(website) except AttributeError: website = None except TypeError: website = None bundle = [url, self.website.platform, seller_name, location, website] print(bundle) TheAuthour.write_seller(*bundle)
def get_seller_data(self, url): # Caller :: get_artwork_data_slave visited.discard(url) soup = TheMiner.fetch_page(url) # print("A") if soup is not None: # print("B") A = soup.find('div', id='top-seller') seller_name = A.h1.text.strip() # print(seller_name) # Code will break if seller's name is not found try: location = A.find('p', class_="subtitle").text.strip().split(',') location = location[-1].strip() # print(location) except AttributeError: location = None try: website = str(soup.find('ul', id="websites").a['href']).strip() # print(website) except AttributeError: website = None except TypeError: website = None bundle = [ url, self.website.platform, seller_name, location, website ] # print(bundle) TheAuthour.write_seller(*bundle)
def get_seller_id(self, seller_url) -> int: # Fetches seller_data, writes it in db, and returns seller_id. # bundle = [seller_url, self.website.platform, 'KAZoART', None, url] seller_id = None if seller_url is not None: if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) # print(seller_id) else: # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data # again with seller_url self.get_seller_data(seller_url) # wait for a second to make sure that transaction is smooth. Activate this line if errors are thrown. # time.sleep(1) # Try to fetch seller data again. if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: # Make a Kazoart style bundle, and write it to obtain a seller_id. bundle = [seller_url, self.website.platform, 'BAREBONES', None, seller_url] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: print("FATAL ERROR :: Seller_id not found.") else: print("FATAL ERROR :: Seller_id not found.") # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering # data beyond rule 3 check . return seller_id
def get_seller_id(self, seller_url) -> int: # Fetches seller_data, writes it in db, and returns seller_id. # bundle = [seller_url, self.website.platform, 'KAZoART', None, url] seller_id = None if seller_url is not None: if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) # print(seller_id) else: # Process and create the bundle here. bundle = [ seller_url, self.website.platform, 'KAZoART', None, seller_url ] # Writing to db. TheAuthour.write_seller(*bundle) if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: print("FATAL ERROR :: Seller_id not found.") else: print("FATAL ERROR :: Seller_id not found.") # Let's return seller_id, even if it's None. return seller_id
def get_seller_id(self, seller_url) -> int: # Fetches seller_data, writes it in db, and returns seller_id. # bundle = [seller_url, self.website.platform, 'KAZoART', None, url] # print("GET SELLER ID") seller_id = None if seller_url is not None: if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) return seller_id # print(seller_id) else: # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data # with seller_url self.get_seller_data(seller_url) # Try to fetch seller data again. if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) # If it is not a url, get_seller_data will fail to make an entry.In that case we move to the next part. else: # Make a Kazoart style bundle, and write it to obtain a seller_id. # bundle = [seller_url, platform, Seller's name, location, website] bundle = [ seller_url, self.website.platform, seller_url, None, seller_url ] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) if seller_url in SELLER_INFO.keys(): # This will always run, unless the program is failing unexpectedly. seller_id = SELLER_INFO.get(seller_url) else: print("FATAL ERROR :: Seller_id not found.") else: print("FATAL ERROR :: Seller_id not found.") # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering # data beyond rule 3 check . return seller_id
def get_artwork_data_slave(self, url): soup = TheMiner.fetch_page(url) if soup is not None: # Field initiation :: Artwork_title, artist, price, seller_id , # medium, type, dimension, frame, authenticity, about :: year, support, signature # artist_id, Image_loc = None seller_id = None artist = None artwork = None price = None medium = None # (painting or sculpture) technique = "" # Material and style type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None try: # PRICE A = soup.find('section', id='informations') price = str(A.find('p', class_='media-price price').text).strip() number = '' for p in price: if p == '-': break if p.isdigit(): number += str(p) if p == ".": number += str(p) price = float(number) # print(price) except AttributeError: pass except TypeError: pass # Rule : 5 if price is not None: # Seller_id try: seller_url = soup.find( 'div', id='top-seller').find('a').get('href') if 'galeries-d-art' in str(seller_url): seller_url = re.sub('galeries-d-art', 'art-galleries', seller_url) # If seller_url is found. seller_id = self.get_seller_id(seller_url) except AttributeError: # seller_id = None # There are pages where the seller has no other page. Then we make the url ourselves. seller_url = soup.find('div', id='top-seller').find( 'p', class_='highlight-title').text seller_url = str(seller_url).strip() if seller_url in SELLER_INFO: seller_id = SELLER_INFO[seller_url] else: location = soup.find('div', id='top-seller').find( 'p', class_='subtitle').text.strip().split(',') location = str(location[-1]).strip() seller_name = seller_url bundle = [ seller_url, self.website.platform, seller_name, location, None ] # We write the seller info directly and fetch the seller_id TheAuthour.write_seller(*bundle) seller_id = SELLER_INFO[seller_url] # Artist_id try: artist_url = soup.find('section', id='informations').find( 'div', class_='relative').a.get('href') if "oeuvres-d-art-contemporain" in artist_url: re.sub("oeuvres-d-art-contemporain", "contemporary-artworks", artist_url) artist_id = self.get_artist_id(artist_url) except AttributeError: artist_id = None print("\n\n\n\n\n") print(url) print("\n\n\n\n\n") time.sleep(50) # Medium must always have "Painting" or "Sculpture" (RULE :: 2) la = str(url).split('/') if 'painting' in la: medium = "Painting " # (painting or sculpture) elif 'sculpture' in la: medium = "Sculpture" else: # So that url leaks don't break the code. medium = None # IF either the seller id or artist_id are missing, escape the rest. (Rule : 3) # If medium is neither Paintings, not Sculptures. We don't fetch data. ( Rule : 2) if seller_id is not None and artist_id is not None and medium is not None: # ______________________________MAIN DATA FETCHING________________________ A = soup.find('section', id='informations') B = A.find('div', class_='relative') # ARTIST'S NAME artist = B.find('span', class_='primary-title').text.strip() # print(artist) # ARTWORK'S NAME C = B.find('span', class_='secondary-title').text.strip() artwork_ = C.split(',') artwork_title = "" for a in range(len(artwork_) - 1): if a == 0: artwork_title = artwork_[a] continue artwork_title = artwork_title + ", " + artwork_[ a].strip() artwork = artwork_title # print(artwork) try: # ARTWORK YEAR year = C.split(',')[-1].strip() # print(year) except IndexError: pass # year = None try: # Image url B = A.find('div', id='img-container') image_loc = B.find('img', id='img_original')['data-src'] # print(image_loc) except AttributeError: pass # Contains:: image, dimensions, medium, type, Frame, Support, authenticity, signature try: D = soup.find('div', id='tabs-description').ul E = D.find_all('li') for e in E: a = e.text # Dimensions if 'Dimensions' in a and 'About the artwork' not in a and 'Support' not in a: dimensions = e.find( 'p', class_='pull-right').strong.text.strip() dim = True # print(dimensions) continue # Medium (Sculpture/Painting) and Technique if 'Medium' in a and 'About the artwork' not in a: technique = e.find( 'p', class_='pull-right').text.split(" ") # print(technique) temp = "" for t in technique: if t != "": temp += t.strip() temp += " " # medium = medium[0] # technique = medium[1] technique = temp # print(technique) continue # Type if 'Type' in a and 'About the artwork' not in a: type_ = e.find('p', class_='pull-right text-right' ).text.strip().split(' ')[0] # print(type_) continue # Support (base) if 'Support' in a and 'About the artwork' not in a: try: f = e.find('p', class_='pull-right text-right' ).text.strip().split(' ') support = f[0] + '. ' + f[1].strip('\n') f = e.find( 'p', class_='pull-right text-right' ).strong.text.strip().strip('\n') support += f except IndexError: support = e.find( 'p', class_='pull-right text-right' ).text.strip() # print(support) continue # Framing if 'Framing' in a and 'About the artwork' not in a: frame = e.find( 'p', class_='pull-right').text.strip() # print(frame) continue # Signature if 'Signature' in a and 'About the artwork' not in a: signature = e.find( 'p', class_='pull-right').text.strip() # print(signature) continue # Authenticity if 'Authenticity' in a and 'About the artwork' not in a: authenticity = e.find( 'p', class_='pull-right text-right').text.strip( ) # print(authenticity) continue # Artwork Description if 'About the artwork' in a: about = e.find('p', class_="marg-bot-10") if about is not None: a = e.find( 'div', class_= "description-catalog see-more text-justify" ).text.strip() about = about.text.strip() about += a else: about = e.find('p', class_='').text.strip() continue # print(about) except AttributeError: pass # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None, # Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None, # platform=None, image_addr=None, seller_id=None, artist_id=None) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } # print(artwork_bundle) TheAuthour.write_artwork_price_image(**artwork_bundle) else: print( f"SELLER ID :: {seller_id},\nARTIST ID :: {artist_id}") else: # If the price is not available, we skip the entire process. print(f"PRICE NOT FOUND : {price} at {url}") else: print(f"\n\n\n\n\nURL DIDN'T RETURN : {url}\n\n\n\n\n")
def get_seller_data(self, url): # Caller :: get_artwork_data_slave and get_seller_id # We get to here only after we do not find the seller's info in SELLER_INFO # print("GET SELLER DATA") visited.discard(url) soup = TheMiner.fetch_page(url) if soup is not None: # print("GET SELLER DATA: SOUP RETURNED") seller_name = None try: # Seller's Name seller_box = soup.find('div', id='jumpto--PartnerHeader') seller_name = seller_box.h1.text.strip() # print(seller_name) # Code will break if seller's name is not found except AttributeError: pass if seller_name is not None: # print(f"SELLER NAME : {seller_name}") # Location try: # Location is not available here. location = "" locatio = seller_box.h1.nextSibling() # print(type(locatio)) try: location = locatio.text except AttributeError: for l in locatio: location += l.text location += " " # print(location) except AttributeError: location = None except TypeError: location = None # Website try: website = soup.find_all('a') for web in website: if "http" in str(web.get('href')): website = web.get('href') print(web.get('href')) break # print(website) except AttributeError: website = None except IndexError: website = None bundle = [ url, self.website.platform, seller_name, location, website ] # print(bundle) TheAuthour.write_seller(*bundle)
def get_artwork_data_slave(self, url, driver): driver.get(url) soup = BeautifulSoup(driver.page_source, url) if soup is not None: # Field initiation :: artwork = None price = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None # Material to be added to technique technique = "" seller_id = None artist = None medium = None # Medium must always have "Painting" or "Sculpture" (RULE :: 2) # if "/painting/" in str(url): # medium = "Painting" # (painting or sculpture) # elif "/sculpture/" in str(url): # medium = "Sculpture" # else: # # So that url leaks don't break the code. # medium = None # Seller_id try: seller_url = soup.find('div', class_='WncCi').find('a')['href'] seller_id = self.get_seller_id(seller_url) except AttributeError or TypeError: # Seller doesn't have a page. try: seller_url = soup.find('div', class_='WncCi').text.strip() if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: # Make a Kazoart style bundle, and write it to obtain a seller_id. # [seller_url, platform_id(from name), Seller's name, Location, website] bundle = [ seller_url, self.website.platform, 'EMERGINGARTISTPLATFOM', None, None ] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) seller_id = SELLER_INFO.get(seller_url) except AttributeError: pass # We'll let the seller name be seller_url if the url is not found. # Artist_id try: artist_url = soup.find('div', class_='WncCi').a.get('href') if str(artist_url).endswith(".com"): artist_url = re.sub('.com', "", artist_url) artist_url = re.sub('emergingartistplatform', 'emergingartistplatform.com', artist_url) artist_id = self.get_artist_id(artist_url) except AttributeError: try: artist_url = soup.find('div', class_='WncCi').text.strip() country = None a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Country' in j.text or 'country' in j.text or 'COUNTRY' in j.text: title = j.text.split(":") country = title[-1].strip() artist_data_pack = [artist_url, None, country, None] # artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[artist_url] = db.Artist.key_maker( artist_data_pack) key = KEY_INFO.get(artist_url) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack) artist_id = ARTIST_INFO[key] except AttributeError: artist_id = None # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None: try: a = soup.find_all('span') t = "" for b in a: if b.get('data-hook') == "formatted-primary-price": # print(b.text) for p in b.text: if str(p).isnumeric() or str(p) == ".": t += p price = float(t) * rate # print(price) # Price # print(price) except AttributeError: price = None except ValueError: price = None # RULE : 5 if price is not None: # Find artist, artwork, year, type_(N/A), dimensions, support, frame, signature, authenticity, # about, image_loc(actual url of the image), and technique # Wish the code to break if either Artist's name or Artwork's name are not found. # Artist artist = soup.find('div', class_='WncCi').text.strip() # print(artist) # Artwork a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Title' in j.text or 'title' in j.text or 'TITLE' in j.text: title = j.text.split(":") artwork = title[-1].strip() if len(artwork) >= 255: artwork = artwork[0:255] # print(artwork) if 'Date' in j.text: date = j.text.split(":") year = date[-1].strip() # print(year) if 'Size' in j.text: dimensions = j.text.split(":") dimensions = dimensions[-1].strip() # print(dimensions) if 'Medium' in j.text: technique = j.text.split(":") technique = technique[-1].strip() # print(technique) if len(j.text.split( ":")) == 1 and about is None: about = j.text[-1].strip() # Medium (RULE : 3) if "Sculptures" in self.website.start_url: medium = "Sculpture" else: medium = "Painting" # image_loc image = soup.find('div', class_='main-media-image-wrapper-hook') image = image.find('div', id='get-image-item-id') image_loc = image.get('href') # print(image_loc) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } TheAuthour.write_artwork_price_image(**artwork_bundle) else: print(f"Skipping {url}\n PRICE : {price}") else: print( f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}" ) else: print(f"Soup not returned for {url}")