def resolve_ean(ean): page = requests.get(SEARCH_URL.format(ean=ean)) #Check if something was found if "Ihre Suche ergab leider keine Treffer" in page.text: return None html = lxml.html.document_fromstring(page.text) result = defaultdict() transform = list() #Check media type result["type"] = html.find('.//span[@class="noCategory"]').text_content().strip() resolve_author = lambda: defNone(html.find('.//span[@class="oAuthorLinked"]'), lambda x: x.text_content()) if result["type"].startswith("Buch"): result["type"] = "book" result["author"] = resolve_author() result["artists"] = None elif result["type"] == "Hörbuch": result["type"] = "audiobook" result["author"] = resolve_author() result["artists"] = None else: result["type"] = "movie" result["artists"] = [elm.text for elm in html.findall('.//span[@class="oAuthorLinked"]/a')] result["author"] = None #Extract simple attributes from the head of the page result["title"] = html.find('.//span[@class="oProductTitle"]').text.strip() result["imgurl"] = html.find('.//img[@id="elevateZoom"]').attrib["src"] result["description"] = defNone(html.find('.//dd[@class="cTypeBeschreibung"]'), lambda x: x.text_content().strip()) #Extract attributes of the dd/dt Table next to the article picture attr_container = html.find('.//dl[@class="dlCols30_70"]') attr_list = dict() for elm in attr_container.getchildren(): if elm.tag == "dt": curName = elm.text.strip() if elm.tag == "dd": attr_list[curName] = elm.text_content().strip() result["duration"] = defNone(attr_list.get("Spieldauer"), lambda x:int(x.replace("Minuten", ""))) result["studio"] = attr_list.get("Studio") result["genre"] = attr_list.get("Genre") import locale oldlocale = locale.getlocale(locale.LC_TIME) locale.setlocale(locale.LC_TIME, "de_DE.utf8") result["created"] = defNone(attr_list.get("Erscheinungsdatum"), lambda x: interpDate(x)) locale.setlocale(locale.LC_TIME, oldlocale) return result
def resolve_ean(ean): page = requests.get(SEARCH_URL.format(ean)) html = lxml.html.document_fromstring(page.text) #Jump further further_url = "http://www.rebuy.de/" + html.find('.//a[@class="productConversion"]').attrib["href"] page = requests.get(further_url) html = lxml.html.document_fromstring(page.text) result = dict() result["title"] = html.find('.//h1/span[@class="loud"]').text_content() result["type"] = TYPE_TRANSLATE[html.xpath('.//p[contains(@class, "category-icon")]')[0].text_content()] result["imgurl"] = html.find(".//img[@id='cover']").attrib["src"] attribs = dict() for i in html.findall(".//ul[@id='main-info-facts']/li"): name, sep, val = i.text_content().strip().partition(":") attribs[name] = val result["created"] = defNone(attribs.get("Erscheinungsdatum"), lambda x: toDBDate(x.strip(), "%d.%m.%Y")) result["author"] = None result["artists"] = None result["description"] = None result["duration"] = None return result
def resolve_ean(ean): page = requests.post(SEARCH_URL, data={"form[q]": ean}) #Check if something was found if "keine Artikel gefunden" in page.text: return None html = lxml.html.document_fromstring(page.text) result = dict() result["type"] = html.find('.//li[@class="variant"]').text_content().strip() if result["type"] == "Audio CD": result["type"] = "audiobook" result["author"] = html.find('.//a[@class="author"]').text_content().strip() result["artists"] = None elif result["type"] == "Gebundenes Buch": result["type"] = "book" result["author"] = html.find('.//a[@class="author"]').text_content().strip() result["artists"] = None else: result["artists"] = result["author"] = None result["type"] = "movie" result["title"] = html.find('.//h1[@class="headline"]').text attr_field = html.find('.//ul[@class="plain"]') attrs = dict() for li in attr_field.findall(".//li"): data = li.text_content() if data: title, sep, val = data.partition(":") attrs[title] = val.strip() #Extract description description_element = html.find('.//div[@class="product-description"]/div[2]/div[1]') #Convert brs to nl if description_element is not None: for br in description_element.xpath(".//br"): br.tail = "\n" + br.tail if br.tail else "\n" description = description_element.text_content() #Strip trailing crap result["description"] = description[:description.find("Bonusmaterial")] else: #Ignore this hit if there is no description return None try: result["duration"] = int(re.search("Gesamtlaufzeit: (\d+) Min.", page.text).group(1)) except: result["duration"] = None result["created"] = defNone(attrs.get("Erscheinungstermin"), lambda x: interpDate(x)) result["studio"] = attrs.get("Hersteller") result["imgurl"] = html.find('.//img[@class="cover"]').attrib["src"] return result
def resolve_ean(ean): page = requests.get(SEARCH_URL.format(ean)) html = lxml.html.document_fromstring(page.text) result = dict() title_elm = html.find(".//span[@itemprop='name']") #When the title is not found on the page, the product seems to be in the unsorted section of geizhals... if title_elm is None: return None result["title"] = title_elm.text_content() result["genre"] = html.find(".//li[@class='ghnavhi']").text_content() description = html.find(".//div[@id='gh_proddesc']").text_content() result["firstrelease"] = defNone(re.search("Ersterscheinung: (\d+)", description), lambda x: x.group(1)) for i in html.findall(".//a[@class='revlink']"): if "imdb" in i.attrib["href"]: result["imdb_link"] = i.attrib["href"] break; return result