def imdb_direct_info(imdb_id): print imdb_id try: html = urllib2.urlopen('http://www.imdb.com/title/'+imdb_id+'/').read() except urllib2.HTTPError as e: print e return html = re.sub(r'&#x(\w+);', r'%\1', html) try: html = unicode(html, 'utf-8') except UnicodeDecodeError as e: print "UnicodeDecodeError (%s) in %s" % (e, m) return html = urllib2.unquote(html) xml = html2xml.translate(html) page = xmlquery.parse_xml(xml) info = page.queryone('#overview-top') if not info: return name = str(info.queryone('h1.header').children[0]) print name year = info.queryone('h1.header>span>a') if year: year = int(year.text) print year genres = map(lambda x: x.text, info.query('.infobar>a[href^="/genre/"]')) print genres rating = str(info.queryone('span[class="rating-rating"]').children[0]) if rating == '-': rating = 0 else: rating = float(rating) print rating votes = int(re.sub(',', '', info.queryone('a[href="ratings"]').text.split()[0])) print votes if len(info.query('p')) > 1: short_plot = info.query('p')[1].text.splitlines()[0] else: short_plot = None print short_plot people = {} for p_type in info.query('div.txt-block'): p_type_text = p_type.queryone('h4.inline').text[:-1] people[p_type_text] = [] for p in p_type.query('a[href^="/name/"]'): people[p_type_text].append((p.text, p.attributes['href'])) print people return raise NotImplementedError() (name, year) = re.search('<meta name="title" content="(.*?)\s\((\d{4})(?:/[IVX]+)?\)(?:\s\(T?V\))?">', html).group(1, 2) m = re.search('<a href="ratings" class="tn15more">(\d+(?:,\d+)?) votes</a>', html) if m: mvotes = int(re.sub(',', '', m.group(1))) else: mvotes = 0 print name, year, mvotes
import xmlquery xml = xmlquery.parse_xml("<a><b /></a>") c = xml.query("*") #c[1].name = "banana" for n in c: print n.name