def module(module_code): """ Retrieve and parses module information from UL site @param module_code: Module code to get details for @type module_code: String @return An OrderedDict containing the module code and name, or -1 if match not found """ url = 'http://193.1.101.55/tt_moduledetails_res.asp' params = { 'T1' : module_code } rows = common.get_page(url, params).xpath('//table//table/tr') # no matches if not rows: return -1 data = OrderedDict([ ('kind', 'module'), ('code', module_code), ('name', common.tidy_tag(rows[1].xpath('td[2]')[0])), ]) return data
def get_sub_cats(self, url): page = common.get_page(url) common.sleep_random_between(1, 2) cat = {} for child in page.find_all("div", class_="desktop__view-child"): link = child.a["href"] name = child.a.text child_id = re.findall(self.patt["child_cat_id"], link) if len(child_id) == 0: logging.info("could not find category child id, passing...") continue child_id = child_id[0] items = {} for item in child.find_all("li", class_="category-list__item"): item_name = item.a.text item_link = item.a["href"] item_id = re.findall(self.patt["child_cat_id"], item_link) if len(item_id) == 0: logging.info("could not find category item id, passing...") continue item_id = item_id[0] items[item_id] = {"name": item_name, "link": item_link} cat[child_id] = {"name": name, "link": link, "items": items} return cat
def on_get_users(update: Update, context: CallbackContext): r""" Получение пользователей: - /get_users - get[ _]users """ message = update.effective_message query = update.callback_query if query: query.answer() page = get_page(context) total_users = db.User.select().count() items_per_page = 1 user = db.User.get_by_page(page=page, items_per_page=items_per_page)[0] description = get_user_message_repr(user) text = f'Пользователь №{page}:\n{description}' reply_text_or_edit_with_keyboard_paginator( message, query, text, page_count=total_users, items_per_page=items_per_page, current_page=page, data_pattern=fill_string_pattern(PATTERN_GET_USER_BY_PAGE, '{page}'), )
def building(building_name): """ Retrieve and parses building information from UL site @param module_code: Buidling name to get details for @type module_code: String @return An OrderedDict containing the building name, a thumbnail and web address of building information page, or -1 if match not found """ url = 'https://www2.ul.ie/web/WWW/Services/Buildings_and_Estates/At_A_Glance/' row = common.get_page(url).xpath('//div[@class=\'rc-doc\']/table/tbody[1]/tr\ [contains(.//strong, \'{0}\')]'.format(building_name.title())) # Handle building does not exist if not row: return -1 building_data = row[0].xpath('./td[1]/strong/text()')[0] building_image = 'https://www2.ul.ie' + row[0].xpath('./td[2]/a/img/@src')[0] building_link = 'https://www2.ul.ie' + row[0].xpath('./td[2]/a/@href')[0] data = OrderedDict([ ('kind', 'building'), ('name', building_data), ('thumb', building_image), ('url', building_link), ]) return data
def course(course_code): """ Retrieve and parses course information from UL site @param module_code: Course code to get details for @type module_code: String @return An OrderedDict containing the original course code, title of course and web address of course page, or -1 if match not found """ url = 'http://www3.ul.ie/courses/AlphabeticalList.shtml' row = common.get_page(url).xpath('//p//a[contains(., \'{0}\')]'.format(course_code)) # Handle course does not exist (either now or ever) if not row: return -1 text_value = row[0].xpath('./text()')[0] link_value = row[0].xpath('./@href')[0] # Parse course code and name from combined string using Regex course_re = re.match(common.COURSE_NAME_RE, text_value) course_data = course_re.group('code', 'name') course_url = 'http://www3.ul.ie/courses/' + link_value data = OrderedDict([ ('kind', 'course'), ('code', course_data[0]), ('name', course_data[1]), ('url', course_url), ]) return data
def scrape(url): blocks = get_blocks(get_page(url)) n = len(blocks) (xs, ys) = [normalize(range(n)), map(cost, blocks)] (l_bound, r_bound) = classify(zip(xs, ys)) return '<p>' + '</p>\n<p>'.join(blocks[l_bound:r_bound]) + '</p>'
def handler(): logging.info("Opening URL: %s", url); f = StringIO.StringIO(common.get_page(url)) jobreader = csv.DictReader(f, delimiter=',', quotechar="\"") for i, row in enumerate(jobreader): r = fn_inner(i, row) if r is not None: yield r
def rottentomatoes_find_id_by_imdb(imdb_id): url = u"http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb_id logging.info("Searching RT with IMDB ID: ''%s'" % url) try: request_url, page = common.get_page(url) except Exception, e: logging.error("Got exception while opening page: %s" % e) return None
def rottentomatoes_parse_page(rt_id): metadata = {} try: url = u'http://www.rottentomatoes.com/m/%s/' % rt_id _, page = common.get_page(url) except Exception, e: logging.error("Got exception while opening page: %s" % e) return None
def lookup_nyt_review(content): name = content.simple_name().encode('utf-8') title, year = common.detect_title_year(name) url = 'http://movies.nytimes.com/gst/movies/msearch.html?%s' data = {'query': title} url = url % urllib.urlencode(data) _, page = common.get_page(url) if not page: logging.error("Couldn't get NYT search page for '%s'" % content) return None doc = B(page) entertainment_results = doc.findChild( 'div', attrs={'id': 'entertainment_results'}) results_container = entertainment_results.findChild( 'ol') if entertainment_results else None results = results_container.findChildren( 'li', recursive=False) if results_container else [] for result in results: title_header = result.findChild('h3') title_link = title_header.findChild('a') if title_header else None nyt_title = title_link.string if title_link else None if not nyt_title: logging.warning("Couldn't find title node for '%s'" % title) continue # This sucks. nyt_title = nyt_title.replace(u'\xa0', ' ') nyt_title = nyt_title.encode('utf-8') nyt_title, nyt_year = common.detect_title_year(nyt_title) if not common.title_match(title, nyt_title): try: logging.warning( "Skipping NYT title '%s' because it didn't match '%s'" % (nyt_title, title)) except Exception, e: import pdb pdb.set_trace() print e continue extra_links = result.findChild('ul') if extra_links: for link in extra_links.findChildren('a'): if link.string == "N.Y.Times Review": return 'http://movies.nytimes.com%s' % link.get('href')
def calendar(year): """ Retrieve and parses academic calendar from UL calendar site @param year: Start year of calendar to retrieve ([year] - [year + 1]) @type year: String @return A An OrderedDict containing events for calendar, plus corresponding dates, or -1 if match not found """ # Retrieve page and create parser object for table year_end = str(int(year) + 1)[2:] url = ('http://www2.ul.ie/web/WWW/Services/Academic_Calendar/{0}_-_{1}_' 'Academic_Calendar').format(year, year_end) rows = common.get_page(url).xpath('//div[@class=\'rc-doc\']/table/tbody[1]') search_terms = [ 'Autumn Teaching Term', 'Spring Teaching Term', 'Autumn Examinations', 'Examinations Spring' ] result_names = [ ('autumn'), ('spring'), ('autumn_exam'), ('spring_exam'), ] results = [] for idx, search_term in enumerate(search_terms): data = rows[0].xpath('./tr[./td/div/strong= \'{0}\']'.format(search_term)) data = data[0].xpath('./td') start_date = common.tidy_tag(data[2]) end_date = common.tidy_tag(data[3]) result = OrderedDict([ ('start', start_date), ('end', end_date), ]) results.append((result_names[idx], result)) results = OrderedDict([('kind', 'calendar'), ('items', OrderedDict(results))]) return results
def main(): orgs_data = {} projects_data = {} for year in range(2005, 2009): url = developer + '/open-source/gsoc/{yr}/'.format(yr=year) loop = asyncio.get_event_loop() soup = loop.run_until_complete(get_page(url)) orgs, projects = get_info(soup) orgs_data[year] = orgs projects_data[year] = projects dumper(orgs_data, "2005-2008.json") dumper(projects_data, "2005-2008.json")
def on_get_group_chats_short(update: Update, context: CallbackContext): r""" Получение групповых чатов (короткая): - /get_group_chats_short - get group chats short """ message = update.effective_message query = update.callback_query if query: query.answer() page = get_page(context) # Для получения только групповых чатов filters = [db.Chat.type != 'private'] total_group_chats = db.Chat.select().where(*filters).count() items_per_page = ITEMS_PER_PAGE start = ((page - 1) * items_per_page) + 1 chats = db.Chat.get_by_page( page=page, items_per_page=items_per_page, filters=filters, ) items = [] for i, chat in enumerate(chats, start): short_title = chat.get_short_title_for_group() short_title = f'{i}. {short_title}' items.append(short_title) text = f'Чаты ({total_group_chats}):\n' + '\n'.join(items) reply_text_or_edit_with_keyboard_paginator( message, query, text, page_count=total_group_chats, items_per_page=items_per_page, current_page=page, data_pattern=fill_string_pattern(PATTERN_GET_GROUP_CHATS_SHORT_BY_PAGE, '{page}'), )
def semester_timetable(student_id): """ Retrieve and parses semester timetable from UL timetable site @param student_id: Student ID to get timetable for @type student_id: String @return An OrderedDict of OrderedDicts containing start and end times, module code, class type and room for events, or -1 if match not found """ url = 'http://www.timetable.ul.ie/tt2.asp' params = { 'T1' : student_id } rows = common.get_page(url, params).xpath('//div/table/tr[2]/td') results = [] for idx, day in enumerate(rows): periods = [] for idx2, period in enumerate(day.xpath('./p')): # Convert mostly unstructured text from within 'p' tag into a list of words. # Each word will correspond to a line on the actual timetable page. # Example output: # [u'15:00', u'-', u'16:00', u'EE4617', u'- LEC -', u'LCO017', u'Wks:1-8,10-14'] # [u'17:00', u'-', u'18:00', u'CE4218', u'- LAB -', u'2A', u'B2042', u'Wks:1-8,10-14'] data = filter(None, [x.strip() for x in common.tidy_tag(period).split('\n')]) # Handle empty data cells if not data: continue periods.append(_parse_timetable_entry(data)) results.append((idx, periods)) results = OrderedDict([('kind', 'timetable#day'), ('items', OrderedDict(results))]) return OrderedDict(results)
def scrape_sell(base_url, action, mode, locs, fp): logging.info("Scraping sale mode") batch = 32 data = [] for k in locs: logging.info(f'location: {locs[k]}') url = base_url + "/" + action + "/" + mode + locs[k] #pages = [p for p in common.get_n_pages(url, 2)] for page in common.get_next_page(url): posts = sale.get_postings(page) if posts is None: logging.warning("Posts is None, avoiding") continue for post in posts: p_link = sale.get_post_link(post) if p_link is None: logging.warning("Post link is None, avoiding") continue if 'proyecto' in p_link: continue common.sleep_random_between(2, 4) p_link = base_url + p_link post_page = common.get_page(p_link) try: row = extract_sale_info(post_page) row["url"] = p_link data.append(row) if len(data) % batch == 0: pkl.dump(data, fp) del data[:] except Exception as e: logging.error("While extracting sale info", exc_info=True) if len(data) > 0: pkl.dump(data, fp) del data[:]
def extract_posts_info(self, post): """ Extract info from post item """ item_link = post.a["href"] page = common.get_page(item_link) common.sleep_random_between(1, 2) # Get script script = page.find_all("script") if len(script) == 0: logging.info("Post has not have 'script'...") return None # Get post features feat = { "item_name": post.text, "item_id": self.get_item_id(script), "item_price": self.get_item_price(script), "local_item_price": self.get_local_item_price(script), "available_stock": self.get_available_stock(script), "sold_stock": self.get_sold_stock(script), "brand": self.get_brand_item(script), "model": self.get_model_item(script), "item_condition": self.get_condition_item(script), "root_category": self.get_root_category(script), "path_to_root": self.get_path_to_root(script), "seller_id": self.get_seller_id(script), "location": self.get_location(page), "seller_type": self.get_seller_type(script), "reputation_level": self.get_reputation_level(script), "seller_status": self.get_seller_status(script), "customer_satisfaction": self.get_customer_satisfaction(script), "seller_age": self.get_seller_age(script), "sales_completed": self.get_sales_completed(script), "link": item_link } return feat
def on_get_errors_short(update: Update, context: CallbackContext): r""" Получение ошибок (короткая): - /get_errors_short - get[ _]errors[ _]short """ message = update.effective_message query = update.callback_query if query: query.answer() page = get_page(context) total = db.Error.select().count() items_per_page = ERRORS_PER_PAGE start = ((page - 1) * items_per_page) + 1 errors = db.Error.get_by_page(page=page, items_per_page=items_per_page) items = [] for i, error in enumerate(errors, start): short_title = error.get_short_title() short_title = f'{i}. {short_title}' items.append(short_title) text = 'Ошибки:\n' + '\n'.join(items) reply_text_or_edit_with_keyboard_paginator( message, query, text, page_count=total, items_per_page=items_per_page, current_page=page, data_pattern=fill_string_pattern(PATTERN_GET_ERRORS_SHORT_BY_PAGE, '{page}'), )
def on_get_users_short(update: Update, context: CallbackContext): r""" Получение пользователей (короткая): - /get_users_short - get[ _]users[ _]short """ message = update.effective_message query = update.callback_query if query: query.answer() page = get_page(context) total_users = db.User.select().count() items_per_page = ITEMS_PER_PAGE start = ((page - 1) * items_per_page) + 1 users = db.User.get_by_page(page=page, items_per_page=items_per_page) items = [] for i, user in enumerate(users, start): short_title = user.get_short_title() short_title = f'{i}. {short_title}' items.append(short_title) text = f'Пользователи ({total_users}):\n' + '\n'.join(items) reply_text_or_edit_with_keyboard_paginator( message, query, text, page_count=total_users, items_per_page=items_per_page, current_page=page, data_pattern=fill_string_pattern(PATTERN_GET_USERS_SHORT_BY_PAGE, '{page}'), )
def get_categories(self, save_path="", cache=True): """ Find categories and return a dictionary with each category information """ if (cache is True) and (self.categories is not None): logging.info("cache categories dictionary...") return url = self.url["category"] page = common.get_page(url) common.sleep_random_between(1, 2) cat_container = page.find_all("div", class_="categories__container") cat = {} if len(cat_container) == 0: logging.info( "category container is empty, returning empty dictionary...") return cat for c in cat_container: name = c.h2.text link = c.h2.a["href"] cat_id = re.findall(self.patt["cat_id"], link) if len(cat_id) == 0: logging.info("could not find category id, passing...") continue cat_id = cat_id[0] sub = self.get_sub_cats(link) cat[cat_id] = {"name": name, "link": link, "sub": sub} if len(cat) != 0 and save_path != "": common.save_pickle(save_path, cat) self.categories = cat
def index(): response = get_page() if not response: return wrap_response("fail") soup = BeautifulSoup(response.text, 'lxml') area_stat = soup.find(id='getAreaStat') total_stat = soup.find(id='getStatisticsService') area_data = area_stat.text.split('getAreaStat =')[-1].split('}catch')[0] area_result = json.loads(area_data) overview_data = total_stat.text.split('getStatisticsService =')[-1].split( '}catch')[0] overview_result = json.loads(overview_data) confirmed_cnt = overview_result.get('confirmedCount') suspected_cnt = overview_result.get('suspectedCount') cured_cnt = overview_result.get('curedCount') dead_cnt = overview_result.get('deadCount') tz = pytz.timezone('Asia/Shanghai') tm = datetime.now(tz=tz).strftime("%Y-%m-%d %H:%M:%S") total_view = TotalView(tm, confirmed_cnt, suspected_cnt, dead_cnt, cured_cnt) db.session.add(total_view) db.session.commit() for item in area_result: name = item.get('provinceShortName') confirmed = item.get('confirmedCount') cured = item.get('curedCount') dead = item.get('deadCount') prov = ProvView(tm, name, confirmed, cured, dead) db.session.add(prov) db.session.commit() return wrap_response("success")
years = "%d,%d" % (year - 1, year + 1) url = u'http://www.imdb.com/List' url = u'http://www.imdb.com/search/title?' data = {'title': title, 'release_date': years} try: url = url + urllib.urlencode(data) except Exception, e: logging.error("Could not URL encode %s" % str(data)) return None data = None _, page = common.get_page(url, data) if page is None: logging.info("Couldn't get IMDb search page for '%s'" % name) return None # Cleanup dumbass IMDB stuff page = page.replace('rate""', 'rate"').replace('"src', '" src') document = B(page) results = document.findAll('tr', attrs={'class': re.compile('detailed')}) for result_node in results: extras = {}
stardict_parse) def etymonline_parse(raw): extract = lambda m: map(lambda i: strip_html(m.group(i)), [1, 2]) first_word = lambda s: re.match(r' *(\w*)', s.lower()).group(1) make_block = lambda h, e: (first_word(h), 'Etymonline, ' + h, [e]) return map( lambda m: make_block(*extract(m)), re.finditer(r'<dt[^>]*>(.*?)</dt>[^<]*<dd[^>]*>((.|\n)*?)</dd>', raw)) use_etymonline = make_translator( lambda w: get_page( 'http://www.etymonline.com/index.php?allowed_in_frame=0&searchmode=nl&search=' + get_stem(w)), etymonline_parse) class UrbanDictionaryHTMLParser(HTMLParser): def mk_matchers(conds): def match_start(self, name, attrs): assert (name in conds) if name not in self._markers: if conds[name](attrs): self._markers[name] = 0 + self._depth return True else: assert (self._markers[name] - self._depth < 0) return True
def lookup_metacritic_metadata(content): metadata = {} name = content.simple_name() title, year = common.detect_title_year(name) url_kind_map = { models.KIND_MOVIE: 'http://www.metacritic.com/search/movie/%s/results', models.KIND_SERIES: 'http://www.metacritic.com/search/tv/%s/results', models.KIND_TV: 'http://www.metacritic.com/search/tv/%s/results', models.KIND_SEASON: 'http://www.metacritic.com/search/tv/%s/results' } url = url_kind_map[content.kind] # Remove special characters that the regular metacritic search seems to # remove anyway. title_utf8 = title.encode('utf-8') title_stripped = re.sub('[!@#$%^&*();.,?]', '', title_utf8).strip() #title.replace('-','').replace(':','').replace('(','').replace(')','') title_stripped = re.sub('[:\-\s]', '+', title_stripped) #title_stripped = title_stripped.replace(' ', '+') # Fake encode the title, strip out the a= #title_stripped = re.sub('^a=', '', urllib.urlencode({'a': title_stripped})) url = url % title_stripped logging.info("Trying to search: %s" % url) _, page = common.get_page(url) if not page: logging.error("Couldn't get metacritic page for '%s'" % content) return None doc = B(page) # Get results results = doc.findAll('li', attrs={'class': re.compile('result')}) for result in results: title_node = result.findChild('h3', attrs={'class': re.compile('product_title')}) title_link = title_node.findChild('a') if title_node else None mc_title = title_link.string if title_link else None if not title_link or not mc_title: logging.warning("Could't find MC title link for result.") continue mc_title = mc_title.strip() if not common.title_match(title, mc_title): try: logging.warning(u"Skipping MC title '%s' because it didn't " "match '%s'" % (mc_title, title)) except Exception, e: traceback.print_exc(e) continue logging.info("Found a matching title, '%s' for '%s'" % (mc_title, title)) mc_url = title_link.get('href') id_match = re.match('/(?P<type>movie|tv)/(?P<mc_id>.*)', mc_url) if not id_match: logging.warning("Could't find MC id from link '%s'." % mc_url) continue metadata['mc_uri'] = mc_url metadata['mc_id'] = id_match.groupdict()['mc_id'] metascore_node = result.findChild('span', attrs={'class': re.compile('metascore')}) metascore = metascore_node.string if metascore_node else None if metascore: metascore_class = metascore_node.get('class') score = 'unknown' if 'score_outstanding' in metascore_class: score = 'outstanding' elif 'score_favorable' in metascore_class: score = 'favorable' elif 'score_mixed' in metascore_class: score = 'mixed' elif 'score_unfavorable' in metascore_class: score = 'unfavorable' elif 'score_terrible' in metascore_class: score = 'terrible' elif 'score_tbd' in metascore_class: score = 'tbd' metadata['mc_status'] = score try: metadata['mc_score'] = int(metascore) except: logging.error("Couldn't convert metascore '%s' to integer." % metascore) return metadata
def imdb_parse_page_metadata(imdb_id): url = u'http://www.imdb.com/title/tt%s/combined' % imdb_id try: logging.info("Looking up '%s'" % url) _, page = common.get_page(url) # BeautifulSoup can't handle hex entities. Massage them into decimal. hexentityMassage = copy.copy(B.MARKUP_MASSAGE) hexentityMassage = [(re.compile('&#x([^;]+);'), lambda m: '&#%d;' % int(m.group(1), 16))] document = B(page, convertEntities=B.ALL_ENTITIES, markupMassage=hexentityMassage) metadata = {} # Grab the poster poster_node = document.findChild('img', attrs={'id': 'primary-poster'}) poster_url = poster_node.get('src') if poster_node else None if poster_url: logging.info("Found IMDb Poster URL: '%s'" % poster_url) # IMDb Poster URLs work like this: # http://ia.media-imdb.com/images/M/MV5BOTI5ODc3NzExNV5BMl5BanBnXkFtZTcwNzYxNzQzMw@@._V1._SX214_CR0,0,214,314_.jpg # Everything after the @@ is a format command. # ._V1 not sure # ._SX214 format width 214 pixels # ._SY214 format height 214 pixels # _CR0,0,214,214_ not sure # So to collect our images at X by Y, just replace 'SX\d+' by the # desired width. The combined details page defaults to a small # thumbnail. # Eliminate height restrictions. poster_url = re.sub('_SY\d+', '', poster_url) desired_width = settings.IMDB_THUMBNAIL_WIDTH # Replace height restriction with our desired height. poster_url = re.sub('\._SX\d+', "._SX%d" % desired_width, poster_url) metadata['imdb_cover_uri'] = poster_url metadata['imdb_cover_width'] = desired_width info_nodes = document.findAll('div', attrs={'class': re.compile('^info( stars)?$')}) def take_first_string(contents): for item in node_content.contents: if isinstance(item, basestring): return unicode(item.strip()) return None for node in info_nodes: node_title = node.findChild('h5') node_title = node_title.string if node_title else None node_content = node.findChild('div', attrs={'class': re.compile('^(info-content|starbar-meta)$')}) if not node_title or not node_content: continue if node_title == 'User Rating:': rating_node = node_content.findChild('b') rating_match = re.match("(?P<rating>[0-9.]+)/10", rating_node.string.strip()) \ if rating_node and rating_node.string else None if rating_match: try: metadata['imdb_rating'] = float(rating_match.groupdict()['rating']) except Exception, e: logging.error("Couldn't parse rating: '%s'" % rating_match.groupdict()['rating']) elif node_title == 'Director:': metadata['imdb_directors'] = [unicode(subnode.string) for subnode in node_content.findAll('a') if subnode.string] elif node_title == 'Writers:': metadata['imdb_writers'] = [unicode(subnode.string) for subnode in node_content.findAll('a') if subnode.string]
def imdb_find_id(title, year=None): title = title.decode('utf8') url = u'http://www.imdb.com/find?%s' data = {'s': 'tt', 'q': title.encode('latin1')} try: url = url % urllib.urlencode(data) logging.info("Executing IMDB regular search for '%s' at '%s'" % (title, url)) result_url, page = common.get_page(url) result_url = result_url.replace('http://www.imdb.com', '') result_url_match = imdb_title_pattern.match(result_url) if result_url_match: # IMDb saw fit to redirect us to the thing we searched for. Let's # trust them? logging.info("IMDb redirected us to '%s', trusting them." % result_url) return result_url_match.groupdict()['imdb_id'] # BeautifulSoup can't handle hex entities. Massage them into decimal. hexentityMassage = copy.copy(B.MARKUP_MASSAGE) hexentityMassage = [(re.compile('&#x([^;]+);'), lambda m: '&#%d;' % int(m.group(1), 16))] #page = imdb_cleanup_markup(page) document = B(page, convertEntities=B.HTML_ENTITIES, markupMassage=hexentityMassage) links = document.findAll('a', attrs={'href': re.compile('^/title/tt\d{7}/$')}) for link in links: link_title = link.string if not link_title: continue if not common.title_match(title, link_title): logging.info("Skipping IMDB link title '%s' because it didn't match '%s'" % (link_title, title)) continue link_year = link.nextSibling if not isinstance(link_year, basestring): continue link_year = link_year.strip() link_year_match = re.match('\((?P<year>\d{4}).*?\)', link_year) link_year = link_year_match.groupdict()['year'] if link_year_match else None if not link_year: continue if year and link_year != year: logging.info("Link '%s's year '%s' doesn't match '%s'." % (link_title, link_year, year)) continue imdb_url = link.get('href') imdb_match = re.match('^/title/tt(?P<imdb_id>\d{7})/', imdb_url) logging.info("Found match for '%s (%s)': '%s (%s)'" % (title, year, link_title, link_year)) # We know this because the nodes were selected with this regex. assert imdb_match return imdb_match.groupdict()['imdb_id'] logging.error("Found no matches for '%s'" % title) except Exception, e: logging.error("Couldn't get IMDB regular search for '%s'" % title) traceback.print_exc(e)
def rottentomatoes_find_id(title, year=None, imdb_id=None): # Find the content by search. url = u"http://www.rottentomatoes.com/search/movie.php?%s" title_latin1 = title.encode('latin1') data = {'searchby': 'movies', 'search': title_latin1} try: url = url % urllib.urlencode(data) logging.info("Executing RT regular search for '%s' at '%s'" % (title, url)) result_url, page = common.get_page(url) # BeautifulSoup can't handle hex entities. Massage them into decimal. hexentityMassage = copy.copy(B.MARKUP_MASSAGE) hexentityMassage = [(re.compile('&#x([^;]+);'), lambda m: '&#%d;' % int(m.group(1), 16))] #page = imdb_cleanup_markup(page) document = B(page, convertEntities=B.HTML_ENTITIES, markupMassage=hexentityMassage) results_ul = document.findChild('ul', attrs={'id': re.compile('movie_results_ul')}) results = (results_ul.findAll('li', attrs={'class': re.compile('media_block')}) if results_ul else None) if results is None: logging.error("Couldn't lookup RT ID for '%s (%s)'" % (title, year)) return None for result_node in results: # Scope in on the content div, because otherwise we get the poster # image. content_div = result_node.findChild( 'div', attrs={'class': re.compile('media_block_content')}) link = content_div.findChild('a', attrs={'href': rottentomatoes_id_pattern}) link_title = link.string if link else None if not link_title: logging.error("Couldn't find RT result link title. Skipping") continue titles = [] # Try the original title titles.append(link_title) # Rotten Tomatoes annoyingly embeds the AKAs in the title in parens following the head title. # For example: # - Batoru rowaiaru II: Chinkonka (Battle Royale II) # - Battle Royale (Batoru Rowaiaru) endparen_match = re.search("\(([^\(\)]+)\)$", link_title) while endparen_match: titles.append(endparen_match.groups()[0]) # Strip out the ending (title) and any spaces before it. link_title = re.sub("\s*\(([^\(\)]+)\)$", '', link_title) endparen_match = re.search("\(([^\(\)]+)\)$", link_title) # Add the final version of the title with the AKAs removed to # the title list. if not endparen_match: titles.append(link_title) found_title = None for aka in titles: if not common.title_match(title, aka): try: logging.warning(u"Skipping RT title '%s' because it didn't match '%s'" % (aka, title)) except Exception, e: traceback.print_exc(e) continue else: logging.info("Found RT title match '%s' for '%s'" % (aka, title)) found_title = aka break if not found_title: continue span_year = result_node.findChild('span', attrs={'class': re.compile('movie_year')}) link_year = unicode(span_year.string) if span_year and span_year.string else None link_year = link_year.strip(' ()') if year and link_year != year: logging.info("Link '%s's year '%s' doesn't match '%s'." % (link_title, link_year, year)) continue # Get RT ID link_href = link.get('href') link_match = rottentomatoes_id_pattern.match(link_href) assert link_match # guaranteed return link_match.groupdict()['id'] except Exception, e: traceback.print_exc(e) logging.error("Couldn't lookup RT ID for '%s (%s)'" % (title, year)) pass
def handler(): input_xml = ET.fromstring(common.get_page(url)) return fn_inner(input_xml)