Python get_page示例，common.get_page Python示例

示例#1

0

显示文件

文件： course.py 项目： stephenfin/ul-rest-api

def module(module_code):
  """
  Retrieve and parses module information from UL site

  @param module_code: Module code to get details for
  @type module_code: String

  @return An OrderedDict containing the module code and name, or -1 if match 
  not found
  """
  url = 'http://193.1.101.55/tt_moduledetails_res.asp'
  
  params = { 
    'T1' : module_code
  }

  rows = common.get_page(url, params).xpath('//table//table/tr')

  # no matches
  if not rows:
    return -1
  
  data = OrderedDict([
    ('kind', 'module'),
    ('code', module_code),
    ('name', common.tidy_tag(rows[1].xpath('td[2]')[0])),
  ])
  
  return data

示例#2

0

显示文件

文件： spider.py 项目： benitesf/ecommerce-scraper

    def get_sub_cats(self, url):
        page = common.get_page(url)
        common.sleep_random_between(1, 2)
        cat = {}

        for child in page.find_all("div", class_="desktop__view-child"):
            link = child.a["href"]
            name = child.a.text
            child_id = re.findall(self.patt["child_cat_id"], link)
            if len(child_id) == 0:
                logging.info("could not find category child id, passing...")
                continue
            child_id = child_id[0]

            items = {}

            for item in child.find_all("li", class_="category-list__item"):
                item_name = item.a.text
                item_link = item.a["href"]
                item_id = re.findall(self.patt["child_cat_id"], item_link)
                if len(item_id) == 0:
                    logging.info("could not find category item id, passing...")
                    continue
                item_id = item_id[0]

                items[item_id] = {"name": item_name, "link": item_link}

            cat[child_id] = {"name": name, "link": link, "items": items}

        return cat

示例#3

0

显示文件

文件： commands.py 项目： gil9red/telegram__random_bashim_bot

def on_get_users(update: Update, context: CallbackContext):
    r"""
    Получение пользователей:
     - /get_users
     - get[ _]users
    """

    message = update.effective_message

    query = update.callback_query
    if query:
        query.answer()

    page = get_page(context)
    total_users = db.User.select().count()
    items_per_page = 1

    user = db.User.get_by_page(page=page, items_per_page=items_per_page)[0]
    description = get_user_message_repr(user)
    text = f'Пользователь №{page}:\n{description}'

    reply_text_or_edit_with_keyboard_paginator(
        message,
        query,
        text,
        page_count=total_users,
        items_per_page=items_per_page,
        current_page=page,
        data_pattern=fill_string_pattern(PATTERN_GET_USER_BY_PAGE, '{page}'),
    )

示例#4

0

显示文件

文件： geolocation.py 项目： stephenfin/ul-rest-api

def building(building_name):
  """
  Retrieve and parses building information from UL site

  @param module_code: Buidling name to get details for
  @type module_code: String

  @return An OrderedDict containing the building name, a thumbnail and web 
  address of building information page, or -1 if match not found
  """
  url = 'https://www2.ul.ie/web/WWW/Services/Buildings_and_Estates/At_A_Glance/'
  
  row = common.get_page(url).xpath('//div[@class=\'rc-doc\']/table/tbody[1]/tr\
    [contains(.//strong, \'{0}\')]'.format(building_name.title()))

  # Handle building does not exist
  if not row:
    return -1

  building_data = row[0].xpath('./td[1]/strong/text()')[0]
  building_image = 'https://www2.ul.ie' + row[0].xpath('./td[2]/a/img/@src')[0]
  building_link = 'https://www2.ul.ie' + row[0].xpath('./td[2]/a/@href')[0]

  data = OrderedDict([
    ('kind', 'building'),
    ('name', building_data),
    ('thumb', building_image),
    ('url', building_link),
  ])
  
  return data

示例#5

0

显示文件

文件： course.py 项目： stephenfin/ul-rest-api

def course(course_code):
  """
  Retrieve and parses course information from UL site

  @param module_code: Course code to get details for
  @type module_code: String

  @return An OrderedDict containing the original course code, title of course 
  and web address of course page, or -1 if match not found
  """
  url = 'http://www3.ul.ie/courses/AlphabeticalList.shtml'
  
  row = common.get_page(url).xpath('//p//a[contains(., \'{0}\')]'.format(course_code))

  # Handle course does not exist (either now or ever)
  if not row:
    return -1

  text_value = row[0].xpath('./text()')[0]
  link_value = row[0].xpath('./@href')[0]

  # Parse course code and name from combined string using Regex
  course_re = re.match(common.COURSE_NAME_RE, text_value)
  course_data = course_re.group('code', 'name')

  course_url = 'http://www3.ul.ie/courses/' + link_value

  data = OrderedDict([
    ('kind', 'course'),
    ('code', course_data[0]),
    ('name', course_data[1]),
    ('url', course_url),
  ])
  
  return data

示例#6

0

显示文件

文件： scrape.py 项目： zohl/ScRead

def scrape(url):
    blocks = get_blocks(get_page(url))

    n = len(blocks)
    (xs, ys) = [normalize(range(n)), map(cost, blocks)]
    (l_bound, r_bound) = classify(zip(xs, ys))

    return '<p>' + '</p>\n<p>'.join(blocks[l_bound:r_bound]) + '</p>'

示例#7

0

显示文件

文件： csvinput.py 项目： hhagblom/lambda-decorators

 def handler():
     logging.info("Opening URL: %s", url);
     f = StringIO.StringIO(common.get_page(url))
     jobreader = csv.DictReader(f, delimiter=',', quotechar="\"")
     for i, row in enumerate(jobreader):
         r = fn_inner(i, row)
         if r is not None:
             yield r

示例#8

0

显示文件

def rottentomatoes_find_id_by_imdb(imdb_id):
    url = u"http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb_id
    logging.info("Searching RT with IMDB ID: ''%s'" % url)

    try:
        request_url, page = common.get_page(url)
    except Exception, e:
        logging.error("Got exception while opening page: %s" % e)
        return None

示例#9

0

显示文件

def rottentomatoes_parse_page(rt_id):
    metadata = {}

    try:
        url = u'http://www.rottentomatoes.com/m/%s/' % rt_id
        _, page = common.get_page(url)
    except Exception, e:
        logging.error("Got exception while opening page: %s" % e)
        return None

示例#10

0

显示文件

文件： nyt.py 项目： kazimuth/media-enclave

def lookup_nyt_review(content):
    name = content.simple_name().encode('utf-8')
    title, year = common.detect_title_year(name)

    url = 'http://movies.nytimes.com/gst/movies/msearch.html?%s'
    data = {'query': title}

    url = url % urllib.urlencode(data)
    _, page = common.get_page(url)

    if not page:
        logging.error("Couldn't get NYT search page for '%s'" % content)
        return None

    doc = B(page)

    entertainment_results = doc.findChild(
        'div', attrs={'id': 'entertainment_results'})
    results_container = entertainment_results.findChild(
        'ol') if entertainment_results else None
    results = results_container.findChildren(
        'li', recursive=False) if results_container else []

    for result in results:
        title_header = result.findChild('h3')
        title_link = title_header.findChild('a') if title_header else None
        nyt_title = title_link.string if title_link else None

        if not nyt_title:
            logging.warning("Couldn't find title node for '%s'" % title)
            continue

        # This sucks.
        nyt_title = nyt_title.replace(u'\xa0', ' ')
        nyt_title = nyt_title.encode('utf-8')

        nyt_title, nyt_year = common.detect_title_year(nyt_title)

        if not common.title_match(title, nyt_title):
            try:
                logging.warning(
                    "Skipping NYT title '%s' because it didn't match '%s'" %
                    (nyt_title, title))
            except Exception, e:
                import pdb
                pdb.set_trace()
                print e
            continue

        extra_links = result.findChild('ul')
        if extra_links:
            for link in extra_links.findChildren('a'):
                if link.string == "N.Y.Times Review":
                    return 'http://movies.nytimes.com%s' % link.get('href')

示例#11

0

显示文件

文件： scheduling.py 项目： stephenfin/ul-rest-api

def calendar(year):
  """
  Retrieve and parses academic calendar from UL calendar site

  @param year: Start year of calendar to retrieve ([year] - [year + 1])
  @type year: String

  @return A An OrderedDict containing events for calendar, plus corresponding 
  dates, or -1 if match not found
  """
  # Retrieve page and create parser object for table
  year_end = str(int(year) + 1)[2:]

  url = ('http://www2.ul.ie/web/WWW/Services/Academic_Calendar/{0}_-_{1}_'
    'Academic_Calendar').format(year, year_end)

  rows = common.get_page(url).xpath('//div[@class=\'rc-doc\']/table/tbody[1]')

  search_terms = [
    'Autumn Teaching Term',
    'Spring Teaching Term',
    'Autumn Examinations',
    'Examinations Spring'
  ]

  result_names = [
    ('autumn'),
    ('spring'),
    ('autumn_exam'),
    ('spring_exam'),
  ]

  results = []

  for idx, search_term in enumerate(search_terms):
    data = rows[0].xpath('./tr[./td/div/strong= \'{0}\']'.format(search_term))
    data = data[0].xpath('./td')
    start_date = common.tidy_tag(data[2])
    end_date = common.tidy_tag(data[3])
    result = OrderedDict([
      ('start', start_date),
      ('end', end_date),
    ])
    results.append((result_names[idx], result))

  results = OrderedDict([('kind', 'calendar'), 
    ('items', OrderedDict(results))])

  return results

示例#12

0

显示文件

def main():
    orgs_data = {}
    projects_data = {}
    for year in range(2005, 2009):
        url = developer + '/open-source/gsoc/{yr}/'.format(yr=year)

        loop = asyncio.get_event_loop()
        soup = loop.run_until_complete(get_page(url))
        orgs, projects = get_info(soup)

        orgs_data[year] = orgs
        projects_data[year] = projects

    dumper(orgs_data, "2005-2008.json")
    dumper(projects_data, "2005-2008.json")

示例#13

0

显示文件

文件： commands.py 项目： gil9red/telegram__random_bashim_bot

def on_get_group_chats_short(update: Update, context: CallbackContext):
    r"""
    Получение групповых чатов (короткая):
     - /get_group_chats_short
     - get group chats short
    """

    message = update.effective_message

    query = update.callback_query
    if query:
        query.answer()

    page = get_page(context)

    # Для получения только групповых чатов
    filters = [db.Chat.type != 'private']

    total_group_chats = db.Chat.select().where(*filters).count()
    items_per_page = ITEMS_PER_PAGE
    start = ((page - 1) * items_per_page) + 1

    chats = db.Chat.get_by_page(
        page=page,
        items_per_page=items_per_page,
        filters=filters,
    )

    items = []
    for i, chat in enumerate(chats, start):
        short_title = chat.get_short_title_for_group()
        short_title = f'{i}. {short_title}'
        items.append(short_title)

    text = f'Чаты ({total_group_chats}):\n' + '\n'.join(items)

    reply_text_or_edit_with_keyboard_paginator(
        message,
        query,
        text,
        page_count=total_group_chats,
        items_per_page=items_per_page,
        current_page=page,
        data_pattern=fill_string_pattern(PATTERN_GET_GROUP_CHATS_SHORT_BY_PAGE,
                                         '{page}'),
    )

示例#14

0

显示文件

文件： scheduling.py 项目： stephenfin/ul-rest-api

def semester_timetable(student_id):
  """
  Retrieve and parses semester timetable from UL timetable site

  @param student_id: Student ID to get timetable for
  @type student_id: String

  @return An OrderedDict of OrderedDicts containing start and end times, 
  module code, class type and room for events, or -1 if match not found
  """
  url = 'http://www.timetable.ul.ie/tt2.asp'
  
  params = { 
    'T1' : student_id
  }

  rows = common.get_page(url, params).xpath('//div/table/tr[2]/td')

  results = []

  for idx, day in enumerate(rows):
    periods = []
    for idx2, period in enumerate(day.xpath('./p')):
      # Convert mostly unstructured text from within 'p' tag into a list of words.
      # Each word will correspond to a line on the actual timetable page.
      # Example output:
      #   [u'15:00', u'-', u'16:00', u'EE4617', u'- LEC -', u'LCO017', u'Wks:1-8,10-14']
      #   [u'17:00', u'-', u'18:00', u'CE4218', u'- LAB -', u'2A', u'B2042', u'Wks:1-8,10-14']
      data = filter(None, [x.strip() for x in common.tidy_tag(period).split('\n')])

      # Handle empty data cells
      if not data:
        continue
    
      periods.append(_parse_timetable_entry(data))
    results.append((idx, periods))

  results = OrderedDict([('kind', 'timetable#day'), 
    ('items', OrderedDict(results))])
  
  return OrderedDict(results)

示例#15

0

显示文件

def scrape_sell(base_url, action, mode, locs, fp):
    logging.info("Scraping sale mode")
    batch = 32
    data = []

    for k in locs:
        logging.info(f'location: {locs[k]}')
        url = base_url + "/" + action + "/" + mode + locs[k]
        #pages = [p for p in common.get_n_pages(url, 2)]

        for page in common.get_next_page(url):
            posts = sale.get_postings(page)
            if posts is None:
                logging.warning("Posts is None, avoiding")
                continue
            for post in posts:
                p_link = sale.get_post_link(post)
                if p_link is None:
                    logging.warning("Post link is None, avoiding")
                    continue
                if 'proyecto' in p_link:
                    continue
                common.sleep_random_between(2, 4)
                p_link = base_url + p_link
                post_page = common.get_page(p_link)
                try:
                    row = extract_sale_info(post_page)
                    row["url"] = p_link
                    data.append(row)

                    if len(data) % batch == 0:
                        pkl.dump(data, fp)
                        del data[:]
                except Exception as e:
                    logging.error("While extracting sale info", exc_info=True)

    if len(data) > 0:
        pkl.dump(data, fp)
        del data[:]

示例#16

0

显示文件

文件： spider.py 项目： benitesf/ecommerce-scraper

    def extract_posts_info(self, post):
        """
        Extract info from post item
        """
        item_link = post.a["href"]
        page = common.get_page(item_link)
        common.sleep_random_between(1, 2)

        # Get script
        script = page.find_all("script")
        if len(script) == 0:
            logging.info("Post has not have 'script'...")
            return None

        # Get post features
        feat = {
            "item_name": post.text,
            "item_id": self.get_item_id(script),
            "item_price": self.get_item_price(script),
            "local_item_price": self.get_local_item_price(script),
            "available_stock": self.get_available_stock(script),
            "sold_stock": self.get_sold_stock(script),
            "brand": self.get_brand_item(script),
            "model": self.get_model_item(script),
            "item_condition": self.get_condition_item(script),
            "root_category": self.get_root_category(script),
            "path_to_root": self.get_path_to_root(script),
            "seller_id": self.get_seller_id(script),
            "location": self.get_location(page),
            "seller_type": self.get_seller_type(script),
            "reputation_level": self.get_reputation_level(script),
            "seller_status": self.get_seller_status(script),
            "customer_satisfaction": self.get_customer_satisfaction(script),
            "seller_age": self.get_seller_age(script),
            "sales_completed": self.get_sales_completed(script),
            "link": item_link
        }

        return feat

示例#17

0

显示文件

文件： commands.py 项目： gil9red/telegram__random_bashim_bot

def on_get_errors_short(update: Update, context: CallbackContext):
    r"""
    Получение ошибок (короткая):
     - /get_errors_short
     - get[ _]errors[ _]short
    """

    message = update.effective_message

    query = update.callback_query
    if query:
        query.answer()

    page = get_page(context)

    total = db.Error.select().count()
    items_per_page = ERRORS_PER_PAGE
    start = ((page - 1) * items_per_page) + 1

    errors = db.Error.get_by_page(page=page, items_per_page=items_per_page)

    items = []
    for i, error in enumerate(errors, start):
        short_title = error.get_short_title()
        short_title = f'{i}. {short_title}'
        items.append(short_title)

    text = 'Ошибки:\n' + '\n'.join(items)

    reply_text_or_edit_with_keyboard_paginator(
        message,
        query,
        text,
        page_count=total,
        items_per_page=items_per_page,
        current_page=page,
        data_pattern=fill_string_pattern(PATTERN_GET_ERRORS_SHORT_BY_PAGE,
                                         '{page}'),
    )

示例#18

0

显示文件

文件： commands.py 项目： gil9red/telegram__random_bashim_bot

def on_get_users_short(update: Update, context: CallbackContext):
    r"""
    Получение пользователей (короткая):
     - /get_users_short
     - get[ _]users[ _]short
    """

    message = update.effective_message

    query = update.callback_query
    if query:
        query.answer()

    page = get_page(context)

    total_users = db.User.select().count()
    items_per_page = ITEMS_PER_PAGE
    start = ((page - 1) * items_per_page) + 1

    users = db.User.get_by_page(page=page, items_per_page=items_per_page)

    items = []
    for i, user in enumerate(users, start):
        short_title = user.get_short_title()
        short_title = f'{i}. {short_title}'
        items.append(short_title)

    text = f'Пользователи ({total_users}):\n' + '\n'.join(items)

    reply_text_or_edit_with_keyboard_paginator(
        message,
        query,
        text,
        page_count=total_users,
        items_per_page=items_per_page,
        current_page=page,
        data_pattern=fill_string_pattern(PATTERN_GET_USERS_SHORT_BY_PAGE,
                                         '{page}'),
    )

示例#19

0

显示文件

文件： spider.py 项目： benitesf/ecommerce-scraper

    def get_categories(self, save_path="", cache=True):
        """
        Find categories and return a dictionary with each category information
        """
        if (cache is True) and (self.categories is not None):
            logging.info("cache categories dictionary...")
            return

        url = self.url["category"]
        page = common.get_page(url)
        common.sleep_random_between(1, 2)

        cat_container = page.find_all("div", class_="categories__container")

        cat = {}

        if len(cat_container) == 0:
            logging.info(
                "category container is empty, returning empty dictionary...")
            return cat

        for c in cat_container:
            name = c.h2.text
            link = c.h2.a["href"]
            cat_id = re.findall(self.patt["cat_id"], link)
            if len(cat_id) == 0:
                logging.info("could not find category id, passing...")
                continue
            cat_id = cat_id[0]

            sub = self.get_sub_cats(link)

            cat[cat_id] = {"name": name, "link": link, "sub": sub}

        if len(cat) != 0 and save_path != "":
            common.save_pickle(save_path, cat)

        self.categories = cat

示例#20

0

显示文件

def index():
    response = get_page()
    if not response:
        return wrap_response("fail")
    soup = BeautifulSoup(response.text, 'lxml')
    area_stat = soup.find(id='getAreaStat')
    total_stat = soup.find(id='getStatisticsService')

    area_data = area_stat.text.split('getAreaStat =')[-1].split('}catch')[0]
    area_result = json.loads(area_data)

    overview_data = total_stat.text.split('getStatisticsService =')[-1].split(
        '}catch')[0]
    overview_result = json.loads(overview_data)

    confirmed_cnt = overview_result.get('confirmedCount')
    suspected_cnt = overview_result.get('suspectedCount')
    cured_cnt = overview_result.get('curedCount')
    dead_cnt = overview_result.get('deadCount')
    tz = pytz.timezone('Asia/Shanghai')
    tm = datetime.now(tz=tz).strftime("%Y-%m-%d %H:%M:%S")

    total_view = TotalView(tm, confirmed_cnt, suspected_cnt, dead_cnt,
                           cured_cnt)
    db.session.add(total_view)
    db.session.commit()

    for item in area_result:
        name = item.get('provinceShortName')
        confirmed = item.get('confirmedCount')
        cured = item.get('curedCount')
        dead = item.get('deadCount')
        prov = ProvView(tm, name, confirmed, cured, dead)
        db.session.add(prov)
    db.session.commit()

    return wrap_response("success")

示例#21

0

显示文件

    years = "%d,%d" % (year - 1, year + 1)

    url = u'http://www.imdb.com/List'
    url = u'http://www.imdb.com/search/title?'

    data = {'title': title,
            'release_date': years}

    try:
        url = url + urllib.urlencode(data)
    except Exception, e:
        logging.error("Could not URL encode %s" % str(data))
        return None
    data = None
    _, page = common.get_page(url, data)

    if page is None:
        logging.info("Couldn't get IMDb search page for '%s'" % name)
        return None

    # Cleanup dumbass IMDB stuff
    page = page.replace('rate""', 'rate"').replace('"src', '" src')

    document = B(page)

    results = document.findAll('tr', attrs={'class': re.compile('detailed')})


    for result_node in results:
        extras = {}

示例#22

0

显示文件

    stardict_parse)


def etymonline_parse(raw):
    extract = lambda m: map(lambda i: strip_html(m.group(i)), [1, 2])
    first_word = lambda s: re.match(r' *(\w*)', s.lower()).group(1)
    make_block = lambda h, e: (first_word(h), 'Etymonline, ' + h, [e])

    return map(
        lambda m: make_block(*extract(m)),
        re.finditer(r'<dt[^>]*>(.*?)</dt>[^<]*<dd[^>]*>((.|\n)*?)</dd>', raw))


use_etymonline = make_translator(
    lambda w: get_page(
        'http://www.etymonline.com/index.php?allowed_in_frame=0&searchmode=nl&search='
        + get_stem(w)), etymonline_parse)


class UrbanDictionaryHTMLParser(HTMLParser):
    def mk_matchers(conds):
        def match_start(self, name, attrs):
            assert (name in conds)

            if name not in self._markers:
                if conds[name](attrs):
                    self._markers[name] = 0 + self._depth
                    return True
            else:
                assert (self._markers[name] - self._depth < 0)
                return True

示例#23

0

显示文件

文件： metacritic.py 项目： pranjalv123/media-enclave

def lookup_metacritic_metadata(content):
    metadata = {}
    name = content.simple_name()
    title, year = common.detect_title_year(name)

    url_kind_map = { models.KIND_MOVIE: 'http://www.metacritic.com/search/movie/%s/results',
                     models.KIND_SERIES: 'http://www.metacritic.com/search/tv/%s/results',
                     models.KIND_TV: 'http://www.metacritic.com/search/tv/%s/results',
                     models.KIND_SEASON: 'http://www.metacritic.com/search/tv/%s/results' }

    url = url_kind_map[content.kind]

    # Remove special characters that the regular metacritic search seems to
    # remove anyway.
    title_utf8 = title.encode('utf-8')
    title_stripped = re.sub('[!@#$%^&*();.,?]', '', title_utf8).strip() #title.replace('-','').replace(':','').replace('(','').replace(')','')
    title_stripped = re.sub('[:\-\s]', '+', title_stripped)
    #title_stripped = title_stripped.replace(' ', '+')

    # Fake encode the title, strip out the a=
    #title_stripped = re.sub('^a=', '', urllib.urlencode({'a': title_stripped}))

    url = url % title_stripped
    logging.info("Trying to search: %s" % url)
    _, page = common.get_page(url)

    if not page:
        logging.error("Couldn't get metacritic page for '%s'" % content)
        return None

    doc = B(page)

    # Get results
    results = doc.findAll('li', attrs={'class': re.compile('result')})

    for result in results:
        title_node = result.findChild('h3', attrs={'class': re.compile('product_title')})
        title_link = title_node.findChild('a') if title_node else None
        mc_title = title_link.string if title_link else None

        if not title_link or not mc_title:
            logging.warning("Could't find MC title link for result.")
            continue

        mc_title = mc_title.strip()

        if not common.title_match(title, mc_title):
            try:
                logging.warning(u"Skipping MC title '%s' because it didn't "
                                "match '%s'" % (mc_title, title))
            except Exception, e:
                traceback.print_exc(e)
            continue

        logging.info("Found a matching title, '%s' for '%s'" % (mc_title, title))

        mc_url = title_link.get('href')
        id_match = re.match('/(?P<type>movie|tv)/(?P<mc_id>.*)', mc_url)
        if not id_match:
            logging.warning("Could't find MC id from link '%s'." % mc_url)
            continue

        metadata['mc_uri'] = mc_url
        metadata['mc_id'] = id_match.groupdict()['mc_id']

        metascore_node = result.findChild('span', attrs={'class': re.compile('metascore')})
        metascore = metascore_node.string if metascore_node else None

        if metascore:
            metascore_class = metascore_node.get('class')
            score = 'unknown'
            if 'score_outstanding' in metascore_class:
                score = 'outstanding'
            elif 'score_favorable' in metascore_class:
                score = 'favorable'
            elif 'score_mixed' in metascore_class:
                score = 'mixed'
            elif 'score_unfavorable' in metascore_class:
                score = 'unfavorable'
            elif 'score_terrible' in metascore_class:
                score = 'terrible'
            elif 'score_tbd' in metascore_class:
                score = 'tbd'

            metadata['mc_status'] = score

            try:
                metadata['mc_score'] = int(metascore)
            except:
                logging.error("Couldn't convert metascore '%s' to integer." % metascore)

        return metadata

示例#24

0

显示文件

def imdb_parse_page_metadata(imdb_id):
    url = u'http://www.imdb.com/title/tt%s/combined' % imdb_id

    try:
        logging.info("Looking up '%s'" % url)
        _, page = common.get_page(url)

        # BeautifulSoup can't handle hex entities. Massage them into decimal.
        hexentityMassage = copy.copy(B.MARKUP_MASSAGE)
        hexentityMassage = [(re.compile('&#x([^;]+);'),
                             lambda m: '&#%d;' % int(m.group(1), 16))]

        document = B(page, convertEntities=B.ALL_ENTITIES,
                     markupMassage=hexentityMassage)

        metadata = {}

        # Grab the poster
        poster_node = document.findChild('img', attrs={'id': 'primary-poster'})
        poster_url = poster_node.get('src') if poster_node else None
        if poster_url:
            logging.info("Found IMDb Poster URL: '%s'" % poster_url)
            # IMDb Poster URLs work like this:
            # http://ia.media-imdb.com/images/M/MV5BOTI5ODc3NzExNV5BMl5BanBnXkFtZTcwNzYxNzQzMw@@._V1._SX214_CR0,0,214,314_.jpg
            # Everything after the @@ is a format command.
            # ._V1 not sure
            # ._SX214 format width 214 pixels
            # ._SY214 format height 214 pixels
            # _CR0,0,214,214_ not sure

            # So to collect our images at X by Y, just replace 'SX\d+' by the
            # desired width. The combined details page defaults to a small
            # thumbnail.

            # Eliminate height restrictions.
            poster_url = re.sub('_SY\d+', '', poster_url)

            desired_width = settings.IMDB_THUMBNAIL_WIDTH

            # Replace height restriction with our desired height.
            poster_url = re.sub('\._SX\d+', "._SX%d" % desired_width, poster_url)

            metadata['imdb_cover_uri'] = poster_url
            metadata['imdb_cover_width'] = desired_width

        info_nodes = document.findAll('div', attrs={'class': re.compile('^info( stars)?$')})

        def take_first_string(contents):
            for item in node_content.contents:
                if isinstance(item, basestring):
                    return unicode(item.strip())
            return None

        for node in info_nodes:
            node_title = node.findChild('h5')
            node_title = node_title.string if node_title else None
            node_content = node.findChild('div', attrs={'class': re.compile('^(info-content|starbar-meta)$')})

            if not node_title or not node_content:
                continue

            if node_title == 'User Rating:':
                rating_node = node_content.findChild('b')
                rating_match = re.match("(?P<rating>[0-9.]+)/10", rating_node.string.strip()) \
                    if rating_node and rating_node.string else None
                if rating_match:
                    try:
                        metadata['imdb_rating'] = float(rating_match.groupdict()['rating'])
                    except Exception, e:
                        logging.error("Couldn't parse rating: '%s'" % rating_match.groupdict()['rating'])
            elif node_title == 'Director:':
                metadata['imdb_directors'] = [unicode(subnode.string)
                                              for subnode in node_content.findAll('a')
                                              if subnode.string]
            elif node_title == 'Writers:':
                metadata['imdb_writers'] = [unicode(subnode.string)
                                            for subnode in node_content.findAll('a')
                                            if subnode.string]

示例#25

0

显示文件

def imdb_find_id(title, year=None):
    title = title.decode('utf8')

    url = u'http://www.imdb.com/find?%s'
    data = {'s': 'tt',
            'q': title.encode('latin1')}

    try:
        url = url % urllib.urlencode(data)
        logging.info("Executing IMDB regular search for '%s' at '%s'" % (title, url))
        result_url, page = common.get_page(url)

        result_url = result_url.replace('http://www.imdb.com', '')
        result_url_match = imdb_title_pattern.match(result_url)
        if result_url_match:
            # IMDb saw fit to redirect us to the thing we searched for. Let's
            # trust them?
            logging.info("IMDb redirected us to '%s', trusting them." % result_url)
            return result_url_match.groupdict()['imdb_id']

        # BeautifulSoup can't handle hex entities. Massage them into decimal.
        hexentityMassage = copy.copy(B.MARKUP_MASSAGE)
        hexentityMassage = [(re.compile('&#x([^;]+);'),
                             lambda m: '&#%d;' % int(m.group(1), 16))]

        #page = imdb_cleanup_markup(page)
        document = B(page, convertEntities=B.HTML_ENTITIES,
                     markupMassage=hexentityMassage)

        links = document.findAll('a', attrs={'href': re.compile('^/title/tt\d{7}/$')})
        for link in links:
            link_title = link.string
            if not link_title:
                continue

            if not common.title_match(title, link_title):
                logging.info("Skipping IMDB link title '%s' because it didn't match '%s'" % (link_title, title))
                continue

            link_year = link.nextSibling

            if not isinstance(link_year, basestring):
                continue

            link_year = link_year.strip()
            link_year_match = re.match('\((?P<year>\d{4}).*?\)', link_year)
            link_year = link_year_match.groupdict()['year'] if link_year_match else None

            if not link_year:
                continue

            if year and link_year != year:
                logging.info("Link '%s's year '%s' doesn't match '%s'." % (link_title, link_year, year))
                continue

            imdb_url = link.get('href')
            imdb_match = re.match('^/title/tt(?P<imdb_id>\d{7})/', imdb_url)
            logging.info("Found match for '%s (%s)': '%s (%s)'" % (title, year, link_title, link_year))
            # We know this because the nodes were selected with this regex.
            assert imdb_match
            return imdb_match.groupdict()['imdb_id']
        logging.error("Found no matches for '%s'" % title)
    except Exception, e:
        logging.error("Couldn't get IMDB regular search for '%s'" % title)
        traceback.print_exc(e)

示例#26

0

显示文件

def rottentomatoes_find_id(title, year=None, imdb_id=None):

    # Find the content by search.
    url = u"http://www.rottentomatoes.com/search/movie.php?%s"
    title_latin1 = title.encode('latin1')
    data = {'searchby': 'movies',
            'search': title_latin1}

    try:
        url = url % urllib.urlencode(data)
        logging.info("Executing RT regular search for '%s' at '%s'" % (title, url))
        result_url, page = common.get_page(url)

        # BeautifulSoup can't handle hex entities. Massage them into decimal.
        hexentityMassage = copy.copy(B.MARKUP_MASSAGE)
        hexentityMassage = [(re.compile('&#x([^;]+);'),
                             lambda m: '&#%d;' % int(m.group(1), 16))]

        #page = imdb_cleanup_markup(page)
        document = B(page, convertEntities=B.HTML_ENTITIES,
                     markupMassage=hexentityMassage)

        results_ul = document.findChild('ul', attrs={'id': re.compile('movie_results_ul')})
        results = (results_ul.findAll('li', attrs={'class': re.compile('media_block')})
                   if results_ul else None)

        if results is None:
            logging.error("Couldn't lookup RT ID for '%s (%s)'" % (title, year))
            return None

        for result_node in results:
            # Scope in on the content div, because otherwise we get the poster
            # image.
            content_div = result_node.findChild(
                'div', attrs={'class': re.compile('media_block_content')})
            link = content_div.findChild('a', attrs={'href': rottentomatoes_id_pattern})

            link_title = link.string if link else None
            if not link_title:
                logging.error("Couldn't find RT result link title. Skipping")
                continue

            titles = []

            # Try the original title
            titles.append(link_title)

            # Rotten Tomatoes annoyingly embeds the AKAs in the title in parens following the head title.
            # For example:
            # - Batoru rowaiaru II: Chinkonka (Battle Royale II)
            # - Battle Royale (Batoru Rowaiaru)
            endparen_match = re.search("\(([^\(\)]+)\)$", link_title)

            while endparen_match:
                titles.append(endparen_match.groups()[0])
                # Strip out the ending (title) and any spaces before it.
                link_title = re.sub("\s*\(([^\(\)]+)\)$", '', link_title)
                endparen_match = re.search("\(([^\(\)]+)\)$", link_title)

                # Add the final version of the title with the AKAs removed to
                # the title list.
                if not endparen_match:
                    titles.append(link_title)

            found_title = None
            for aka in titles:
                if not common.title_match(title, aka):
                    try:
                        logging.warning(u"Skipping RT title '%s' because it didn't match '%s'" % (aka, title))
                    except Exception, e:
                        traceback.print_exc(e)
                    continue
                else:
                    logging.info("Found RT title match '%s' for '%s'" % (aka, title))
                found_title = aka
                break

            if not found_title:
                continue

            span_year = result_node.findChild('span', attrs={'class': re.compile('movie_year')})
            link_year = unicode(span_year.string) if span_year and span_year.string else None
            link_year = link_year.strip(' ()')

            if year and link_year != year:
                logging.info("Link '%s's year '%s' doesn't match '%s'." %
                             (link_title, link_year, year))
                continue

            # Get RT ID
            link_href = link.get('href')
            link_match = rottentomatoes_id_pattern.match(link_href)
            assert link_match # guaranteed
            return link_match.groupdict()['id']
    except Exception, e:
        traceback.print_exc(e)
        logging.error("Couldn't lookup RT ID for '%s (%s)'" % (title, year))
        pass

示例#27

0

显示文件

文件： httpxml_to_s3.py 项目： hhagblom/lambda-decorators

 def handler():
     input_xml = ET.fromstring(common.get_page(url))
     return fn_inner(input_xml)