예제 #1
0
    def create_article(cls, title=None):
        page = cls.get_raw_wikihow_page(title=title) if title is not None \
               else cls.get_raw_wikihow_page() 

        title = Element(page)("h1.firstHeading a")[0].string
        if title.startswith("wiki"): title = title[4:]

        url = 'http://www.wikihow.com/{}'.format(title[7:].replace(' ', '-'))

        steps, errors = cls.get_steps(page)
        tips = cls.get_tips(page)

        return cls(url, title, steps, tips, errors)
def scrape_movie_page(dom):
    """
    Scrape the IMDB page for a single movie

    Args:
        dom: pattern.web.DOM instance representing the page of 1 single
            movie.

    Returns:
        A list of strings representing the following (in order): title, year,
        duration, genre(s) (semicolon separated if several), director(s) 
        (semicolon separated if several), writer(s) (semicolon separated if
        several), actor(s) (semicolon separated if several), rating, number
        of ratings.

    For the following functions I imported Element from pattern.web. 
    This allowed me to make my code shorter than the constant use of for loops.
    Furthermore, the following code is based on CSS selectors. It uses them to
    extract the right parts of the downloaded HTML file.
    """

    # Title
    element = Element(dom)

    title = element.by_class("itemprop")[0].content
    # Duration
    duration = ""
    for e in dom.by_tag("div.infobar"):
        for a in e.by_tag("time"):
            duration = a.content.replace(" ", "").replace("min", "").replace("/n", "")
    # Genres
    genres = []
    e = dom.by_tag("div.infobar")[0]
    for genre in e.by_class("itemprop"):
        genres.append(genre.content)
    genres = ";".join(genres)

    # Directors
    directors = []
    e = element('div[itemprop="director"]')[0]
    for a in e.by_tag("span"):
        directors.append(a.content)
    directors = ";".join(directors)

    # Writers
    writers = []
    e = element('div[itemprop="creator"]')[0]
    for a in e.by_tag("span.itemprop"):
        writers.append(a.content)
    writers = ";".join(writers)
    # Actors
    actors = []
    actorscode = element('div[itemprop="actors"]')[0]
    for actor in actorscode.by_tag("span.itemprop"):
        actors.append(actor.content)
    actors = ";".join(actors)

    # Rating
    rating = element.by_class("titlePageSprite star-box-giga-star")[0].content.replace(" ", "")

    # Amount of raters
    n_ratings = element('span[itemprop="ratingCount"]')[0].content

    # Return everything of interest for this movie (all strings as specified
    # in the docstring of this function).
    return title, duration, genres, directors, writers, actors, rating, n_ratings