def create_article(cls, title=None): page = cls.get_raw_wikihow_page(title=title) if title is not None \ else cls.get_raw_wikihow_page() title = Element(page)("h1.firstHeading a")[0].string if title.startswith("wiki"): title = title[4:] url = 'http://www.wikihow.com/{}'.format(title[7:].replace(' ', '-')) steps, errors = cls.get_steps(page) tips = cls.get_tips(page) return cls(url, title, steps, tips, errors)
def scrape_movie_page(dom): """ Scrape the IMDB page for a single movie Args: dom: pattern.web.DOM instance representing the page of 1 single movie. Returns: A list of strings representing the following (in order): title, year, duration, genre(s) (semicolon separated if several), director(s) (semicolon separated if several), writer(s) (semicolon separated if several), actor(s) (semicolon separated if several), rating, number of ratings. For the following functions I imported Element from pattern.web. This allowed me to make my code shorter than the constant use of for loops. Furthermore, the following code is based on CSS selectors. It uses them to extract the right parts of the downloaded HTML file. """ # Title element = Element(dom) title = element.by_class("itemprop")[0].content # Duration duration = "" for e in dom.by_tag("div.infobar"): for a in e.by_tag("time"): duration = a.content.replace(" ", "").replace("min", "").replace("/n", "") # Genres genres = [] e = dom.by_tag("div.infobar")[0] for genre in e.by_class("itemprop"): genres.append(genre.content) genres = ";".join(genres) # Directors directors = [] e = element('div[itemprop="director"]')[0] for a in e.by_tag("span"): directors.append(a.content) directors = ";".join(directors) # Writers writers = [] e = element('div[itemprop="creator"]')[0] for a in e.by_tag("span.itemprop"): writers.append(a.content) writers = ";".join(writers) # Actors actors = [] actorscode = element('div[itemprop="actors"]')[0] for actor in actorscode.by_tag("span.itemprop"): actors.append(actor.content) actors = ";".join(actors) # Rating rating = element.by_class("titlePageSprite star-box-giga-star")[0].content.replace(" ", "") # Amount of raters n_ratings = element('span[itemprop="ratingCount"]')[0].content # Return everything of interest for this movie (all strings as specified # in the docstring of this function). return title, duration, genres, directors, writers, actors, rating, n_ratings