Пример #1
0
def get_info(comic_num):
    URL = "https://www.explainxkcd.com/wiki/index.php/%s" % str(comic_num)
    soup = make_soup(URL)
    if soup == None:
        return "Error: comic %d not found" % comic_num

    result = Comic(comic_num)

    # get title
    result.og_title = get_title(comic_num)
    result.title = clean_text(result.og_title.split())

    # get transcript
    try:
        transcript = soup.find("span", {"id": "Transcript"})
        result.transcript = []
        cur = transcript.parent
        while cur:
            if cur.name == 'dl':
                for dd in cur:
                    result.transcript.append(
                        str(dd).strip('<dd>').strip('</dd>'))
            elif cur.name == 'span':
                break
            cur = cur.nextSibling
        result.transcript = clean_text((" ".join(result.transcript)).split())
    except:
        result.transcript = []

    # get title text
    result.og_ttext = get_ttext(comic_num)
    result.title_text = clean_text(result.og_ttext.split())

    # get explanation
    try:
        explanation = soup.find("span", {"id": "Explanation"})
        result.explanation = []
        cur = explanation.parent
        while cur:
            if cur.name == 'p':
                result.explanation.append(cur.text)
            elif cur.name == 'span':
                break
            cur = cur.nextSibling
        result.explanation = clean_text((" ".join(result.explanation)).split())
    except:
        result.explanation = []

    # get image URL
    result.img_url = get_img_url(comic_num)

    return result
Пример #2
0
    def insert_data(self, data):
        """
        Will handle inserting data into the database
        """
        try:
            db_session = DBSession()
            # Check if comic is in database, if so update else create
            try:
                comic = db_session.query(Comic).filter(
                    Comic.comic_id == data.get('comic_id')).one()
            except NoResultFound:
                comic = Comic()

            comic.title = data.get('title')
            comic.alt = data.get('alt')
            comic.comic_id = data.get('comic_id')
            comic.source_file_location = data.get('source_file_location')
            comic.saved_file_location = data.get('saved_file_location')
            comic.posted_at = data.get('posted_at')
            comic.raw_json = data.get('raw_json')
            comic.time_collected = data.get('time_collected')
            comic.transcript = data.get('transcript')

            db_session.add(comic)
            db_session.commit()
            # self.track_stat('rows_added_to_db', rows_affected)

        except Exception:
            db_session.rollback()
            logger.exception("Error adding to db {data}".format(data=data))