示例#1
0
    def get_memes():
        id_ = request.args.get('id')

        if id_ is not None and Meme.exists(id=int(id_)):
            return jsonify(Server._get_meme_long_desc(Meme.get(id=int(id_))))

        memes = [Server._get_meme_short_desc(meme) for meme in Meme.select()]

        memes.append(Server._get_quiz_desc(1))
        memes.append(Server._get_generator_desc(1))
        memes.append(Server._get_generator_desc(2))

        return jsonify(memes)
示例#2
0
def parse_meme(m):
    title, url, content, description = parse_content(m)

    if url is None:
        return

    points = None
    points_text = m.css(".up_votes::text").get()
    try:
        points = int(points_text)
    except:
        pass

    comment_count = None
    comments_count_text = m.css(".demot-comments a::text").get()
    try:
        comment_count = int(comments_count_text)
    except:
        pass

    return Meme(
        title,
        ROOT + url,
        "/demotywatory/{}".format(find_id_in_url(url)),
        content,
        None,
        None,
        points,
        comment_count,
    )
示例#3
0
def parse_meme(m):
    title = html.unescape(m["title"])
    url = m["url"]

    comment_count = int(m["commentsCount"])
    points = int(m["upVoteCount"]) - int(m["downVoteCount"])
    tags = [Tag(tag["key"], ROOT + tag["url"]) for tag in m["tags"]]

    content = None

    # Determine type
    t = m["type"]
    if t == "Animated":
        for k, v in m["images"].items():
            if "duration" in v:
                content = VideoContent(v["url"])
    else:
        src = m["images"]["image700"]["url"]
        content = ImageContent(src)

    return Meme(
        title,
        url,
        "/9gag/{}".format(get_last_part_url(url)),
        content,
        None,
        tags,
        points,
        comment_count,
    )
示例#4
0
def parse_meme(m):
    title = m.css("header.story-header a::text").get()
    url = m.css("header.story-header a::attr(href)").get()

    points = None
    points_text = m.css("span.points::text").get()
    try:
        points = int(points_text)
    except:
        pass

    text = "\n".join(
        [
            part.replace("\r\n", "")
            for part in m.css("article > section ::text").getall()
        ]
    )
    content = TextContent(text)

    return Meme(
        title,
        url,
        "/anonimowe/{}".format(title.replace("#", "")),
        content,
        None,
        None,
        points,
        None,
    )
示例#5
0
def _parse_meme_page(page_url: str):
    response = requests.get(page_url, headers={'user-agent': 'vkhack-bot'})
    parsed_page = BeautifulSoup(response.text, "html.parser")

    image = parsed_page.find(name='a', attrs={'class': 'photo left wide'})

    if image is None:
        image = parsed_page.find(name='a', attrs={'class': 'photo left '})

    image = image.get('href')
    about_tag = parsed_page.find(name='h2', attrs={'id': 'about'})
    after_about_tag = list(about_tag.next_siblings)[1]

    origin = None
    origin_tag = parsed_page.find(name='h2', attrs={'id': 'origin'})

    if origin_tag:
        after_origin_tag = list(origin_tag.next_siblings)[1]
        origin = after_origin_tag.text

    name = list(after_about_tag.children)[0].text
    about = after_about_tag.text

    if not all(ch.isalpha() or ch in ' .,?!-\"\'' for ch in name):
        return

    type_tags = parsed_page.find_all(name='a',
                                     attrs={'class': 'entry-type-link'})
    meme_type = ','.join(tag.text for tag in type_tags)

    if not Meme.exists(name=name):
        logger.info(f'Add meme {page_url}')
        Meme(name=name,
             image=image,
             about=about,
             origin=origin,
             type=meme_type)
示例#6
0
def parse_meme(m):
    title = m.css(".content > h2 > a::text").get().strip()
    url = m.css(".content > h2 > a::attr(href)").get().strip()

    tags = [
        Tag(tag.css("::text").get(), tag.attrib["href"])
        for tag in m.css(".content > div > div > a")
    ]

    author_link = m.css("div.user-bar > div > a")
    author_url = author_link.attrib["href"]
    author_name = author_link.css("span.name::text").get()

    comment_count_text = m.attrib["data-comments-count"]
    votes_down_text = m.attrib["data-vote-down"]
    votes_up_text = m.attrib["data-vote-up"]

    points = None
    try:
        points = int(votes_up_text) - int(votes_down_text)
    except:
        pass

    comment_count = None
    try:
        comment_count = int(comment_count_text)
    except:
        pass

    content = None
    if "/przegladaj" in url:
        content = parse_gallery(url)
    else:
        content = parse_simple_content(m)

    if content is None:
        return None

    return Meme(
        title,
        url,
        "/kwejk/{}".format(find_id_in_url(url)),
        content,
        Author(author_name, author_url),
        tags,
        points,
        comment_count,
    )
示例#7
0
    def download_images(self):
        self.logger.info('Downloading images...')

        if not self.IMAGE_FOLDER.exists():
            self.IMAGE_FOLDER.mkdir()

        for meme in Meme.select():
            image = self.IMAGE_FOLDER / str(meme.id)

            if not image.exists():
                r = requests.get(meme.image, stream=True)

                if r.status_code == 200:
                    with image.open('wb') as f:
                        for chunk in r:
                            f.write(chunk)
示例#8
0
def parse_meme(m):
    title = m.css("h1.picture > a::text").get()
    if title is None:
        return None

    title = title.strip()
    url = m.css("h1.picture > a::attr(href)").get()

    points = None
    points_text = m.css("span.total_votes_up > span.value::text").get()
    try:
        points = int(points_text)
    except:
        pass

    comment_count = None
    comments_count_text = (m.css("a.lcomment::text").get().replace(
        "\t", "").replace("\n", ""))

    result = COMMENT.match(comments_count_text)
    if result:
        try:
            comment_count = int(result[1])
        except:
            pass
    else:
        comment_count = 0

    content = None
    src = m.css("img.pic::attr(src)").get()
    if src:
        content = ImageContent(ROOT + src)

    return Meme(
        title,
        ROOT + url,
        "/mistrzowie/{}".format(find_id_in_url(url)),
        content,
        None,
        None,
        points,
        comment_count,
    )
示例#9
0
def parse_meme(m):
    title = m.css(".article-title > a::text").get().strip()
    url = m.css(".article-title > a::attr(href)").get().strip()

    tags = [
        Tag(tag.css("::text").get(), tag.attrib["href"]) for tag in m.css(".article-tags > a")
    ]

    points = None
    points_text = m.css(".btn-plus span::text").get()
    try:
        points = int(points_text)
    except:
        pass

    comment_count = None
    comments_count_text = (
        remove_big_whitespaces_selector(m.css(".article-comments-count")).css("::text").get()
    )
    try:
        comment_count = int(comments_count_text)
    except:
        pass

    content = parse_content(m.css(".article-container"))
    if content is None:
        return None

    return Meme(
        title,
        url,
        "/jbzd/{}".format(find_id_in_url(url)),
        content,
        None,
        tags,
        points,
        comment_count,
    )
示例#10
0
文件: crawl.py 项目: rebryk/memeaning
import logging

logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    level=logging.INFO)

from crawlers import parse_know_your_meme
from crawlers import parse_imgflip
from pony import orm
from data import Meme

parser = argparse.ArgumentParser('Parse static.')
parser.add_argument('--resource',
                    type=str,
                    required=True,
                    help='resource to parse')
parser.add_argument('--page_from', type=int, required=True, help='start page')
parser.add_argument('--page_to', type=int, required=True, help='end page')

if __name__ == '__main__':
    args = parser.parse_args()

    if args.resource == 'know_your_meme':
        parse_know_your_meme(args.page_from, args.page_to)
    elif args.resource == 'imgflip':
        with orm.db_session():
            for meme in Meme.select():
                parse_imgflip(meme.id, meme.name)
    else:
        raise RuntimeError(f'No such resource: {args.resource}')
示例#11
0
 def build_text_index(self):
     data = [(it.id, it.about) for it in Meme.select()]
     self.tsearcher.build_index(data)