def get_memes(): id_ = request.args.get('id') if id_ is not None and Meme.exists(id=int(id_)): return jsonify(Server._get_meme_long_desc(Meme.get(id=int(id_)))) memes = [Server._get_meme_short_desc(meme) for meme in Meme.select()] memes.append(Server._get_quiz_desc(1)) memes.append(Server._get_generator_desc(1)) memes.append(Server._get_generator_desc(2)) return jsonify(memes)
def parse_meme(m): title, url, content, description = parse_content(m) if url is None: return points = None points_text = m.css(".up_votes::text").get() try: points = int(points_text) except: pass comment_count = None comments_count_text = m.css(".demot-comments a::text").get() try: comment_count = int(comments_count_text) except: pass return Meme( title, ROOT + url, "/demotywatory/{}".format(find_id_in_url(url)), content, None, None, points, comment_count, )
def parse_meme(m): title = html.unescape(m["title"]) url = m["url"] comment_count = int(m["commentsCount"]) points = int(m["upVoteCount"]) - int(m["downVoteCount"]) tags = [Tag(tag["key"], ROOT + tag["url"]) for tag in m["tags"]] content = None # Determine type t = m["type"] if t == "Animated": for k, v in m["images"].items(): if "duration" in v: content = VideoContent(v["url"]) else: src = m["images"]["image700"]["url"] content = ImageContent(src) return Meme( title, url, "/9gag/{}".format(get_last_part_url(url)), content, None, tags, points, comment_count, )
def parse_meme(m): title = m.css("header.story-header a::text").get() url = m.css("header.story-header a::attr(href)").get() points = None points_text = m.css("span.points::text").get() try: points = int(points_text) except: pass text = "\n".join( [ part.replace("\r\n", "") for part in m.css("article > section ::text").getall() ] ) content = TextContent(text) return Meme( title, url, "/anonimowe/{}".format(title.replace("#", "")), content, None, None, points, None, )
def _parse_meme_page(page_url: str): response = requests.get(page_url, headers={'user-agent': 'vkhack-bot'}) parsed_page = BeautifulSoup(response.text, "html.parser") image = parsed_page.find(name='a', attrs={'class': 'photo left wide'}) if image is None: image = parsed_page.find(name='a', attrs={'class': 'photo left '}) image = image.get('href') about_tag = parsed_page.find(name='h2', attrs={'id': 'about'}) after_about_tag = list(about_tag.next_siblings)[1] origin = None origin_tag = parsed_page.find(name='h2', attrs={'id': 'origin'}) if origin_tag: after_origin_tag = list(origin_tag.next_siblings)[1] origin = after_origin_tag.text name = list(after_about_tag.children)[0].text about = after_about_tag.text if not all(ch.isalpha() or ch in ' .,?!-\"\'' for ch in name): return type_tags = parsed_page.find_all(name='a', attrs={'class': 'entry-type-link'}) meme_type = ','.join(tag.text for tag in type_tags) if not Meme.exists(name=name): logger.info(f'Add meme {page_url}') Meme(name=name, image=image, about=about, origin=origin, type=meme_type)
def parse_meme(m): title = m.css(".content > h2 > a::text").get().strip() url = m.css(".content > h2 > a::attr(href)").get().strip() tags = [ Tag(tag.css("::text").get(), tag.attrib["href"]) for tag in m.css(".content > div > div > a") ] author_link = m.css("div.user-bar > div > a") author_url = author_link.attrib["href"] author_name = author_link.css("span.name::text").get() comment_count_text = m.attrib["data-comments-count"] votes_down_text = m.attrib["data-vote-down"] votes_up_text = m.attrib["data-vote-up"] points = None try: points = int(votes_up_text) - int(votes_down_text) except: pass comment_count = None try: comment_count = int(comment_count_text) except: pass content = None if "/przegladaj" in url: content = parse_gallery(url) else: content = parse_simple_content(m) if content is None: return None return Meme( title, url, "/kwejk/{}".format(find_id_in_url(url)), content, Author(author_name, author_url), tags, points, comment_count, )
def download_images(self): self.logger.info('Downloading images...') if not self.IMAGE_FOLDER.exists(): self.IMAGE_FOLDER.mkdir() for meme in Meme.select(): image = self.IMAGE_FOLDER / str(meme.id) if not image.exists(): r = requests.get(meme.image, stream=True) if r.status_code == 200: with image.open('wb') as f: for chunk in r: f.write(chunk)
def parse_meme(m): title = m.css("h1.picture > a::text").get() if title is None: return None title = title.strip() url = m.css("h1.picture > a::attr(href)").get() points = None points_text = m.css("span.total_votes_up > span.value::text").get() try: points = int(points_text) except: pass comment_count = None comments_count_text = (m.css("a.lcomment::text").get().replace( "\t", "").replace("\n", "")) result = COMMENT.match(comments_count_text) if result: try: comment_count = int(result[1]) except: pass else: comment_count = 0 content = None src = m.css("img.pic::attr(src)").get() if src: content = ImageContent(ROOT + src) return Meme( title, ROOT + url, "/mistrzowie/{}".format(find_id_in_url(url)), content, None, None, points, comment_count, )
def parse_meme(m): title = m.css(".article-title > a::text").get().strip() url = m.css(".article-title > a::attr(href)").get().strip() tags = [ Tag(tag.css("::text").get(), tag.attrib["href"]) for tag in m.css(".article-tags > a") ] points = None points_text = m.css(".btn-plus span::text").get() try: points = int(points_text) except: pass comment_count = None comments_count_text = ( remove_big_whitespaces_selector(m.css(".article-comments-count")).css("::text").get() ) try: comment_count = int(comments_count_text) except: pass content = parse_content(m.css(".article-container")) if content is None: return None return Meme( title, url, "/jbzd/{}".format(find_id_in_url(url)), content, None, tags, points, comment_count, )
import logging logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) from crawlers import parse_know_your_meme from crawlers import parse_imgflip from pony import orm from data import Meme parser = argparse.ArgumentParser('Parse static.') parser.add_argument('--resource', type=str, required=True, help='resource to parse') parser.add_argument('--page_from', type=int, required=True, help='start page') parser.add_argument('--page_to', type=int, required=True, help='end page') if __name__ == '__main__': args = parser.parse_args() if args.resource == 'know_your_meme': parse_know_your_meme(args.page_from, args.page_to) elif args.resource == 'imgflip': with orm.db_session(): for meme in Meme.select(): parse_imgflip(meme.id, meme.name) else: raise RuntimeError(f'No such resource: {args.resource}')
def build_text_index(self): data = [(it.id, it.about) for it in Meme.select()] self.tsearcher.build_index(data)