def cutSafe(image, size_factor=2): cached_url.get(image, force_cache=True, mode='b') fn = cached_url.getFilePath(image) if isAnimated(fn): return [fn] if not getImg(fn): return [] return list(cut(fn, size_factor=size_factor)) or [fn]
def getAlbum(url, force_cache=True, word_limit=200, paragraph_limit=3, append_source=False, append_url=True): content = _getArticle(url, force_cache=force_cache).text album = AlbumResult() for item in content.findAll('img'): path = item.get('src') if not path: continue try: cached_url.get(path, mode='b', force_cache=True) img = Image.open(cached_url.getFilePath(path)) except: continue w, h = img.size file_size = os.stat(cached_url.getFilePath(path)).st_size if 36000 < file_size < 36200 and w == 1080 and h == 1080: # 界面文化题头 continue if 27000 < file_size < 27300 and w == 640 and h == 640: # 思想市场 continue if w == 750 and h == 234: # 界面文化题头 continue if 6000 < file_size < 9000 and w == 347 and h == 347: # 界面文化题头 continue if 87000 < file_size < 91000 and w == 900 and h == 500: # 美国华人杂谈题头 continue if 53000 < file_size < 56000 and w == 795 and h == 504: # 微信foot continue if 57000 < file_size < 61000 and w == 1011 and h == 282: # 短史记题头 continue if w * 0.25 < h < w * 4 and min(w, h) > 100 and max(w, h) > 300: # print(file_size, w, h) album.imgs.append(item.get('src')) break for tag in ['img', 'br']: for item in content.findAll(tag): item.replace_with('\n\n') for item in content.findAll('p'): item.append('\n\n') title = '【%s】\n\n' % getTitle(url) lines = content.text.split('\n') lines = [line.strip() for line in lines] lines = [line for line in lines if isGoodLine(line)] if paragraph_limit < 5: lines = [line for line in lines if not line or len(line) > 20] lines = cutCaptionHtml('\n'.join(lines), word_limit).strip().strip('\ufeff').strip() lines = lines.split('\n') lines = lines[:paragraph_limit * 2] album.cap_html_v2 = title + '\n'.join(lines).strip() if append_url: album.cap_html_v2 += '\n\n' + url if append_source: album.url = url return album
def isAnimated(path): cached_url.get(path, force_cache=True, mode='b') gif = Image.open(cached_url.getFilePath(path)) try: gif.seek(1) except EOFError: return False else: return True
def postVideo(subreddit, post_text, video): cached_url.get(video, mode='b', force_cache=True) title, content = splitText(post_text) content += '{video}' return subreddit.submit(title, selftext=content, inline_media={ "video": InlineVideo(path=cached_url.getFilePath(video)) })
def getContent(path, force_cache=False): if isWeiboArticle(path): new_url = ('https://card.weibo.com/article/m/aj/detail?id=' + getWid(path) + '&_t=' + str(int(time.time()))) json = yaml.load(cached_url.get(new_url, headers={'referer': path}, force_cache=force_cache), Loader=yaml.FullLoader) return '<div><title>%s</title>%s</div>' % (json['data']['title'], json['data']['content']) return cached_url.get(path, force_cache=force_cache)
async def sendSingle(client, source_channel, target, post, img_number, new_text): video = post.getVideo() if video: cached_url.get(video, mode='b', force_cache=True) await client.send_message(target, new_text, file=cached_url.getFilePath(video)) return if not img_number: await client.send_message(target, new_text) return fns = await telepost.getImages(source_channel, post.post_id, img_number) await client.send_message(target, new_text, file=fns)
def backfill(key, ttl=0, sleep=10, limit=30): base_url = getSearchUrl(key) content = cached_url.get(base_url, ttl=ttl, sleep=sleep) result_dict = getResultDict(yaml.load(content, Loader=yaml.FullLoader)) final_result = result_dict count = 2 while result_dict: url = base_url + '&page=%d' % count content = cached_url.get(url, ttl=ttl, sleep=sleep) result_dict = getResultDict(yaml.load(content, Loader=yaml.FullLoader)) final_result.update(result_dict) count += 1 if count > limit: break return sortedResult(final_result)
def getLikes(link): soup = BeautifulSoup(cached_url.get(link), 'html.parser') item = soup.find('span', class_="clap").nextSibling if item: return int(item.text) else: return 0
def getArticleHtml(name, link, index_loc): content = None if 'bbc' in link: content = cached_url.get(link, force_cache=True, sleep = 5) args = {} if 'twreporter.org/' in link: args['toSimplified'] = True soup = readee.export(link, content = content, **args) funcs = [ lambda x: x.find('div', {'property': 'articleBody'}), lambda x: x.find('article'), lambda x: x.find('div', {'id': 'story-body'}), ] for f in funcs: new_soup = f(soup) if new_soup: soup = new_soup for item in soup.find_all('h2'): new_item = fact().new_tag('h4') new_item.string = item.text item.replace_with(new_item) if len(soup.text) < 100: return return ''' <html> <body> <title>%s</title> <h1>%s</h1> <div><a href="%s">返回目录</a></div> %s <div><br/><a href="%s">原文</a></div> <div><br/><a href="%s">返回目录</a></div> </body> </html> ''' % (name, name, index_loc, str(soup), link, index_loc)
def enlarge(url): candidate = url.replace('orj360', 'large') candidate_content = cached_url.get(candidate, mode='b', force_cache=True) if (0 < len(candidate_content) < 1 << 22 or isLongPic(candidate) or isAnimated(candidate)): return candidate return url
def download(url, force_cache=False): nid = getNid(url) content = cached_url.get(chapter_prefix + nid, force_cache=force_cache) content = yaml.load(content, Loader=yaml.FullLoader) novel_name = None result = [] for cid in getIds(content): raw_content = cached_url.get(detail_prefix % cid, force_cache=True, sleep=1) if not novel_name: novel_name = getNovelName(raw_content) os.system('mkdir download > /dev/null 2>&1') result.append(getContent(raw_content, debug_info=detail_prefix % cid)) with open('download/%s.txt' % novel_name, 'w') as f: f.write(compactText(''.join(result)))
def getDoubanNotes(uid): link = 'https://m.douban.com/rexxar/api/v2/user/%s/notes?start=0&count=20' % uid json = yaml.load(cached_url.get( link, headers={'referer': 'https://m.douban.com'}), Loader=yaml.FullLoader) for note_obj in json['notes']: yield note_obj['url'].replace('\/', '/')
def findResource(source): soup = BeautifulSoup(cached_url.get(LINK_PREFIX + source), 'html.parser') name = soup.find('meta', {'property': 'og:title'})['content'] links = {} for item in soup.find_all('a', class_='tgme_widget_message_link_preview'): if 'telegra.ph' not in item['href']: continue title = item.find('div', class_='link_preview_title').text links[(item['href'], )] = title pics = [] for item in soup.find_all('div', class_='tgme_widget_message_bubble'): imgs = [] for pic in item.find_all('a', class_='tgme_widget_message_photo_wrap'): imgs.append('<figure><img src="%s"/></figure>' % findSrc(pic['style'])) text = item.find('div', class_='tgme_widget_message_text') if imgs: pics.append((''.join(imgs), text or '')) texts = [] for item in soup.find_all('div', class_='tgme_widget_message_wrap'): if item.find('a', class_='tgme_widget_message_photo_wrap'): continue preview = item.find('a', class_='tgme_widget_message_link_preview') if preview: preview.name = 'div' text = item.find('div', class_='tgme_widget_message_text') texts.append((text, preview or '')) if len(links) == 0: print('no links', name) links = findLinks(source) if name == 'MengyShare': name = '端传媒' return name, links, pics, texts
def test(url, rotate=False): result = web_2_album.get(url) suffix = '[source](%s)' % url if result.video: with open('tmp/video.mp4', 'wb') as f: f.write(cached_url.get(result.video, force_cache=True, mode='b')) group = [ InputMediaVideo(open('tmp/video.mp4', 'rb'), caption=cutCaption(result.cap, suffix, 1000), parse_mode='Markdown') ] return tele.bot.send_media_group(-1001198682178, group, timeout=20 * 60) imgs = pic_cut.getCutImages(result.imgs, 9) if imgs: imgs = pic_cut.getCutImages(result.imgs, 9) if rotate: for img_path in imgs: img = Image.open(img_path) img = img.rotate(180) img.save(img_path) group = [InputMediaPhoto(open(imgs[0], 'rb'), caption=cutCaption(result.cap, suffix, 1000), parse_mode='Markdown')] + \ [InputMediaPhoto(open(x, 'rb')) for x in imgs[1:]] return tele.bot.send_media_group(-1001198682178, group, timeout=20 * 60) tele.bot.send_message(-1001198682178, cutCaption(result.cap, suffix, 4000), timeout=20 * 60)
def getContent(url, force_cache=False): if 'weibo.c' in url: wid = getWid(url) if matchKey(url, ['card', 'ttarticle']): new_url = 'https://card.weibo.com/article/m/aj/detail?id=' + wid + '&_t=' + str( int(time.time())) json = yaml.load(cached_url.get(new_url, headers={'referer': url}, force_cache=force_cache), Loader=yaml.FullLoader) return '<div><title>%s</title>%s</div>' % (json['data']['title'], json['data']['content']) return getContentFromAlbum(weibo_2_album.get(url)) if 'photos.google.com/share' in url: return getContentFromAlbum(gphoto_2_album.get(url), noText=True) return cached_url.get(url, force_cache=force_cache)
def getStatus(user_id): url = 'https://www.douban.com/people/%s' % user_id soup = BeautifulSoup(cached_url.get(url, sleep=20), 'html.parser') for item in soup.find_all('span', class_='created_at'): sub_item = item.find('a') if not sub_item: continue yield sub_item['href']
def get(path): content = cached_url.get(path) b = readee.export(path, content=content) result = Result() result.imgs = getImgs(b) result.cap = getCap(b) result.video = getVideo(b) return result
def getImages(content): for parts in content.split(pivot): link = parts.split(end)[0] if '?key=' not in link[:160]: continue yield getImage( cached_url.get('https://photos.google.com/share/' + link, force_cache=True))
def getTelegraphRaw(link): if 'telegra.ph' not in link: return link b = BeautifulSoup(cached_url.get(link, force_cache=True), 'html.parser') try: return b.find('address').find('a')['href'] except: return link
def getAllPos(link): s = BeautifulSoup(cached_url.get(link + '?embed=1'), 'html.parser') result = [] for a in s.find_all('a', class_='grouped_media_wrap'): new_link = a.get('href', '').strip() new_link = new_link.split('?')[0] result.append(int(new_link.split('/')[-1])) return sorted(result)
def get(url): r = Result() r.url = url content = cached_url.get(url, force_cache=True) soup = BeautifulSoup(content, 'html.parser') r.title = soup.find('meta', {'property': 'og:title'})['content'] r.cap_html = r.title r.imgs = list(getImages(content)) return r
def parseFreewechat(link): if not link or 'freewechat.com' not in link: return link try: b = BeautifulSoup(cached_url.get(link, force_cache=True), 'html.parser') return b.find('div', id='about-article').find('a')['href'] except: return link
def check(link): try: content = cached_url.get(link, force_cache=True) except: return False soup = readee.export(link, content=content) if 200 < cnWordCount(soup.text) < 2500: return True return False
def process(root, total_page): for page in range(0, total_page): url = root + '?start=' + str(page * 25) soup = BeautifulSoup(cached_url.get(url)) for album_url in findAlbumUrl(soup): try: test(album_url) except Exception as e: print(album_url, str(e)) time.sleep(120)
def processTelegraphSingle(url, title, dirname): raw_content = cached_url.get(url) soup = BeautifulSoup(raw_content, 'html.parser').find('article') for tag in ['br', 'p', 'li', 'h4']: for item in soup.find_all(tag): item.replace_with('\n' + item.text.strip() + '\n') content = soup.text for _ in range(10): content.replace('\n\n\n', '\n\n') with open('%s/%s.md' % (dirname, title), 'w') as f: f.write(content.strip())
def getSoup(site): soup = BeautifulSoup(cached_url.get(site), 'html.parser') for item in soup.find_all('a', rel='author'): item.decompose() for tag in offtopic_tags: for item in soup.find_all(tag): item.decompose() if 'freewechat.com' in site: for item in soup.find_all('div', id='hot-articles'): item.decompose() return soup
def __init__(self, url): content = cached_url.get(url + '?json=1') content = yaml.load(content, Loader=yaml.FullLoader) self.title = content['title'] self.soup = BeautifulSoup(content['content'], 'html.parser') self.evernote_urls = list(getEvernoteUrls(self.soup)) self.next_url = self.evernote_urls and self.evernote_urls[0] self.text_soup = getTextSoup(content['content']) self.raw_text = compactText(self.text_soup.text.replace('~', '.')) self.text = clearText(self.raw_text) self.word_count = len( [c for c in self.text if c.isalpha() and ord(c) > 255])
def sendVideo(chat, result): os.system('mkdir tmp > /dev/null 2>&1') with open('tmp/video.mp4', 'wb') as f: f.write(cached_url.get(result.video, force_cache=True, mode='b')) if os.stat('tmp/video.mp4').st_size > 50 * 1024 * 1024: return [] group = [ InputMediaVideo(open('tmp/video.mp4', 'rb'), caption=getCap(result, 1000), parse_mode=result.getParseMode()) ] return chat.bot.send_media_group(chat.id, group, timeout=20 * 60)
def getLinkReplace(url, album): if 'telegra.ph' in url and 'douban.com/note/' in album.cap_html: return '' if 'telegra.ph' in url: soup = BeautifulSoup(cached_url.get(url, force_cache=True), 'html.parser') title = export_to_telegraph.getTitle(url) try: return '\n\n【%s】 %s' % (title, soup.find('address').find('a')['href']) except: return '' return '\n\n' + url
def getPosts(name, start): content = cached_url.get('https://t.me/s/%s/%d' % (name, start)) soup = BeautifulSoup(content, 'html.parser') for item in soup.find_all('div', class_='tgme_widget_message'): post_id = int(item['data-post'].split('/')[-1]) post_content = item.find('div', class_='tgme_widget_message_text') post_content = BeautifulSoup(str(post_content).replace('<br/>', '\n'), features='lxml') content = parseUrl(post_content.text) for d in range(10): content = content.replace('\n%s.' % d, '\n%s. ' % d) content = content.replace('\n%s. ' % d, '\n%s. ' % d) yield post_id, content