def class_tr_to_str(tr: bs4.element.Tag) -> str: for a in tr.find_all("a"): a["href"] = BASE_API_URL + a["href"] data = (tr.find("td", class_="header"), tr.find("td", class_="description")) nameSpan = data[0].find("span", class_="element-name") if data[0].find("span", class_="attribute-type") is not None: accessType = "param" type_ = data[0].find("span", class_="param-type").text.strip() else: accessType = "func" if accessType == "param": attributeMode = data[0].find("span", class_="attribute-mode").text header = f"`{nameSpan.text} :: {type_}` {attributeMode}" else: header = f"`{nameSpan.text}`" contents = [item for item in data[1].contents if item != " "] if len(contents) > 0: if len(contents) > 1 and "\n" not in contents[0]: description = tomd.convert( f"<p>{''.join([str(item) for item in contents[:-1]])}</p>" ).strip() else: description = contents[0].strip() return f"{header} - {description}" else: return header
def getNode(contentSoup): node = ArticleNode() node.depth = contentSoup['depth'] node.id = contentSoup['id'] node.name = contentSoup.find('a', class_='x-wiki-index-item').text.replace( '/', ' ') node.url = 'https://www.liaoxuefeng.com' + contentSoup.find( 'a', class_='x-wiki-index-item')['href'] print(node.toString()) content = getUrlContent(node.url) if content != None: soup = BeautifulSoup(content, 'lxml') node.articleHTML = str( soup.find('div', class_="x-wiki-content x-main-content")) node.articleMD = tomd.convert(node.articleHTML) with open('output/' + node.name + '.md', 'w', encoding='utf-8') as fs: fs.write(node.articleMD) fs.flush() for item in contentSoup.find_all('div', depth=str(int(node.depth) + 1)): node.children.append(getNode(item)) return node
def _get_new_data(self,page_url,soup): ''' 抽取有效数据 :param page_url: 下载页面的 URL :param soup: :return: 返回有效数据 ''' data = {} dbStore = DataOutput() Htmldownloader = HtmlDownloader() data['article_url'] = page_url html = Htmldownloader.download(page_url) ind_soup = BeautifulSoup(html, 'html.parser') title = ind_soup.find_all("h1",class_="csdn_top") data['title']=title[0].string updated = ind_soup.find_all("span",class_="time") data['update_date'] = updated[0].string pageview = ind_soup.find_all("span",class_="txt") data['pageviewcnt']= pageview[0].string articlecontent = ind_soup.find_all("div",class_="article_content") md = tomd.convert(articlecontent[0].prettify()) # the page_content is too long to be saved in mysql blob data type , make that be saved on local folder as a file data['page_content']=pickle.dumps(md.encode("utf-8")) dbStore.store_data(data) print(data) return data
def get_url_content(url): str_html = requests.get(url) html = BeautifulSoup(str_html.text, 'html.parser') title = html.find('h1').text date = html.find('span', {'class': 'time'}).text html.find('div', {'class': 'toc'}).decompose() html.find('svg').decompose() html.find('br').decompose() clean_tags(html) replace_h_tags(html) get_image(title, html) content = ''.join(map(str, html.find('div', {'id': 'content_views'}).contents)) print(content) exit() md_content = tomd.convert(content) path = './content' file = title + '.md' file_path = path + '/' + file if not os.path.exists(path): os.makedirs(path) f = open(file_path, 'w', encoding="utf-8") f.write(str(md_content)) f.close()
def _generate_ics(self): data = self.data if data != None: c = Calendar() days = list(data.keys()) days.sort() for xday in days: day_data = data[xday] title = day_data[0].text_content() week, day = title.split(", ") week = w2n.word_to_num(week.lstrip("Week ")) day = w2n.word_to_num(day.lstrip("Day ")) offset = (week - 1) * 7 + (day) event_day = self.start_date + timedelta(days=offset) event_day = event_day.replace(hour=0, minute=0, second=0, microsecond=0) description = "".join( [str(html.tostring(el)) for el in day_data]) description = tomd.convert(description) e = Event(name="Magoosh {}".format(title), begin=event_day, end=event_day, description=description) e.make_all_day() c.events.add(e) with open(self.out_file, 'w') as f: f.writelines(c) print("File written to {}".format(self.out_file))
def parse_content(self): """ parse data from html to markdown """ self.markdown = [] try: for item in self.data: if item: for i in item: t = str(i) test = re.search("^!", t) if test: self.markdown.append(t) md = tomd.convert(t) self.markdown.append(md) filtered = list( filter(lambda x: not re.match(r'^\s*$', x), self.markdown)) filtered = [x.replace('\n', '') for x in filtered] filtered = [x.replace('\t', '') for x in filtered] self.markdown = filtered except RuntimeError as err: print(Fore.RED, "[!] Recursion {0}".format(err)) pass self.markdown.append('\n' + self.url)
def anno_edit(request, pk): '''公告编辑''' anno = Announcement.objects.get(pk=pk) # 当为post请求时,修改数据 if request.method == "POST": form = AnnoForm(request.POST, instance=anno) if form.is_valid(): form.save() registerinfo = { 'title': '修改成功', 'subtitle': '数据更新成功', 'status': 'success', } context = { 'registerinfo': registerinfo, 'anno': Announcement.objects.all(), } return render(request, 'backend/annolist.html', context=context) else: registerinfo = { 'title': '错误', 'subtitle': '数据填写错误', 'status': 'error', } context = { 'form': form, 'registerinfo': registerinfo, 'anno': anno, } return render(request, 'backend/annoedit.html', context=context) # 当请求不是post时,渲染form else: anno.body = tomd.convert(anno.body) context = { 'anno': anno, } return render(request, 'backend/annoedit.html', context=context)
def get(self, request, pk): anno = Announcement.objects.get(pk=pk) anno.body = tomd.convert(anno.body) context = {"anno": anno} return render(request, "my_admin/announcement_detail.html", context=context)
def run(): print('Fetching content...', end='') title, author, description, cta, img_url = get_meta_data() article = tomd.convert(get_article(cta)) print('Done') date = datetime.now().strftime('%Y%m%d') commitMessage = f'{title} by {author}' fileName = os.path.join('blinks', f'{date[:4]}', f'{date}-{title}-{author}.md') print('Building output...', end='') # Convert to markdown, add source output = f'![{title}]({img_url})\n# {title}\n*{author}*\n\n>{description}\n\n{article}\n\nSource: [{commitMessage}](https://www.blinkist.com{cta})' print('Done') print(f'Committing file {fileName}...', end='') g = Github(repoToken) repo = g.get_repo(repoName) try: repo.create_file(fileName, commitMessage, output) print('Done') return 'OK' except GithubException: print('already exists') return 'File Exists'
async def wiki_updated(name, feed_data, feed, feeds): time_latest_entry = feed_data['time_latest_entry'] for i, entry in enumerate(feed.entries): if get_formatted_time(entry) > time_latest_entry: info_log('Found new wiki entry made on ' + entry.updated) summary = '' if re.search('<p.*?>.*?<\/p>', entry.summary): summary = html.unescape( tomd.convert( re.search('<p.*?>.*?<\/p>', entry.summary).group())) summary = re.sub(r'\((\/\S*)\)', r'(https://wiki.factorio.com\1)', summary) summary = re.sub('<bdi>|<\/bdi>', '', summary) embed = discord.Embed( title=f'{entry.author} changed {entry.title}', color=14103594, timestamp=datetime.datetime(*entry.updated_parsed[0:6]), url=entry.link, description=summary) channel = client.get_channel(feed_data['channel']) await channel.send(embed=embed) else: break feeds[name]['time_latest_entry'] = get_formatted_time(feed.entries[0]) with open('feeds.json', 'w') as f: json.dump(feeds, f)
def getMarkdown(entry, attr, selector, *args, **kwargs): if args or kwargs: if args and kwargs: snippet = soup.find_all( entry, attrs={attr, selector})[args[0]].find( kwargs['opt_entry'], attrs={kwargs['opt_attr'], kwargs['opt_selector']}) elif kwargs and len(args) == 0: snippet = soup.find(entry, attrs={attr, selector}).find( kwargs['opt_entry'], attrs={kwargs['opt_attr'], kwargs['opt_selector']}) uls = snippet.find('ul') if uls != None: snippet.ul.append(soup.new_tag('p')) snippet = tomd.convert(str(snippet)).replace('- ', '\n- ').strip() return snippet return tomd.convert(str(soup.find(entry, attrs={attr, selector})))
def parse_article_content(bsObj, directory, title): # 1. Find html. html = bsObj.find('div', {'class': 'article_content'}) md = tomd.convert(html.prettify()) # 2. Write to the file. with open('%s/%s.md' % (directory, title), 'w', encoding='utf-8') as f: f.write(md)
def __str__(self) -> str: html = ( "<h1> {}</h1>\n<pre><code>Exercise ID {}</code></pre>\n<h2> Assignment </h2>{}\n" .format(self.data.title, self.id, self.data.assignment) + self.get_pre_exercise_code() + self.get_instructions() + self.get_sample_code() + self.get_anwsers() + self.get_hints() + self.get_solution()) return tomd.convert(html)
def get_class_description(soup: bs4.BeautifulSoup, class_: str) -> str: a = soup.select(f"tr > td.header > a[href=\"{class_}.html\"]") if len(a) == 1: descriptionTag = a[0].parent.parent.find("td", class_="description") descriptionRaw = "".join( [str(content) for content in descriptionTag.contents]) return tomd.convert(f"<p>{descriptionRaw}</p>").strip() return None
def download_statement(url): req = requests.get(url) filename = _extract_name(url) dirname = _extract_dirname(url) if req.status_code == 200: statement = _extract_statement(req.content) markdown = tomd.convert(str(statement)) _save_file(markdown, filename, dirname) print(f'Successful download of {filename}!') else: print(f'We cannot get the request of {filename}!')
def extract_text(status: Status) -> str: as_html = status["content"] text: str = tomd.convert(as_html) tags = status["tags"] mentions = status["mentions"] text = clean_tags(text, tags) text = clean_mentions(text, mentions) text = text.replace("<br />", "\n") text = clean_links(text) text = html.unescape(text) return text
def get_event_description(div: bs4.element.Tag) -> str: for a in div.find_all("a"): a["href"] = BASE_API_URL + a["href"] data = (div.select("div.element-content > p"), div.find("div", class_="detail-content")) paragraphs = [] for p in data[0]: contents = p.contents if not (len(contents) == 1 and len(contents[0].strip()) == 0): paragraphs.append(html.unescape(tomd.convert(str(p)))) return "\n".join([p.strip().replace("\n", "") for p in paragraphs])
def getArticle(id): url = "https://blog-console-api.csdn.net/v1/editor/getArticle?id={}".format(id) print(url) response = requests.get(url,headers=headers); responseJson = json.loads(response.content) if responseJson['data']['markdowncontent'] == '': text = tomd.convert(responseJson['data']['content']) # parser = Parser(responseJson['data']['content']) responseJson['data']["markdowncontent"] = text # print(''.join(parser.outputs)) write_txt(json.dumps(responseJson['data'],ensure_ascii=False),pwd+"/articles/"+id+".txt") time.sleep(1)
def getList(entry, attr, selector, element): list = soup.find(entry, attrs={attr: selector}).find_all(element) list_md = '' counter = 0 for idx, val in enumerate(list): if element == 'p': if val.get('class') == [u'counter']: counter += 1 list_md += str(counter) + '. ' + val.text.strip() + '\n' else: list_md += '\n' + tomd.convert(str(val)).strip() + '\n' else: list_md += '- ' + val.text.strip() + '\n' return list_md
def get_meta_data(): container = get_element_from_request('https://www.blinkist.com/nc/daily', 'div', 'daily-book__container') title = container.find('h3', 'daily-book__headline').string.strip() author = container.find('div', 'daily-book__author').string.strip() # description = container.find('div', 'dailyV2__free-book__description').string.strip() description_html = container.find('div', 'book-tabs__content-inner') description = tomd.convert(str(description_html).strip()) # cta = container.find('div', 'dailyV2__free-book__cta').a['href'] cta = container.find('a', 'daily-book__cta').get('href') img_url = container.find('img')['src'] return title, author, description, cta, img_url
def convert(self): # 处理图片; index = 1 for e in self.contentQ(self.rule.img_tag): q = pq(e) img_src = self.rule.find_img(q) img_src_cur = self.rule.save_pic(self.url, self.title, index, img_src) if q[0].tag != "img": q.replace_with(pq('<img src="' + img_src_cur + '"/>')) else: q.attr(src=img_src_cur) index += 1 # 转换成markdown; self.rule.save_md(self.title, tomd.convert(self.contentQ))
def extract_news(link): page = requests.get(link) soup = bs4.BeautifulSoup(page.text, 'html.parser') article_html = soup.select(".article")[0] article = str(article_html.encode('utf-8')) md = ''.join( tomd.convert(article).replace("\\n", "\n").replace( "\\xc2", "").replace("\\xa0", "").replace("\n\n\n", "\n").replace( "\n\n", "\n").replace("\\xe2\\x80\\x99", "'").split("|")) md = md.replace("\\xe2", "").replace("\\x86", "").replace("\\x92", "").replace( "\\x80", "").replace("\\93", "") md = extras.get_links(md) return (md)
async def embed_fff(number): """ Returns a discord.Embed object derived from an fff number """ link = f"https://factorio.com/blog/post/fff-{number}" response = await get_soup(link) if response[0] == 200: soup = response[1] titleList = soup.find_all("h2") em = discord.Embed(title=titleList[0].string.strip(), url=link, colour=discord.Colour.dark_green()) titleList = titleList[1:] if len(titleList) == 0: titleList = soup.find_all("h4") if len(titleList) == 0: titleList = soup.find_all("h3") for title in titleList: # Check for smaller font tag and append it to the title result = fontEx.search(str(title)) if len([group for group in result.groups() if group is not None]) == 1: name = result.group(1) else: name = result.group(1) + result.group(3) content = str(title.next_sibling.next_sibling) if "<p>" not in content: continue if "<ol>" in content: itemCount = 1 while "<li>" in content: content = content.replace("<li>", f"{itemCount}. ", 1) itemCount += 1 if "<ul>" in content: content = content.replace("<li>", "- ") for item in ["<ol>", "</ol>", "<ul>", "</ul>", "</li>", "<br/>"]: content = content.replace(item, "") # Escape Discord formatting characters for item in ["*", "_"]: content = content.replace(item, "\\" + item) content = content.replace("\n\n", "\n") em.add_field(name=name.replace("amp;", ""), value=tomd.convert(content).strip()) else: em = discord.Embed(title="Error", description=f"Couldn't find FFF #{number}.", colour=discord.Colour.red()) return em
def write_md(url, md_count): html = urlopen(url) bsObj = BeautifulSoup(html, 'html.parser') title = bsObj.find('div', {'class': 'article-title-box'}) title_convert = (title.h1).get_text() # print(title) # title_convert = tomd.convert(title.prettify()) md = bsObj.find('div', {'class': 'article_content'}) # print(md.prettify()) convert = tomd.convert(md.prettify()) md_name = str(md_count) + '.md' with open(md_name, 'w') as f: f.write(title_convert) f.write(convert)
def getContent(url): page = [] (year,month) = re.search('http://scienceblogs.com/evolgen/(\d+)/(\d+)',url).groups() print("processing %s %s"%(month,year),end='\r') out = open('evolgen.md','wt') html = requests.get(url = url) bs = BeautifulSoup(html.content,'html.parser') header = bs.find('h1',{'class':'title entry-title'}) page.append("#" + header.text) abbr = bs.find('abbr',{'class':'published'}) page.append("Date: %s"%abbr.text) content = bs.find('div',{'class':'content entry-content'}) for tag in content: if tag == "\n": continue page.append(tomd.convert(str(tag))) return page
def define_tr_to_str(tr: bs4.element.Tag) -> str: for a in tr.find_all("a"): a["href"] = BASE_API_URL + a["href"] data = (tr.find("td", class_="header").string, tr.find("td", class_="description")) contents = data[1].contents if len(contents) > 0: if len(contents) > 1 and "\n" not in contents[0]: description = tomd.convert( f"<p>{''.join([str(item) for item in contents[:-1]])}</p>" ).strip() else: description = contents[0].split('\n')[0].strip() return f"`{data[0]}` - {description}" else: return f"`{data[0]}`"
def parse(package): package_name = get_package(package) url = 'https://github.com/%s/releases.atom' % package_name feed = feedparser.parse(url) entries = [] for item in feed['entries']: owner, repo = package_name.split('/') version = re.search('(?<=Repository/)[0-9]+/(.+)', item['id']).group(1) authors = item['authors'][0]['name'] if 'authors' in item and item['authors'] and item['authors'][0] and item['authors'][0]['name'] else None, author = authors[0] content = "" for obj in item['content']: if obj['type'] == 'text/html': content += obj['value'] content = content.replace("<br />", "\n") content = tomd.convert(content[:1024]) while content.startswith('\n'): content = content[1:] while content.endswith('\n'): content = content[:-1] entries.append({ "embeds": [{ "title": "New release: %s" % version, "description": package_name, "url": item['link'], "thumbnail": { "url": "https://github.com/%s.png" % owner, }, "author": { "name": author, "url": "https://github.com/%s" % author, "icon_url": "https://github.com/%s.png" % author, }, "fields": [{ "name": item['title_detail']['value'], "value": content[:1024], }], "footer": { "text": time.strftime("%a %d %b, %Y at %I:%M %p", item['updated_parsed']), } }], "version": version, "package_name": package_name, }) return entries
def get_wiki_description(soup): """ Returns the first paragraph of a wiki page BeautifulSoup """ if soup.select(".mw-parser-output > p"): pNum = 0 if headerEx.search( str( soup.select( ".mw-body-content > #mw-content-text > .mw-parser-output > p" )[0])): pNum = 1 return tomd.convert( str( soup.select( ".mw-body-content > #mw-content-text > .mw-parser-output > p" )[pNum])).strip().replace("<br/>", "\n") return ""
def anno_edit(request, pk): """公告编辑""" anno = Announcement.objects.get(pk=pk) # 当为post请求时,修改数据 if request.method == "POST": form = AnnoForm(request.POST, instance=anno) if form.is_valid(): form.save() messages.success(request, "数据更新成功", extra_tags="修改成功") return HttpResponseRedirect(reverse("sspanel:backend_anno")) else: messages.error(request, "数据填写错误", extra_tags="错误") context = {"form": form, "anno": anno} return render(request, "backend/annoedit.html", context=context) # 当请求不是post时,渲染form else: anno.body = tomd.convert(anno.body) context = {"anno": anno} return render(request, "backend/annoedit.html", context=context)
def convert(self, name="", selector=""): if not name: name = self.title if not selector: selector = self.xpath # 提取文章内容; contentQ = self.rootQ(selector) # 处理图片; index = 1 for e in contentQ(self.rule.img_tag): q = pq(e) img_src = self.rule.find_img(q) img_src_cur = self.rule.save_pic(self.url, name, index, img_src) if q[0].tag != "img": q.replace_with(pq('<img src="' + img_src_cur + '"/>')) else: q.attr(src=img_src_cur) index += 1 # 转换成markdown; self.rule.save_md(name, tomd.convert(contentQ))
<i>italic</i> <b><i>bold italic</i></b> <em>em</em> <strong>strong</strong> aa <strong> strong </strong> aa </p> <hr/> <table> <thead> <tr class="1"> <th>th1</th> <th>th2</th> </tr> </thead> <tbody> <tr> <td>td</td> <td>td</td> </tr> <tr> <td>td</td> <td>td</td> </tr></tbody></table> """ print(Tomd(string).markdown) print(tomd.convert(string))
import tomd f = open('in.txt', mode='r', encoding='UTF-8') contents = f.read() f_out = open('out.md', 'w+', encoding='UTF-8') soup = tomd.convert(contents) f_out.write(soup) f.close() f_out.close()