def test_br_tags_converted(self): html = "This text<br>has breaks<br>in it" text = "This text \nhas breaks \nin it" self.assertEqual(markdownify(html), text) html = "<p>A paragraph with a<br>linebreak</p>" text = "A paragraph with a \nlinebreak" self.assertEqual(markdownify(html), text)
def test_multiple_paragraphs(self): html = "<p>This text is in a paragraph</p><p>And so is this text</p>" text = "This text is in a paragraph\n\nAnd so is this text" self.assertEqual(markdownify(html), text) html = "<p>This paragraph has\nnewlines in it</p><p>This doesn't</p>" text = "This paragraph has newlines in it\n\nThis doesn't" self.assertEqual(markdownify(html), text)
def add_csv(hd,data,distnum,date): tmp_lst=[] if (hd.startswith('District')): tmp_lst.append('District') tmp_lst.append(distnum) tmp_lst.append('') tmp_lst.append('') md_data = markdownify.markdownify(data) tmp_lst.append(md_data.strip()) elif (hd.startswith('Division')): tmp_lst.append('Division') tmp_lst.append(distnum) tmp_lst.append(hd[-1:]) tmp_lst.append('') md_data = markdownify.markdownify(data) tmp_lst.append(md_data.strip()) elif (hd.startswith('Area')): tmp_lst.append('Area') tmp_lst.append(distnum) tmp_lst.append(hd[-2:-1]) tmp_lst.append(hd[-1:]) md_data = markdownify.markdownify(data) tmp_lst.append(md_data.strip()) tmp_lst.append(date) tmp_lst.append(source_link) with open(csv_name,'a') as op_file: writer = csv.writer(op_file) writer.writerow(tmp_lst)
def to_markdown(self): """ Converts the element to a Markdown string. """ md = [markdownify(self.title_html[0].decode())] for c in self.html_content.find("span").contents: md.append(markdownify(c.decode())) return "".join(md)
def to_table(row): a = "" for x in row: x = list(x.values()) first = markdownify(to_camel_case(x[0])).replace('|', '').replace("\n", '<br>') second = markdownify(x[1]).replace('|', '').replace("\n", '<br>') third = markdownify(x[2]).replace('|', '').replace("\n", '<br>') a += f"|{first}|{second}|{third}\n" return a
def test_bolding(self): html = "This text contains <strong>bolding</strong>." text = "This text contains **bolding**." self.assertEqual(markdownify(html), text) html = "<strong>bolding</strong> and <strong>more bolding</strong>" text = "**bolding** and **more bolding**" self.assertEqual(markdownify(html), text) html = "<p>Some <strong>bolded text</strong></p><p>In a paragraph</p>" text = "Some **bolded text**\n\nIn a paragraph" self.assertEqual(markdownify(html), text)
def test_hyperlinks(self): html = '<a href="https://www.pythonmorsels.com">Python Morsels</a>' text = "[Python Morsels](https://www.pythonmorsels.com)" self.assertEqual(markdownify(html), text) html = ('link 1 <a href="http://trey.io">here</a> and ' 'link 2 <a href="http://pypi.io">there</a>!') text = ( 'link 1 [here](http://trey.io) and link 2 [there](http://pypi.io)!' ) self.assertEqual(markdownify(html), text)
def __sanitizeEvent(self, event: Dict, resource: Dict[str, str], hasRca: bool) -> Dict: """Format the RH event data to filter and flatten the structure, and convert any HTML content into markdown format. Args: event (Dict): RH event data. resource (Dict[str, str]): Azure resource config data. hasRca (bool): True if the event contains Rca, else False. Returns: Dict: Sanitized RH event data. """ recommendedActions = None summary = None sanitizedEvent = {} if RECOMMENDED_ACTIONS_CONTENT in event[PROPERTIES]: self.tracer.info("[%s] event with id=%s has RCA." % (self.fullName, event[ID])) # recommendedActionContents is in HTML format. Convert it into markdown. recommendedActions = markdownify( event[PROPERTIES][RECOMMENDED_ACTIONS_CONTENT]) # summary is in HTML format if RCA is present. summary = markdownify(event[PROPERTIES][SUMMARY]) elif RECOMMENDED_ACTIONS in event[PROPERTIES]: self.tracer.info("[%s] event with id=%s does not have RCA." % (self.fullName, event[ID])) # recommendedActions is a list of actions in text format. Convert it into HTML format and then convert to markdown. htmlFormattedRecommendedActions = self.__formatToHtml( event[PROPERTIES][RECOMMENDED_ACTIONS]) recommendedActions = markdownify(htmlFormattedRecommendedActions) summary = event[PROPERTIES][SUMMARY] # Add resource related data sanitizedEvent[AZ_RESOURCE_ID] = resource[AZ_RESOURCE_ID] sanitizedEvent[SID] = resource[SID] sanitizedEvent[ARM_TYPE] = resource[ARM_TYPE] # Add RH related data. sanitizedEvent[ID] = event[ID] sanitizedEvent[NAME] = event[NAME] sanitizedEvent[RH_TYPE] = event[TYPE] parsedProperties = self.__populateProperties(event, summary, recommendedActions, hasRca) sanitizedEvent = {**sanitizedEvent, **parsedProperties} return sanitizedEvent
def markdown(): filename = fileText.get() filepath = os.path.join(cwd + '\\AlHtmlToMarkdown', filename) if os.path.exists(filepath): extension = os.path.splitext(filepath)[1] try: if extension.lower() == ".html": htmlFile = open(filepath, "r") html = htmlFile.read() htmlFile.close() markDown = markdownify.markdownify(html, heading_style="ATX") markdownFileName = filename.replace(extension, '.md') markdownFilePath = os.path.join(cwd + '\\AlHtmlToMarkd' 'own\\Markdown', markdownFileName) markdownFile = open(markdownFilePath, "w") markdownFile.writelines(markDown) markdownFile.close() text.delete(1.0, END) text.insert(1.0, markdownFileName + ' has been saved ' 'successfully in Markdown folder') except Exception as e: text.delete(1.0, END) print(str(e)) text.insert(1.0, 'Invalid document, please provide .html ' 'extension files') else: text.delete(1.0, END) text.insert(1.0, 'Invalid file path')
def main(): parser = argparse.ArgumentParser(description='Fun with phpbb database') parser.add_argument('dbfile', type=str, help='phpBB database dump (sqlite3)') args = parser.parse_args() if not os.path.exists(args.dbfile): print('Failed to open file \'{}\''.format(args.dbfile)) sys.exit(1) connection = sqlite3.connect(args.dbfile) forums = get_forums(connection) topics = get_topics(connection) attachments = get_attachments(connection) posts = get_posts(connection) forum_paths = create_forums_folders(forums) for post in posts: post_id = post['post_id'] topic_id = post['topic_id'] post_username = post['post_username'] post_text = post['post_text'] got_attachment = post['got_attachment'] post_time = datetime.datetime.fromtimestamp( post['post_time']).strftime('%Y-%m-%d %H:%M:%S') topic = topics[topic_id] topic_name = topic['name'] forum_id = topic['forum_id'] base_path = forum_paths[forum_id] post_filepath = os.path.join( base_path, '{id}_{name}.md'.format(id=topic_id, name=slugify(topic_name))) with io.open(post_filepath, 'a') as f: if f.tell() == 0: # Write header f.write('# {}\n\n'.format(topic_name)) else: f.write('\n\n') f.write('## {}, posted by: {}\n\n'.format(post_time, post_username)) f.write(markdownify(post_text)) if got_attachment: attachment_list = [ a['real_filename'] for (i, a) in attachments.items() if a['post_id'] == post_id ] f.write('\n\n### Attachments\n\n') for a in attachment_list: f.write( '[{attachment}]({attachment})'.format(attachment=a))
def _clean(self, value): if value: value = HTML_CLEANER.clean_html(value) value = markdownify(value) return value else: return ""
def render_body(self): """ Renders standard template with context """ if self.body_template is not None: body = Template(self.body_template).render(self.get_context()) elif self.template_name is not None: body = loader.get_template(self.template_name).render(self.get_context()) else: body = None try: from markdownify import markdownify except ImportError: pass else: html_body = self.render_html_body() if html_body is not None: body = markdownify(html_body, convert=ALLOWED_TAGS) if body is None: raise MissingBody('The email does not have a body. Either' ' provide a body or template_name or, if you' ' really want to send an email without a' ' body, set the body to an empty string' ' explicitly.') return body
def __init__(self, Tree): self.Tree = Tree self.TextElements = { float(TextElement.attrib["start"]): { "start": float(TextElement.attrib["start"]), "duration": float(TextElement.attrib["dur"]), "end": round( float(TextElement.attrib["start"]) + float(TextElement.attrib["dur"]), 2, ), "text": TextElement.text, "markdown": markdownify(TextElement.text).strip(), } for TextElement in self.Tree.findall("text") } self.duration = sorted([ TextElement["end"] for TextElement in self.TextElements.values() ])[-1] self.time = 0.0 self.current = None
def cleanup_html(cell: str) -> Optional[str]: if cell is not None: cleaned: str = cell.replace("<p> </p>", "").replace(" ", " ") markdown: str = markdownify(cleaned).strip() # Example Formatting Locator Strings: # The Things They Carry Video Series # The FBI is offering a reward for information leading to the arrest of Juan Carlos Martinez # Seen HTML Tags: ['br', 'div', 'p', 'a', 'li', 'ol', 'ul', 'strong'] # This cleans newlines up and only allows a max of 1 blank line in between. Not exhaustingly tested cleaned_lines: List[str] = [] counter: int = 0 for line in markdown.splitlines(): if line.strip(): # Not An Empty Line counter: int = 0 cleaned_lines.append(line.strip()) else: # Empty Line if counter == 2: cleaned_lines.append("") counter: int = counter + 1 # Not entirely sure how one random instance of <p> still escapes cleaning. return "\n".join(cleaned_lines).replace("<p>", "") return None
def render_body(self): """ Renders standard template with context """ if self.body_template is not None: body = Template(self.body_template).render(self.get_context()) elif self.template_name is not None: body = loader.get_template(self.template_name).render( self.get_context_data()) else: try: body = self.body except AttributeError: body = None try: from markdownify import markdownify except ImportError: pass else: html_body = self.render_html_body() if html_body is not None: body = markdownify(html_body, convert=ALLOWED_TAGS) if body is None: raise MissingBody('The email does not have a body. Either' ' provide a body or template_name or, if you' ' really want to send an email without a' ' body, set the body to an empty string' ' explicitly.') return body
def soup_to_md(soup): # convert top-level elements individually to remove leading space top_elements = [] for element in soup.contents: if element.name in COPY_TAGS: top_elements.append(to_pretty_html(element)) else: # convert HTML to markdown markdown = markdownify(str(element), heading_style='ATX', bullets='*').strip() # remove blank lines markdown = '\n'.join(line.rstrip() for line in markdown.splitlines() if line.strip()) top_elements.append(markdown.strip()) # combine into single string markdown = '\n\n'.join(top_elements) # remove trailing whitespace markdown = '\n'.join(line.rstrip() for line in markdown.splitlines()) # remove consecutive blank lines markdown = re.sub('\n\n\n+', '\n\n', markdown) # convert tabs to spaces markdown = markdown.replace('\t', ' ') # remove some Unicode characters for codepoint in DELETE_UNICODE: markdown = markdown.replace(chr(codepoint), '') # convert the other Unicode characters for char in set(re.findall('[^ -~\n]', markdown)): markdown = markdown.replace(char, f'&#{ord(char)};') return markdown.strip()
def __init__(self, Tree): self.Tree = Tree self.TextElements = { float(TextElement.attrib['start']): { 'start': float(TextElement.attrib['start']), 'duration': float(TextElement.attrib['dur']), 'end': round( float(TextElement.attrib['start']) + float(TextElement.attrib['dur']), 2), 'text': TextElement.text, 'markdown': markdownify(TextElement.text) } for TextElement in self.Tree.findall('text') } self.duration = sorted([ TextElement['end'] for TextElement in self.TextElements.values() ])[-1] self.time = 0.0 self.current = None
def load_assignment_details(assignment_ids, api_url, api_key, current_course): if assignment_ids is None: raise PreventUpdate elif assignment_ids == []: return ([],'','','','','','','','','','',[],[],[],[]) course = get_course(api_url, api_key, current_course) assignments = course.assignments assignment_groups = course.assignment_groups # TODO: support multiple assignments assignments = [x for x in assignments if x.id in assignment_ids] assignment = assignments[0] details = assignment.description if len(assignments) > 1: assignment_groups = [] else: pass return ( [{'label': x.name, 'value': x.id} for x in assignment_groups], # assignment_groups options assignment.assignment_group_id, assignment.id, assignment.created_at, assignment.updated_at, assignment.position, sum([getattr(assignment,'needs_grading_count',0)]), assignment.name, assignment.due_at, assignment.points_possible, markdownify(details or '_No description set in Canvas_'), details, [{'label': 'None', 'value': 'None'}], assignment.grading_type, [x for x in assignment.submission_types], )
def map(cls, breach: JSON) -> JSON: indicator: JSON = cls.DEFAULTS.copy() indicator['id'] = transient_id(indicator, breach["Name"]) # `BreachDate` itself is just a date with no time (i.e. YYYY-MM-DD), # so make sure to add some time to make the date comply with ISO 8601. indicator['valid_time'] = { 'start_time': breach['BreachDate'] + 'T00:00:00Z' } indicator['confidence'] = ['Medium', 'High'][breach['IsVerified']] # `Description` contains an overview of the breach represented in HTML, # so convert its contents to Markdown to make it comply with CTIM. indicator['description'] = markdownify(breach['Description']) indicator['severity'] = ['Medium', 'High' ][breach['IsVerified'] and 'Passwords' in breach['DataClasses']] indicator['short_description'] = breach['Title'] indicator['tags'] = breach['DataClasses'] indicator['title'] = breach['Name'] return indicator
async def get_page_contents(self) -> Union[str, Embed]: title = self.pages[self.page_index] session = ClientSession(timeout=ClientTimeout(20)) params = {"action": "query", "prop": "extracts", "titles": title, "format": "json", "exintro": "true", "explainttext": "true"} try: response = await session.get("https://ru.wikipedia.org/w/api.php", params=params) except (asyncio.TimeoutError, ClientConnectionError): return styled_embed_generator.get_embed(Style.ERROR, self.tr.translate("search_connection_error")) data = await response.json() await session.close() response.close() pages = data["query"]["pages"] text = data["query"]["pages"][next(iter(pages))]["extract"] text = markdownify(text, strip=["img"]) embed = styled_embed_generator.get_embed(Style.INFO, text, title=title, author=self.original_author, guild=self.guild) return embed
def __init__(self, Tree: ElementTree): super().__init__() self.Tree = Tree self.TextElements = { float(TextElement.attrib["start"]): { "start": float(TextElement.attrib["start"]), "duration": float(TextElement.attrib["dur"]), "end": round( float(TextElement.attrib["start"]) + float(TextElement.attrib["dur"]), 2, ), "text": TextElement.text, "markdown": markdownify(TextElement.text), } for TextElement in self.Tree.findall("text") } self.duration = sorted([ TextElement["end"] for TextElement in self.TextElements.values() ])[-1]
def __init__(self, Data: str): super().__init__() self.Tree = ElementTree.fromstring(Data) self.TextElements = { float(TextElement.attrib["start"]): { "start": float(TextElement.attrib["start"]), "duration": float(TextElement.attrib["dur"]), "end": round( float(TextElement.attrib["start"]) + float(TextElement.attrib["dur"]), 2, ), "text": TextElement.text, "markdown": markdownify(TextElement.text), } for TextElement in self.Tree.findall("text") if "dur" in TextElement.attrib } self.duration = sorted( map(lambda TextElement: TextElement["end"], self.TextElements.values()))[-1]
async def get_diary_embed(dids): description = '' for did in dids: d_entry = await api.api_call(path=f'log-entry/{did}') film = d_entry['film'] description += f"**[{film['name']} ({film['releaseYear']})]" description += f'({get_link(d_entry)})**\n' if 'diaryDetails' in d_entry: description += f"**{d_entry['diaryDetails']['diaryDate']}** " if 'rating' in d_entry: description += ' ' + int(d_entry['rating']) * '★' if str(d_entry['rating'])[-1] == '5': description += '½ ' if d_entry['like']: description += ' <3' if d_entry['diaryDetails']['rewatch']: description += ' ↺' if 'review' in d_entry: if d_entry['review']['containsSpoilers']: description += '\n```Contains spoilers```' else: description += '\n```' +markdownify(d_entry['review']['text'][:1600]) + '```' description += '\n' embed = discord.Embed(description=description) if 'poster' in film: embed.set_thumbnail(url=film['poster']['sizes'][-1]['url']) return embed
def hyper_markdownify(s): while True: try: s = json.loads(s) except (json.decoder.JSONDecodeError, TypeError): break return markdownify(s)
def dump(self, post: Post) -> None: metadata = json.loads(post.json(exclude={"canonical_url", "filepath"})) content = markdownify(metadata.pop("content")) frontmatter_post = frontmatter.Post(content, **metadata) frontmatter.dump(frontmatter_post, post.filepath, encoding="utf-8")
def get_summary(entry: feedparser.FeedParserDict) -> str: try: summary = entry.summary if entry.summary_detail.type in ('text/html', 'text/xml'): summary = remove_html_tags(markdownify(summary)) return summary except AttributeError: return ''
def write_to_file(article, header): file_path = _build_path(article.title) markdown = markdownify.markdownify(article.html, heading_style='ATX') with open(file_path, 'w') as f: f.write(header) f.write('------') f.write('\n') with open(file_path, 'a') as f: f.write(markdown)
def extract_data(html): soup = BeautifulSoup(html, 'lxml') title = soup.find('h1', attrs={'data-qa': "vacancy-title"}).text body = soup.find('div', attrs={'data-qa': "vacancy-description"}) comp = soup.find('a', class_="vacancy-company-name") if comp: company_name = comp.text.replace(u'\xa0', u' ') comp = company_data(comp['href'], company_name) return {'title': title, 'company': comp, 'body': markdownify(str(body))}
def main(): file = open(SOURCE_PATH) source = file.read() file.close() target = markdownify.markdownify(source) file = open(TARGET_PATH, "w") file.write(target) file.close()
def request_details(cls, request): ticker = request.get("ticker") try: assetData = CoinGecko.connection.get_coin_by_id(id=ticker.get("symbol"), localization="false", tickers=False, market_data=True, community_data=True, developer_data=True) historicData = CoinGecko.connection.get_coin_ohlc_by_id(id=ticker.get("symbol"), vs_currency="usd", days=365) except: return [{}, ""] description = markdownify(assetData["description"].get("en", "No description")) descriptionParagraphs = description.split("\r\n\r\n") textLength = [len(descriptionParagraphs[0])] for i in range(1, len(descriptionParagraphs)): nextLength = textLength[-1] + len(descriptionParagraphs[i]) if nextLength > 500 and textLength[-1] > 300 or nextLength > 1900: break textLength.append(nextLength) description = "\n".join(descriptionParagraphs[:len(textLength)])[:] + "\n[Read more on CoinGecko](https://www.coingecko.com/coins/{})".format(ticker.get("symbol")) highs = [e[2] for e in historicData] lows = [e[3] for e in historicData] payload = { "name": "{} ({})".format(assetData["name"], ticker.get("base")), "description": description, "rank": assetData["market_data"]["market_cap_rank"], "supply": {}, "score": { "developer": assetData["developer_score"], "community": assetData["community_score"], "liquidity": assetData["liquidity_score"], "public interest": assetData["public_interest_score"] }, "price": { "current": assetData["market_data"]["current_price"].get("usd"), "ath": assetData["market_data"]["ath"].get("usd"), "atl": assetData["market_data"]["atl"].get("usd") }, "change": { "past day": assetData["market_data"]["price_change_percentage_24h_in_currency"].get("usd"), "past month": assetData["market_data"]["price_change_percentage_30d_in_currency"].get("usd"), "past year": assetData["market_data"]["price_change_percentage_1y_in_currency"].get("usd") }, "sourceText": "Data from CoinGecko", "platform": "CoinGecko", } if assetData["image"]["large"].startswith("http"): payload["image"] = assetData["image"]["large"] if assetData["links"]["homepage"][0] != "": payload["url"] = assetData["links"]["homepage"][0].replace(" ", "") if assetData["links"]["homepage"][0].replace(" ", "").startswith("http") else "https://" + assetData["links"]["homepage"][0].replace(" ", "") if assetData["market_data"]["total_volume"] is not None: payload["volume"] = assetData["market_data"]["total_volume"].get("usd") if assetData["market_data"]["market_cap"] is not None: payload["marketcap"] = assetData["market_data"]["market_cap"].get("usd") if assetData["market_data"]["total_supply"] is not None: payload["supply"]["total"] = assetData["market_data"]["total_supply"] if assetData["market_data"]["circulating_supply"] is not None: payload["supply"]["circulating"] = assetData["market_data"]["circulating_supply"] if len(highs) != 0: payload["price"]["1y high"] = max(highs) if len(lows) != 0: payload["price"]["1y low"] = min(lows) return [payload, ""]
def main(argv): input_dir = argv[0] output_dir = argv[1] try: windows_encoding = argv[2] is not None except IndexError: windows_encoding = False print("Input directory: " + input_dir) print("Output directory: " + output_dir) copy_files(input_dir, output_dir) print("Copied files to output directory.") files = get_html_files(output_dir) try: for f in files: print("Editing file: " + os.path.basename(f)) with open(f, 'r', encoding='utf-8') as original: filedata = original.read() font_matter: str = re.search(pattern='---[\s\S]*---', string=filedata)[0] filedata = re.sub(pattern='---[\s\S]*---', repl="", string=filedata) soup = BeautifulSoup(filedata, "html.parser") author_name = getattr(soup.find(class_="profile-usercard-hover"), 'text', '') author_name = re.sub(pattern="\([a-zA-Z\s]*\)", repl="", string=author_name) author_name = "author_name: "+author_name title = os.path.basename(f).replace(".html", "").replace(".md", "") title = re.sub(pattern="\d*-\d*-\d*-", repl="", string=title) title = "title: "+"\""+title+"\"" font_matter = font_matter[0:3]+"\n"+\ title+"\n"+\ author_name+\ font_matter[3:len(font_matter)]+\ "\n" with open(f, 'w', encoding='utf-8') as modified: converted_article = font_matter+markdownify(filedata, bullets='-', header='ATX') modified.write(converted_article) newname = f.replace('.html', '.md') output = os.rename(f, newname) except Exception as e: print(e) print('Batch process failed. Deleting the contents of the output directory.') for filename in os.listdir(output_dir): filepath = os.path.join(output_dir, filename) try: shutil.rmtree(filepath) except OSError: os.remove(filepath) print('Done.') exit()
def get_edit(self, id): """ Get route for Editing an Item TODO: Make decorator for "owership" of item """ item = Item.query.get_or_404(id) if current_user.id == item.user_id: commentForm = self._commentForm(request) commentForm.text.data = markdownify(item.text) commentForm.edit.data = True return render_template('item/item.html', item = item, form = commentForm, title=item.title, edit=True) else: return redirect(url_for('.item', id=id))
def process(url, start=0, fetch=50): """ Main processing engine """ pos = start # End will be updated during each request with incoming data end = pos + fetch Console.header("Tumblr Import") Console.info("Importing data...") Console.indent() while pos < end: Console.info("Requesting %s-%s of %s" % (pos, pos+fetch-1, end)) response = requests.get(url % (pos, fetch)) if response.status_code != 200: raise Exception("Error during communication with Tumblr: %s" % r.status) tree = ElementTree.fromstring(response.content) # This element contains all posts allPosts = tree.find("posts") # Update end pointer end = int(allPosts.get("total")) # Iterate trough all posts for post in allPosts: postType = post.get("type") postTimeStamp = post.get("unix-timestamp") postExportDate = str(datetime.datetime.fromtimestamp(int(postTimeStamp))) postSlug = post.get("slug") postFormat = post.get("format") postDateOnly = postExportDate[0:postExportDate.find(" ")] postFileName = "%s-%s" % (postDateOnly, postSlug) if postType == "quote": quoteText = post.find("quote-text").text quoteComment = post.find("quote-source").text # Post-process quoteText = markdownify.markdownify("<blockquote>" + quoteText + "</blockquote>").rstrip("\n").lstrip("\n") quoteComment = markdownify.markdownify(quoteComment).rstrip("\n") fileContent = quoteTemplate % (postSlug, postExportDate, quoteText + "\n\n" + quoteComment) elif postType == "photo": photoText = post.find("photo-caption").text try: photoLinkUrl = post.find("photo-link-url").text except: photoLinkUrl = None photoUrl = post.find("photo-url").text # Post-process photoText = markdownify.markdownify(photoText).rstrip("\n") # Downloading image photoResponse = requests.get(photoUrl, allow_redirects=True) if photoResponse.status_code != 200: Console.error("Unable to load photo. Status: %s; URL: %s", photoResponse.status_code, photoUrl) continue # Build extension based on response headers (safer than using file extension) photoType = photoResponse.headers["content-type"] if "png" in photoType: photoExtension = ".png" elif "jpeg" in photoType or "jpg" in photoType: photoExtension = ".jpeg" elif "gif" in photoType: photoExtension = ".gif" else: Console.error("Unknown photo format: %s; Status: %s; URL: %s", photoType, photoResponse.status_code, photoUrl) continue # Generating checksum photoHash = hashlib.sha1(photoResponse.content).hexdigest() # Generate file name and path from existing data photoFileName = "%s-%s-%s%s" % (postDateOnly, postSlug, photoHash[0:10], photoExtension) photoPath = os.path.join(photoFolder, photoFileName) # Do not repeatly write identical files if not os.path.exists(photoPath): photoFile = open(photoPath, "wb") photoFile.write(photoResponse.content) photoFile.close() # Generate basic image tag photoAsset = '<img src="{{@asset.url %s/%s/%s}}" alt=""/>' % (projectName, photoAssetFolder, photoFileName) # Wrap with a link when it should be link to an external site if photoLinkUrl: photoAsset = '<a href="%s">%s</a>' % (photoLinkUrl, photoAsset) fileContent = photoTemplate % (postSlug, postExportDate, photoAsset + "\n\n" + photoText) elif postType == "link": linkUrl = post.find("link-url").text try: linkText = post.find("link-text").text except: linkText = linkUrl # Post-process if linkText != linkUrl: linkText = markdownify.markdownify(linkText).rstrip("\n") fileContent = linkTemplate % (postSlug, postExportDate, "[%s](%s)" % (linkText, linkUrl)) elif postType == "video": videoCode = post.find("video-source").text videoText = post.find("video-caption").text # Post-process videoText = markdownify.markdownify(videoText).rstrip("\n") fileContent = videoTemplate % (postSlug, postExportDate, videoCode + "\n\n" + videoText) elif postType == "regular": postText = post.find("regular-body").text try: postTitle = post.find("regular-title").text except: # Ignore posts without title Console.warn("Ignoring post without title!") continue postText = markdownify.markdownify(postText).rstrip("\n") fileContent = regularTemplate % (postSlug, postExportDate, postTitle, postText) else: Console.warn("Unknown POST-TYPE: %s" % postType) print(ElementTree.dump(post)) continue # Write post file fileHandle = open(os.path.join(postFolder, postDateOnly + "-" + postType + "-" + postSlug + ".markdown"), "w") fileHandle.write(fileContent) fileHandle.close() # Update for next requests pos = pos + fetch Console.outdent() Console.info("Successfully imported")
#!/usr/bin/env python # some comment as a example from markdownify import markdownify with open('example.html') as f: lines = ''.join(f.readlines()) md = markdownify(lines) print md class MyClass(object): def __init__(self): # TODO: write something! pass