def wp2fields(xml, wp_custpost=False): """Opens a wordpress XML file, and yield Pelican fields""" soup = xml_to_soup(xml) items = soup.rss.channel.findAll('item') for item in items: if item.find('status').string in ["publish", "draft"]: try: # Use HTMLParser due to issues with BeautifulSoup 3 title = unescape(item.title.contents[0]) except IndexError: title = 'No title [%s]' % item.find('post_name').string logger.warning('Post "%s" is lacking a proper title', title) post_name = item.find('post_name').string post_id = item.find('post_id').string filename = get_filename(post_name, post_id) content = item.find('encoded').string raw_date = item.find('post_date').string if raw_date == u'0000-00-00 00:00:00': date = None else: date_object = SafeDatetime.strptime(raw_date, '%Y-%m-%d %H:%M:%S') date = date_object.strftime('%Y-%m-%d %H:%M') author = item.find('creator').string categories = [ cat.string for cat in item.findAll('category', {'domain': 'category'}) ] tags = [ tag.string for tag in item.findAll('category', {'domain': 'post_tag'}) ] # To publish a post the status should be 'published' status = 'published' if item.find('status').string == "publish" \ else item.find('status').string kind = 'article' post_type = item.find('post_type').string if post_type == 'page': kind = 'page' elif wp_custpost: if post_type == 'post': pass # Old behaviour was to name everything not a page as an # article.Theoretically all attachments have status == inherit # so no attachments should be here. But this statement is to # maintain existing behaviour in case that doesn't hold true. elif post_type == 'attachment': pass else: kind = post_type yield (title, content, filename, date, author, categories, tags, status, kind, 'wp-html')
def posterous2fields(api_token, email, password): """Imports posterous posts""" import base64 from datetime import timedelta try: # py3k import import json except ImportError: # py2 import import simplejson as json try: # py3k import import urllib.request as urllib_request except ImportError: # py2 import import urllib2 as urllib_request def get_posterous_posts(api_token, email, password, page=1): base64string = base64.encodestring(("%s:%s" % (email, password)).encode("utf-8")).replace("\n", "") url = ("http://posterous.com/api/v2/users/me/sites/primary/" "posts?api_token=%s&page=%d") % (api_token, page) request = urllib_request.Request(url) request.add_header("Authorization", "Basic %s" % base64string.decode()) handle = urllib_request.urlopen(request) posts = json.loads(handle.read().decode("utf-8")) return posts page = 1 posts = get_posterous_posts(api_token, email, password, page) while len(posts) > 0: posts = get_posterous_posts(api_token, email, password, page) page += 1 for post in posts: slug = post.get("slug") if not slug: slug = slugify(post.get("title")) tags = [tag.get("name") for tag in post.get("tags")] raw_date = post.get("display_date") date_object = SafeDatetime.strptime(raw_date[:-6], "%Y/%m/%d %H:%M:%S") offset = int(raw_date[-5:]) delta = timedelta(hours=(offset / 100)) date_object -= delta date = date_object.strftime("%Y-%m-%d %H:%M") kind = "article" # TODO: Recognise pages status = "published" # TODO: Find a way for draft posts yield ( post.get("title"), post.get("body_cleaned"), slug, date, post.get("user").get("display_name"), [], tags, status, kind, "html", )
def wp2fields(xml, wp_custpost=False): """Opens a wordpress XML file, and yield Pelican fields""" soup = xml_to_soup(xml) items = soup.rss.channel.findAll('item') for item in items: if item.find('status').string in ["publish", "draft"]: try: # Use HTMLParser due to issues with BeautifulSoup 3 title = unescape(item.title.contents[0]) except IndexError: title = 'No title [%s]' % item.find('post_name').string logger.warning('Post "%s" is lacking a proper title', title) post_name = item.find('post_name').string post_id = item.find('post_id').string filename = get_filename(post_name, post_id) content = item.find('encoded').string raw_date = item.find('post_date').string if raw_date == u'0000-00-00 00:00:00': date = None else: date_object = SafeDatetime.strptime( raw_date, '%Y-%m-%d %H:%M:%S') date = date_object.strftime('%Y-%m-%d %H:%M') author = item.find('creator').string categories = [cat.string for cat in item.findAll('category', {'domain': 'category'})] tags = [tag.string for tag in item.findAll('category', {'domain': 'post_tag'})] # To publish a post the status should be 'published' status = 'published' if item.find('status').string == "publish" \ else item.find('status').string kind = 'article' post_type = item.find('post_type').string if post_type == 'page': kind = 'page' elif wp_custpost: if post_type == 'post': pass # Old behaviour was to name everything not a page as an # article.Theoretically all attachments have status == inherit # so no attachments should be here. But this statement is to # maintain existing behaviour in case that doesn't hold true. elif post_type == 'attachment': pass else: kind = post_type yield (title, content, filename, date, author, categories, tags, status, kind, 'wp-html')
def posterous2fields(api_token, email, password): """Imports posterous posts""" import base64 from datetime import timedelta try: # py3k import import json except ImportError: # py2 import import simplejson as json try: # py3k import import urllib.request as urllib_request except ImportError: # py2 import import urllib2 as urllib_request def get_posterous_posts(api_token, email, password, page=1): base64string = base64.encodestring( ("%s:%s" % (email, password)).encode('utf-8')).replace('\n', '') url = ("http://posterous.com/api/v2/users/me/sites/primary/" "posts?api_token=%s&page=%d") % (api_token, page) request = urllib_request.Request(url) request.add_header('Authorization', 'Basic %s' % base64string.decode()) handle = urllib_request.urlopen(request) posts = json.loads(handle.read().decode('utf-8')) return posts page = 1 posts = get_posterous_posts(api_token, email, password, page) settings = read_settings() subs = settings['SLUG_REGEX_SUBSTITUTIONS'] while len(posts) > 0: posts = get_posterous_posts(api_token, email, password, page) page += 1 for post in posts: slug = post.get('slug') if not slug: slug = slugify(post.get('title'), regex_subs=subs) tags = [tag.get('name') for tag in post.get('tags')] raw_date = post.get('display_date') date_object = SafeDatetime.strptime(raw_date[:-6], '%Y/%m/%d %H:%M:%S') offset = int(raw_date[-5:]) delta = timedelta(hours=(offset / 100)) date_object -= delta date = date_object.strftime('%Y-%m-%d %H:%M') kind = 'article' # TODO: Recognise pages status = 'published' # TODO: Find a way for draft posts yield (post.get('title'), post.get('body_cleaned'), slug, date, post.get('user').get('display_name'), [], tags, status, kind, 'html')
def posterous2fields(api_token, email, password): """Imports posterous posts""" import base64 from datetime import timedelta try: # py3k import import json except ImportError: # py2 import import simplejson as json try: # py3k import import urllib.request as urllib_request except ImportError: # py2 import import urllib2 as urllib_request def get_posterous_posts(api_token, email, password, page=1): base64string = base64.encodestring( ("%s:%s" % (email, password)).encode('utf-8')).replace('\n', '') url = ("http://posterous.com/api/v2/users/me/sites/primary/" "posts?api_token=%s&page=%d") % (api_token, page) request = urllib_request.Request(url) request.add_header('Authorization', 'Basic %s' % base64string.decode()) handle = urllib_request.urlopen(request) posts = json.loads(handle.read().decode('utf-8')) return posts page = 1 posts = get_posterous_posts(api_token, email, password, page) settings = read_settings() subs = settings['SLUG_REGEX_SUBSTITUTIONS'] while len(posts) > 0: posts = get_posterous_posts(api_token, email, password, page) page += 1 for post in posts: slug = post.get('slug') if not slug: slug = slugify(post.get('title'), regex_subs=subs) tags = [tag.get('name') for tag in post.get('tags')] raw_date = post.get('display_date') date_object = SafeDatetime.strptime( raw_date[:-6], '%Y/%m/%d %H:%M:%S') offset = int(raw_date[-5:]) delta = timedelta(hours=(offset / 100)) date_object -= delta date = date_object.strftime('%Y-%m-%d %H:%M') kind = 'article' # TODO: Recognise pages status = 'published' # TODO: Find a way for draft posts yield (post.get('title'), post.get('body_cleaned'), slug, date, post.get('user').get('display_name'), [], tags, status, kind, 'html')
def blogger2fields(xml): """Opens a blogger XML file, and yield Pelican fields""" soup = xml_to_soup(xml) entries = soup.feed.findAll('entry') for entry in entries: raw_kind = entry.find('category', { 'scheme': 'http://schemas.google.com/g/2005#kind' }).get('term') if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post': kind = 'article' elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment': kind = 'comment' elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page': kind = 'page' else: continue try: assert kind != 'comment' filename = entry.find('link', {'rel': 'alternate'})['href'] filename = os.path.splitext(os.path.basename(filename))[0] except (AssertionError, TypeError, KeyError): filename = entry.find('id').string.split('.')[-1] title = entry.find('title').string or '' content = entry.find('content').string raw_date = entry.find('published').string if hasattr(SafeDatetime, 'fromisoformat'): date_object = SafeDatetime.fromisoformat(raw_date) else: date_object = SafeDatetime.strptime(raw_date[:23], '%Y-%m-%dT%H:%M:%S.%f') date = date_object.strftime('%Y-%m-%d %H:%M') author = entry.find('author').find('name').string # blogger posts only have tags, no category tags = [ tag.get('term') for tag in entry.findAll( 'category', {'scheme': 'http://www.blogger.com/atom/ns#'}) ] # Drafts have <app:control><app:draft>yes</app:draft></app:control> status = 'published' try: if entry.find('control').find('draft').string == 'yes': status = 'draft' except AttributeError: pass yield (title, content, filename, date, author, None, tags, status, kind, 'html')
def posterous2fields(api_token, email, password): """Imports posterous posts""" import base64 from datetime import timedelta try: # py3k import import json except ImportError: # py2 import import simplejson as json try: # py3k import import urllib.request as urllib_request except ImportError: # py2 import import urllib2 as urllib_request def get_posterous_posts(api_token, email, password, page=1): base64string = base64.encodestring( ("%s:%s" % (email, password)).encode('utf-8')).replace(b'\n', b'') url = "http://posterous.com/api/v2/users/me/sites/primary/posts?api_token=%s&page=%d" % ( api_token, page) request = urllib_request.Request(url) request.add_header("Authorization", "Basic %s" % base64string.decode()) handle = urllib_request.urlopen(request) posts = json.loads(handle.read().decode('utf-8')) return posts page = 1 posts = get_posterous_posts(api_token, email, password, page) while len(posts) > 0: posts = get_posterous_posts(api_token, email, password, page) page += 1 for post in posts: slug = post.get('slug') if not slug: slug = slugify(post.get('title')) tags = [tag.get('name') for tag in post.get('tags')] raw_date = post.get('display_date') date_object = SafeDatetime.strptime(raw_date[:-6], "%Y/%m/%d %H:%M:%S") offset = int(raw_date[-5:]) delta = timedelta(hours=offset / 100) date_object -= delta date = date_object.strftime("%Y-%m-%d %H:%M") kind = 'article' # TODO: Recognise pages yield (post.get('title'), post.get('body_cleaned'), slug, date, post.get('user').get('display_name'), [], tags, kind, "html")
def blogger2fields(xml): """Opens a blogger XML file, and yield Pelican fields""" soup = xml_to_soup(xml) entries = soup.feed.findAll('entry') for entry in entries: raw_kind = entry.find( 'category', {'scheme': 'http://schemas.google.com/g/2005#kind'} ).get('term') if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post': kind = 'article' elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment': kind = 'comment' elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page': kind = 'page' else: continue try: assert kind != 'comment' filename = entry.find('link', {'rel': 'alternate'})['href'] filename = os.path.splitext(os.path.basename(filename))[0] except (AssertionError, TypeError, KeyError): filename = entry.find('id').string.split('.')[-1] title = entry.find('title').string or '' content = entry.find('content').string raw_date = entry.find('published').string if hasattr(SafeDatetime, 'fromisoformat'): date_object = SafeDatetime.fromisoformat(raw_date) else: date_object = SafeDatetime.strptime( raw_date[:23], '%Y-%m-%dT%H:%M:%S.%f') date = date_object.strftime('%Y-%m-%d %H:%M') author = entry.find('author').find('name').string # blogger posts only have tags, no category tags = [tag.get('term') for tag in entry.findAll( 'category', {'scheme': 'http://www.blogger.com/atom/ns#'})] # Drafts have <app:control><app:draft>yes</app:draft></app:control> status = 'published' try: if entry.find('control').find('draft').string == 'yes': status = 'draft' except AttributeError: pass yield (title, content, filename, date, author, None, tags, status, kind, 'html')
def datetime_from_period(value): """ Converts "period" into a datetime object. On yearly/monthly/daily archive pages, a "period" object is supplied so you know what timeperiod the particular archive page is for. This converts it to a datetime.datetime object, so it can be further processed. If a month is not provided (i.e. the period is for a yearly archive), January is assumed. If a day is not provided (i.e. the period is for a yearly or monthly archive), the 1st is assumed. You can also generate a tuple of (up to three) integers to get a datetime out, using the integer representation for the month (1=January, etc). If passes a single integer, it is assumed to represent a year. Args ---- value (tuple or int): input period Returns ------- datetime.datetime: value converted """ if isinstance(value, int): value = (value, ) if len(value) >= 2 and isinstance(value[1], int): placeholder_month = SafeDatetime(2021, value[1], 1).strftime("%B") elif len(value) == 1: placeholder_month = SafeDatetime(2021, 1, 1).strftime("%B") else: placeholder_month = value[1] new_value = " ".join(( str(value[0]), placeholder_month, str(value[2]) if len(value) >= 3 else "1", )) new_datetime = SafeDatetime.strptime(new_value, "%Y %B %d") return new_datetime
def parse(self): """Imports posterous posts""" from datetime import timedelta page = 1 posts = self._get_posterous_posts(page) settings = read_settings() subs = settings["SLUG_REGEX_SUBSTITUTIONS"] while len(posts) > 0: posts = self._get_posterous_posts(page) page += 1 for post in posts: slug = post.get("slug") if not slug: slug = slugify(post.get("title"), regex_subs=subs) tags = [tag.get("name") for tag in post.get("tags")] raw_date = post.get("display_date") date_object = SafeDatetime.strptime( raw_date[:-6], "%Y/%m/%d %H:%M:%S" ) offset = int(raw_date[-5:]) delta = timedelta(hours=(offset / 100)) date_object -= delta date = date_object.strftime("%Y-%m-%d %H:%M") kind = "article" # TODO: Recognise pages status = "published" # TODO: Find a way for draft posts yield blog2pelican.entities.content.Content( title=post.get("title"), content=post.get("body_cleaned"), slug=slug, date=date, author=post.get("user").get("display_name"), categories=[], tags=tags, status=status, kind=kind, markup="html", )
def parse(self): """Opens a wordpress XML file, and yield Pelican fields""" soup = xml_to_soup(self.filepath) items = soup.rss.channel.findAll("item") for item in items: if item.find("status").string in ["publish", "draft"]: try: # Use HTMLParser due to issues with BeautifulSoup 3 title = unescape(item.title.contents[0]) except IndexError: title = "No title [%s]" % item.find("post_name").string logger.warning('Post "%s" is lacking a proper title', title) post_name = item.find("post_name").string post_id = item.find("post_id").string filename = get_filename(post_name, post_id) content = item.find("encoded").string raw_date = item.find("post_date").string if raw_date == u"0000-00-00 00:00:00": date = None else: date_object = SafeDatetime.strptime( raw_date, "%Y-%m-%d %H:%M:%S") date = date_object.strftime("%Y-%m-%d %H:%M") author = item.find("creator").string categories = [ cat.string for cat in item.findAll("category", {"domain": "category"}) ] tags = [ tag.string for tag in item.findAll("category", {"domain": "post_tag"}) ] # To publish a post the status should be 'published' status = ("published" if item.find("status").string == "publish" else item.find("status").string) kind = "article" post_type = item.find("post_type").string if post_type == "page": kind = "page" elif self.wp_custpost: if post_type == "post": pass # Old behaviour was to name everything not a page as an # article.Theoretically all attachments have status == inherit # so no attachments should be here. But this statement is to # maintain existing behaviour in case that doesn't hold true. elif post_type == "attachment": pass else: kind = post_type yield blog2pelican.entities.content.Content( title=title, content=content, slug=filename, date=date, author=author, categories=categories, tags=tags, status=status, kind=kind, markup="wp-html", )
def parse(self): """Opens a blogger XML file, and yield Pelican fields""" soup = xml_to_soup(self.filepath) entries = soup.feed.findAll("entry") for entry in entries: raw_kind = entry.find( "category", {"scheme": "http://schemas.google.com/g/2005#kind"} ).get("term") if raw_kind == "http://schemas.google.com/blogger/2008/kind#post": kind = "article" elif ( raw_kind == "http://schemas.google.com/blogger/2008/kind#comment" ): kind = "comment" elif ( raw_kind == "http://schemas.google.com/blogger/2008/kind#page" ): kind = "page" else: continue try: assert kind != "comment" filename = entry.find("link", {"rel": "alternate"})["href"] filename = os.path.splitext(os.path.basename(filename))[0] except (AssertionError, TypeError, KeyError): filename = entry.find("id").string.split(".")[-1] title = entry.find("title").string or "" content = entry.find("content").string raw_date = entry.find("published").string if hasattr(SafeDatetime, "fromisoformat"): date_object = SafeDatetime.fromisoformat(raw_date) else: date_object = SafeDatetime.strptime( raw_date[:23], "%Y-%m-%dT%H:%M:%S.%f" ) date = date_object.strftime("%Y-%m-%d %H:%M") author = entry.find("author").find("name").string # blogger posts only have tags, no category tags = [ tag.get("term") for tag in entry.findAll( "category", {"scheme": "http://www.blogger.com/atom/ns#"} ) ] # Drafts have <app:control><app:draft>yes</app:draft></app:control> status = "published" try: if entry.find("control").find("draft").string == "yes": status = "draft" except AttributeError: pass yield ( title, content, filename, date, author, None, tags, status, kind, "html", )