Пример #1
0
def wp2fields(xml, wp_custpost=False):
    """Opens a wordpress XML file, and yield Pelican fields"""

    soup = xml_to_soup(xml)
    items = soup.rss.channel.findAll('item')
    for item in items:

        if item.find('status').string in ["publish", "draft"]:

            try:
                # Use HTMLParser due to issues with BeautifulSoup 3
                title = unescape(item.title.contents[0])
            except IndexError:
                title = 'No title [%s]' % item.find('post_name').string
                logger.warning('Post "%s" is lacking a proper title', title)

            post_name = item.find('post_name').string
            post_id = item.find('post_id').string
            filename = get_filename(post_name, post_id)

            content = item.find('encoded').string
            raw_date = item.find('post_date').string
            if raw_date == u'0000-00-00 00:00:00':
                date = None
            else:
                date_object = SafeDatetime.strptime(raw_date,
                                                    '%Y-%m-%d %H:%M:%S')
                date = date_object.strftime('%Y-%m-%d %H:%M')
            author = item.find('creator').string

            categories = [
                cat.string
                for cat in item.findAll('category', {'domain': 'category'})
            ]

            tags = [
                tag.string
                for tag in item.findAll('category', {'domain': 'post_tag'})
            ]
            # To publish a post the status should be 'published'
            status = 'published' if item.find('status').string == "publish" \
                else item.find('status').string

            kind = 'article'
            post_type = item.find('post_type').string
            if post_type == 'page':
                kind = 'page'
            elif wp_custpost:
                if post_type == 'post':
                    pass
                # Old behaviour was to name everything not a page as an
                # article.Theoretically all attachments have status == inherit
                # so no attachments should be here. But this statement is to
                # maintain existing behaviour in case that doesn't hold true.
                elif post_type == 'attachment':
                    pass
                else:
                    kind = post_type
            yield (title, content, filename, date, author, categories, tags,
                   status, kind, 'wp-html')
Пример #2
0
def posterous2fields(api_token, email, password):
    """Imports posterous posts"""
    import base64
    from datetime import timedelta

    try:
        # py3k import
        import json
    except ImportError:
        # py2 import
        import simplejson as json

    try:
        # py3k import
        import urllib.request as urllib_request
    except ImportError:
        # py2 import
        import urllib2 as urllib_request

    def get_posterous_posts(api_token, email, password, page=1):
        base64string = base64.encodestring(("%s:%s" % (email, password)).encode("utf-8")).replace("\n", "")
        url = ("http://posterous.com/api/v2/users/me/sites/primary/" "posts?api_token=%s&page=%d") % (api_token, page)
        request = urllib_request.Request(url)
        request.add_header("Authorization", "Basic %s" % base64string.decode())
        handle = urllib_request.urlopen(request)
        posts = json.loads(handle.read().decode("utf-8"))
        return posts

    page = 1
    posts = get_posterous_posts(api_token, email, password, page)
    while len(posts) > 0:
        posts = get_posterous_posts(api_token, email, password, page)
        page += 1

        for post in posts:
            slug = post.get("slug")
            if not slug:
                slug = slugify(post.get("title"))
            tags = [tag.get("name") for tag in post.get("tags")]
            raw_date = post.get("display_date")
            date_object = SafeDatetime.strptime(raw_date[:-6], "%Y/%m/%d %H:%M:%S")
            offset = int(raw_date[-5:])
            delta = timedelta(hours=(offset / 100))
            date_object -= delta
            date = date_object.strftime("%Y-%m-%d %H:%M")
            kind = "article"  # TODO: Recognise pages
            status = "published"  # TODO: Find a way for draft posts

            yield (
                post.get("title"),
                post.get("body_cleaned"),
                slug,
                date,
                post.get("user").get("display_name"),
                [],
                tags,
                status,
                kind,
                "html",
            )
Пример #3
0
def wp2fields(xml, wp_custpost=False):
    """Opens a wordpress XML file, and yield Pelican fields"""

    soup = xml_to_soup(xml)
    items = soup.rss.channel.findAll('item')
    for item in items:

        if item.find('status').string in ["publish", "draft"]:

            try:
                # Use HTMLParser due to issues with BeautifulSoup 3
                title = unescape(item.title.contents[0])
            except IndexError:
                title = 'No title [%s]' % item.find('post_name').string
                logger.warning('Post "%s" is lacking a proper title', title)

            post_name = item.find('post_name').string
            post_id = item.find('post_id').string
            filename = get_filename(post_name, post_id)

            content = item.find('encoded').string
            raw_date = item.find('post_date').string
            if raw_date == u'0000-00-00 00:00:00':
                date = None
            else:
                date_object = SafeDatetime.strptime(
                    raw_date, '%Y-%m-%d %H:%M:%S')
                date = date_object.strftime('%Y-%m-%d %H:%M')
            author = item.find('creator').string

            categories = [cat.string for cat
                          in item.findAll('category', {'domain': 'category'})]

            tags = [tag.string for tag
                    in item.findAll('category', {'domain': 'post_tag'})]
            # To publish a post the status should be 'published'
            status = 'published' if item.find('status').string == "publish" \
                else item.find('status').string

            kind = 'article'
            post_type = item.find('post_type').string
            if post_type == 'page':
                kind = 'page'
            elif wp_custpost:
                if post_type == 'post':
                    pass
                # Old behaviour was to name everything not a page as an
                # article.Theoretically all attachments have status == inherit
                # so no attachments should be here. But this statement is to
                # maintain existing behaviour in case that doesn't hold true.
                elif post_type == 'attachment':
                    pass
                else:
                    kind = post_type
            yield (title, content, filename, date, author, categories,
                   tags, status, kind, 'wp-html')
Пример #4
0
def posterous2fields(api_token, email, password):
    """Imports posterous posts"""
    import base64
    from datetime import timedelta
    try:
        # py3k import
        import json
    except ImportError:
        # py2 import
        import simplejson as json

    try:
        # py3k import
        import urllib.request as urllib_request
    except ImportError:
        # py2 import
        import urllib2 as urllib_request

    def get_posterous_posts(api_token, email, password, page=1):
        base64string = base64.encodestring(
            ("%s:%s" % (email, password)).encode('utf-8')).replace('\n', '')
        url = ("http://posterous.com/api/v2/users/me/sites/primary/"
               "posts?api_token=%s&page=%d") % (api_token, page)
        request = urllib_request.Request(url)
        request.add_header('Authorization', 'Basic %s' % base64string.decode())
        handle = urllib_request.urlopen(request)
        posts = json.loads(handle.read().decode('utf-8'))
        return posts

    page = 1
    posts = get_posterous_posts(api_token, email, password, page)
    settings = read_settings()
    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
    while len(posts) > 0:
        posts = get_posterous_posts(api_token, email, password, page)
        page += 1

        for post in posts:
            slug = post.get('slug')
            if not slug:
                slug = slugify(post.get('title'), regex_subs=subs)
            tags = [tag.get('name') for tag in post.get('tags')]
            raw_date = post.get('display_date')
            date_object = SafeDatetime.strptime(raw_date[:-6],
                                                '%Y/%m/%d %H:%M:%S')
            offset = int(raw_date[-5:])
            delta = timedelta(hours=(offset / 100))
            date_object -= delta
            date = date_object.strftime('%Y-%m-%d %H:%M')
            kind = 'article'  # TODO: Recognise pages
            status = 'published'  # TODO: Find a way for draft posts

            yield (post.get('title'), post.get('body_cleaned'), slug, date,
                   post.get('user').get('display_name'), [], tags, status,
                   kind, 'html')
Пример #5
0
def posterous2fields(api_token, email, password):
    """Imports posterous posts"""
    import base64
    from datetime import timedelta
    try:
        # py3k import
        import json
    except ImportError:
        # py2 import
        import simplejson as json

    try:
        # py3k import
        import urllib.request as urllib_request
    except ImportError:
        # py2 import
        import urllib2 as urllib_request

    def get_posterous_posts(api_token, email, password, page=1):
        base64string = base64.encodestring(
            ("%s:%s" % (email, password)).encode('utf-8')).replace('\n', '')
        url = ("http://posterous.com/api/v2/users/me/sites/primary/"
               "posts?api_token=%s&page=%d") % (api_token, page)
        request = urllib_request.Request(url)
        request.add_header('Authorization', 'Basic %s' % base64string.decode())
        handle = urllib_request.urlopen(request)
        posts = json.loads(handle.read().decode('utf-8'))
        return posts

    page = 1
    posts = get_posterous_posts(api_token, email, password, page)
    settings = read_settings()
    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
    while len(posts) > 0:
        posts = get_posterous_posts(api_token, email, password, page)
        page += 1

        for post in posts:
            slug = post.get('slug')
            if not slug:
                slug = slugify(post.get('title'), regex_subs=subs)
            tags = [tag.get('name') for tag in post.get('tags')]
            raw_date = post.get('display_date')
            date_object = SafeDatetime.strptime(
                raw_date[:-6], '%Y/%m/%d %H:%M:%S')
            offset = int(raw_date[-5:])
            delta = timedelta(hours=(offset / 100))
            date_object -= delta
            date = date_object.strftime('%Y-%m-%d %H:%M')
            kind = 'article'      # TODO: Recognise pages
            status = 'published'  # TODO: Find a way for draft posts

            yield (post.get('title'), post.get('body_cleaned'),
                   slug, date, post.get('user').get('display_name'),
                   [], tags, status, kind, 'html')
Пример #6
0
def blogger2fields(xml):
    """Opens a blogger XML file, and yield Pelican fields"""

    soup = xml_to_soup(xml)
    entries = soup.feed.findAll('entry')
    for entry in entries:
        raw_kind = entry.find('category', {
            'scheme': 'http://schemas.google.com/g/2005#kind'
        }).get('term')
        if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post':
            kind = 'article'
        elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment':
            kind = 'comment'
        elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page':
            kind = 'page'
        else:
            continue

        try:
            assert kind != 'comment'
            filename = entry.find('link', {'rel': 'alternate'})['href']
            filename = os.path.splitext(os.path.basename(filename))[0]
        except (AssertionError, TypeError, KeyError):
            filename = entry.find('id').string.split('.')[-1]

        title = entry.find('title').string or ''

        content = entry.find('content').string
        raw_date = entry.find('published').string
        if hasattr(SafeDatetime, 'fromisoformat'):
            date_object = SafeDatetime.fromisoformat(raw_date)
        else:
            date_object = SafeDatetime.strptime(raw_date[:23],
                                                '%Y-%m-%dT%H:%M:%S.%f')
        date = date_object.strftime('%Y-%m-%d %H:%M')
        author = entry.find('author').find('name').string

        # blogger posts only have tags, no category
        tags = [
            tag.get('term') for tag in entry.findAll(
                'category', {'scheme': 'http://www.blogger.com/atom/ns#'})
        ]

        # Drafts have <app:control><app:draft>yes</app:draft></app:control>
        status = 'published'
        try:
            if entry.find('control').find('draft').string == 'yes':
                status = 'draft'
        except AttributeError:
            pass

        yield (title, content, filename, date, author, None, tags, status,
               kind, 'html')
Пример #7
0
def posterous2fields(api_token, email, password):
    """Imports posterous posts"""
    import base64
    from datetime import timedelta
    try:
        # py3k import
        import json
    except ImportError:
        # py2 import
        import simplejson as json

    try:
        # py3k import
        import urllib.request as urllib_request
    except ImportError:
        # py2 import
        import urllib2 as urllib_request

    def get_posterous_posts(api_token, email, password, page=1):
        base64string = base64.encodestring(
            ("%s:%s" % (email, password)).encode('utf-8')).replace(b'\n', b'')
        url = "http://posterous.com/api/v2/users/me/sites/primary/posts?api_token=%s&page=%d" % (
            api_token, page)
        request = urllib_request.Request(url)
        request.add_header("Authorization", "Basic %s" % base64string.decode())
        handle = urllib_request.urlopen(request)
        posts = json.loads(handle.read().decode('utf-8'))
        return posts

    page = 1
    posts = get_posterous_posts(api_token, email, password, page)
    while len(posts) > 0:
        posts = get_posterous_posts(api_token, email, password, page)
        page += 1

        for post in posts:
            slug = post.get('slug')
            if not slug:
                slug = slugify(post.get('title'))
            tags = [tag.get('name') for tag in post.get('tags')]
            raw_date = post.get('display_date')
            date_object = SafeDatetime.strptime(raw_date[:-6],
                                                "%Y/%m/%d %H:%M:%S")
            offset = int(raw_date[-5:])
            delta = timedelta(hours=offset / 100)
            date_object -= delta
            date = date_object.strftime("%Y-%m-%d %H:%M")
            kind = 'article'  # TODO: Recognise pages

            yield (post.get('title'), post.get('body_cleaned'), slug, date,
                   post.get('user').get('display_name'), [], tags, kind,
                   "html")
Пример #8
0
def blogger2fields(xml):
    """Opens a blogger XML file, and yield Pelican fields"""

    soup = xml_to_soup(xml)
    entries = soup.feed.findAll('entry')
    for entry in entries:
        raw_kind = entry.find(
            'category', {'scheme': 'http://schemas.google.com/g/2005#kind'}
        ).get('term')
        if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post':
            kind = 'article'
        elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment':
            kind = 'comment'
        elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page':
            kind = 'page'
        else:
            continue

        try:
            assert kind != 'comment'
            filename = entry.find('link', {'rel': 'alternate'})['href']
            filename = os.path.splitext(os.path.basename(filename))[0]
        except (AssertionError, TypeError, KeyError):
            filename = entry.find('id').string.split('.')[-1]

        title = entry.find('title').string or ''

        content = entry.find('content').string
        raw_date = entry.find('published').string
        if hasattr(SafeDatetime, 'fromisoformat'):
            date_object = SafeDatetime.fromisoformat(raw_date)
        else:
            date_object = SafeDatetime.strptime(
                raw_date[:23], '%Y-%m-%dT%H:%M:%S.%f')
        date = date_object.strftime('%Y-%m-%d %H:%M')
        author = entry.find('author').find('name').string

        # blogger posts only have tags, no category
        tags = [tag.get('term') for tag in entry.findAll(
            'category', {'scheme': 'http://www.blogger.com/atom/ns#'})]

        # Drafts have <app:control><app:draft>yes</app:draft></app:control>
        status = 'published'
        try:
            if entry.find('control').find('draft').string == 'yes':
                status = 'draft'
        except AttributeError:
            pass

        yield (title, content, filename, date, author, None, tags, status,
               kind, 'html')
def datetime_from_period(value):
    """
    Converts "period" into a datetime object.

    On yearly/monthly/daily archive pages, a "period" object is supplied so you
    know what timeperiod the particular archive page is for. This converts it
    to a datetime.datetime object, so it can be further processed.

    If a month is not provided (i.e. the period is for a yearly archive),
    January is assumed. If a day is not provided (i.e. the period is for a
    yearly or monthly archive), the 1st is assumed.

    You can also generate a tuple of (up to three) integers to get a datetime
    out, using the integer representation for the month (1=January, etc).

    If passes a single integer, it is assumed to represent a year.

    Args
    ----
        value (tuple or int): input period

    Returns
    -------
        datetime.datetime: value converted

    """
    if isinstance(value, int):
        value = (value, )

    if len(value) >= 2 and isinstance(value[1], int):
        placeholder_month = SafeDatetime(2021, value[1], 1).strftime("%B")
    elif len(value) == 1:
        placeholder_month = SafeDatetime(2021, 1, 1).strftime("%B")
    else:
        placeholder_month = value[1]

    new_value = " ".join((
        str(value[0]),
        placeholder_month,
        str(value[2]) if len(value) >= 3 else "1",
    ))
    new_datetime = SafeDatetime.strptime(new_value, "%Y %B %d")
    return new_datetime
    def parse(self):
        """Imports posterous posts"""
        from datetime import timedelta

        page = 1
        posts = self._get_posterous_posts(page)
        settings = read_settings()
        subs = settings["SLUG_REGEX_SUBSTITUTIONS"]
        while len(posts) > 0:
            posts = self._get_posterous_posts(page)
            page += 1

            for post in posts:
                slug = post.get("slug")
                if not slug:
                    slug = slugify(post.get("title"), regex_subs=subs)
                tags = [tag.get("name") for tag in post.get("tags")]
                raw_date = post.get("display_date")
                date_object = SafeDatetime.strptime(
                    raw_date[:-6], "%Y/%m/%d %H:%M:%S"
                )
                offset = int(raw_date[-5:])
                delta = timedelta(hours=(offset / 100))
                date_object -= delta
                date = date_object.strftime("%Y-%m-%d %H:%M")
                kind = "article"  # TODO: Recognise pages
                status = "published"  # TODO: Find a way for draft posts

                yield blog2pelican.entities.content.Content(
                    title=post.get("title"),
                    content=post.get("body_cleaned"),
                    slug=slug,
                    date=date,
                    author=post.get("user").get("display_name"),
                    categories=[],
                    tags=tags,
                    status=status,
                    kind=kind,
                    markup="html",
                )
    def parse(self):
        """Opens a wordpress XML file, and yield Pelican fields"""

        soup = xml_to_soup(self.filepath)
        items = soup.rss.channel.findAll("item")
        for item in items:

            if item.find("status").string in ["publish", "draft"]:

                try:
                    # Use HTMLParser due to issues with BeautifulSoup 3
                    title = unescape(item.title.contents[0])
                except IndexError:
                    title = "No title [%s]" % item.find("post_name").string
                    logger.warning('Post "%s" is lacking a proper title',
                                   title)

                post_name = item.find("post_name").string
                post_id = item.find("post_id").string
                filename = get_filename(post_name, post_id)

                content = item.find("encoded").string
                raw_date = item.find("post_date").string
                if raw_date == u"0000-00-00 00:00:00":
                    date = None
                else:
                    date_object = SafeDatetime.strptime(
                        raw_date, "%Y-%m-%d %H:%M:%S")
                    date = date_object.strftime("%Y-%m-%d %H:%M")
                author = item.find("creator").string

                categories = [
                    cat.string
                    for cat in item.findAll("category", {"domain": "category"})
                ]

                tags = [
                    tag.string
                    for tag in item.findAll("category", {"domain": "post_tag"})
                ]
                # To publish a post the status should be 'published'
                status = ("published" if item.find("status").string
                          == "publish" else item.find("status").string)

                kind = "article"
                post_type = item.find("post_type").string
                if post_type == "page":
                    kind = "page"
                elif self.wp_custpost:
                    if post_type == "post":
                        pass
                    # Old behaviour was to name everything not a page as an
                    # article.Theoretically all attachments have status == inherit
                    # so no attachments should be here. But this statement is to
                    # maintain existing behaviour in case that doesn't hold true.
                    elif post_type == "attachment":
                        pass
                    else:
                        kind = post_type

                yield blog2pelican.entities.content.Content(
                    title=title,
                    content=content,
                    slug=filename,
                    date=date,
                    author=author,
                    categories=categories,
                    tags=tags,
                    status=status,
                    kind=kind,
                    markup="wp-html",
                )
Пример #12
0
    def parse(self):
        """Opens a blogger XML file, and yield Pelican fields"""
        soup = xml_to_soup(self.filepath)
        entries = soup.feed.findAll("entry")
        for entry in entries:
            raw_kind = entry.find(
                "category", {"scheme": "http://schemas.google.com/g/2005#kind"}
            ).get("term")
            if raw_kind == "http://schemas.google.com/blogger/2008/kind#post":
                kind = "article"
            elif (
                raw_kind
                == "http://schemas.google.com/blogger/2008/kind#comment"
            ):
                kind = "comment"
            elif (
                raw_kind == "http://schemas.google.com/blogger/2008/kind#page"
            ):
                kind = "page"
            else:
                continue

            try:
                assert kind != "comment"
                filename = entry.find("link", {"rel": "alternate"})["href"]
                filename = os.path.splitext(os.path.basename(filename))[0]
            except (AssertionError, TypeError, KeyError):
                filename = entry.find("id").string.split(".")[-1]

            title = entry.find("title").string or ""

            content = entry.find("content").string
            raw_date = entry.find("published").string
            if hasattr(SafeDatetime, "fromisoformat"):
                date_object = SafeDatetime.fromisoformat(raw_date)
            else:
                date_object = SafeDatetime.strptime(
                    raw_date[:23], "%Y-%m-%dT%H:%M:%S.%f"
                )
            date = date_object.strftime("%Y-%m-%d %H:%M")
            author = entry.find("author").find("name").string

            # blogger posts only have tags, no category
            tags = [
                tag.get("term")
                for tag in entry.findAll(
                    "category", {"scheme": "http://www.blogger.com/atom/ns#"}
                )
            ]

            # Drafts have <app:control><app:draft>yes</app:draft></app:control>
            status = "published"
            try:
                if entry.find("control").find("draft").string == "yes":
                    status = "draft"
            except AttributeError:
                pass

            yield (
                title,
                content,
                filename,
                date,
                author,
                None,
                tags,
                status,
                kind,
                "html",
            )