Exemplo n.º 1
0
    def import_item(self, item, wordpress_namespace, out_folder=None):
        """Takes an item from the feed and creates a post file."""
        if out_folder is None:
            out_folder = 'posts'

        title = get_text_tag(item, 'title', 'NO TITLE')
        # link is something like http://foo.com/2012/09/01/hello-world/
        # So, take the path, utils.slugify it, and that's our slug
        link = get_text_tag(item, 'link', None)
        parsed = urlparse(link)
        path = unquote(parsed.path.strip('/'))

        # In python 2, path is a str. slug requires a unicode
        # object. According to wikipedia, unquoted strings will
        # usually be UTF8
        if isinstance(path, utils.bytes_str):
            path = path.decode('utf8')

        # Cut out the base directory.
        if path.startswith(self.base_dir.strip('/')):
            path = path.replace(self.base_dir.strip('/'), '', 1)

        pathlist = path.split('/')
        if parsed.query:  # if there are no nice URLs and query strings are used
            out_folder = os.path.join(*([out_folder] + pathlist))
            slug = get_text_tag(item,
                                '{{{0}}}post_name'.format(wordpress_namespace),
                                None)
            if not slug:  # it *may* happen
                slug = get_text_tag(
                    item, '{{{0}}}post_id'.format(wordpress_namespace), None)
            if not slug:  # should never happen
                LOGGER.error("Error converting post:", title)
                return
        else:
            if len(pathlist) > 1:
                out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
            slug = utils.slugify(pathlist[-1])

        description = get_text_tag(item, 'description', '')
        post_date = get_text_tag(
            item, '{{{0}}}post_date'.format(wordpress_namespace), None)
        try:
            dt = utils.to_datetime(post_date)
        except ValueError:
            dt = datetime.datetime(1970, 1, 1, 0, 0, 0)
            LOGGER.error(
                'Malformed date "{0}" in "{1}" [{2}], assuming 1970-01-01 00:00:00 instead.'
                .format(post_date, title, slug))
            post_date = dt.strftime('%Y-%m-%d %H:%M:%S')

        if dt.tzinfo and self.timezone is None:
            self.timezone = utils.get_tzname(dt)
        status = get_text_tag(item,
                              '{{{0}}}status'.format(wordpress_namespace),
                              'publish')
        content = get_text_tag(
            item, '{http://purl.org/rss/1.0/modules/content/}encoded', '')

        tags = []
        if status == 'trash':
            LOGGER.warn(
                'Trashed post "{0}" will not be imported.'.format(title))
            return
        elif status != 'publish':
            tags.append('draft')
            is_draft = True
        else:
            is_draft = False

        for tag in item.findall('category'):
            text = tag.text
            if text == 'Uncategorized':
                continue
            tags.append(text)

        if '$latex' in content:
            tags.append('mathjax')

        if is_draft and self.exclude_drafts:
            LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
        elif content.strip():
            # If no content is found, no files are written.
            self.url_map[link] = (self.context['SITE_URL'] +
                                  out_folder.rstrip('/') + '/' + slug +
                                  '.html').replace(os.sep, '/')
            if hasattr(self, "separate_qtranslate_content") \
               and self.separate_qtranslate_content:
                content_translations = separate_qtranslate_content(content)
            else:
                content_translations = {"": content}
            default_language = self.context["DEFAULT_LANG"]
            for lang, content in content_translations.items():
                if lang:
                    out_meta_filename = slug + '.meta'
                    if lang == default_language:
                        out_content_filename = slug + '.wp'
                    else:
                        out_content_filename \
                            = utils.get_translation_candidate(self.context,
                                                              slug + ".wp", lang)
                        self.extra_languages.add(lang)
                    meta_slug = slug
                else:
                    out_meta_filename = slug + '.meta'
                    out_content_filename = slug + '.wp'
                    meta_slug = slug
                content = self.transform_content(content)
                self.write_metadata(
                    os.path.join(self.output_folder, out_folder,
                                 out_meta_filename), title, meta_slug,
                    post_date, description, tags)
                self.write_content(
                    os.path.join(self.output_folder, out_folder,
                                 out_content_filename), content)
        else:
            LOGGER.warn('Not going to import "{0}" because it seems to contain'
                        ' no content.'.format(title))
Exemplo n.º 2
0
    def import_item(self, item, wordpress_namespace, out_folder=None):
        """Takes an item from the feed and creates a post file."""
        if out_folder is None:
            out_folder = 'posts'

        title = get_text_tag(item, 'title', 'NO TITLE')
        # link is something like http://foo.com/2012/09/01/hello-world/
        # So, take the path, utils.slugify it, and that's our slug
        link = get_text_tag(item, 'link', None)
        path = unquote(urlparse(link).path)

        # In python 2, path is a str. slug requires a unicode
        # object. According to wikipedia, unquoted strings will
        # usually be UTF8
        if isinstance(path, utils.bytes_str):
            path = path.decode('utf8')
        slug = utils.slugify(path)
        if not slug:  # it happens if the post has no "nice" URL
            slug = get_text_tag(
                item, '{{{0}}}post_name'.format(wordpress_namespace), None)
        if not slug:  # it *may* happen
            slug = get_text_tag(
                item, '{{{0}}}post_id'.format(wordpress_namespace), None)
        if not slug:  # should never happen
            print("Error converting post:", title)
            return

        description = get_text_tag(item, 'description', '')
        post_date = get_text_tag(
            item, '{{{0}}}post_date'.format(wordpress_namespace), None)
        dt = utils.to_datetime(post_date)
        if dt.tzinfo and self.timezone is None:
            self.timezone = utils.get_tzname(dt)
        status = get_text_tag(
            item, '{{{0}}}status'.format(wordpress_namespace), 'publish')
        content = get_text_tag(
            item, '{http://purl.org/rss/1.0/modules/content/}encoded', '')

        tags = []
        if status == 'trash':
            print('Trashed post "{0}" will not be imported.'.format(title))
            return
        elif status != 'publish':
            tags.append('draft')
            is_draft = True
        else:
            is_draft = False

        for tag in item.findall('category'):
            text = tag.text
            if text == 'Uncategorized':
                continue
            tags.append(text)

        if is_draft and self.exclude_drafts:
            print('Draft "{0}" will not be imported.'.format(title))
        elif content.strip():
            # If no content is found, no files are written.
            self.url_map[link] = self.context['SITE_URL'] + '/' + \
                out_folder + '/' + slug + '.html'

            content = self.transform_content(content)

            self.write_metadata(os.path.join(self.output_folder, out_folder,
                                             slug + '.meta'),
                                title, slug, post_date, description, tags)
            self.write_content(
                os.path.join(self.output_folder, out_folder, slug + '.wp'),
                content)
        else:
            print('Not going to import "{0}" because it seems to contain'
                  ' no content.'.format(title))
Exemplo n.º 3
0
    def import_item(self, item, wordpress_namespace, out_folder=None):
        """Takes an item from the feed and creates a post file."""
        if out_folder is None:
            out_folder = 'posts'

        title = get_text_tag(item, 'title', 'NO TITLE')
        # link is something like http://foo.com/2012/09/01/hello-world/
        # So, take the path, utils.slugify it, and that's our slug
        link = get_text_tag(item, 'link', None)
        path = unquote(urlparse(link).path)

        # In python 2, path is a str. slug requires a unicode
        # object. According to wikipedia, unquoted strings will
        # usually be UTF8
        if isinstance(path, utils.bytes_str):
            path = path.decode('utf8')
        # Remove date form the slug
        slug = utils.slugify(path)[8:]
        if not slug:  # it happens if the post has no "nice" URL
            slug = get_text_tag(
                item, '{{{0}}}post_name'.format(wordpress_namespace), None)
        if not slug:  # it *may* happen
            slug = get_text_tag(
                item, '{{{0}}}post_id'.format(wordpress_namespace), None)
        if not slug:  # should never happen
            print("Error converting post:", title)
            return

        description = get_text_tag(item, 'description', '')
        post_date = get_text_tag(
            item, '{{{0}}}post_date'.format(wordpress_namespace), None)
        try:
            dt = utils.to_datetime(post_date)
        except:
            import epdb;epdb.set_trace()
        if dt.tzinfo and self.timezone is None:
            self.timezone = utils.get_tzname(dt)
        status = get_text_tag(
            item, '{{{0}}}status'.format(wordpress_namespace), 'publish')
        content = get_text_tag(
            item, '{http://purl.org/rss/1.0/modules/content/}encoded', '')

        tags = []
        if status == 'trash':
            print('Trashed post "{0}" will not be imported.'.format(title))
            return
        elif status != 'publish':
            tags.append('draft')
            is_draft = True
        else:
            is_draft = False

        for tag in item.findall('category'):
            text = tag.text
            if text == 'Uncategorized':
                continue
            tags.append(text)

        if is_draft and self.exclude_drafts:
            print('Draft "{0}" will not be imported.'.format(title))
        elif content.strip():
            # If no content is found, no files are written.
            self.url_map[link] = self.context['SITE_URL'] + '/' + \
                out_folder + '/' + slug + '.html'

            content = self.transform_content(content)

            self.write_metadata(os.path.join(self.output_folder, out_folder,
                                             slug + '.meta'),
                                title, slug, post_date, description, tags)
            self.write_content(
                os.path.join(self.output_folder, out_folder, slug + '.wp'),
                content)
        else:
            print('Not going to import "{0}" because it seems to contain'
                  ' no content.'.format(title))
Exemplo n.º 4
0
    def import_item(self, item, wordpress_namespace, out_folder=None):
        """Takes an item from the feed and creates a post file."""
        if out_folder is None:
            out_folder = 'posts'

        title = get_text_tag(item, 'title', 'NO TITLE')
        # link is something like http://foo.com/2012/09/01/hello-world/
        # So, take the path, utils.slugify it, and that's our slug
        link = get_text_tag(item, 'link', None)
        path = unquote(urlparse(link).path.strip('/'))

        # In python 2, path is a str. slug requires a unicode
        # object. According to wikipedia, unquoted strings will
        # usually be UTF8
        if isinstance(path, utils.bytes_str):
            path = path.decode('utf8')
        pathlist = path.split('/')
        if len(pathlist) > 1:
            out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
        slug = utils.slugify(pathlist[-1])
        if not slug:  # it happens if the post has no "nice" URL
            slug = get_text_tag(
                item, '{{{0}}}post_name'.format(wordpress_namespace), None)
        if not slug:  # it *may* happen
            slug = get_text_tag(
                item, '{{{0}}}post_id'.format(wordpress_namespace), None)
        if not slug:  # should never happen
            LOGGER.error("Error converting post:", title)
            return

        description = get_text_tag(item, 'description', '')
        post_date = get_text_tag(
            item, '{{{0}}}post_date'.format(wordpress_namespace), None)
        dt = utils.to_datetime(post_date)
        if dt.tzinfo and self.timezone is None:
            self.timezone = utils.get_tzname(dt)
        status = get_text_tag(
            item, '{{{0}}}status'.format(wordpress_namespace), 'publish')
        content = get_text_tag(
            item, '{http://purl.org/rss/1.0/modules/content/}encoded', '')

        tags = []
        if status == 'trash':
            LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
            return
        elif status != 'publish':
            tags.append('draft')
            is_draft = True
        else:
            is_draft = False

        for tag in item.findall('category'):
            text = tag.text
            if text == 'Uncategorized':
                continue
            tags.append(text)

        if '$latex' in content:
            tags.append('mathjax')

        if is_draft and self.exclude_drafts:
            LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
        elif content.strip():
            # If no content is found, no files are written.
            self.url_map[link] = (self.context['SITE_URL'] + out_folder + '/'
                                  + slug + '.html')
            if hasattr(self, "separate_qtranslate_content") \
               and self.separate_qtranslate_content:
                content_translations = separate_qtranslate_content(content)
            else:
                content_translations = {"": content}
            default_language = self.context["DEFAULT_LANG"]
            for lang, content in content_translations.items():
                if lang:
                    out_meta_filename = slug + '.meta'
                    if lang == default_language:
                        out_content_filename = slug + '.wp'
                    else:
                        out_content_filename \
                            = utils.get_translation_candidate(self.context,
                                                              slug + ".wp", lang)
                        self.extra_languages.add(lang)
                    meta_slug = slug
                else:
                    out_meta_filename = slug + '.meta'
                    out_content_filename = slug + '.wp'
                    meta_slug = slug
                content = self.transform_content(content)
                self.write_metadata(os.path.join(self.output_folder, out_folder,
                                                 out_meta_filename),
                                    title, meta_slug, post_date, description, tags)
                self.write_content(
                    os.path.join(self.output_folder,
                                 out_folder, out_content_filename),
                    content)
        else:
            LOGGER.warn('Not going to import "{0}" because it seems to contain'
                        ' no content.'.format(title))
Exemplo n.º 5
0
    def import_postpage_item(self, item, wordpress_namespace, out_folder=None, attachments=None):
        """Take an item from the feed and creates a post file."""
        if out_folder is None:
            out_folder = 'posts'

        title = get_text_tag(item, 'title', 'NO TITLE')
        # link is something like http://foo.com/2012/09/01/hello-world/
        # So, take the path, utils.slugify it, and that's our slug
        link = get_text_tag(item, 'link', None)
        parsed = urlparse(link)
        path = unquote(parsed.path.strip('/'))

        try:
            path = path.decode('utf8')
        except AttributeError:
            pass

        # Cut out the base directory.
        if path.startswith(self.base_dir.strip('/')):
            path = path.replace(self.base_dir.strip('/'), '', 1)

        pathlist = path.split('/')
        if parsed.query:  # if there are no nice URLs and query strings are used
            out_folder = os.path.join(*([out_folder] + pathlist))
            slug = get_text_tag(
                item, '{{{0}}}post_name'.format(wordpress_namespace), None)
            if not slug:  # it *may* happen
                slug = get_text_tag(
                    item, '{{{0}}}post_id'.format(wordpress_namespace), None)
            if not slug:  # should never happen
                LOGGER.error("Error converting post:", title)
                return False
        else:
            if len(pathlist) > 1:
                out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
            slug = utils.slugify(pathlist[-1])

        description = get_text_tag(item, 'description', '')
        post_date = get_text_tag(
            item, '{{{0}}}post_date'.format(wordpress_namespace), None)
        try:
            dt = utils.to_datetime(post_date)
        except ValueError:
            dt = datetime.datetime(1970, 1, 1, 0, 0, 0)
            LOGGER.error('Malformed date "{0}" in "{1}" [{2}], assuming 1970-01-01 00:00:00 instead.'.format(post_date, title, slug))
            post_date = dt.strftime('%Y-%m-%d %H:%M:%S')

        if dt.tzinfo and self.timezone is None:
            self.timezone = utils.get_tzname(dt)
        status = get_text_tag(
            item, '{{{0}}}status'.format(wordpress_namespace), 'publish')
        content = get_text_tag(
            item, '{http://purl.org/rss/1.0/modules/content/}encoded', '')
        excerpt = get_text_tag(
            item, '{http://wordpress.org/export/1.2/excerpt/}encoded', None)

        if excerpt is not None:
            if len(excerpt) == 0:
                excerpt = None

        tags = []
        categories = []
        if status == 'trash':
            LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
            return False
        elif status == 'private':
            tags.append('private')
            is_draft = False
            is_private = True
        elif status != 'publish':
            tags.append('draft')
            is_draft = True
            is_private = False
        else:
            is_draft = False
            is_private = False

        for tag in item.findall('category'):
            text = tag.text
            type = 'category'
            if 'domain' in tag.attrib:
                type = tag.attrib['domain']
            if text == 'Uncategorized' and type == 'category':
                continue
            self.all_tags.add(text)
            if type == 'category':
                categories.append(type)
            else:
                tags.append(text)

        if '$latex' in content:
            tags.append('mathjax')

        # Find post format if it's there
        post_format = 'wp'
        format_tag = [x for x in item.findall('*//{%s}meta_key' % wordpress_namespace) if x.text == '_tc_post_format']
        if format_tag:
            post_format = format_tag[0].getparent().find('{%s}meta_value' % wordpress_namespace).text
            if post_format == 'wpautop':
                post_format = 'wp'

        if is_draft and self.exclude_drafts:
            LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
            return False
        elif is_private and self.exclude_privates:
            LOGGER.notice('Private post "{0}" will not be imported.'.format(title))
            return False
        elif content.strip() or self.import_empty_items:
            # If no content is found, no files are written.
            self.url_map[link] = (self.context['SITE_URL'] +
                                  out_folder.rstrip('/') + '/' + slug +
                                  '.html').replace(os.sep, '/')
            if hasattr(self, "separate_qtranslate_content") \
               and self.separate_qtranslate_content:
                content_translations = separate_qtranslate_content(content)
            else:
                content_translations = {"": content}
            default_language = self.context["DEFAULT_LANG"]
            for lang, content in content_translations.items():
                try:
                    content, extension, rewrite_html = self.transform_content(content, post_format, attachments)
                except:
                    LOGGER.error(('Cannot interpret post "{0}" (language {1}) with post ' +
                                  'format {2}!').format(os.path.join(out_folder, slug), lang, post_format))
                    return False
                if lang:
                    out_meta_filename = slug + '.meta'
                    if lang == default_language:
                        out_content_filename = slug + '.' + extension
                    else:
                        out_content_filename \
                            = utils.get_translation_candidate(self.context,
                                                              slug + "." + extension, lang)
                        self.extra_languages.add(lang)
                    meta_slug = slug
                else:
                    out_meta_filename = slug + '.meta'
                    out_content_filename = slug + '.' + extension
                    meta_slug = slug
                tags, other_meta = self._create_metadata(status, excerpt, tags, categories,
                                                         post_name=os.path.join(out_folder, slug))
                self.write_metadata(os.path.join(self.output_folder, out_folder,
                                                 out_meta_filename),
                                    title, meta_slug, post_date, description, tags, **other_meta)
                self.write_content(
                    os.path.join(self.output_folder,
                                 out_folder, out_content_filename),
                    content,
                    rewrite_html)

            if self.export_comments:
                comments = []
                for tag in item.findall('{{{0}}}comment'.format(wordpress_namespace)):
                    comment = self._extract_comment(tag, wordpress_namespace)
                    if comment is not None:
                        comments.append(comment)

                for comment in comments:
                    comment_filename = slug + "." + str(comment['id']) + ".wpcomment"
                    self._write_comment(os.path.join(self.output_folder, out_folder, comment_filename), comment)

            return (out_folder, slug)
        else:
            LOGGER.warn(('Not going to import "{0}" because it seems to contain'
                         ' no content.').format(title))
            return False
Exemplo n.º 6
0
    def import_item(self, item, wordpress_namespace, out_folder=None):
        """Takes an item from the feed and creates a post file."""
        if out_folder is None:
            out_folder = "posts"

        title = get_text_tag(item, "title", "NO TITLE")
        # link is something like http://foo.com/2012/09/01/hello-world/
        # So, take the path, utils.slugify it, and that's our slug
        link = get_text_tag(item, "link", None)
        path = unquote(urlparse(link).path.strip("/"))

        # In python 2, path is a str. slug requires a unicode
        # object. According to wikipedia, unquoted strings will
        # usually be UTF8
        if isinstance(path, utils.bytes_str):
            path = path.decode("utf8")
        pathlist = path.split("/")
        if len(pathlist) > 1:
            out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
        slug = utils.slugify(pathlist[-1])
        if not slug:  # it happens if the post has no "nice" URL
            slug = get_text_tag(item, "{{{0}}}post_name".format(wordpress_namespace), None)
        if not slug:  # it *may* happen
            slug = get_text_tag(item, "{{{0}}}post_id".format(wordpress_namespace), None)
        if not slug:  # should never happen
            LOGGER.error("Error converting post:", title)
            return

        description = get_text_tag(item, "description", "")
        post_date = get_text_tag(item, "{{{0}}}post_date".format(wordpress_namespace), None)
        dt = utils.to_datetime(post_date)
        if dt.tzinfo and self.timezone is None:
            self.timezone = utils.get_tzname(dt)
        status = get_text_tag(item, "{{{0}}}status".format(wordpress_namespace), "publish")
        content = get_text_tag(item, "{http://purl.org/rss/1.0/modules/content/}encoded", "")

        tags = []
        if status == "trash":
            LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
            return
        elif status != "publish":
            tags.append("draft")
            is_draft = True
        else:
            is_draft = False

        for tag in item.findall("category"):
            text = tag.text
            if text == "Uncategorized":
                continue
            tags.append(text)

        if is_draft and self.exclude_drafts:
            LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
        elif content.strip():
            # If no content is found, no files are written.
            self.url_map[link] = self.context["SITE_URL"] + "/" + out_folder + "/" + slug + ".html"

            content = self.transform_content(content)

            self.write_metadata(
                os.path.join(self.output_folder, out_folder, slug + ".meta"), title, slug, post_date, description, tags
            )
            self.write_content(os.path.join(self.output_folder, out_folder, slug + ".wp"), content)
        else:
            LOGGER.warn('Not going to import "{0}" because it seems to contain' " no content.".format(title))
Exemplo n.º 7
0
    def import_item(self, item, wordpress_namespace, out_folder=None):
        """Takes an item from the feed and creates a post file."""
        if out_folder is None:
            out_folder = 'posts'

        title = get_text_tag(item, 'title', 'NO TITLE')
        # link is something like http://foo.com/2012/09/01/hello-world/
        # So, take the path, utils.slugify it, and that's our slug
        link = get_text_tag(item, 'link', None)
        parsed = urlparse(link)
        path = unquote(parsed.path.strip('/'))

        # In python 2, path is a str. slug requires a unicode
        # object. According to wikipedia, unquoted strings will
        # usually be UTF8
        if isinstance(path, utils.bytes_str):
            path = path.decode('utf8')

        # Cut out the base directory.
        if path.startswith(self.base_dir.strip('/')):
            path = path.replace(self.base_dir.strip('/'), '', 1)

        pathlist = path.split('/')
        if parsed.query:  # if there are no nice URLs and query strings are used
            out_folder = os.path.join(*([out_folder] + pathlist))
            slug = get_text_tag(
                item, '{{{0}}}post_name'.format(wordpress_namespace), None)
            if not slug:  # it *may* happen
                slug = get_text_tag(
                    item, '{{{0}}}post_id'.format(wordpress_namespace), None)
            if not slug:  # should never happen
                LOGGER.error("Error converting post:", title)
                return
        else:
            if len(pathlist) > 1:
                out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
            slug = utils.slugify(pathlist[-1])

        description = get_text_tag(item, 'description', '')
        post_date = get_text_tag(
            item, '{{{0}}}post_date'.format(wordpress_namespace), None)
        try:
            dt = utils.to_datetime(post_date)
        except ValueError:
            dt = datetime.datetime(1970, 1, 1, 0, 0, 0)
            LOGGER.error('Malformed date "{0}" in "{1}" [{2}], assuming 1970-01-01 00:00:00 instead.'.format(post_date, title, slug))
            post_date = dt.strftime('%Y-%m-%d %H:%M:%S')

        if dt.tzinfo and self.timezone is None:
            self.timezone = utils.get_tzname(dt)
        status = get_text_tag(
            item, '{{{0}}}status'.format(wordpress_namespace), 'publish')
        content = get_text_tag(
            item, '{http://purl.org/rss/1.0/modules/content/}encoded', '')

        tags = []
        if status == 'trash':
            LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
            return
        elif status != 'publish':
            tags.append('draft')
            is_draft = True
        else:
            is_draft = False

        for tag in item.findall('category'):
            text = tag.text
            if text == 'Uncategorized':
                continue
            tags.append(text)

        if '$latex' in content:
            tags.append('mathjax')

        # Find post format if it's there
        post_format = 'wp'
        format_tag = [x for x in item.findall('*//{%s}meta_key' % wordpress_namespace) if x.text == '_tc_post_format']
        if format_tag:
            post_format = format_tag[0].getparent().find('{%s}meta_value' % wordpress_namespace).text

        if is_draft and self.exclude_drafts:
            LOGGER.notice('Draft "{0}" will not be imported.'.format(title))

        elif content.strip():
            # If no content is found, no files are written.
            self.url_map[link] = (self.context['SITE_URL'] +
                                  out_folder.rstrip('/') + '/' + slug +
                                  '.html').replace(os.sep, '/')
            if hasattr(self, "separate_qtranslate_content") \
               and self.separate_qtranslate_content:
                content_translations = separate_qtranslate_content(content)
            else:
                content_translations = {"": content}
            default_language = self.context["DEFAULT_LANG"]
            for lang, content in content_translations.items():
                if lang:
                    out_meta_filename = slug + '.meta'
                    if lang == default_language:
                        out_content_filename = slug + '.wp'
                    else:
                        out_content_filename \
                            = utils.get_translation_candidate(self.context,
                                                              slug + ".wp", lang)
                        self.extra_languages.add(lang)
                    meta_slug = slug
                else:
                    out_meta_filename = slug + '.meta'
                    out_content_filename = slug + '.wp'
                    meta_slug = slug
                if post_format == 'wp':
                    content = self.transform_content(content)
                self.write_metadata(os.path.join(self.output_folder, out_folder,
                                                 out_meta_filename),
                                    title, meta_slug, post_date, description, tags)
                self.write_content(
                    os.path.join(self.output_folder,
                                 out_folder, out_content_filename),
                    content)
        else:
            LOGGER.warn('Not going to import "{0}" because it seems to contain'
                        ' no content.'.format(title))
Exemplo n.º 8
0
    def import_item(self, item, wordpress_namespace, out_folder=None):
        """Takes an item from the feed and creates a post file."""
        if out_folder is None:
            out_folder = "posts"

        title = get_text_tag(item, "title", "NO TITLE")
        # link is something like http://foo.com/2012/09/01/hello-world/
        # So, take the path, utils.slugify it, and that's our slug
        link = get_text_tag(item, "link", None)
        parsed = urlparse(link)
        path = unquote(parsed.path.strip("/"))

        # In python 2, path is a str. slug requires a unicode
        # object. According to wikipedia, unquoted strings will
        # usually be UTF8
        if isinstance(path, utils.bytes_str):
            path = path.decode("utf8")

        # Cut out the base directory.
        if path.startswith(self.base_dir.strip("/")):
            path = path.replace(self.base_dir.strip("/"), "", 1)

        pathlist = path.split("/")
        if parsed.query:  # if there are no nice URLs and query strings are used
            out_folder = os.path.join(*([out_folder] + pathlist))
            slug = get_text_tag(item, "{{{0}}}post_name".format(wordpress_namespace), None)
            if not slug:  # it *may* happen
                slug = get_text_tag(item, "{{{0}}}post_id".format(wordpress_namespace), None)
            if not slug:  # should never happen
                LOGGER.error("Error converting post:", title)
                return
        else:
            if len(pathlist) > 1:
                out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
            slug = utils.slugify(pathlist[-1])

        description = get_text_tag(item, "description", "")
        post_date = get_text_tag(item, "{{{0}}}post_date".format(wordpress_namespace), None)
        try:
            dt = utils.to_datetime(post_date)
        except ValueError:
            dt = datetime.datetime(1970, 1, 1, 0, 0, 0)
            LOGGER.error(
                'Malformed date "{0}" in "{1}" [{2}], assuming 1970-01-01 00:00:00 instead.'.format(
                    post_date, title, slug
                )
            )
            post_date = dt.strftime("%Y-%m-%d %H:%M:%S")

        if dt.tzinfo and self.timezone is None:
            self.timezone = utils.get_tzname(dt)
        status = get_text_tag(item, "{{{0}}}status".format(wordpress_namespace), "publish")
        content = get_text_tag(item, "{http://purl.org/rss/1.0/modules/content/}encoded", "")

        tags = []
        if status == "trash":
            LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
            return
        elif status != "publish":
            tags.append("draft")
            is_draft = True
        else:
            is_draft = False

        for tag in item.findall("category"):
            text = tag.text
            if text == "Uncategorized":
                continue
            tags.append(text)

        if "$latex" in content:
            tags.append("mathjax")

        if is_draft and self.exclude_drafts:
            LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
        elif content.strip():
            # If no content is found, no files are written.
            self.url_map[link] = (self.context["SITE_URL"] + out_folder.rstrip("/") + "/" + slug + ".html").replace(
                os.sep, "/"
            )
            if hasattr(self, "separate_qtranslate_content") and self.separate_qtranslate_content:
                content_translations = separate_qtranslate_content(content)
            else:
                content_translations = {"": content}
            default_language = self.context["DEFAULT_LANG"]
            for lang, content in content_translations.items():
                if lang:
                    out_meta_filename = slug + ".meta"
                    if lang == default_language:
                        out_content_filename = slug + ".wp"
                    else:
                        out_content_filename = utils.get_translation_candidate(self.context, slug + ".wp", lang)
                        self.extra_languages.add(lang)
                    meta_slug = slug
                else:
                    out_meta_filename = slug + ".meta"
                    out_content_filename = slug + ".wp"
                    meta_slug = slug
                content = self.transform_content(content)
                self.write_metadata(
                    os.path.join(self.output_folder, out_folder, out_meta_filename),
                    title,
                    meta_slug,
                    post_date,
                    description,
                    tags,
                )
                self.write_content(os.path.join(self.output_folder, out_folder, out_content_filename), content)
        else:
            LOGGER.warn('Not going to import "{0}" because it seems to contain' " no content.".format(title))
Exemplo n.º 9
0
    def import_postpage_item(self, item, wordpress_namespace, out_folder=None, attachments=None):
        """Take an item from the feed and creates a post file."""
        if out_folder is None:
            out_folder = "posts"

        title = get_text_tag(item, "title", "NO TITLE")
        # link is something like http://foo.com/2012/09/01/hello-world/
        # So, take the path, utils.slugify it, and that's our slug
        link = get_text_tag(item, "link", None)
        parsed = urlparse(link)
        path = unquote(parsed.path.strip("/"))

        try:
            if isinstance(path, utils.bytes_str):
                path = path.decode("utf8", "replace")
            else:
                path = path
        except AttributeError:
            pass

        # Cut out the base directory.
        if path.startswith(self.base_dir.strip("/")):
            path = path.replace(self.base_dir.strip("/"), "", 1)

        pathlist = path.split("/")
        if parsed.query:  # if there are no nice URLs and query strings are used
            out_folder = os.path.join(*([out_folder] + pathlist))
            slug = get_text_tag(item, "{{{0}}}post_name".format(wordpress_namespace), None)
            if not slug:  # it *may* happen
                slug = get_text_tag(item, "{{{0}}}post_id".format(wordpress_namespace), None)
            if not slug:  # should never happen
                LOGGER.error("Error converting post:", title)
                return False
        else:
            if len(pathlist) > 1:
                out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
            slug = utils.slugify(pathlist[-1])

        description = get_text_tag(item, "description", "")
        post_date = get_text_tag(item, "{{{0}}}post_date".format(wordpress_namespace), None)
        try:
            dt = utils.to_datetime(post_date)
        except ValueError:
            dt = datetime.datetime(1970, 1, 1, 0, 0, 0)
            LOGGER.error(
                'Malformed date "{0}" in "{1}" [{2}], assuming 1970-01-01 00:00:00 instead.'.format(
                    post_date, title, slug
                )
            )
            post_date = dt.strftime("%Y-%m-%d %H:%M:%S")

        if dt.tzinfo and self.timezone is None:
            self.timezone = utils.get_tzname(dt)
        status = get_text_tag(item, "{{{0}}}status".format(wordpress_namespace), "publish")
        content = get_text_tag(item, "{http://purl.org/rss/1.0/modules/content/}encoded", "")
        excerpt = get_text_tag(item, "{http://wordpress.org/export/1.2/excerpt/}encoded", None)

        if excerpt is not None:
            if len(excerpt) == 0:
                excerpt = None

        tags = []
        categories = []
        if status == "trash":
            LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
            return False
        elif status == "private":
            tags.append("private")
            is_draft = False
            is_private = True
        elif status != "publish":
            tags.append("draft")
            is_draft = True
            is_private = False
        else:
            is_draft = False
            is_private = False

        for tag in item.findall("category"):
            text = tag.text
            type = "category"
            if "domain" in tag.attrib:
                type = tag.attrib["domain"]
            if text == "Uncategorized" and type == "category":
                continue
            if type == "category":
                categories.append(text)
            else:
                tags.append(text)

        if "$latex" in content:
            tags.append("mathjax")

        for i, cat in enumerate(categories[:]):
            cat = self._sanitize(cat, True)
            categories[i] = cat
            self.all_tags.add(cat)

        for i, tag in enumerate(tags[:]):
            tag = self._sanitize(tag, False)
            tags[i] = tag
            self.all_tags.add(tag)

        # Find post format if it's there
        post_format = "wp"
        format_tag = [x for x in item.findall("*//{%s}meta_key" % wordpress_namespace) if x.text == "_tc_post_format"]
        if format_tag:
            post_format = format_tag[0].getparent().find("{%s}meta_value" % wordpress_namespace).text
            if post_format == "wpautop":
                post_format = "wp"

        if is_draft and self.exclude_drafts:
            LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
            return False
        elif is_private and self.exclude_privates:
            LOGGER.notice('Private post "{0}" will not be imported.'.format(title))
            return False
        elif content.strip() or self.import_empty_items:
            # If no content is found, no files are written.
            self.url_map[link] = (self.context["SITE_URL"] + out_folder.rstrip("/") + "/" + slug + ".html").replace(
                os.sep, "/"
            )
            if hasattr(self, "separate_qtranslate_content") and self.separate_qtranslate_content:
                content_translations = separate_qtranslate_content(content)
            else:
                content_translations = {"": content}
            default_language = self.context["DEFAULT_LANG"]
            for lang, content in content_translations.items():
                try:
                    content, extension, rewrite_html = self.transform_content(content, post_format, attachments)
                except:
                    LOGGER.error(
                        ('Cannot interpret post "{0}" (language {1}) with post ' + "format {2}!").format(
                            os.path.join(out_folder, slug), lang, post_format
                        )
                    )
                    return False
                if lang:
                    out_meta_filename = slug + ".meta"
                    if lang == default_language:
                        out_content_filename = slug + "." + extension
                    else:
                        out_content_filename = utils.get_translation_candidate(
                            self.context, slug + "." + extension, lang
                        )
                        self.extra_languages.add(lang)
                    meta_slug = slug
                else:
                    out_meta_filename = slug + ".meta"
                    out_content_filename = slug + "." + extension
                    meta_slug = slug
                tags, other_meta = self._create_metadata(
                    status, excerpt, tags, categories, post_name=os.path.join(out_folder, slug)
                )
                self.write_metadata(
                    os.path.join(self.output_folder, out_folder, out_meta_filename),
                    title,
                    meta_slug,
                    post_date,
                    description,
                    tags,
                    **other_meta
                )
                self.write_content(
                    os.path.join(self.output_folder, out_folder, out_content_filename), content, rewrite_html
                )

            if self.export_comments:
                comments = []
                for tag in item.findall("{{{0}}}comment".format(wordpress_namespace)):
                    comment = self._extract_comment(tag, wordpress_namespace)
                    if comment is not None:
                        comments.append(comment)

                for comment in comments:
                    comment_filename = "{0}.{1}.wpcomment".format(slug, comment["id"])
                    self._write_comment(os.path.join(self.output_folder, out_folder, comment_filename), comment)

            return (out_folder, slug)
        else:
            LOGGER.warn(('Not going to import "{0}" because it seems to contain' " no content.").format(title))
            return False