Python get_text_word_count示例

编程语言: Python

命名空间/包名称: superdesk.text_utils

方法/功能: get_text_word_count

hotexamples.com的示例: 8

Python get_text_word_count - 已找到8个示例。这些是从开源项目中提取的最受好评的superdesk.text_utils.get_text_word_count现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

    def parse(self, file_path, provider=None):
        try:
            item = {
                'guid': '{}-{}'.format(file_path, uuid.uuid4()),
                'pubstatus': 'usable',
                'versioncreated': utcnow(),
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                FORMAT: FORMATS.PRESERVED,
            }

            with open(file_path, 'r', encoding='windows-1252') as f:
                data = f.read().replace('\r', '')

            header, dateline_data, body_data = data.split('\n\n', 2)

            self._process_header(item, header)

            start_of_body = 'MEDIA RELEASE '
            source, data = data.split(start_of_body, 1)
            data = start_of_body + data

            item['anpa_category'] = [{'qcode': 'j'}]
            item['original_source'] = 'AsiaNet'
            item['word_count'] = get_text_word_count(data)
            item['body_html'] = '<pre>' + to_ascii(
                html.escape(data)) + '</pre>'

            return item
        except Exception as e:
            raise AAPParserError.AsiaNetParserError(file_path, e)

示例#2

显示文件

def broadcast_auto_publish(item, **kwargs):
    """Broadcast auto publish macro.

    :param item:
    :param kwargs:
    :return:
    """
    if item.get(ITEM_TYPE) != CONTENT_TYPE.TEXT or item.get(
            FORMAT) != FORMATS.HTML:
        return

    formatter = AAPBulletinBuilderFormatter()
    body_text = formatter.get_text_content(formatter.append_body_footer(item))
    word_count = get_text_word_count(body_text)
    max_word_count = config.MIN_BROADCAST_TEXT_WORD_COUNT
    item['genre'] = [{'name': 'Broadcast Script', 'qcode': 'Broadcast Script'}]
    if item[ITEM_STATE] not in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED} and \
            not (item.get('flags') or {}).get('marked_for_legal'):
        if word_count > max_word_count and \
                not (item.get('flags') or {}).get('marked_for_legal'):
            lines = body_text.splitlines()
            new_body_html = []
            for line in lines:
                para = line.strip()
                if not para:
                    continue

                new_body_html.append('<p>{}</p>'.format(para))
                word_count = get_text_word_count(''.join(new_body_html))
                if word_count > max_word_count:
                    if len(new_body_html):
                        item['body_html'] = ''.join(new_body_html)
                        item['word_count'] = word_count
                    break
    elif item[ITEM_STATE] in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED}:
        lines = body_text.splitlines()
        lines = [
            '<p>{}</p>'.format(line.strip()) for line in lines if line.strip()
        ]
        # remove the first line/paragraph of kill message
        lines = lines[1:]
        item['body_html'] = ''.join(lines)
        fields_to_remove = ['embargo', 'dateline', 'slugline', 'genre']
        for field in fields_to_remove:
            item.pop(field, None)

    internal_destination_auto_publish(item, **kwargs)

示例#3

显示文件

文件： broadcast_auto_publish.py 项目： mdhaman/superdesk-aap

def broadcast_auto_publish(item, **kwargs):
    """Broadcast auto publish macro.

    :param item:
    :param kwargs:
    :return:
    """
    if item.get(ITEM_TYPE) != CONTENT_TYPE.TEXT or item.get(FORMAT) != FORMATS.HTML:
        return

    formatter = AAPBulletinBuilderFormatter()
    body_text = formatter.get_text_content(formatter.append_body_footer(item))
    word_count = get_text_word_count(body_text)
    max_word_count = config.MIN_BROADCAST_TEXT_WORD_COUNT
    item['genre'] = [{'name': 'Broadcast Script', 'qcode': 'Broadcast Script'}]
    if item[ITEM_STATE] not in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED} and \
            not (item.get('flags') or {}).get('marked_for_legal'):
        if word_count > max_word_count and \
                not (item.get('flags') or {}).get('marked_for_legal'):
            lines = body_text.splitlines()
            new_body_html = []
            for line in lines:
                para = line.strip()
                if not para:
                    continue

                new_body_html.append('<p>{}</p>'.format(para))
                word_count = get_text_word_count(''.join(new_body_html))
                if word_count > max_word_count:
                    if len(new_body_html):
                        item['body_html'] = ''.join(new_body_html)
                        item['word_count'] = word_count
                    break
    elif item[ITEM_STATE] in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED}:
        lines = body_text.splitlines()
        lines = ['<p>{}</p>'.format(line.strip()) for line in lines if line.strip()]
        # remove the first line/paragraph of kill message
        lines = lines[1:]
        item['body_html'] = ''.join(lines)
        fields_to_remove = ['embargo', 'dateline', 'slugline', 'genre']
        for field in fields_to_remove:
            item.pop(field, None)

    internal_destination_auto_publish(item, **kwargs)

示例#4

显示文件

文件： broadcast_auto_publish.py 项目： araxiskeshju/superdesk-aap

def broadcast_auto_publish(item, **kwargs):
    """Broadcast auto publish macro.

    :param item:
    :param kwargs:
    :return:
    """
    if item.get(ITEM_TYPE) != CONTENT_TYPE.TEXT or item.get(
            FORMAT) != FORMATS.HTML:
        return

    max_word_count = config.MIN_BROADCAST_TEXT_WORD_COUNT
    item['genre'] = [{'name': 'Broadcast Script', 'qcode': 'Broadcast Script'}]
    if item[ITEM_STATE] not in {CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED} and \
            not (item.get('flags') or {}).get('marked_for_legal'):
        formatter = AAPBulletinBuilderFormatter()
        body_text = formatter.get_text_content(
            formatter.append_body_footer(item))
        word_count = get_text_word_count(body_text)
        if word_count > max_word_count and \
                not (item.get('flags') or {}).get('marked_for_legal'):
            lines = body_text.splitlines()
            new_body_html = []
            for line in lines:
                para = line.strip()
                if not para:
                    continue

                new_body_html.append('<p>{}</p>'.format(para))
                word_count = get_text_word_count(''.join(new_body_html))
                if word_count > max_word_count:
                    if len(new_body_html):
                        item['body_html'] = ''.join(new_body_html)
                        item['word_count'] = word_count
                    break

    internal_destination_auto_publish(item, **kwargs)

示例#5

显示文件

    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        ninjs = {
            'guid': article.get(GUID_FIELD, article.get('uri')),
            'version': str(article.get(config.VERSION, 1)),
            'type': self._get_type(article)
        }

        if article.get('byline'):
            ninjs['byline'] = article['byline']

        located = article.get('dateline', {}).get('located', {})
        if located:
            ninjs['located'] = located.get('city', '')

        for copy_property in self.direct_copy_properties:
            if article.get(copy_property) is not None:
                ninjs[copy_property] = article[copy_property]

        if 'body_text' not in article and 'alt_text' in article:
            ninjs['body_text'] = article['alt_text']

        if 'title' in article:
            ninjs['headline'] = article['title']

        if article.get('body_html'):
            ninjs['body_html'] = self.append_body_footer(article)

        if article.get('description'):
            ninjs['description_html'] = self.append_body_footer(article)

        if article.get('place'):
            ninjs['place'] = self._format_place(article)

        if article.get('profile'):
            ninjs['profile'] = self._format_profile(article['profile'])

        if recursive:
            if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE:
                ninjs[ASSOCIATIONS] = self._get_associations(
                    article, subscriber)
                if article.get(ASSOCIATIONS):
                    ninjs[ASSOCIATIONS].update(
                        self._format_related(article, subscriber))
            elif article.get(ASSOCIATIONS):
                ninjs[ASSOCIATIONS] = self._format_related(article, subscriber)
        elif article.get(ASSOCIATIONS):
            ninjs[ASSOCIATIONS] = self._format_related(article, subscriber)

        if article.get(EMBARGO):
            ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat()

        if article.get('priority'):
            ninjs['priority'] = article['priority']
        else:
            ninjs['priority'] = 5

        if article.get('subject'):
            ninjs['subject'] = self._get_subject(article)

        if article.get('anpa_category'):
            ninjs['service'] = self._get_service(article)
        if article.get('renditions'):
            ninjs['renditions'] = self._get_renditions(article)
        elif 'url' in article:
            ninjs['renditions'] = self._generate_renditions(article)

        # SDPA-317
        if 'abstract' in article:
            abstract = article.get('abstract', '')
            ninjs['description_html'] = abstract
            ninjs['description_text'] = text_utils.get_text(abstract)
        elif article.get('description_text'):
            ninjs['description_text'] = article.get('description_text')

        if article.get('company_codes'):
            ninjs['organisation'] = [{
                'name':
                c.get('name', ''),
                'rel':
                'Securities Identifier',
                'symbols': [{
                    'ticker': c.get('qcode', ''),
                    'exchange': c.get('security_exchange', '')
                }]
            } for c in article['company_codes']]
        elif 'company' in article:
            ninjs['organisation'] = [{'name': article['company']}]

        if article.get('rewrite_of'):
            ninjs['evolvedfrom'] = article['rewrite_of']

        if not ninjs.get('copyrightholder') and not ninjs.get(
                'copyrightnotice') and not ninjs.get('usageterms'):
            ninjs.update(
                superdesk.get_resource_service('vocabularies').get_rightsinfo(
                    article))

        if 'genre' in article:
            ninjs['genre'] = self._get_genre(article)

        if article.get('flags', {}).get('marked_for_legal'):
            ninjs['signal'] = self._format_signal_cwarn()

        if article.get('attachments'):
            ninjs['attachments'] = self._format_attachments(article)

        if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs
                                                   or 'body_text' in ninjs):
            if 'body_html' in ninjs:
                body_html = ninjs['body_html']
                word_count = text_utils.get_word_count(body_html)
                char_count = text_utils.get_char_count(body_html)
                readtime = text_utils.get_reading_time(body_html, word_count,
                                                       article.get('language'))
            else:
                body_text = ninjs['body_text']
                word_count = text_utils.get_text_word_count(body_text)
                char_count = len(body_text)
                readtime = text_utils.get_reading_time(body_text, word_count,
                                                       article.get('language'))
            ninjs['charcount'] = char_count
            ninjs['wordcount'] = word_count
            ninjs['readtime'] = readtime

        if article.get('authors'):
            ninjs['authors'] = self._format_authors(article)

        return ninjs

示例#6

显示文件

文件： ninjs_formatter.py 项目： superdesk/superdesk-core

    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        ninjs = {
            "guid": article.get(GUID_FIELD, article.get("uri")),
            "version": str(article.get(config.VERSION, 1)),
            "type": self._get_type(article),
        }

        if article.get("byline"):
            ninjs["byline"] = article["byline"]

        located = article.get("dateline", {}).get("located", {})
        if located:
            ninjs["located"] = located.get("city", "")

        for copy_property in self.direct_copy_properties:
            if article.get(copy_property) is not None:
                ninjs[copy_property] = article[copy_property]

        if "body_text" not in article and "alt_text" in article:
            ninjs["body_text"] = article["alt_text"]

        if "title" in article:
            ninjs["headline"] = article["title"]

        if article.get("body_html"):
            ninjs["body_html"] = self.append_body_footer(article)

        if article.get("description"):
            ninjs["description_html"] = self.append_body_footer(article)

        if article.get("place"):
            ninjs["place"] = self._format_place(article)

        if article.get("profile"):
            ninjs["profile"] = self._format_profile(article["profile"])

        extra_items = None
        if recursive:
            if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE:
                ninjs[ASSOCIATIONS] = self._get_associations(
                    article, subscriber)
                if article.get(ASSOCIATIONS):
                    associations, extra_items = self._format_related(
                        article, subscriber)
                    ninjs[ASSOCIATIONS].update(associations)
            elif article.get(ASSOCIATIONS):
                ninjs[ASSOCIATIONS], extra_items = self._format_related(
                    article, subscriber)
        elif article.get(ASSOCIATIONS) and recursive:
            ninjs[ASSOCIATIONS], extra_items = self._format_related(
                article, subscriber)
        if extra_items:
            ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items)

        if article.get("embargoed"):
            ninjs["embargoed"] = article["embargoed"].isoformat()

        if article.get(
                EMBARGO):  # embargo set in superdesk overrides ingested one
            ninjs["embargoed"] = get_utc_schedule(article, EMBARGO).isoformat()

        if article.get("priority"):
            ninjs["priority"] = article["priority"]
        else:
            ninjs["priority"] = 5

        if article.get("subject"):
            ninjs["subject"] = self._get_subject(article)

        if article.get("anpa_category"):
            ninjs["service"] = self._get_service(article)
        if article.get("renditions"):
            ninjs["renditions"] = self._get_renditions(article)
        elif "url" in article:
            ninjs["renditions"] = self._generate_renditions(article)

        if "order" in article:
            ninjs["order"] = article["order"]

        # SDPA-317
        if "abstract" in article:
            abstract = article.get("abstract", "")
            ninjs["description_html"] = abstract
            ninjs["description_text"] = text_utils.get_text(abstract)
        elif article.get("description_text"):
            ninjs["description_text"] = article.get("description_text")

        if article.get("company_codes"):
            ninjs["organisation"] = [{
                "name":
                c.get("name", ""),
                "rel":
                "Securities Identifier",
                "symbols": [{
                    "ticker": c.get("qcode", ""),
                    "exchange": c.get("security_exchange", "")
                }],
            } for c in article["company_codes"]]
        elif "company" in article:
            ninjs["organisation"] = [{"name": article["company"]}]

        if article.get("rewrite_of"):
            ninjs["evolvedfrom"] = article["rewrite_of"]

        if not ninjs.get("copyrightholder") and not ninjs.get(
                "copyrightnotice") and not ninjs.get("usageterms"):
            ninjs.update(
                superdesk.get_resource_service("vocabularies").get_rightsinfo(
                    article))

        if article.get("genre"):
            ninjs["genre"] = self._get_genre(article)

        if article.get("flags", {}).get("marked_for_legal"):
            ninjs["signal"] = self._format_signal_cwarn()

        if article.get("signal"):
            ninjs.setdefault("signal", []).extend(
                [self._format_signal(signal) for signal in article["signal"]])

        if article.get("attachments"):
            ninjs["attachments"] = self._format_attachments(article)

        if ninjs["type"] == CONTENT_TYPE.TEXT and ("body_html" in ninjs
                                                   or "body_text" in ninjs):
            if "body_html" in ninjs:
                body_html = ninjs["body_html"]
                word_count = text_utils.get_word_count(body_html)
                char_count = text_utils.get_char_count(body_html)
                readtime = text_utils.get_reading_time(body_html, word_count,
                                                       article.get("language"))
            else:
                body_text = ninjs["body_text"]
                word_count = text_utils.get_text_word_count(body_text)
                char_count = len(body_text)
                readtime = text_utils.get_reading_time(body_text, word_count,
                                                       article.get("language"))
            ninjs["charcount"] = char_count
            ninjs["wordcount"] = word_count
            ninjs["readtime"] = readtime

        if article.get("authors"):
            ninjs["authors"] = self._format_authors(article)

        if (article.get("schedule_settings")
                or {}).get("utc_publish_schedule"):
            ninjs["publish_schedule"] = article["schedule_settings"][
                "utc_publish_schedule"]

        # set description for custom embed field
        if article.get("extra"):
            ninjs["extra"] = article["extra"]
            for key, value in ninjs["extra"].items():
                if type(value) == dict and "embed" in value:
                    value.setdefault("description", "")

        return ninjs

示例#7

显示文件

文件： ninjs_formatter.py 项目： jerome-poisson/superdesk-core

    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        ninjs = {
            'guid': article.get(GUID_FIELD, article.get('uri')),
            'version': str(article.get(config.VERSION, 1)),
            'type': self._get_type(article)
        }

        if article.get('byline'):
            ninjs['byline'] = article['byline']

        located = article.get('dateline', {}).get('located', {})
        if located:
            ninjs['located'] = located.get('city', '')

        for copy_property in self.direct_copy_properties:
            if article.get(copy_property) is not None:
                ninjs[copy_property] = article[copy_property]

        if 'body_text' not in article and 'alt_text' in article:
            ninjs['body_text'] = article['alt_text']

        if 'title' in article:
            ninjs['headline'] = article['title']

        if article.get('body_html'):
            ninjs['body_html'] = self.append_body_footer(article)

        if article.get('description'):
            ninjs['description_html'] = self.append_body_footer(article)

        if article.get('place'):
            ninjs['place'] = self._format_place(article)

        if article.get('profile'):
            ninjs['profile'] = self._format_profile(article['profile'])

        extra_items = None
        if recursive:
            if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE:
                ninjs[ASSOCIATIONS] = self._get_associations(article, subscriber)
                if article.get(ASSOCIATIONS):
                    associations, extra_items = self._format_related(article, subscriber)
                    ninjs[ASSOCIATIONS].update(associations)
            elif article.get(ASSOCIATIONS):
                ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber)
        elif article.get(ASSOCIATIONS):
            ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber)
        if extra_items:
            ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items)

        if article.get(EMBARGO):
            ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat()

        if article.get('priority'):
            ninjs['priority'] = article['priority']
        else:
            ninjs['priority'] = 5

        if article.get('subject'):
            ninjs['subject'] = self._get_subject(article)

        if article.get('anpa_category'):
            ninjs['service'] = self._get_service(article)
        if article.get('renditions'):
            ninjs['renditions'] = self._get_renditions(article)
        elif 'url' in article:
            ninjs['renditions'] = self._generate_renditions(article)

        # SDPA-317
        if 'abstract' in article:
            abstract = article.get('abstract', '')
            ninjs['description_html'] = abstract
            ninjs['description_text'] = text_utils.get_text(abstract)
        elif article.get('description_text'):
            ninjs['description_text'] = article.get('description_text')

        if article.get('company_codes'):
            ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier',
                                      'symbols': [{'ticker': c.get('qcode', ''),
                                                   'exchange': c.get('security_exchange', '')}]}
                                     for c in article['company_codes']]
        elif 'company' in article:
            ninjs['organisation'] = [{'name': article['company']}]

        if article.get('rewrite_of'):
            ninjs['evolvedfrom'] = article['rewrite_of']

        if not ninjs.get('copyrightholder') and not ninjs.get('copyrightnotice') and not ninjs.get('usageterms'):
            ninjs.update(superdesk.get_resource_service('vocabularies').get_rightsinfo(article))

        if 'genre' in article:
            ninjs['genre'] = self._get_genre(article)

        if article.get('flags', {}).get('marked_for_legal'):
            ninjs['signal'] = self._format_signal_cwarn()

        if article.get('attachments'):
            ninjs['attachments'] = self._format_attachments(article)

        if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs):
            if 'body_html' in ninjs:
                body_html = ninjs['body_html']
                word_count = text_utils.get_word_count(body_html)
                char_count = text_utils.get_char_count(body_html)
                readtime = text_utils.get_reading_time(body_html, word_count, article.get('language'))
            else:
                body_text = ninjs['body_text']
                word_count = text_utils.get_text_word_count(body_text)
                char_count = len(body_text)
                readtime = text_utils.get_reading_time(body_text, word_count, article.get('language'))
            ninjs['charcount'] = char_count
            ninjs['wordcount'] = word_count
            ninjs['readtime'] = readtime

        if article.get('authors'):
            ninjs['authors'] = self._format_authors(article)

        return ninjs

示例#8

显示文件

文件： import_text_archive.py 项目： tomaskikutis/superdesk-aap

    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        items = []
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            try:
                # print(doc.get('href'))
                id = doc.find('dcdossier').get('id')
                if self._direction:
                    if int(id) > self._id:
                        self._id = int(id)
                else:
                    if int(id) < self._id:
                        self._id = int(id)
                item = {}
                item['guid'] = doc.find('dcdossier').get('guid')
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
                format = self._get_head_value(doc, 'Format')
                if format == 't':
                    item[FORMAT] = FORMATS.PRESERVED
                else:
                    item[FORMAT] = FORMATS.HTML
                # item[FORMAT] = FORMATS.HTML

                # if the item has been modified in the archive then it is due to a kill
                # there is an argument that this item should not be imported at all
                if doc.find('dcdossier').get('created') != doc.find(
                        'dcdossier').get('modified'):
                    # item[ITEM_STATE] = CONTENT_STATE.KILLED
                    continue
                else:
                    item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

                value = datetime.strptime(
                    self._get_head_value(doc, 'PublicationDate'),
                    '%Y%m%d%H%M%S')
                local_tz = pytz.timezone('Australia/Sydney')
                try:
                    aus_dt = local_tz.localize(value, is_dst=None)
                except NonExistentTimeError as ex:
                    aus_dt = local_tz.localize(value, is_dst=True)
                except AmbiguousTimeError:
                    aus_dt = local_tz.localize(value, is_dst=False)

                item['firstcreated'] = aus_dt.astimezone(pytz.utc)
                item['versioncreated'] = item['firstcreated']

                generate_unique_id_and_name(item)
                item['ingest_id'] = id

                last_line = None
                el = doc.find('dcdossier/document/body/BodyText')
                if el is not None:
                    story = el.text
                    lines = story.split('\n')
                    if len(lines) > 0:
                        last_line = lines[-1]
                    if item.get(FORMAT) == FORMATS.HTML:
                        story = story.replace('\n   ', '<p></p>')
                        story = story.replace('\n', '<br>')
                        item['body_html'] = '<p>' + story + '</p>'
                    else:
                        item['body_html'] = '<pre>' + story + '</pre>'
                    try:
                        item['word_count'] = get_text_word_count(
                            item['body_html'])
                    except:
                        pass
                else:
                    # Items with no body are ignored
                    continue

                item['source'] = self._get_head_value(doc, 'Agency')
                # if the source document contains no agency then by definition it is unknown
                if item['source'] is None:
                    item['source'] = 'UNKNOWN'
                else:
                    # check if the source of the document was Newscentre
                    dc_unique = doc.find('dcdossier').get('unique')
                    if dc_unique.startswith('NC.') and last_line is not None:
                        # The AFR summary articles all have agency values 25 chars long
                        if len(item['source']) == 25:
                            item['source'] = 'AAP'
                        # is it a numeric Agency
                        elif self._get_head_value(doc, 'Agency').isdigit():
                            sign_off = last_line.split(' ')
                            if len(sign_off) > 0:
                                item['source'] = sign_off[0].upper()
                            else:
                                item['source'] = sign_off.upper()
                            # clean up what we have extracted
                            if item['source'].startswith('AAP'):
                                item['source'] = 'AAP'
                            else:
                                # make sure it is one of the known values
                                if item['source'] not in {
                                        'AAP', 'AP', 'REUT', 'Asia Pulse',
                                        'DPA', 'AFP', 'RAW', 'NZA', 'NZPA',
                                        'KRT', 'PA', 'PAA', 'SNI', 'REUTERS'
                                }:
                                    print('Source : {}'.format(item['source']))
                                    item['source'] = 'UNKNOWN'

    #            self._addkeywords('AsiaPulseCodes', doc, item)

                byline = self._get_head_value(doc, 'Byline')
                if byline:
                    item['byline'] = byline

                # item['service'] = self._get_head_value(doc,'Service')

                category = self._get_head_value(doc, 'Category')
                if not category:
                    publication_name = self._get_head_value(
                        doc, 'PublicationName')
                    if publication_name in pubnames:
                        category = pubnames[publication_name]
                if category:
                    anpacategory = {}
                    anpacategory['qcode'] = category
                    for anpa_category in self._anpa_categories['items']:
                        if anpacategory['qcode'].lower(
                        ) == anpa_category['qcode'].lower():
                            anpacategory = {
                                'qcode': anpacategory['qcode'],
                                'name': anpa_category['name']
                            }
                            break
                    item['anpa_category'] = [anpacategory]

                self._addkeywords('CompanyCodes', doc, item)

                item['keyword'] = self._get_head_value(doc, 'Keyword')
                item['ingest_provider_sequence'] = self._get_head_value(
                    doc, 'Sequence')

                orginal_source = self._get_head_value(doc, 'Author')
                if orginal_source:
                    item['original_source'] = orginal_source

                item['headline'] = self._get_head_value(doc, 'Headline')

                code = self._get_head_value(doc, 'SubjectRefNum')
                if code and len(code) == 7:
                    code = '0' + code
                if code and code in subject_codes:
                    item['subject'] = []
                    item['subject'].append({
                        'qcode': code,
                        'name': subject_codes[code]
                    })
                    try:
                        process_iptc_codes(item, None)
                    except:
                        pass

                slug = self._get_head_value(doc, 'SLUG')
                if slug:
                    item['slugline'] = slug
                else:
                    item['slugline'] = self._get_head_value(doc, 'Keyword')

                take_key = self._get_head_value(doc, 'Takekey')
                if take_key:
                    item['anpa_take_key'] = take_key

                self._addkeywords('Topic', doc, item)

                #            self._addkeywords('Selectors', doc, item)

                item['pubstatus'] = 'usable'
                # this is required for the archived service additional lookup
                item['item_id'] = item['guid']
                item[config.VERSION] = 1
                item['flags'] = {'marked_archived_only': True}

                # item['_id'] = ObjectId(id.rjust(24,'0'))
                item['_id'] = ObjectId()
                items.append(item)

                if self._limit:
                    self._limit -= 1
                # print(item)
            except Exception as ex:
                print('Exception parsing DC documnent {}'.format(id))
                pass

        try:
            res = superdesk.get_resource_service('archived')
            s = time.time()
            res.post(items)
            print('Post to Batch to Superdesk took {:.2f}'.format(time.time() -
                                                                  s))
        except Exception as ex:
            if ex.code == 409:
                print('Key clash exceptionn detected')
                # create a list of the guids we tried to post
                guids = [g['guid'] for g in items]
                # create a query for all those id's
                query = {
                    'size': self.BATCH_SIZE,
                    'query': {
                        'filtered': {
                            'filter': {
                                "terms": {
                                    "guid": [guids]
                                }
                            }
                        }
                    }
                }

                req = ParsedRequest()
                repos = 'archived'
                req.args = {'source': json.dumps(query), 'repo': repos}

                search_res = superdesk.get_resource_service('search')
                existing = search_res.get(req=req, lookup=None)
                existing_guids = [e['guid'] for e in existing]
                not_existing = [g for g in guids if g not in existing_guids]
                for missing_guid in not_existing:
                    i = [m for m in items if m['guid'] == missing_guid]
                    original = res.find_one(req=None, guid=i[0]['guid'])
                    if not original:
                        try:
                            s = time.time()
                            res.post(i)
                            print(
                                'Post single item to Superdesk in {:.2f} seconds'
                                .format(time.time() - s))
                        except Exception as ex:
                            print('Exception posting single item')
            else:
                print('Exception posting batch')