예제 #1
0
def get_norm_datetime(tree):
    if tree is None:
        return

    try:
        value = datetime.strptime(tree.attrib['norm'], '%Y%m%dT%H%M%S')
    except ValueError:
        value = datetime.strptime(tree.attrib['norm'], '%Y%m%dT%H%M%S%z')

    return utc.normalize(value) if value.tzinfo else value
예제 #2
0
def get_norm_datetime(tree):
    if tree is None:
        return

    try:
        value = datetime.strptime(tree.attrib['norm'], '%Y%m%dT%H%M%S')
    except ValueError:
        value = datetime.strptime(tree.attrib['norm'], '%Y%m%dT%H%M%S%z')

    return utc.normalize(value) if value.tzinfo else value
예제 #3
0
    def get_norm_datetime(self, tree):
        if tree is None:
            return

        try:
            value = datetime.strptime(tree.attrib["norm"], "%Y%m%dT%H%M%S")
        except ValueError:
            try:
                value = datetime.strptime(tree.attrib["norm"], "%Y%m%dT%H%M%S%z")
            except ValueError:
                value = dateutil.parser.parse(tree.attrib["norm"])

        return utc.normalize(value) if value.tzinfo else value
예제 #4
0
    def get_norm_datetime(self, tree):
        if tree is None:
            return

        try:
            value = datetime.strptime(tree.attrib["norm"], "%Y%m%dT%H%M%S")
        except ValueError:
            try:
                value = datetime.strptime(tree.attrib["norm"],
                                          "%Y%m%dT%H%M%S%z")
            except ValueError:
                try:
                    value = dateutil.parser.parse(tree.attrib["norm"])
                except ValueError:
                    return

        return utc.normalize(value) if value.tzinfo else value
예제 #5
0
def normalize_date(naive, tz):
    return utc.normalize(tz.localize(naive))
예제 #6
0
    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            print(doc.get('href'))
            id = doc.find('dcdossier').get('id')
            if int(id) < self._id:
                self._id = int(id)
            item = {}
            item['guid'] = doc.find('dcdossier').get('guid')

            # if the item has been modified in the archive then it is due to a kill
            # there is an argument that this item should not be imported at all
            if doc.find('dcdossier').get('created') != doc.find(
                    'dcdossier').get('modified'):
                item[ITEM_STATE] = CONTENT_STATE.KILLED
            else:
                item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

            value = datetime.strptime(
                self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S')
            item['firstcreated'] = utc.normalize(
                value) if value.tzinfo else value
            item['versioncreated'] = item['firstcreated']

            generate_unique_id_and_name(item)
            item['ingest_id'] = id

            item['source'] = self._get_head_value(doc, 'Agency')

            self._addkeywords('AsiaPulseCodes', doc, item)

            byline = self._get_head_value(doc, 'Byline')
            if byline:
                item['byline'] = byline

            # item['service'] = self._get_head_value(doc,'Service')

            category = self._get_head_value(doc, 'Category')
            if not category:
                publication_name = self._get_head_value(doc, 'PublicationName')
                if publication_name in pubnames:
                    category = pubnames[publication_name]
            if category:
                anpacategory = {}
                anpacategory['qcode'] = category
                for anpa_category in self._anpa_categories['items']:
                    if anpacategory['qcode'].lower(
                    ) == anpa_category['qcode'].lower():
                        anpacategory = {
                            'qcode': anpacategory['qcode'],
                            'name': anpa_category['name']
                        }
                        break
                item['anpa_category'] = [anpacategory]

            self._addkeywords('CompanyCodes', doc, item)

            type = self._get_head_value(doc, 'Format')
            if type == 'x':
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            elif type == 't':
                item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED
            else:
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT

            item['keyword'] = self._get_head_value(doc, 'Keyword')
            item['ingest_provider_sequence'] = self._get_head_value(
                doc, 'Sequence')

            orginal_source = self._get_head_value(doc, 'Author')
            if orginal_source:
                item['original_source'] = orginal_source

            item['headline'] = self._get_head_value(doc, 'Headline')

            code = self._get_head_value(doc, 'SubjectRefNum')
            if code and len(code) == 7:
                code = '0' + code
            if code and code in subject_codes:
                item['subject'] = []
                item['subject'].append({
                    'qcode': code,
                    'name': subject_codes[code]
                })
                try:
                    process_iptc_codes(item, None)
                except:
                    pass

            slug = self._get_head_value(doc, 'SLUG')
            if slug:
                item['slugline'] = slug
            else:
                item['slugline'] = self._get_head_value(doc, 'Keyword')

            # self._addkeywords('Takekey', doc, item)
            take_key = self._get_head_value(doc, 'Takekey')
            if take_key:
                item['anpa_take_key'] = take_key

            self._addkeywords('Topic', doc, item)

            self._addkeywords('Selectors', doc, item)

            el = doc.find('dcdossier/document/body/BodyText')
            if el is not None:
                story = el.text
                if item[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                    story = story.replace('\n   ', '<br><br>')
                    story = story.replace('\n', '<br>')
                    item['body_html'] = story
                else:
                    item['body_html'] = story
                try:
                    item['word_count'] = get_text_word_count(item['body_html'])
                except:
                    pass

            item['pubstatus'] = 'usable'
            item['allow_post_publish_actions'] = False

            res = superdesk.get_resource_service('published')
            original = res.find_one(req=None, guid=item['guid'])
            if not original:
                item['_id'] = item['guid']
                res.post([item])
            else:
                res.patch(original['_id'], item)

            if self._limit:
                self._limit -= 1
예제 #7
0
def normalize_date(naive, tz):
    return utc.normalize(tz.localize(naive))
예제 #8
0
    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            print(doc.get('href'))
            id = doc.find('dcdossier').get('id')
            if int(id) < self._id:
                self._id = int(id)
            item = {}
            item['guid'] = doc.find('dcdossier').get('guid')

            # if the item has been modified in the archive then it is due to a kill
            # there is an argument that this item should not be imported at all
            if doc.find('dcdossier').get('created') != doc.find('dcdossier').get('modified'):
                item[ITEM_STATE] = CONTENT_STATE.KILLED
            else:
                item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

            value = datetime.strptime(self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S')
            item['firstcreated'] = utc.normalize(value) if value.tzinfo else value
            item['versioncreated'] = item['firstcreated']

            item['unique_id'] = doc.find('dcdossier').get('unique')
            item['ingest_id'] = id

            item['source'] = self._get_head_value(doc, 'Agency')

            self._addkeywords('AsiaPulseCodes', doc, item)

            byline = self._get_head_value(doc, 'Byline')
            if byline:
                item['byline'] = byline

            # item['service'] = self._get_head_value(doc,'Service')

            category = self._get_head_value(doc, 'Category')
            if not category:
                publication_name = self._get_head_value(doc, 'PublicationName')
                if publication_name in pubnames:
                    category = pubnames[publication_name]
            if category:
                anpacategory = {}
                anpacategory['qcode'] = category
                for anpa_category in self._anpa_categories['items']:
                    if anpacategory['qcode'].lower() == anpa_category['qcode'].lower():
                        anpacategory = {'qcode': anpacategory['qcode'], 'name': anpa_category['name']}
                        break
                item['anpa_category'] = [anpacategory]

            self._addkeywords('CompanyCodes', doc, item)

            type = self._get_head_value(doc, 'Format')
            if type == 'x':
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            elif type == 't':
                item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED
            else:
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT

            item['keyword'] = self._get_head_value(doc, 'Keyword')
            item['ingest_provider_sequence'] = self._get_head_value(doc, 'Sequence')

            orginal_source = self._get_head_value(doc, 'Author')
            if orginal_source:
                item['original_source'] = orginal_source

            item['headline'] = self._get_head_value(doc, 'Headline')

            code = self._get_head_value(doc, 'SubjectRefNum')
            if code and len(code) == 7:
                code = '0' + code
            if code and code in subject_codes:
                item['subject'] = []
                item['subject'].append({'qcode': code, 'name': subject_codes[code]})
                try:
                    process_iptc_codes(item, None)
                except:
                    pass

            slug = self._get_head_value(doc, 'SLUG')
            if slug:
                item['slugline'] = slug
            else:
                item['slugline'] = self._get_head_value(doc, 'Keyword')

            # self._addkeywords('Takekey', doc, item)
            take_key = self._get_head_value(doc, 'Takekey')
            if take_key:
                item['anpa_take_key'] = take_key

            self._addkeywords('Topic', doc, item)

            self._addkeywords('Selectors', doc, item)

            el = doc.find('dcdossier/document/body/BodyText')
            if el is not None:
                story = el.text
                if item[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                    story = story.replace('\n   ', '<br><br>')
                    story = story.replace('\n', '<br>')
                    item['body_html'] = story
                else:
                    item['body_html'] = story
                try:
                    item['word_count'] = get_text_word_count(item['body_html'])
                except:
                    pass

            item['pubstatus'] = 'usable'
            item['allow_post_publish_actions'] = False

            res = superdesk.get_resource_service('published')
            original = res.find_one(req=None, guid=item['guid'])
            if not original:
                res.post([item])
            else:
                res.patch(original['_id'], item)

            if self._limit:
                self._limit -= 1