def get_norm_datetime(tree): if tree is None: return try: value = datetime.strptime(tree.attrib['norm'], '%Y%m%dT%H%M%S') except ValueError: value = datetime.strptime(tree.attrib['norm'], '%Y%m%dT%H%M%S%z') return utc.normalize(value) if value.tzinfo else value
def get_norm_datetime(self, tree): if tree is None: return try: value = datetime.strptime(tree.attrib["norm"], "%Y%m%dT%H%M%S") except ValueError: try: value = datetime.strptime(tree.attrib["norm"], "%Y%m%dT%H%M%S%z") except ValueError: value = dateutil.parser.parse(tree.attrib["norm"]) return utc.normalize(value) if value.tzinfo else value
def get_norm_datetime(self, tree): if tree is None: return try: value = datetime.strptime(tree.attrib["norm"], "%Y%m%dT%H%M%S") except ValueError: try: value = datetime.strptime(tree.attrib["norm"], "%Y%m%dT%H%M%S%z") except ValueError: try: value = dateutil.parser.parse(tree.attrib["norm"]) except ValueError: return return utc.normalize(value) if value.tzinfo else value
def normalize_date(naive, tz): return utc.normalize(tz.localize(naive))
def _process_bunch(self, x): # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href') for doc in x.findall('dc_rest_docs/dc_rest_doc'): print(doc.get('href')) id = doc.find('dcdossier').get('id') if int(id) < self._id: self._id = int(id) item = {} item['guid'] = doc.find('dcdossier').get('guid') # if the item has been modified in the archive then it is due to a kill # there is an argument that this item should not be imported at all if doc.find('dcdossier').get('created') != doc.find( 'dcdossier').get('modified'): item[ITEM_STATE] = CONTENT_STATE.KILLED else: item[ITEM_STATE] = CONTENT_STATE.PUBLISHED value = datetime.strptime( self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S') item['firstcreated'] = utc.normalize( value) if value.tzinfo else value item['versioncreated'] = item['firstcreated'] generate_unique_id_and_name(item) item['ingest_id'] = id item['source'] = self._get_head_value(doc, 'Agency') self._addkeywords('AsiaPulseCodes', doc, item) byline = self._get_head_value(doc, 'Byline') if byline: item['byline'] = byline # item['service'] = self._get_head_value(doc,'Service') category = self._get_head_value(doc, 'Category') if not category: publication_name = self._get_head_value(doc, 'PublicationName') if publication_name in pubnames: category = pubnames[publication_name] if category: anpacategory = {} anpacategory['qcode'] = category for anpa_category in self._anpa_categories['items']: if anpacategory['qcode'].lower( ) == anpa_category['qcode'].lower(): anpacategory = { 'qcode': anpacategory['qcode'], 'name': anpa_category['name'] } break item['anpa_category'] = [anpacategory] self._addkeywords('CompanyCodes', doc, item) type = self._get_head_value(doc, 'Format') if type == 'x': item[ITEM_TYPE] = CONTENT_TYPE.TEXT elif type == 't': item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED else: item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['keyword'] = self._get_head_value(doc, 'Keyword') item['ingest_provider_sequence'] = self._get_head_value( doc, 'Sequence') orginal_source = self._get_head_value(doc, 'Author') if orginal_source: item['original_source'] = orginal_source item['headline'] = self._get_head_value(doc, 'Headline') code = self._get_head_value(doc, 'SubjectRefNum') if code and len(code) == 7: code = '0' + code if code and code in subject_codes: item['subject'] = [] item['subject'].append({ 'qcode': code, 'name': subject_codes[code] }) try: process_iptc_codes(item, None) except: pass slug = self._get_head_value(doc, 'SLUG') if slug: item['slugline'] = slug else: item['slugline'] = self._get_head_value(doc, 'Keyword') # self._addkeywords('Takekey', doc, item) take_key = self._get_head_value(doc, 'Takekey') if take_key: item['anpa_take_key'] = take_key self._addkeywords('Topic', doc, item) self._addkeywords('Selectors', doc, item) el = doc.find('dcdossier/document/body/BodyText') if el is not None: story = el.text if item[ITEM_TYPE] == CONTENT_TYPE.TEXT: story = story.replace('\n ', '<br><br>') story = story.replace('\n', '<br>') item['body_html'] = story else: item['body_html'] = story try: item['word_count'] = get_text_word_count(item['body_html']) except: pass item['pubstatus'] = 'usable' item['allow_post_publish_actions'] = False res = superdesk.get_resource_service('published') original = res.find_one(req=None, guid=item['guid']) if not original: item['_id'] = item['guid'] res.post([item]) else: res.patch(original['_id'], item) if self._limit: self._limit -= 1
def _process_bunch(self, x): # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href') for doc in x.findall('dc_rest_docs/dc_rest_doc'): print(doc.get('href')) id = doc.find('dcdossier').get('id') if int(id) < self._id: self._id = int(id) item = {} item['guid'] = doc.find('dcdossier').get('guid') # if the item has been modified in the archive then it is due to a kill # there is an argument that this item should not be imported at all if doc.find('dcdossier').get('created') != doc.find('dcdossier').get('modified'): item[ITEM_STATE] = CONTENT_STATE.KILLED else: item[ITEM_STATE] = CONTENT_STATE.PUBLISHED value = datetime.strptime(self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S') item['firstcreated'] = utc.normalize(value) if value.tzinfo else value item['versioncreated'] = item['firstcreated'] item['unique_id'] = doc.find('dcdossier').get('unique') item['ingest_id'] = id item['source'] = self._get_head_value(doc, 'Agency') self._addkeywords('AsiaPulseCodes', doc, item) byline = self._get_head_value(doc, 'Byline') if byline: item['byline'] = byline # item['service'] = self._get_head_value(doc,'Service') category = self._get_head_value(doc, 'Category') if not category: publication_name = self._get_head_value(doc, 'PublicationName') if publication_name in pubnames: category = pubnames[publication_name] if category: anpacategory = {} anpacategory['qcode'] = category for anpa_category in self._anpa_categories['items']: if anpacategory['qcode'].lower() == anpa_category['qcode'].lower(): anpacategory = {'qcode': anpacategory['qcode'], 'name': anpa_category['name']} break item['anpa_category'] = [anpacategory] self._addkeywords('CompanyCodes', doc, item) type = self._get_head_value(doc, 'Format') if type == 'x': item[ITEM_TYPE] = CONTENT_TYPE.TEXT elif type == 't': item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED else: item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['keyword'] = self._get_head_value(doc, 'Keyword') item['ingest_provider_sequence'] = self._get_head_value(doc, 'Sequence') orginal_source = self._get_head_value(doc, 'Author') if orginal_source: item['original_source'] = orginal_source item['headline'] = self._get_head_value(doc, 'Headline') code = self._get_head_value(doc, 'SubjectRefNum') if code and len(code) == 7: code = '0' + code if code and code in subject_codes: item['subject'] = [] item['subject'].append({'qcode': code, 'name': subject_codes[code]}) try: process_iptc_codes(item, None) except: pass slug = self._get_head_value(doc, 'SLUG') if slug: item['slugline'] = slug else: item['slugline'] = self._get_head_value(doc, 'Keyword') # self._addkeywords('Takekey', doc, item) take_key = self._get_head_value(doc, 'Takekey') if take_key: item['anpa_take_key'] = take_key self._addkeywords('Topic', doc, item) self._addkeywords('Selectors', doc, item) el = doc.find('dcdossier/document/body/BodyText') if el is not None: story = el.text if item[ITEM_TYPE] == CONTENT_TYPE.TEXT: story = story.replace('\n ', '<br><br>') story = story.replace('\n', '<br>') item['body_html'] = story else: item['body_html'] = story try: item['word_count'] = get_text_word_count(item['body_html']) except: pass item['pubstatus'] = 'usable' item['allow_post_publish_actions'] = False res = superdesk.get_resource_service('published') original = res.find_one(req=None, guid=item['guid']) if not original: res.post([item]) else: res.patch(original['_id'], item) if self._limit: self._limit -= 1