def import_doc(self, info): url = DOC_DL_URL % (info['type'], info['id']) info['info_link'] = url self.fix_id_quirks(info) if not should_download_doc(info): self.logger.warning("skipping %s %s" % (info['type'], info['id'])) return None origin_id = "%s %s" % (info['type'], info['id']) try: doc = Document.objects.get(origin_id=origin_id) except Document.DoesNotExist: doc = Document(origin_id=origin_id) if 'update_time' in info: doc.mark_checked() if doc.last_modified_time and doc.last_modified_time >= info[ 'update_time'] and not self.replace: self.logger.debug("%s %s not updated" % (info['type'], info['id'])) doc.save(update_fields=['last_checked_time']) return None else: self.logger.debug( "%s %s updated %s (checked %s)" % (info['type'], info['id'], info['update_time'], doc.last_modified_time)) else: if doc.pk and not self.replace: return doc doc.type = DOC_TYPES[info['type']] doc.name = origin_id info = self.fetch_processing_info(info) if info['type'] == 'HE': self.import_he(info) else: ret = self.import_sgml_doc(info, current_version=doc.version) if not ret: return None doc.version = info.get('doc_version', None) doc.subject = info['subject'] for attr_name in ('summary', 'question', 'answer', 'answerer_name', 'answerer_title'): if attr_name in info: setattr(doc, attr_name, info[attr_name]) if 'error' in info: doc.error = info['error'] else: doc.error = None # Figure out the document date through the intro stage. for st in info['phases']: if st['phase'] == 'intro': doc.date = st['date'] break if doc.date is None: raise ParseError("Document date could not be determined") doc.info_link = info['info_link'] if 'sgml_link' in info: doc.sgml_link = info['sgml_link'] if 'author' in info: doc.author = Member.objects.get(origin_id=info['author']['id']) doc.mark_modified() doc.save() self.save_stages(doc, info) self.save_keywords(doc, info) if 'signatures' in info: self.save_signatures(doc, info) # The keywords are saved only at this point. We'll save it again in order # to create the proper KeywordActivity objects. doc._updated = True doc.save() return doc
def import_doc(self, info): url = DOC_DL_URL % (info['type'], info['id']) info['info_link'] = url self.fix_id_quirks(info) if not should_download_doc(info): self.logger.warning("skipping %s %s" % (info['type'], info['id'])) return None origin_id = "%s %s" % (info['type'], info['id']) try: doc = Document.objects.get(origin_id=origin_id) except Document.DoesNotExist: doc = Document(origin_id=origin_id) if 'update_time' in info: doc.mark_checked() if doc.last_modified_time and doc.last_modified_time >= info['update_time'] and not self.replace: self.logger.debug("%s %s not updated" % (info['type'], info['id'])) doc.save(update_fields=['last_checked_time']) return None else: self.logger.debug("%s %s updated %s (checked %s)" % ( info['type'], info['id'], info['update_time'], doc.last_modified_time )) else: if doc.pk and not self.replace: return doc doc.type = DOC_TYPES[info['type']] doc.name = origin_id info = self.fetch_processing_info(info) if info['type'] == 'HE': self.import_he(info) else: ret = self.import_sgml_doc(info, current_version=doc.version) if not ret: return None doc.version = info.get('doc_version', None) doc.subject = info['subject'] for attr_name in ('summary', 'question', 'answer', 'answerer_name', 'answerer_title'): if attr_name in info: setattr(doc, attr_name, info[attr_name]) if 'error' in info: doc.error = info['error'] else: doc.error = None # Figure out the document date through the intro stage. for st in info['phases']: if st['phase'] == 'intro': doc.date = st['date'] break if doc.date is None: raise ParseError("Document date could not be determined") doc.info_link = info['info_link'] if 'sgml_link' in info: doc.sgml_link = info['sgml_link'] if 'author' in info: doc.author = Member.objects.get(origin_id=info['author']['id']) doc.mark_modified() doc.save() self.save_stages(doc, info) self.save_keywords(doc, info) if 'signatures' in info: self.save_signatures(doc, info) # The keywords are saved only at this point. We'll save it again in order # to create the proper KeywordActivity objects. doc._updated = True doc.save() return doc
def import_doc(self, info): url = DOC_DL_URL % (info['type'], info['id']) info['info_link'] = url self.fix_quirks(info) if not should_download_doc(info): self.logger.warning("skipping %s %s" % (info['type'], info['id'])) return None self.logger.info("downloading %s %s" % (info['type'], info['id'])) origin_id = "%s %s" % (info['type'], info['id']) try: doc = Document.objects.get(origin_id=origin_id) if not self.replace: return doc except Document.DoesNotExist: doc = Document(origin_id=origin_id) doc.type = DOC_TYPES[info['type']] doc.name = origin_id info = self.fetch_processing_info(info) if info['type'] == 'HE': self.import_he(info) else: ret = self.import_sgml_doc(info) if not ret: return None s = "%s %s" % (info['type'], info['id']) doc.subject = info['subject'] if 'summary' in info: doc.summary = info['summary'] if 'error' in info: doc.error = info['error'] else: doc.error = None # Figure out the document date through the intro stage. for st in info['phases']: (idx, stage, date) = st if stage == 'intro': doc.date = date break if doc.date is None: raise ParseError("Document date could not be determined") doc.info_link = info['info_link'] if 'sgml_link' in info: doc.sgml_link = info['sgml_link'] if 'author' in info: doc.author = Member.objects.get(origin_id=info['author']['id']) doc.save() self.save_stages(doc, info) self.save_keywords(doc, info) if 'signatures' in info: self.save_signatures(doc, info) # The keywords are saved only at this point. We'll save it again in order # to create the proper KeywordActivity objects. doc._updated = True doc.save() return doc