def parse_item(self, response): log.msg('[%s] Parsing Start: %s' % (self.id, response.url),level=log.INFO,spider=self) #log.msg('response header' + response.headers['content-type'], level=log.INFO, spider=self) try: item = { 'urlAddress' : response.url, 'domain' : self.allowed_domains, 'site' : Site.objects.get(pk=self.id), 'response_code' : response.status, 'isUsed' : 0 } if '.pdf' in str(response.url[-4:]): pdf_name = str(self.id) + '_' + str(datetime.now().isoformat()) + '.pdf' path = '/home/ec2-user/bblio/scraper/pdf/' if not os.path.exists(path): os.makedirs(path) item.update({ 'document_html' : path + pdf_name, 'encoding' : 'PDF' }) log.msg('PDF path: ' + path + pdf_name,level=log.INFO) with open(path + pdf_name, "wb") as f: f.write(response.body) f.close() #aws.ec2.copy_file_to_web_server(path+pdf_name ,path + pdf_name) aws.ec2.copy_file_to_S3(response.url, path + pdf_name) os.remove(path + pdf_name) else: item.update({ 'encoding' : response.headers['content-type'].split('charset=')[-1], 'document_html': (response.body).decode('utf-8','ignore').encode('utf-8') }) if Document.objects.filter(site_id=self.id).filter(urlAddress=item['urlAddress']).count() == 1: logging.info('[%s] Parsing Doc Overwrite: %s' % (self.id, response.url)) d = Document.objects.filter(site_id=self.id).filter(urlAddress=item['urlAddress'])[0] d.document_html = item['document_html'] d.encoding = item['encoding'] d.domain = item['domain'] d.response_code = item['response_code'] d.isUsed = 0 d.save() else: d = Document(**item) d.save() logging.info('[%s] Parsing Success: %s' % (self.id, response.url)) return except AttributeError: logging.info('* Cannot parse: ' + response.url) logging.info(sys.exc_info()[0]) return except: logging.info('* Unexpected error:' + str(sys.exc_info()[0]) + '\n' + str(sys.exc_info()[1])) return
return chapter_ordinal title = xpath('//tei:title', xml) author = xpath('//tei:author', xml) id = xpath('/tei:TEI/@xml:id', xml) d = Document(id=id, title=title, author=author, add_date=datetime.now(), pub_date=datetime.now() ) d.save() logging.info("Adding content for id %s" % d.id) chapter_ordinal = 1 # Do we have parts? if len(xml.xpath("//tei:div[@type='part']", namespaces={'tei': TEI})) > 0: part_ordinal = 1 for part in xml.xpath("//tei:div[@type='part']", namespaces={'tei': TEI}): part_id = xpath('@xml:id', part) part_title = xpath('tei:head[1]', part) logging.debug("Adding part", part_title.encode('utf-8')) p = d.part_set.create(id=part_id, title=part_title, ordinal=part_ordinal, label='part')