Пример #1
0
    def parse_item(self, response):
        log.msg('[%s] Parsing Start: %s' % (self.id, response.url),level=log.INFO,spider=self)
        #log.msg('response header' + response.headers['content-type'], level=log.INFO, spider=self)
        try:

            item = {
                    'urlAddress' : response.url,
                    'domain' :  self.allowed_domains,
                    'site' : Site.objects.get(pk=self.id),
                    'response_code' : response.status, 
                    'isUsed' : 0
                    }

            if '.pdf' in str(response.url[-4:]):
                pdf_name = str(self.id) + '_' + str(datetime.now().isoformat()) + '.pdf'
                path = '/home/ec2-user/bblio/scraper/pdf/'
                if not os.path.exists(path):
                    os.makedirs(path)
                item.update({
                        'document_html' : path + pdf_name,
                        'encoding' : 'PDF'
                        })
                log.msg('PDF path: ' + path + pdf_name,level=log.INFO)        
                
                with open(path + pdf_name, "wb") as f: 
                    f.write(response.body)
                f.close()
                #aws.ec2.copy_file_to_web_server(path+pdf_name ,path + pdf_name)
                aws.ec2.copy_file_to_S3(response.url, path + pdf_name)
                os.remove(path + pdf_name)
            else:
                item.update({
                    'encoding' : response.headers['content-type'].split('charset=')[-1],
                    'document_html': (response.body).decode('utf-8','ignore').encode('utf-8')
                    })

            if Document.objects.filter(site_id=self.id).filter(urlAddress=item['urlAddress']).count() == 1:
                logging.info('[%s] Parsing Doc Overwrite: %s' % (self.id, response.url))
                d = Document.objects.filter(site_id=self.id).filter(urlAddress=item['urlAddress'])[0]
                d.document_html = item['document_html']
                d.encoding = item['encoding']
                d.domain = item['domain']
                d.response_code = item['response_code']
                d.isUsed = 0
                d.save()
            else:
                d = Document(**item)
                d.save()
            
            logging.info('[%s] Parsing Success: %s' % (self.id, response.url))

            return
        except AttributeError:
            logging.info('* Cannot parse: ' + response.url)
            logging.info(sys.exc_info()[0])
            return

        except:
            logging.info('* Unexpected error:' + str(sys.exc_info()[0]) + '\n' + str(sys.exc_info()[1]))
            return
    return chapter_ordinal

title = xpath('//tei:title', xml)
author = xpath('//tei:author', xml)
id = xpath('/tei:TEI/@xml:id', xml)

d = Document(id=id,
             title=title,

             author=author,
             add_date=datetime.now(),
             pub_date=datetime.now()
             )

d.save()

logging.info("Adding content for id %s" %  d.id)
chapter_ordinal = 1

# Do we have parts?
if len(xml.xpath("//tei:div[@type='part']", namespaces={'tei': TEI})) > 0:
    part_ordinal = 1
    for part in xml.xpath("//tei:div[@type='part']", namespaces={'tei': TEI}):
        part_id = xpath('@xml:id', part)
        part_title = xpath('tei:head[1]', part) 
        logging.debug("Adding part", part_title.encode('utf-8'))
        p = d.part_set.create(id=part_id,
                              title=part_title,
                              ordinal=part_ordinal,
                              label='part')