Пример #1
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                 parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'rss/index\.html$', 'rss/index.rss',
                               href)
             new_href = re.sub(r'/index\.html$', '/', new_href)
             new_href = re.sub(r'index\.html$', '.', new_href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     for element in d('link'):
         e = PyQuery(element)
         href = e.attr('href')
         if href is not None and not abs_url_regex.search(href):
             new_href = re.sub(r'rss/index\.html$', 'rss/index.rss',
                               href)
             new_href = re.sub(r'\?v=.*$', '', href)
             e.attr('href', new_href)
             print "\t", href, "removed v =>", new_href
     for element in d('script'):
         e = PyQuery(element)
         href = e.attr('src')
         if href is not None and not abs_url_regex.search(href):
             new_href = re.sub(r'\?v=.*$', '', href)
             e.attr('src', new_href)
             print "\t", href, "removed v =>", new_href
     if parser == 'html':
         return "<!DOCTYPE html>" + d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #2
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                        parser=parser)
            for element in d('a'):
                e = PyQuery(element)
                href = e.attr('href')
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'rss/index\.html$', 'rss/index.rss',
                                      href)
                    new_href = re.sub(r'/index\.html$', '/', new_href)
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href
            link_element_types = [
                'script', 'meta', 'a', 'link', 'img', 'amp-img'
            ]
            link_attributes = ['href', 'content', 'src', 'url']
            for element_type in link_element_types:
                for element in d(element_type):
                    e = PyQuery(element)
                    for a in link_attributes:
                        old_a = e.attr(a)
                        if old_a:
                            new_a = old_a.replace(arguments['--domain'],
                                                  arguments['--target-domain'])
                            new_a = re.sub(r'^[a-z]+://', '//', new_a)
                            e.attr(a, new_a)
                            print "\t", old_a, "=>", new_a

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #3
0
        def fix_meta_image_links(text, parser):
            filetext = text.decode('utf8')
            td_regex = re.compile(target_domain + '|')

            assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
            d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')),
                        parser=parser)
            for share_class in [
                    'meta[property="og:image"], meta[name="twitter:image"]'
            ]:
                print "share_class : ", share_class
                for element in d(share_class):
                    e = PyQuery(element)
                    print "element : ", e
                    href = e.attr('content')
                    print "href : ", href
                    print "domain : ", domain
                    content_target_domain = target_domain.replace(
                        "/static", "")
                    print "target_domain : ", content_target_domain
                    new_href = re.sub(domain, content_target_domain, href)
                    e.attr('content', new_href)
                    print "\t", href, "=>", new_href
            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #4
0
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        content_node = doc('div#rightdiv1')
        content_node.remove('span.white12')
        item = ContentItem()
        content_node = content_node.__unicode__()
        img_all = []
        img='leftsmallimgurl\[1\]\=\"(.*?)\"\;'
        ob = re.compile(img)
        imgs = ob.findall(doc.__unicode__())
        if not imgs:
            image=''
        else:
            image='<br/><img src="'+imgs[0]+'"/><br/>'
            img_all.append(self.getRealURI(imgs[0]))
        content_node=image+content_node
        item['image_urls'] = img_all
                
        item['title'] = self.title = doc('h1').text()
        item['content'] = self.content = content_node
                    
        item['release_time'] = ''
#        item['release_switch_time'] = self.release_switch_time = time.time()
        item['source'] = u"瑞丽服饰网"
        item['author'] = ''
        item['pic_url'] = ''
        
        self.title = item['title']
        self.content = item['content']
        
        return item
Пример #5
0
        def fixLinks(text, parser):
            # JQuery translator
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                        parser=parser)

            for element in d('a'):
                e = PyQuery(element)

                href = e.attr('href')
                if not abs_url_regex.search(href):
                    # redirect rss file
                    new_href = re.sub(r'rss/index\.html$', 'rss/index.rss',
                                      href)
                    # point index file to / as server location
                    new_href = re.sub(r'/index\.html$', '/', new_href)
                    e.attr('href', new_href)
                    print("\t", href, "=>", new_href)

            # fix wrong jpgpg case
            for element in d("img"):
                e = PyQuery(element)
                attr_name = "srcset"
                print("img:", e)
                attr = e.attr(attr_name)
                if attr:
                    new_attr = re.sub(r"\.jpgg ", ".jpg ", attr)
                    new_attr = re.sub(r"\.jpgpg ", ".jpg ", new_attr)
                    new_attr = re.sub(r"\.jpgjpg ", ".jpg ", new_attr)
                    # upsert element attribute
                    e.attr(attr_name, new_attr)
                    print("\t", attr, "=>", new_attr)

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #6
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                 parser=parser)
     for element in d('link'):
         e = PyQuery(element)
         href = e.attr('href')
         if href:
             if href.find(domain) > -1:
                 new_href = href.split(domain)[-1]
                 new_href = '{}{}'.format(target_domain, new_href)
                 e.attr('href', new_href)
                 print "\t", "fixed link ", href, "=> ", new_href
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if href:
             if href.find(domain) > -1:
                 new_href = href.split(domain)[-1]
                 e.attr('href', new_href)
                 print "\t", "Fixed ", href, "=> ", new_href
         if href and not abs_url_regex.search(href):
             new_href = re.sub(r'rss/index\.html$', 'rss/index.rss',
                               href)
             new_href = re.sub(r'/index\.html$', '/', new_href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #7
0
        def fixLinks(text, parser):
            if text == '':
                return ''

            d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                        parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue
                if (not abs_url_regex.search(href)) or ('/rss/' in href):
                    new_href = re.sub(r"/([\w-]+)$", r"/\1.html", href)
                    new_href = re.sub(r"^([\w-]+)$", r"\1.html", new_href)
                    if href != new_href:
                        e.attr('href', new_href)
                        print "\t", href, "=>", new_href

                href = e.attr('href')
                if bad_url_regex.search(href):
                    new_href = re.sub(r'(.+)\.[0-9]{1,2}$', r'\1', href)
                    e.attr('href', new_href)
                    print "\t FIX! ", href, "=>", new_href
            if parser == 'html':
                return "<!DOCTYPE html>\n<html>" + d.html(
                    method='html').encode('utf8') + "</html>"
            return "<!DOCTYPE html>\n<html>" + d.__unicode__().encode(
                'utf8') + "</html>"
Пример #8
0
    def extract(self):
        item = ContentItem()

        self.html = re.sub('<!--.*?-->', '', self.html)
        content_node = self.hxs.select("//div[@class = 'art_con']").extract()
        content_node = PyQuery(content_node[0])
        
        content_node.remove('div[class = "pconline_page"]')
        content_node.remove('div[class = "pc3g"]')
        content_node.remove('div[class = "pageTips"]')
        content_node.remove('div[class = "art_nav_box mt10"]')
        content_node.remove('div[class = "art_bottom"]')
        content_node.remove('div[class = "art_con_top"]')

        

        item['image_urls'] = [self.getRealURI(img.get('src')) for img in content_node('img') if not img.get('src').endswith('.gif')]
        item['title'] = self.title = self.hxs.select("//h1/text()").extract()[0]
        if not item['title']:
            item['title'] = self.title = self.hxs.select("//div[@id = 'UC_newsInfoDetail_lbl_newsTitle']/text()").extract()[0]
        item['content'] = self.content = content_node.__unicode__()
        release_time = self.hxs.select("//div[@class = 'art_con_top']").extract()[0]
        doc_t = PyQuery(release_time)
        release_time = doc_t('span').text()
        p = re.compile(u'20\d\d年\d\d月\d\d日')
        #item['release_time'] = self.release_time = doc('div[class="art_con_top"]').find('span').eq(0).text()
        item['release_time'] = self.release_time = p.search(release_time).group()
        item['source'] = u'pconline'
        item['author'] = ''
        item['pic_url'] = ''

        return item
Пример #9
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href)
             new_href = re.sub(r'/index\.html$', '/', new_href)
             e.attr('href', new_href)
             print("\t", href, "=>", new_href)
             
     for element in d('img'):
         e = PyQuery(element)
         srcset = e.attr('srcset')
         
         if srcset != None:
             new_srcset = re.sub(r'\.pngg', '.png', srcset)
             new_srcset = re.sub(r'\.pngng', '.png', new_srcset)
             new_srcset = re.sub(r'\.pngpng', '.png', new_srcset)
             e.attr('srcset', new_srcset)
             
             if srcset != new_srcset:
                 print("\t", srcset, "=>", new_srcset)
     
     if parser == 'html':
         return d.html(method='html')
     return d.__unicode__()
Пример #10
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue

                print '// Drop queryString in included src'
                print 'from: ', href
                result = urlparse(href)

                if result.scheme == 'https':
                    href = href
                elif result.scheme == '':
                    href = result.path + (('#' + result.fragment) if result.fragment != '' else '')
                print 'to: ', href
  
                new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'/index\.html$', '/', new_href)

                if href != new_href:
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #11
0
    def extract(self):
        item = ContentItem()
        self.html = re.sub('<!--.*?-->', '', self.html)

        tz_title=self.hxs.select("//h1/text()").extract()
        content=self.hxs.select("//ul[@class='content']/li/div").extract()
        tz_content=''
        for con in content:
            if "fromposty" in con:
                tz_content=self.hxs.select("//ul[@class='content']/li/div")[2].extract()
                break
            else:
                tz_content=self.hxs.select("//ul[@class='content']/li/div")[1].extract()
        
        release_time=self.hxs.select("//div[@class='gray']/text()").extract()

        imgs=PyQuery(tz_content)
        ob=re.compile('src="(.*?)"')
        imgs=ob.findall(imgs.__unicode__())
        img_all=[]
        for img in imgs:
            if ".gif" in img:
                continue
            if ".GIF" in img:
                continue
            else:
                img_all.append(self.getRealURI(img))
                
        author=self.hxs.select("//td[@class='bbsname']/b/span/a/text()").extract()
        tz_content = PyQuery(tz_content)
        cont_div = tz_content('div[style = "color:#FCFCCC"]')
        for cont in cont_div:
            cont_div.eq(cont_div.index(cont)).removeAttr('style')
        tz_content = tz_content.__unicode__()
        item['image_urls'] = img_all
        item['title'] = self.title = tz_title[0].strip()
        item['content'] = self.content = tz_content
        item['release_time'] = ''
        item['source'] = u"铁血网"
        item['author'] = author[0]
    
        item['pic_url'] = ''
#        item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(self.release_time,u'%Y-%m-%d %H:%M'))
        
        return item
Пример #12
0
 def fixLinks(text):
     d = PyQuery(text, parser='html')
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     return d.__unicode__().encode('utf8')
Пример #13
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href)
             new_href = re.sub(r'/index\.html$', '/', new_href)
             e.attr('href', new_href)
             print( "\t", href, "=>", new_href)
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #14
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', href)
             new_href = re.sub(r'index.html', '/', new_href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #15
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                        parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue
                if (not abs_url_regex.search(href)) or ('/rss/' in href):
                    new_href = re.sub(r'rss/$', 'feed.rss', href)
                    new_href = re.sub(r'index\.html$', '', new_href)
                    new_href = re.sub(r'index\.html\#$', '', new_href)
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href
            if parser == 'html':
                return "<!DOCTYPE html>\n<html>" + d.html(
                    method='html').encode('utf8') + "</html>"
            elif parser == 'xml':
                return "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + d.__unicode__(
                ).encode('utf8')
            return "<!DOCTYPE html>\n<html>" + d.__unicode__().encode(
                'utf8') + "</html>"
Пример #16
0
 def fix_share_links(text,parser):
     td_regex = re.compile(target_domain + '|' )
     
     assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for share_class in ['.icon-twitter','.icon-facebook','.icon-google-plus']:
         for element in d(share_class):
             e = PyQuery(element)
             href = e.attr('href')
             new_href = re.sub(domain, target_domain, href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #17
0
def fix_links(text, parser):
    # remove superfluous "index.html" from relative hyperlinks found in text
    abs_url_regex = re.compile(r'^(?:[a-z]+:)?//', flags=re.IGNORECASE)

    d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
    for element in d('a'):
        e = PyQuery(element)
        href = e.attr('href')
        if not abs_url_regex.search(href):
            new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href)
            new_href = re.sub(r'/index\.html$', '/', new_href)
            e.attr('href', new_href)
            print "\t", href, "=>", new_href
    if parser == 'html':
        return d.html(method='html').encode('utf8')
    return d.__unicode__().encode('utf8')
Пример #18
0
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        content_node = doc('div.kb_zw')
        if not content_node:
#            content_node = doc('div.zw_text')
            content_node = PyQuery(self.hxs.select("//div[@class = 'zw_text']").extract()[0])
        
        content_node.remove('script')
        content_node.remove('style')
        content_node.remove('iframe')
        content_node.remove('div[style = "float:left; width:303px; height:250px; display:inline; margin:10px 10px 10px 10px;"]')
        content_node.remove('input')
        
        

        item = ContentItem()
        item['title'] = self.title = doc('td[align = "center"]')('b').text()
        if item['title'] == None:
            item['title'] = self.title = doc('div.zw_bt').text()
        if item['title'] == None:
            item['title'] = self.title = doc('h1.zw_title').text()
        
        
        item['release_time'] = ''
        
        item['source'] = u"新浪"
        item['author'] = ''
        item['pic_url'] = ''

        imgs = content_node('img')
        image_urls = []
        for img in imgs:
            if ".gif" in img.get('src'):
                continue
            if not img.get('src'):
                continue
            else:
                imgs.eq(imgs.index(img)).before('<br>')
                imgs.eq(imgs.index(img)).append('<br>')
                image_urls.append(self.getRealURI(img.get('src')))
        item['image_urls'] = image_urls

        content = content_node.__unicode__()
        item['content'] = self.content = content
        return item
Пример #19
0
        def fix_share_links(text, parser):
            filetext = text.decode('utf8')
            td_regex = re.compile(target_domain + '|')

            assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
            d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')),
                        parser=parser)
            for share_class in ['.share_links a']:
                for element in d(share_class):
                    e = PyQuery(element)
                    href = e.attr('href')
                    new_href = re.sub(domain, target_domain, href)
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href
            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #20
0
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        content_node = doc('.firstTopic')('div')
        content_node.remove('script')
        content_node.remove('.rate')
        content_node.remove('.affixContent')
        content_node.remove('.thread_gold')
        
        
        item = ContentItem()
        imgs = content_node('.p14')('img')
        img_all = []
        for img in imgs:
            if".gif" in img.get('src'):
                continue
            else:  
                imgs.eq(imgs.index(img)).append('<br>')
                imgs.eq(imgs.index(img)).before('<br>')
                img_all.append(self.getRealURI(img.get('src')))
        item['image_urls'] = img_all
        
        item['title'] = self.title = doc('#thread_title').text()
        content = content_node('.p14').__unicode__()
        content = PyQuery(content)
        del_style = content('div')
        for d in del_style:
            if d.get('style'):
                del_style.eq(del_style.index(d)).attr['style'] = ''
                
        content.remove('dl.rate_list')
        content.remove('span[style = "font-size:12px"]')
        content.remove('dl.rate')
        item['content'] = self.content = content.__unicode__()
        
        release_time=doc('.firstTopic')('.postTime').text()
        ob=re.compile(u'20\d\d.*\d\d')
        release_time=ob.findall(release_time)
        
        item['release_time'] = release_time[0]
#        item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(release_time[0],u'%Y-%m-%d %H:%M:%S'))
        item['source'] = u"17173论坛"
        item['author'] = doc('.th1').eq(0).text()
        item['pic_url'] = ''
        
        return item
Пример #21
0
        def fix_meta_url_links(text, parser):
            filetext = text.decode('utf8')
            td_regex = re.compile(target_domain + '|')

            assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
            d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')),
                        parser=parser)
            for share_class in [
                    'meta[property="og:url"], meta[name="twitter:url"]',
                    'meta[property="og:url"]', 'meta[name="twitter:url"]'
            ]:
                for element in d(share_class):
                    e = PyQuery(element)
                    href = e.attr('content')
                    new_href = re.sub(domain, target_domain, href)
                    e.attr('content', new_href)
                    print "\t meta fixed", href, "=>", new_href
            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #22
0
def fix_href_links(text, parser, page_slug):
    d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
    page_slug = find_page_slug(d)
    for element in d('a'):
        e = PyQuery(element)
        href = e.attr('href')
        #print("\thref", href)
        if href is not None: #no href means it's a named anchor in the text
            if not abs_url_regex.search(href):
                new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href)
                new_href = re.sub(r'/index\.html$', '/', new_href)

                if new_href.find('#') > -1:
                    print("\t\tfound an internal link: ", new_href)
                    new_href = page_slug + new_href
                e.attr('href', REMOTE_PATH + new_href)
                print("\t", href, "=>", e.attr('href'))
    if parser == 'html':
        return d.html(method='html').encode('utf8')
    return d.__unicode__().encode('utf8')
Пример #23
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue

                new_href = re.sub(r'(rss/index\.html)|((?<!\.)rss/?)$', 'rss/index.rss', href)
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'/index\.html$', '/', new_href)

                if href != new_href:
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #24
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         print href
         if href is None:
             continue
         new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', new_href)
         if href != new_href:
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     # remove ?v=XXXXXXXXX in css
     for element in d('link'):
         e = PyQuery(element)
         href = e.attr('href')
         if href is None:
             continue
         if re.match(r'http://fonts',href) is not None:
             continue
         new_href = re.sub(r'\?.*', '',href)  
         if href != new_href:
             e.attr('href',new_href)
             print "\t", href, "=>", new_href     
     # remove ?v=XXXXXXXXX in js                  
     for element in d('script'):
         e = PyQuery(element)
         src = e.attr('src')
         if src is None:
             continue
         new_src = re.sub(r'\?.*', '',src) 
         if src != new_src:
             e.attr('src',new_src)
             print "\t", src, "=>", new_src
     ################### 
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #25
0
        def fixLinks(text, parser):
            #extremely lazy implementation - beware.
            text = text.replace('pngg', 'png')
            text = text.replace('pngng', 'png')
            text = text.replace('pngpng', 'png')

            text = text.replace('PNGG', 'PNG')
            text = text.replace('PNGNG', 'PNG')
            text = text.replace('PNGPNG', 'PNG')

            text = text.replace('jpgg', 'jpg')
            text = text.replace('jpgpg', 'jpg')
            text = text.replace('jpgjpg', 'jpg')

            text = text.replace('jpegg', 'jpeg')
            text = text.replace('jpegeg', 'jpeg')
            text = text.replace('jpegpeg', 'jpeg')

            text = text.replace('http://localhost:2368/',
                                'https://blog.lucaperic.com/')
            text = text.replace(
                'https://feedly.com/i/subscription/feed/https://blog.lucaperic.com/rss/',
                'https://feedly.com/i/subscription/feed/https://blog.lucaperic.com/rss/index.rss'
            )
            text = text.replace('/author/luca/rss/', '/rss/index.rss')
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                        parser=parser)
            for element in d('a'):
                e = PyQuery(element)
                href = e.attr('href')
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'rss/index\.html$', 'rss/index.rss',
                                      href)
                    new_href = re.sub(r'/index\.html$', '/', new_href)
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href
            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #26
0
    def extract(self):
        item = ContentItem()
        self.html = re.sub('<!--.*?-->', '', self.html)

        tz_title=self.hxs.select("//h1/text()").extract()
        tz_content=self.hxs.select("//div[@class='text']").extract()
        release_time=self.hxs.select("//div[@class='user']/ul/li/text()").extract()
        ob=re.compile(u'20\d\d.*:\d\d')
        release_time=ob.findall(release_time[0])

        imgs=self.hxs.select("//div[@class='text']/div/div/p/a/img/@src").extract()
        img_all=[]
        for img in imgs:
            if ".gif" in img:
                continue
            if ".GIF" in img:
                continue
            else:
                img_all.append(self.getRealURI(img))
        
        item['image_urls'] = img_all
        item['title'] = self.title = tz_title[0]
        
        content = tz_content[0]
        content_html = PyQuery(content)
        cont_div = content_html('div[style = "color:#f9f9f9"]')
        for cont in cont_div:
            cont_div.eq(cont_div.index(cont)).removeAttr('style')
        content_html = content_html.__unicode__()
        item['content'] = self.content = content_html
        item['release_time'] = release_time[0]
        item['source'] = u"铁血网"
        item['author'] = ''
    
        item['pic_url'] = ''
#        item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(self.release_time,u'%Y-%m-%d %H:%M'))
        
        return item