Пример #1
0
def no_fonts (pq):  # yuk - lxml etree and PyQuery objs get confused - nested ones arent removed, this goes only 2 levels
    raise Exception, "yuk - it's a mess, use tidy!"

    pq = PyQuery (pq)
    #print fonts.__class__.__name__
    for font in pq ('font'):
        font = PyQuery (font)
        #font ('a').remove()
        #print font.__class__.__name__
        #print len (font), font [0]
        #print dir (font)
        #import sys
        #sys.exit()

        #inner = innerhtml (font)  # .text() #.replace (':','').strip()
        #print 'Replacing font with:', font.html()
        font.replaceWith (font.html())
        #font.getparent().replace (font, PyQuery (inner))
        print 'font replaced:', font [:60]

        #font = no_fonts (font)

    for font in pq ('font'):
        font = PyQuery (font)
        font.replaceWith (font.html())
        print 'font 2 replaced:', font [:60]

    return pq
Пример #2
0
def ReadURL(url):
  trytime = 0
  pq = None
  while (trytime < 3):
    try:
      pq = PyQuery(url = url)
      break
    except Exception as e:
      print 'Exception!', url
      trytime += 1
      raise e
      time.sleep(SLEEP_BETWEEN_REQUEST)
  if pq == None or pq.html() == None:
    return ''
  return pq.html()
Пример #3
0
def _split(inputfile, outputdir):
    source = open(inputfile, 'r')
    html = source.read()
    source.close()

    if not os.path.isdir(outputdir):
        os.mkdir(outputdir)

    idx_slide=0
    idx_section=0

    parsed = PyQuery(html)
    
    for section in parsed('section'):
        slide = PyQuery(section)        
        if slide.has_class('stack'):
            idx_section+=1
            stack_path = os.path.join(outputdir,'%02d' % idx_section )
            os.mkdir(stack_path)
            for sub_slide in PyQuery(slide.html())('section'):
                idx_slide+=1
                _dump_slide(sub_slide, idx_slide, stack_path)
        else: 
            if not slide.parent().has_class('stack'):
                idx_slide+=1
                _dump_slide(slide, idx_slide, outputdir)                    
    def _enhance_text(self):
        """
        Transforms a simplified text into a valid mail.template text.
        :return: mail.template text
        """
        self.ensure_one()
        # Parse and set back the keywords into raw template code
        html_text = PyQuery(self.simplified_text.replace('\n', ''))

        def sort_keywords(kw):
            # Replace first if/for-clauses, then var, then code
            index = kw.position
            if kw.type == 'if' or 'for' in kw.type:
                index += 2*len(self.body_html) * kw.nested_position
                # Take if and for in the appearing order in the text
                index -= kw.position
            elif kw.type == 'var':
                index += len(self.body_html)
            return index

        keywords = self.keyword_ids.sorted(sort_keywords, reverse=True)
        # Replace automatic-generated keywords
        for keyword in keywords:
            keyword_text = html_text('#' + keyword.html_id)
            keyword_text.replace_with(keyword.final_text)

        # Replace user added keywords
        template_text = html_text.html()
        for keyword in keywords.filtered(lambda k: k.type == 'code'):
            to_replace = u"[{}]".format(keyword.short_code)
            template_text = template_text.replace(to_replace, keyword.raw_code)
        final_text = PyQuery(BeautifulSoup(template_text).prettify())
        return final_text('body').html()
Пример #5
0
def test_mount_tag():
    root = PyQuery('<root></root>')
    tag = {'name': 'custom', 'html': '<custom><text>{opts.txt}</text></custom>'}
    dom = vdom.mount_tag(root, tag, {'txt': 'hello world'})
    assert dom and dom.uuid # dom created
    assert vdom.get_dom(dom.uuid) # dom cached
    assert root.html() # mounted something
Пример #6
0
def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )
Пример #7
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue

                print '// Drop queryString in included src'
                print 'from: ', href
                result = urlparse(href)

                if result.scheme == 'https':
                    href = href
                elif result.scheme == '':
                    href = result.path + (('#' + result.fragment) if result.fragment != '' else '')
                print 'to: ', href
  
                new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'/index\.html$', '/', new_href)

                if href != new_href:
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #8
0
def sanitize_description(value):
    cleaned = PyQuery(value)
    cleaned = cleaned.remove('span.playMetaText')
    cleaned.remove('span.playMetaText')
    cleaned.remove('time')
    cleaned.remove('strong')

    return cleaned.html().split('<span>')[-1:][0].replace('</span>', '')
Пример #9
0
def sanitize_html2(value):
    soup = PyQuery(value)
    soup = soup.remove("span.playMetaText")
    soup.remove("span.playMetaText")
    soup.remove("time")
    soup.remove("strong")

    return soup.html().split("<span>")[-1:]
Пример #10
0
    def get_pastes ( self ):
        Logger ().log ( 'Getting pastes', True )
        try:
            page = PyQuery ( url = self.PASTES_URL )
        except KeyboardInterrupt:
            raise
        except:
            return self.CONNECTION_FAIL,None


        """
        There are a set of encoding issues which, coupled with some bugs in etree (such as in the Raspbian packages) can
        trigger encoding exceptions here. As a workaround, we try every possible encoding first, and even if that fails,
        we resort to a very hacky workaround whereby we manually get the page and attempt to encode it as utf-8. It's
        ugly, but it works for now.
        """
        try:
            page_html = page.html ()
        except KeyboardInterrupt:
            raise
        except:
            worked = False
            for enc in all_python_encodings():
                try:
                    page_html = page.html(encoding=enc)
                    worked = True
                    break
                except KeyboardInterrupt:
                    raise
                except:
                    pass
            if not worked:
                # One last try...
                try:
                    f = urllib.request.urlopen(Crawler.PASTES_URL)
                    page_html = PyQuery(str(f.read()).encode('utf8')).html()
                    f.close()
                except KeyboardInterrupt:
                    raise
                except:
                    return self.OTHER_ERROR, None
        if re.match ( r'Pastebin\.com - Access Denied Warning', page_html, re.IGNORECASE ) or 'blocked your IP' in page_html:
            return self.ACCESS_DENIED,None
        else:
            return self.OK,page('.maintable img').next('a')
Пример #11
0
def clean_body(body):
	site = Site.objects.get_current()
	html = PyQuery('<body>' + body + '</body>')
	
	for p in html('p'):
		p = PyQuery(p)
		p.replaceWith('\n\n%s\n\n' % p.html())
	
	html('.alignright').addClass('pull-right').removeClass('alignright')
	html('.alignleft').addClass('pull-left').removeClass('alignleft')
	html('[style="float: left;"]').removeAttr('style').addClass('alignleft')
	html('[style="float: right;"]').removeAttr('style').addClass('alignright')
	
	while '\n\n\n' in body:
		body = body.replace('\n\n\n', '\n\n')
	
	while '\r\r\r' in body:
		body = body.replace('\r\r\r', '\r\r')
	
	body = html.html()
	body = body.replace('<br />', '  \n')
	body = body.replace('<br/>', '  \n')
	body = body.replace('<br>', '  \n')
	body = body.replace('\r\n', '\n')
	body = body.replace('\n\r', '\n')
	
	while body.find('\n\n\n') > -1:
		body = body.replace('\n\n\n', '\n\n')
	
	while body.startswith('\n'):
		body = body[1:]
	
	while body.endswith('\n'):
		body = body[:-1]
	
	while body.startswith('\r'):
		body = body[1:]
	
	while body.endswith('\r'):
		body = body[:-1]
	
	while body.startswith('\t'):
		body = body[1:]
	
	return body
Пример #12
0
    def extract(self):
        self.html = re.sub('<!--.*?-->', '', self.html)
        doc = PyQuery(self.html)
        content_node = doc('div#contentText')
        
        content_node.remove('script')
        content_node.remove('style')
        content_node.remove('.line')
        content_node.remove('#shareIn')
        content_node.remove('.tagHotg')
        content_node.remove('.blank8')
        content_node.remove('."editShare clear"')
        content_node.remove('select')
        #content_node.remove('table[width = "100%"]')('td[align = "center"]')
        content_node.remove('div[class = "jingbian_travel01_04"]')
        content_node.remove('div[class = "txt2"]')
        content_node.remove('iframe')
        content_node.remove('embed')
        content_node.remove('td[style = "font-size: 14px; font-weight: bold;"]')
        content_node.remove('table[style = "margin-right: 20px;"]')
        content_node.remove('digi_perpage_bottom')
        content_node.remove('div[class = "extract clear"]')
        content_node.remove('table[bgcolor = "#eeeeee"]')
        content_node.remove('img[alt = "搜狐教育频道"]')
        content_node.remove('table[bgcolor = "#e2e2e2"]')
        content_node.remove('table[bgcolor = "#66ccff"]')
        content_node.remove('div[class = "digi_digest"]')
        item = ContentItem()
        imgs = content_node('img')
        img_all = []
        for img in imgs:
            if".gif" in img.get('src'):
                continue
            else:  
                imgs.eq(imgs.index(img)).append('<br>')
                imgs.eq(imgs.index(img)).before('<br>')
                img_all.append(self.getRealURI(img.get('src')))
        item['image_urls'] = img_all
        
        item['title'] = self.title = doc('h1').text()
        item['content'] = self.content = content_node.__unicode__()
        t = re.compile(u'var club_artinputdate = "(.*?)";')
        release_time = t.search(doc.html())
        if release_time:
            item['release_time'] = self.release_time = release_time.group(1)
#        item['release_switch_time'] = time.mktime(time.strptime(t.search(doc.html()).group(1),'%Y-%m-%d %H:%M:%S'))
        item['source'] = u'搜狐'
        author = doc('div[class = "function clear"]')
        self.author = author('div.l')('a').text()
        item['author'] = self.author
        item['pic_url'] = ''
        
        return item
Пример #13
0
def sanitize_description(value):
    cleaned = PyQuery(value)
    cleaned = cleaned.remove('span.playMetaText')
    cleaned.remove('span.playMetaText')
    cleaned.remove('span.playCount')
    cleaned.remove('time')
    cleaned.remove('strong')

    desc = cleaned.html()

    if desc is None: return ""

    return desc.split('<span>')[-1:][0].replace('</span>', '').strip()
Пример #14
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', href)
             new_href = re.sub(r'index.html', '/', new_href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #15
0
 def render_md5(self, post_content):
     config = Config()
     self.body = markdown2.markdown(
         post_content,
         extras=config.mdextras,
     )
     # rewrite relative img-srcs to full paths.
     d = PyQuery(self.body)
     for img in d.find('img'):
         if '/' not in img.attrib['src']:
             img.attrib['src'] = '{}{}/{}'.format(config.blogurl,
                                                  self.outputpath,
                                                  img.attrib['src'])
     self.body = d.html()
Пример #16
0
def plainify(html):
    doc = PyQuery('<body>%s</body>' % html)
    doc('img, audio, video, iframe, embed, object, script').remove()

    for a in doc('a, i, b, strong, em'):
        PyQuery(a).replaceWith(
            PyQuery(a).html()
        )

    for b in doc('blockquote'):
        PyQuery(b).replaceWith(
            PyQuery(b).html()
        )

    for a in doc('h1, h2, h3, h4, h5, h6'):
        PyQuery(a).replaceWith('<p>%s:</p>' % PyQuery(a).text())

    for p in doc('p'):
        t = (PyQuery(p).text() or '').strip()

        if not t:
            PyQuery(p).remove()
            continue

        if not t[-1] in string.punctuation:
            t += '. '

        if t.startswith('http:') or t.startswith('https:'):
            PyQuery(p).remove()

        if t.startswith('[') and t.endswith(']'):
            PyQuery(p).remove()

        PyQuery(p).html(t)

    for li in doc('li'):
        t = (PyQuery(li).text() or '').strip()
        if not t:
            PyQuery(li).remove()
            continue

        if not t[-1] in string.punctuation:
            t += '.'

        PyQuery(li).html(t)

    return html2text(
        doc.html()
    )
Пример #17
0
 def fix_share_links(text,parser):
     td_regex = re.compile(target_domain + '|' )
     
     assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for share_class in ['.icon-twitter','.icon-facebook','.icon-google-plus']:
         for element in d(share_class):
             e = PyQuery(element)
             href = e.attr('href')
             new_href = re.sub(domain, target_domain, href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #18
0
    def laundry_document(self, html, response):
        html = self.clean_html_document(html)

        #replace a tag to text
        #remove class / id
        cleaner = Cleaner(style=False, links=False, add_nofollow=False,
                          page_structure=True, safe_attrs_only=True)
        html = cleaner.clean_html(html)

        dom = PyQuery(html)
        dom = self.convert_imgs(dom, response)
        dom = self.remove_links(dom)
        html = dom.html()

        #need to remove empty tags
        return html
Пример #19
0
def prepare_html(fileobj):
    """ prepares the html for wordpress pages """
    pq=PyQuery("".join(strip_if_not_pre(fileobj))) 

    out = PyQuery(pq("div.content").outerHtml() )
    # TODO: do we want to extract the title
    # Do we want title at all?
    if out("div.section"):
      out("div.section")[0].set("itemscope","true")
      out("div.section")[0].set("itemtype","http://schema.org/WebPage")
    if out("div.section > p > em"):
      out("div.section > p > em")[0].set("itemprop","author")
    if out("div.section p"):  
      if out("div.section > p > em"):
        out("div.section p")[1].set("itemprop","description") 
      else:  
        out("div.section p")[0].set("itemprop","description") 
    if pq("div.section h1"):
      title= pq("div.section h1")[0].text
      out("div.section h1").css("display","none")
      " set schema.org microdata for sharing "
      out("div.section h1")[0].set("itemprop","name") 
    else:
      title=""

    # TODO: insert toc (??)

    # insert after h1 on 4th ine
    # lines = out.split('\n')
    # out = '\n'.join(lines[:4] + [ '[toc]' ] + lines[4:])

    # now various regex
    
    out=out.html()
    # replace .html with / and index.html with simple ./
    pattern = '(internal" href=".[^"]*)index\.html"'
    out = re.sub(pattern, '\\1"', out)
    pattern = 'internal" href="index\.html"'
    out = re.sub(pattern, 'href="./"', out)
    pattern = '(internal" href="[^"]*).html"'
    out = re.sub(pattern, '\\1/"', out)
    pattern = '(internal" href="[^"]*).html#([^"]*)"'
    out = re.sub(pattern, '\\1/#\\2"', out)

    return (out, title)
Пример #20
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue

                new_href = re.sub(r'(rss/index\.html)|((?<!\.)rss/?)$', 'rss/index.rss', href)
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'/index\.html$', '/', new_href)

                if href != new_href:
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #21
0
 def get_toc(self):
     def get_last_child(e, level):
         if level==0:
             return e
         if level>0:
             ee = e.children('ul:last-child')
             if not ee:
                 #e.append('<li>no title</li>')
                 e.append('<ul></ul>')
                 ee = e.children('ul:last-child')
             return get_last_child(ee, level-1)
 
     out = PyQuery('<ul></ul>')
     for tag in self.q('h1, h2, h3').items():
         level = int(tag[0].tag[1])-1
         assert(0<=level)
         aname = tag.attr('id')
         pp = get_last_child(out, level)
         pp.append(f'<li><a href="#{aname}">{tag.text()}</a></li>\n')
     return out.html() or ""
Пример #22
0
    def to_xml(self):
        (_tag, contents) = list(self.iteritems())[0]
        pqi = PyQuery('<wrap />')

        def _append_contents(struct, par):
            tag = struct['tag']
            _node = PyQuery('<%s />' % tag)
            if 'attributes' in struct:
                for key in struct['attributes'].keys():
                    _node.attr(key, struct['attributes'][key])
            if 'text' in struct:
                _node.text(struct['text'])
            elif 'children' in struct:
                for (ugh, child) in struct['children'].iteritems():
                    _append_contents(child, _node)
            par.append(_node)

        _append_contents(contents, pqi)
        _xio = StringIO(pqi.html())
        _parsed = etree.parse(_xio)
        return etree.tostring(_parsed, pretty_print=True)
Пример #23
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         print href
         if href is None:
             continue
         new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', new_href)
         if href != new_href:
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     # remove ?v=XXXXXXXXX in css
     for element in d('link'):
         e = PyQuery(element)
         href = e.attr('href')
         if href is None:
             continue
         if re.match(r'http://fonts',href) is not None:
             continue
         new_href = re.sub(r'\?.*', '',href)  
         if href != new_href:
             e.attr('href',new_href)
             print "\t", href, "=>", new_href     
     # remove ?v=XXXXXXXXX in js                  
     for element in d('script'):
         e = PyQuery(element)
         src = e.attr('src')
         if src is None:
             continue
         new_src = re.sub(r'\?.*', '',src) 
         if src != new_src:
             e.attr('src',new_src)
             print "\t", src, "=>", new_src
     ################### 
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
 def refresh_text(self):
     """
     Save the current text in the if clause and refresh the selected
     clauses.
     """
     text = PyQuery(self.simplified_text)
     for key in self.keyword_ids.filtered(
             lambda k: k.type in ('if', 'for', 'for_ul')):
         text_selector = text('#' + key.html_id)
         current_text = text_selector.html()
         # Save the current text in the correct if clause
         if key.edit_changed % 2 == 0:
             edit_value = key.edit_value
         else:
             edit_value = not key.edit_value
         if current_text is not None:
             key.set_text(current_text, edit_value)
         if key.edit_changed and not self._context.get('save_mode'):
             # Now we fetch the current clause text
             text_selector.html(key.get_text())
             key.write({'edit_changed': 0})
     self.with_context(no_update=True).simplified_text = text.html()
     return True
Пример #25
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                        parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue
                if (not abs_url_regex.search(href)) or ('/rss/' in href):
                    new_href = re.sub(r'rss/$', 'feed.rss', href)
                    new_href = re.sub(r'index\.html$', '', new_href)
                    new_href = re.sub(r'index\.html\#$', '', new_href)
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href
            if parser == 'html':
                return "<!DOCTYPE html>\n<html>" + d.html(
                    method='html').encode('utf8') + "</html>"
            elif parser == 'xml':
                return "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + d.__unicode__(
                ).encode('utf8')
            return "<!DOCTYPE html>\n<html>" + d.__unicode__().encode(
                'utf8') + "</html>"
 def refresh_text(self):
     """
     Save the current text in the if clause and refresh the selected
     clauses.
     """
     text = PyQuery(self.simplified_text)
     for key in self.keyword_ids.filtered(
             lambda k: k.type in ('if', 'for', 'for_ul')):
         text_selector = text('#' + key.html_id)
         current_text = text_selector.html()
         # Save the current text in the correct if clause
         if key.edit_changed % 2 == 0:
             edit_value = key.edit_value
         else:
             edit_value = not key.edit_value
         if current_text is not None:
             key.set_text(current_text, edit_value)
         if key.edit_changed and not self._context.get('save_mode'):
             # Now we fetch the current clause text
             text_selector.html(key.get_text())
             key.write({'edit_changed': 0})
     self.with_context(no_update=True).simplified_text = text.html()
     return True
Пример #27
0
def GetBrands():
    mysql = pymysql.connect("localhost",
                            "root",
                            "root",
                            "test",
                            charset="utf8")
    url = "https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId=0%20&fctId=0%20&seriesId=0"
    r = GetHtml(url)
    doc = PyQuery(r.text)
    cartree = doc('.cartree')
    cursor = mysql.cursor()
    count = 0
    for pp in doc("ul"):
        pp1 = PyQuery(pp)
        for zipp in pp1('li'):
            a = PyQuery(zipp).find("a")
            title = a.html()
            title = re.findall("/>(.*)<em>", title)
            number = PyQuery(zipp).find("a").find("em").html()
            number = re.findall("[(](.*)[)]", number)
            a = 'https://car.autohome.com.cn' + a.attr("href")

            sql = "insert into brands (brand,count,url) values('%s','%s','%s')" % (
                title[0], number[0], a)
            try:
                # 执行sql语句
                cursor.execute(sql)
                # 执行sql语句
                mysql.commit()
                count = count + 1
            except:
                # 发生错误时回滚
                mysql.rollback()

            # 关闭数据库连接
    mysql.close()
    return count
Пример #28
0
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict:
    # Find if has children
    elem = PyQuery(elem)
    children = list(elem.contents())
    has_children = len(elem.children()) > 0

    contents = []
    if has_children:
        # Fix unwrapped children
        if not already_wrapped:
            children = fix_unwrapped_text(elem).contents()

        for child in children:
            child_dict = build_dict_from_sane_json(child, already_wrapped=True)
            if child_dict:
                contents.append(child_dict)
    else:
        contents = elem.html()

    extra = {}

    # Only tables need the HTML (to use later for extraction of relevant data)
    if elem.is_("table"):
        extra = {'original_html': str(elem)}

    if 'src' in elem[0].attrib:
        extra['src'] = elem.attr('src')
    if 'href' in elem[0].attrib:
        extra['href'] = elem.attr('href')

    return {
        'type': list(elem)[0].tag,
        'attrs': [],
        'layout': {},
        'contents': contents,
        'extra': extra
    }
Пример #29
0
def prepare_html(fileobj):
    """ prepares the html for wordpress pages """
    pq=PyQuery("".join(strip_if_not_pre(fileobj))) 
    
    pq("a.headerlink").remove()
    # Do we want title at all?
    if pq("div.section h1"):
      title= pq("div.section h1")[0].text
      pq("div.section h1:first").remove()
    else:
      title=""

    # TODO: insert toc (??)

    out = PyQuery(pq("div.content").outerHtml() )
    # insert after h1 on 4th ine
    # lines = out.split('\n')
    # out = '\n'.join(lines[:4] + [ '[toc]' ] + lines[4:])

    # now various regex
    
    out=out.html()
    print out
    # replace .html with / and index.html with simple ./
    pattern = '(internal" href=".[^"]*)index\.html"'
    out = re.sub(pattern, '\\1"', out)
    pattern = 'internal" href="index\.html"'
    out = re.sub(pattern, 'href="./"', out)
    pattern = '(internal" href="[^"]*).html"'
    out = re.sub(pattern, '\\1/"', out)
    pattern = '(internal" href="[^"]*).html#([^"]*)"'
    out = re.sub(pattern, '\\1/#\\2"', out)
    pattern = '(internal" href="[^"]*/)index/#([^"]*)"'
    out = re.sub(pattern, '\\1/#\\2"', out)

    return (out, title)
Пример #30
0
    def get_readers_from_html_content(self, fname, html, **kwargs):
        try:
            from pyquery import PyQuery
        except:
            print >>sys.stderr, "could not import pyquery"
            return []

        parsers = []
        pq = PyQuery(html)
        tables = self.find_ideal_tables(pq('table'))

        for table_el in tables:
            try:
                table = PyQuery(table_el)
                p = HTMLTableParser(StringIO(table.html()), fname, **kwargs)
                i = p.get_data_iter()
                consistent, ncols = html_rows_consistent(i())
                if consistent and ncols > 1:
                    parsers.append(i)
            except KeyboardInterrupt:
                pass
            except Exception as e:
                _log.info(traceback.format_exc())
        return parsers
Пример #31
0
        def fix_meta_url_links(text, parser):
            filetext = text.decode('utf8')
            td_regex = re.compile(target_domain + '|')

            assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
            d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')),
                        parser=parser)
            for share_class in [
                    'meta[property="og:url"], meta[name="twitter:url"]'
            ]:
                print "share_class : ", share_class
                for element in d(share_class):
                    e = PyQuery(element)
                    print "element : ", e
                    href = e.attr('content')
                    print "href : ", href
                    print "domain : ", domain
                    print "target_domain : ", target_domain
                    new_href = re.sub(domain, target_domain, href)
                    e.attr('content', new_href)
                    print "\t", href, "=>", new_href
            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #32
0
    exit(1)
# csvwriter.writerow(csv_header)

# Query the first page with location option
url = "https://www.vrbo.com/search/keywords:chapel-hill-nc-usa/@35.874919139908165,-79.113394930114,35.95736065574712,-79.01846618865892,13z?petIncluded=false&ssr=true"
# url = 'https://www.homeaway.com/results/keywords:Chapel%20Hill%2C+NC%2C+USA%29/Page:'

results = [['' for i in range(5)] for j in range(200)]
result_count = 0
page = 1
while page < 2:
    x = PyQuery(url + str(page))
    start_location = end_location = 0
    while True:
        property_data = {}
        start_location = x.html().find('"bathrooms":', end_location) + 12
        end_location = x.html().find('}', start_location) + 1
        try:
            property_data = json.loads(x.html()[start_location:end_location])
            results[result_count][
                0] = property_data['full'] + property_data['half'] * 0.5
        except:
            break
        start_location = x.html().find('"bedrooms":', end_location) + 11
        end_location = x.html().find(',', start_location)
        results[result_count][1] = x.html()[start_location:end_location]

        start_location = x.html().find('"propertyType":', end_location) + 15
        end_location = x.html().find(',', start_location)
        results[result_count][2] = re.sub(
            r'"', '',
Пример #33
0
class SummaryPublisherEngine(object):
    def __init__(self, pro, doc, wc, group, organization=None):
        self.project = pro
        self.document = doc
        self.word_count = wc
        self.groups = [group.key, Group.get_worldshare().key]
        self.organization = organization

        self.user = User()
        self.user.groups = self.groups
        self.walker = ConceptPublishWalker(pro)

        if organization:
            self.user.organization = organization.key

        self.html = ''
        self.body = Pq('<span></span>')
        self.con_count = 0
        self.paragraph = None

    def _get_next_concept(self):
        for level in self.walker:
            for concept in level:
                yield concept

    def render(self):
        cur_wc = 0
        concept_count = 0
        processed_concepts = {}

        for concept in self._get_next_concept():
            if concept:
                if not concept.has_permission_read(self.user):
                    continue

                render = True
                if not concept.is_summary_crawlable(document=self.document,
                                                    project=self.project):
                    render = False

                attr = concept.get_attr_by_doc(self.document)
                if attr and attr.is_header():
                    render = False
                if attr and attr.is_image():
                    render = False

                if render:
                    phrase = concept.get_phrasing(doc=self.document,
                                                  return_text=False)
                    wc = phrase.get_word_count()
                    if wc + cur_wc > self.word_count:
                        break
                    concept_count += 1
                    cur_wc += wc

                parent = concept.get_parent()
                if not processed_concepts.get(parent.id):
                    processed_concepts[parent.id] = []
                processed_concepts[parent.id].append(concept)

        paragraph_divider = 300
        paragraph_count = cur_wc / paragraph_divider
        if cur_wc % paragraph_divider > 0:
            paragraph_count += 1

        con_pre_par = (concept_count / paragraph_count) + 1
        self.paragraph = Pq('<p></p>')
        self.body.append(self.paragraph)
        self.con_count = 0
        self._render(self.project, con_pre_par, processed_concepts)
        self.html = self.body.html(method='html')

    def _render(self, parent, con_pre_par, processed_concepts):
        if not processed_concepts.get(parent.id):
            return
        for concept in processed_concepts.get(parent.id):
            render = True
            if not concept.is_summary_crawlable(document=self.document,
                                                project=self.project):
                render = False

            attr = concept.get_attr_by_doc(self.document)
            if attr and attr.is_header():
                render = False
            if attr and attr.is_image():
                render = False

            if render:
                if self.con_count == con_pre_par:
                    self.con_count = 0
                    self.paragraph = Pq('<p></p>')
                    self.body.append(self.paragraph)

                phrase = concept.get_summary_phrasing(document=self.document)
                span = Pq('<span></span>')
                span.append(phrase.text + ' ')
                # span.css('background-color', ChannelToken.generate_color())
                self.paragraph.append(span)
                self.con_count += 1
            self._render(concept, con_pre_par, processed_concepts)
Пример #34
0
 def save_cache(self, content: pq):
     with open(self.get_cache_filename(), 'w+', encoding='utf-8') as f:
         print(content.html(), file=f)
         f.close()
Пример #35
0
def _wrap(elem):
    """ Wrap an element with a span element """
    span = PyQuery('<span></span>')
    span.html(elem)
    return span
Пример #36
0
        })) for i, data in enumerate(db.data)
    ]
    state['all'] = len(db.data)
    # threadLoadCont(data, i)
    mReq = threadpool.makeRequests(threadLoadCont, argList)
    [pool.putRequest(req) for req in mReq]
    pool.wait()

    # 加载主界面


loadPage = LoadPage(hostURL, 'gbk')

# a标签
aList = PQ(loadPage.data)('dl.chapterlist a')

# 提取全部a标签存入数据
for i in aList:
    aElem = PQ(i)
    title = aElem.html()
    url = hostURL + aElem.attr('href')
    if not (url in db.data):
        db.setData(url, {'title': title, 'url': url, 'isLoad': False})
# 保存
db.save()
loadCont()
print(
    '\n\n抓取:【%s】\n总章节:%s\n成功:%s\n失败:%s\n缓存读取:%s\n网络抓取:%s\n错误地址:' %
    (hostURL, state['all'], state['success'], state['error'],
     state['forCache'], state['forNet']), state['errList'])
Пример #37
0
    def _generate_translation(self):
        """ Generate child description. """
        desc = PyQuery(HTML_TEMPLATE)

        # 1. Program type only if Home Based + Birthday estimate
        ########################################################
        child = self.child_id
        if child.cdsp_type == "Home Based":
            desc(".program_type").html(
                self.home_based_lang[self.env.lang][child.gender].format(
                    preferred_name=child.preferred_name))
        else:
            desc("#program_type").remove()
        if child.estimated_birthdate:
            desc(".birthday_estimate").html(
                _("* The birthday is an estimation."))
        else:
            desc("#birthday_estimate").remove()

        # 2. Household
        ##############
        household = child.household_id.with_context(active_gender=child.gender)
        live_with = self._live_with()
        desc("#live_with").html(live_with)

        if not household.father_living_with_child:
            f_alive = desc(".father").children(".is_alive")
            f_alive[0].text = _("Father alive")
            f_alive[1].text = household.translate("father_alive")
        else:
            desc(".father").remove()
        self._job(desc(".father_job"), "father")

        if not household.mother_living_with_child:
            m_alive = desc(".mother").children(".is_alive")
            m_alive[0].text = _("Mother alive")
            m_alive[1].text = household.translate("mother_alive")
        else:
            desc(".mother").remove()
        self._job(desc(".mother_job"), "mother")

        if household.nb_brothers:
            desc(".brothers")[0].text = _("Number of brothers")
            desc(".brothers")[1].text = str(household.nb_brothers)
        else:
            desc(".brothers").remove()
        if household.nb_sisters:
            desc(".sisters")[0].text = _("Number of sisters")
            desc(".sisters")[1].text = str(household.nb_sisters)
        else:
            desc(".sisters").remove()

        # 3. Schooling
        ##############
        if child.us_grade_level and child.us_grade_level != "Not Enrolled":
            # Make sure the education level is set
            child.convert_us_grade_to_education_level()
            desc("#school_attending").remove()
            desc(".school_level")[0].text = _("School level")
            desc(".school_level")[1].text = child.translate("education_level")
            if child.major_course_study:
                desc(".school_subject")[0].text = _("Best school subject")
                desc(".school_subject")[1].text = child.translate(
                    "major_course_study")
            else:
                desc("#school_subject").remove()
            if child.vocational_training_type and \
                    child.vocational_training_type.lower() not in (
                    "not enrolled",
                    "other"):
                desc(".vocational_training")[0].text = _("Vocational training")
                desc(".vocational_training")[1].text = child.translate(
                    "vocational_training_type")
            else:
                desc("#vocational_training").remove()
        else:
            desc(".school_attending_title").html(
                self.school_no_lang[self.env.lang][child.gender].format(
                    preferred_name=child.preferred_name))
            desc(".school").remove()

        # 4. House duties
        #################
        if child.duty_ids:
            desc("#house_duties_intro").html(
                self.duties_intro_lang[self.env.lang][child.gender])
            desc("#house_duties_list").html("".join([
                "<li>" + duty.value + "</li>" for duty in child.duty_ids[:3]
            ]))
        else:
            desc(".house_duties").remove()

        # 5. Church activities
        ######################
        if child.christian_activity_ids:
            desc("#church_activities_intro").html(
                self.church_intro_lang[self.env.lang][child.gender])
            desc("#church_activities_list").html("".join([
                "<li>" + activity.value + "</li>"
                for activity in child.christian_activity_ids[:3]
            ]))
        else:
            desc(".church_activities").remove()

        # 6. Hobbies
        ############
        if child.hobby_ids:
            desc("#hobbies_intro").html(
                self.hobbies_intro_lang[self.env.lang][child.gender].format(
                    preferred_name=child.preferred_name))
            desc("#hobbies_list").html("".join([
                "<li>" + hobby.value + "</li>" for hobby in child.hobby_ids[:3]
            ]))
        else:
            desc(".hobbies").remove()

        # 7. Health
        ###########
        if child.physical_disability_ids or child.chronic_illness_ids:
            desc("#handicap_intro").html(
                self.handicap_intro_lang[self.env.lang][child.gender].format(
                    preferred_name=child.preferred_name))
            handicap_list = []
            if child.physical_disability_ids:
                handicap_list.extend([
                    "<li>" + handicap.value + "</li>"
                    for handicap in child.physical_disability_ids
                ])
            if child.chronic_illness_ids:
                handicap_list.extend([
                    "<li>" + illness.value + "</li>"
                    for illness in child.chronic_illness_ids
                ])
            desc("#handicap_list").html("".join(handicap_list))
        else:
            desc(".handicap").remove()

        return desc.html()
Пример #38
0
def main():
    arguments = docopt(__doc__, version='0.1.3')
    if arguments['--dir'] is not None:
        static_path = arguments['--dir']
    else:
        static_path = os.path.join(os.getcwd(), 'static')

    if arguments['--web-url'] is not None:
        web_url = "{}".format(arguments['--web-url'])
    else:
        web_url = None

    domain = arguments['--domain']
    if arguments['generate']:
        command = (
            "wget "
            "--level=0 "  # set level to infinitive
            "--recursive "  # follow links to download entire site
            "--convert-links "  # make links relative
            "--page-requisites "  # grab everything: css/in-lined images
            "--no-parent "  # don't go to parent level
            "--directory-prefix {1} "  # download content to static/folder
            "--no-host-directories "  # don't create domain named folder
            "--restrict-file-name=unix "  # don't escape query string
            "{0}").format(domain, static_path)
        os.system(command)

        command = (
            "wget "
            "--level=0 "  # set level to infinitive
            "--recursive "  # follow links to download entire site
            "--convert-links "  # make links relative
            "--page-requisites "  # grab everything: css/in-lined images
            "--no-parent "  # don't go to parent level
            "--directory-prefix {1} "  # download content to static/folder
            "--no-host-directories "  # don't create domain named folder
            "--restrict-file-name=unix "  # don't escape query string
            "{0}/about/").format(domain, static_path)
        os.system(command)

        # rather do this with sitemap-generator
        """
        # copy sitemap files since Ghost 0.5.7
        base_command = "wget --convert-links --page-requisites --no-parent " \
                       "--directory-prefix {1} --no-host-directories " \
                       "--restrict-file-name=unix {0}/{2}"
        command = base_command.format(domain, static_path, "sitemap.xsl")
        os.system(command)
        command = base_command.format(domain, static_path, "sitemap.xml")
        os.system(command)
        command = base_command.format(domain, static_path, "sitemap-pages.xml")
        os.system(command)
        command = base_command.format(domain, static_path, "sitemap-posts.xml")
        os.system(command)
        command = base_command.format(domain, static_path,
                                      "sitemap-authors.xml")
        os.system(command)
        command = base_command.format(domain, static_path, "sitemap-tags.xml")
        os.system(command)
		"""
        def pullRss(path):
            if path is None:
                baserssdir = os.path.join(static_path, "rss")
                mkdir_p(baserssdir)
                wget_command = ("wget --output-document=" + baserssdir +
                                "/feed.rss {0}/rss/").format(domain)
                os.system(wget_command)
            else:
                for feed in os.listdir(os.path.join(static_path, path)):
                    rsspath = os.path.join(path, feed, "rss")
                    rssdir = os.path.join(static_path, 'rss', rsspath)
                    mkdir_p(rssdir)
                    wget_command = ("wget --output-document=" + rssdir +
                                    "/index.html {0}/" +
                                    rsspath).format(domain)
                    os.system(wget_command)

        #pullRss("tag")
        #pullRss("author")

        # create 404.html file
        path_404 = os.path.join(static_path, "404.html")
        shutil.copyfile(os.path.join(static_path, "index.html"), path_404)

        with open(path_404) as f:
            file_text = f.read()

            d = PyQuery(bytes(bytearray(file_text, encoding='utf-8')),
                        parser='html')

            e = d('main')
            e.replaceWith(
                """<main id="content"> <h2>404: Page not found</h2></main>""")
            text = d.html(method='html')
            text = text.replace('assets/styles/crisp.css',
                                'https://rdrn.me/assets/styles/crisp.css')

            new_text = "<!DOCTYPE html>\n<html>" + text + "</html>"

        with open(path_404, 'w') as f:
            try:
                f.write(new_text)
            except UnicodeEncodeError:
                f.write(new_text.encode('utf-8'))

        # remove query string since Ghost 0.4
        file_regex = re.compile(r'.*?(\?.*)')
        bad_file_regex = re.compile(r'.+\.[0-9]{1,2}$')
        static_page_regex = re.compile(r"^([\w-]+)$")

        for root, dirs, filenames in os.walk(static_path):
            for filename in filenames:
                if file_regex.match(filename):
                    newname = re.sub(r'\?.*', '', filename)
                    print("Rename", filename, "=>", newname)
                    os.rename(os.path.join(root, filename),
                              os.path.join(root, newname))
                if bad_file_regex.match(filename):
                    os.remove(os.path.join(root, filename))

                # if we're inside static_path or static_path/tag, rename
                # extension-less files to filename.html
                if (root == static_path
                    or root == os.path.join(static_path, 'tag'))\
                        and static_page_regex.match(filename)\
                        and filename != 'CNAME' and filename != 'LICENSE':
                    newname = filename + ".html"
                    newpath = os.path.join(root, newname)
                    try:
                        os.remove(newpath)
                    except OSError:
                        pass
                    shutil.move(os.path.join(root, filename), newpath)

        # remove superfluous "index.html" from relative hyperlinks found in text
        abs_url_regex = re.compile(r'^(?:[a-z]+:)?//', flags=re.IGNORECASE)
        bad_url_regex = bad_file_regex

        def fixLinks(text, parser):
            if text == '':
                return ''
            try:
                d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                            parser=parser)
            except UnicodeDecodeError:
                d = PyQuery(bytes(bytearray(text)), parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue
                if (not abs_url_regex.search(href)) or ('/rss/' in href):
                    new_href = re.sub(r"index.html", r"", href)
                    new_href = re.sub(r"^([\w-]+)$", r"\1.html", new_href)
                    if href != new_href:
                        e.attr('href', new_href)
                        print("\t", href, "=>", new_href)

                if (not abs_url_regex.search(href)) or ('/rss/' in href):
                    new_href = re.sub(r"/([\w-]+)$", r"/\1.html", href)
                    new_href = re.sub(r"^([\w-]+)$", r"\1.html", new_href)
                    if href != new_href:
                        e.attr('href', new_href)
                        print("\t", href, "=>", new_href)

                href = e.attr('href')
                if bad_url_regex.search(href):
                    new_href = re.sub(r'(.+)\.[0-9]{1,2}$', r'\1', href)
                    e.attr('href', new_href)
                    print("\t FIX! ", href, "=>", new_href)
            return "<!DOCTYPE html>\n<html>" + d.html(
                method='html') + "</html>"

        # fix links in all html files
        for root, dirs, filenames in os.walk(static_path):
            for filename in fnmatch.filter(filenames, "*.html"):
                filepath = os.path.join(root, filename)
                parser = 'html'
                if root.endswith("/rss"):  # rename rss index.html to index.rss
                    parser = 'xml'
                    newfilepath = os.path.join(
                        root,
                        os.path.splitext(filename)[0] + ".rss")
                    os.rename(filepath, newfilepath)
                    filepath = newfilepath
                with open(filepath) as f:
                    filetext = f.read()
                print("fixing links in ", filepath)
                newtext = filetext
                if parser == 'html':
                    newtext = fixLinks(filetext, parser)
                with open(filepath, 'w') as f:
                    try:
                        f.write(newtext)
                    except UnicodeEncodeError:
                        f.write(newtext.encode('utf-8'))

        def trans_local_domain(text):
            modified_text = text.replace('http://localhost:2368', web_url)
            modified_text = modified_text.replace('http://', 'https://')
            modified_text = modified_text.replace('https://rdrn.me/', '/')
            modified_text = re.sub(r'(rss\/)[a-z]+(.html)', r'\1index.rss',
                                   modified_text)

            return modified_text

        def remove_v_tag_in_css_and_html(text):
            modified_text = re.sub(r"%3Fv=[\d|\w]+\.css", "", text)
            modified_text = re.sub(r".js%3Fv=[\d|\w]+", ".js", modified_text)
            modified_text = re.sub(r".woff%3[\d|\w]+", ".woff", modified_text)
            modified_text = re.sub(r".ttf%3[\d|\w]+", ".ttf", modified_text)

            modified_text = re.sub(r"css\.html", "css", modified_text)
            modified_text = re.sub(r"png\.html", "png", modified_text)
            modified_text = re.sub(r"jpg\.html", "jpg", modified_text)

            return modified_text

        for root, dirs, filenames in os.walk(static_path):
            for filename in filenames:
                if filename.endswith(
                    ('.html', '.css', '.xsl', '.rss')):  # removed xml
                    filepath = os.path.join(root, filename)
                    with open(filepath) as f:
                        filetext = f.read()
                    print("fixing local domain in ", filepath)
                    newtext = trans_local_domain(filetext)
                    newtext = remove_v_tag_in_css_and_html(newtext)
                    with open(filepath, 'w') as f:
                        f.write(newtext)

    elif arguments['preview']:
        os.chdir(static_path)

        Handler = http.server.SimpleHTTPRequestHandler
        httpd = socketserver.TCPServer(("", 9001), Handler)

        print("Serving at port 9000")
        # gracefully handle interrupt here
        httpd.serve_forever()

    elif arguments['setup']:
        if arguments['--gh-repo']:
            repo_url = arguments['--gh-repo']
        else:
            repo_url = input("Enter the Github repository URL:\n").strip()

        # Create a fresh new static files directory
        if os.path.isdir(static_path):
            confirm = input(
                "This will destroy everything inside static"
                " Are you sure you want to continue? (y/N)").strip()
            if confirm != 'y' and confirm != 'Y':
                sys.exit(0)
            shutil.rmtree(static_path)

        # User/Organization page -> master branch
        # Project page -> gh-pages branch
        branch = 'gh-pages'
        regex = re.compile(".*[\w-]+\.github\.(?:io|com).*")
        if regex.match(repo_url):
            branch = 'master'

        # Prepare git repository
        repo = Repo.init(static_path)
        git = repo.git

        if branch == 'gh-pages':
            git.checkout(b='gh-pages')
        repo.create_remote('origin', repo_url)

        # Add README
        file_path = os.path.join(static_path, 'README.md')
        with open(file_path, 'w') as f:
            f.write(
                '# Blog\nPowered by [Ghost](http://ghost.org)'
                ' and [Buster](https://github.com/manthansharma/buster/).\n')

        print("All set! You can generate and deploy now.")

    elif arguments['deploy']:
        repo = Repo(static_path)
        repo.git.add('.')

        current_time = strftime("%Y-%m-%d %H:%M:%S", gmtime())
        repo.index.commit('Blog update at {}'.format(current_time))

        origin = repo.remotes.origin
        repo.git.execute(
            ['git', 'push', '-u', origin.name, repo.active_branch.name])
        print("Good job! Deployed to Github Pages.")

    elif arguments['add-domain']:
        repo = Repo(static_path)
        custom_domain = arguments['<domain-name>']

        file_path = os.path.join(static_path, 'CNAME')
        with open(file_path, 'w') as f:
            f.write(custom_domain + '\n')

        print("Added CNAME file to repo. Use `deploy` to deploy")

    else:
        print(__doc__)
Пример #39
0
 def __get_data(self):
     resp = self.session.get(reportURL)
     doc = PyQuery(resp.text)
     html = doc.html()
     tiwen = 36.5 + random.uniform(0, 0.3)
     tiwen = round(tiwen, 1)
     zxMatch = re.findall(r'f8_state={.*?"SelectedValue":"(.+?)"', html)[0]
     gnMatch = re.findall(r'f14_state={.*?"SelectedValue":"(.+?)"', html)[0]
     shengMatch = re.findall(r'f16_state={.+?"SelectedValueArray":\["(.+?)"]', html)[0]
     shiMatch = re.findall(r'f17_state={.*?"F_Items":(.+?),"SelectedValueArray":\["(.+?)"]', html)[0]
     xianMatch = re.findall(r'f18_state={.*?"F_Items":(.+?),"SelectedValueArray":\["(.+?)"]', html)[0]
     # print(shiMatch)
     xxMatch = re.findall(r'f20_state={.*?"Text":"(.+?)"', html)[0]
     F_State = template % (
     self.date, zxMatch, gnMatch, shengMatch, shiMatch[0], shiMatch[1], xianMatch[0], xianMatch[1], xxMatch, "否")
     return {
         'F_State': base64.b64encode(F_State.encode()),
         '__VIEWSTATE': doc.find('#__VIEWSTATE').attr('value'),
         '__EVENTTARGET': 'p1$ctl00$btnSubmit',
         '__EVENTARGUMENT': '',
         '__VIEWSTATEGENERATOR': doc.find('#__VIEWSTATEGENERATOR').attr('value'),
         'p1$ChengNuo': 'p1_ChengNuo',
         'p1$BaoSRQ': self.date,
         'p1$DangQSTZK': '良好',
         'p1$TiWen': str(tiwen),
         'F_TARGET': 'p1_ctl00_btnSubmit',
         'p1_Collapsed': 'false',
         'p1$CengFWH_RiQi': '',
         'p1$CengFWH_BeiZhu': '',
         'p1$JieChu_RiQi': '',
         'p1$JieChu_BeiZhu': '',
         'p1$TuJWH_RiQi': '',
         'p1$TuJWH_BeiZhu': '',
         'p1$JiaRen_BeiZhu': '',
         'p1$ZaiXiao': zxMatch,
         "p1$MingTDX": "不到校",
         "p1$MingTJC": "否",
         "p1$BanChe_1$Value": '0',
         "p1$BanChe_1": '不需要乘班车',
         "p1$BanChe_2$Value": '0',
         "p1$BanChe_2": '不需要乘班车',
         'p1$GuoNei': '国内',
         "p1$ddlGuoJia$Value": "-1",
         "p1$ddlGuoJia": "选择国家",
         'p1$ddlSheng$Value': shengMatch,
         'p1$ddlSheng': shengMatch,
         'p1$ddlShi$Value': shiMatch[1],
         'p1$ddlShi': shiMatch[1],
         'p1$ddlXian$Value': xianMatch[1],
         'p1$ddlXian': xianMatch[1],
         'p1$XiangXDZ': xxMatch,
         "p1$FanXRQ": "",
         "p1$WeiFHYY": "",
         "p1$ShangHJZD": "",
         'p1$QueZHZJC$Value': '否',
         'p1$QueZHZJC': '否',
         'p1$DangRGL': '否',  # 是否隔离
         'p1$DaoXQLYGJ': '',  # 旅游国家
         'p1$DaoXQLYCS': '',  # 旅游城市
         'p1$Address2': '中国',
         'p1$SuiSM': '绿色',  # 随申码颜色
         'p1$LvMa14Days': '是',  # 截止今天是否连续14天健康码为绿色
         'p1$GeLDZ': '',
         "p1_SuiSMSM_Collapsed": "false",
         "p1_GeLSM_Collapsed": 'false',
         "p1_SuiSMSM_Collapsed": 'false'
     }
    def _generate_translation(self):
        """ Generate project description. """
        desc = PyQuery(HTML_TEMPLATE)

        # 1. Basic Information
        ######################
        project = self.project_id
        desc('.project_name')[0].text = _("Project name")
        desc('.project_name')[1].text = project.name
        desc('.project_closest_city')[0].text = _("Closest city")
        self._show_field(
            desc('.project_closest_city')[1], desc('#project_closest_city'),
            project.closest_city)
        desc('.project_cdsp_number')[0].text = _("Number of children")
        self._show_field(
            desc('.project_cdsp_number')[1], desc('#project_cdsp_number'),
            project.nb_cdsp_kids)
        if project.electrical_power == 'Not Available':
            desc('.project_electricity').html(
                _("The project has no electricity."))
        else:
            desc('#project_electricity').remove()

        # 2. Community
        ##############
        desc('#community_label').html(_("Local community"))
        desc('.community_population')[0].text = _("Population")
        self._show_field(
            desc('.community_population')[1], desc('#community_population'),
            '{:,}'.format(project.community_population).replace(',', "'"))
        desc('.community_language')[0].text = _("Language")
        self._show_field(
            desc('.community_language')[1], desc('#community_language'),
            project.primary_language_id.name)
        if project.primary_adults_occupation_ids:
            desc('.community_job')[0].text = _("Typical job")
            self._show_field(
                desc('.community_job')[1], desc('#community_job'),
                project.primary_adults_occupation_ids[0].value)
        else:
            desc('#community_job').remove()
        if project.chf_income and 10 < project.chf_income < 500:
            desc('.community_income')[0].text = _("Family monthly income")
            desc('.community_income')[1].text = 'CHF {:10.0f}.-'.format(
                project.chf_income)
        else:
            desc('#community_income').remove()
        desc('.community_food')[0].text = _("Typical food")
        if project.primary_diet_ids:
            desc('.community_food')[1].text = project.primary_diet_ids[0].value
        else:
            desc('#community_food').remove()
        desc('.community_school_begins')[0].text = _("School begins in")
        self._show_field(
            desc('.community_school_begins')[1],
            desc('#community_school_begins'),
            project.translate('school_year_begins'))

        # 3. Activities
        ###############
        spiritual = project.get_activities('spiritual_activity', 3)
        physical = project.get_activities('physical_activity', 3)
        cognitive = project.get_activities('cognitive_activity', 3)
        socio = project.get_activities('socio_activity', 3)
        if spiritual or physical or cognitive or socio:
            desc('#activities_label').html(
                _("Project activities for children"))
        else:
            desc('#activities').remove()

        if spiritual:
            desc('.spiritual_activities').html(_("Spiritual activities"))
            desc('#spiritual_activities_list').html(''.join(
                ['<li>' + activity + '</li>' for activity in spiritual]))
        else:
            desc('#spiritual_activities').remove()
        if physical:
            desc('.physical_activities').html(_("Physical activities"))
            desc('#physical_activities_list').html(''.join(
                ['<li>' + activity + '</li>' for activity in physical]))
        else:
            desc('#physical_activities').remove()
        if cognitive:
            desc('.cognitive_activities').html(_("Cognitive activities"))
            desc('#cognitive_activities_list').html(''.join(
                ['<li>' + activity + '</li>' for activity in cognitive]))
        else:
            desc('#cognitive_activities').remove()
        if socio:
            desc('.socio_activities').html(_("Socio-emotional activities"))
            desc('#socio_activities_list').html(''.join(
                ['<li>' + activity + '</li>' for activity in socio]))
        else:
            desc('#socio_activities').remove()
        if project.activities_for_parents:
            desc('.parent_activities').html(
                _("In addition, the project offers special activities for the "
                  "parents such as education courses."))
        else:
            desc('#parent_activities').remove()

        return desc.html()
Пример #41
0
def parse_detail(url, filename):
    if 'genshuixue' not in url:
        #url = 'http://jingyan.baidu.com' + url
        pass
    #html = urllib2.urlopen(url).read()
    content = get_content(url, filename)
    if not content:
        return
    jq = PyQuery(content)
    res_json = {
        'bread': jq('.bread-wrap').text().replace('>','').split()[-2:],
        'title': jq('h1').text().replace('听语音',''),
        'date':  jq('time').text()[:10],
        'source': 'baidu',
        'url': url.replace('\n',''),
        'class': 36,
        'subject': u'经验',
        'data_weight': 0,
    }
    methods = []
    content = [each for each in jq('.exp-content-block')]
    print len(content)
    if not content:
        return None
    elif len(content) == 1:
        _list = []
        for steps in PyQuery(content[0])('ol li'):
            step = PyQuery(steps)
            step_title = step.text()
            image = step.html()
            img = image.split('data-src="')[-1].split('"')[0] if image and '<img' in image else ''
            #print img
            _list.append({
                'img': img,
                'title': step_title,
                'substeps': [],
            })
        methods.append(_list)
        abstract = {}
    else:
        try:
            question_desc_img = PyQuery(content[0])('.content-listblock-image').html().split('data-src="')[-1].split('"')[0]
        except:
            question_desc_img = ''
        abstract = {
            'title': '',
            'steps': [PyQuery(content[0])('p').text(),],
            'img': question_desc_img
        }
        for each in content[1:]:
            method = PyQuery(each)
            title = method('h2').text()
            #print title
            _list = []
            steps_list = [step for step in method('ol li')]
            if not steps_list:
                steps_list = [step for step in method('ul li')]
            for steps in steps_list:
                step = PyQuery(steps)
                step_title = step.text()
                image = step('.content-list-image a').html()
                img = image.split('data-src="')[-1].split('"')[0] if image else ''
                _list.append({
                    'img': img,
                    'title': step_title,
                    'substeps': [],
                })
            if not _list:
                _list.append((method('.content-listblock-text').text()))
            methods.append({'title': title, 'steps': _list})
        #工具
        if methods[0]['title'] == u'工具/原料':
            prepare = {'title': methods[0]['title'], 'steps': [v['title'] for v in methods[0]['steps']]}
            methods = methods[1:]
            res_json['prepare'] = prepare
        #注意事项
        if len(methods) == 0:
            return None

        if methods[-1]['title'] == u'注意事项':
            summary = {'title': methods[-1]['title'], 'steps': [v['title'] for v in methods[-1]['steps']]}
            methods = methods[:-1]
            res_json['summary'] = summary
    res_json['methods'] = methods
    res_json['abstract'] = abstract
    #print json.dumps(res_json)
    return json.dumps(res_json)
Пример #42
0
def get_users():
    global month
    logging.info('Récupération des membres')
    if config.debug:
        progress = progressbar.NoProgressBar()
    else:
        progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()], maxval=save.nbusers)
    progress.start()
    
    n = len(save.users)
    progress.update(n)

    ids = [i["id"] for i in save.users]

    d = PyQuery(url=config.rooturl+'/admin/index.forum?part=users_groups&sub=users&extended_admin=1&' + tid, opener=fa_opener)

    if "notgetmember_pic.forum?u=" in d.html():
        raise RuntimeError('Forum user page in "import protected" mode - cannot process users...')

    result = re.search('function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}', d.text())

    try:
        pages = int(result.group(1))
        usersperpages = int(result.group(2))
    except:
        pages = 1
        usersperpages = 0

    currentpage = int(n/usersperpages)
        
    memberslastpage = save.nbusers % usersperpages
    logging.debug('Utilisateurs : %d pages de %d membres - starting from page %d', pages, usersperpages, currentpage)

    for page in range(currentpage, pages):
        pageNumber = page*usersperpages
        if page == pages-1 :
            usersperpages = memberslastpage # nombre de membres sur la dernière page

        if page >= 1:
            time.sleep(61);
            d = PyQuery(url=config.rooturl + '/admin/index.forum?part=users_groups&sub=users&extended_admin=1&start=' + str(pageNumber) + '&' + tid, opener=fa_opener)
            logging.debug('Récupération membre via url: %s', config.rooturl + '/admin/index.forum?part=users_groups&sub=users&extended_admin=1&start=' + str(pageNumber) + '&' + tid)

        if ("notgetmember_pic.forum?u=" in d.html() or "Liste des Utilisateurs" not in d.text()) :
            raise RuntimeError('Forum user page in "import proteced" mode - cannot process users...')

        alluserinthepage = 0
        for i in d('tbody tr'):
            if alluserinthepage == usersperpages:
                break
            e = PyQuery(i)
            addr = e("td a").eq(0).attr("href")
            if addr != "None":
                alluserinthepage += 1
                id = int(re.search("&u=(\d+)&", e("td a").eq(0).attr("href")).group(1))
                logging.debug('Récupération : membre %d', id)

                date = e("td").eq(3).text().split(" ")
                date = time.mktime(time.struct_time((int(date[2]),month[date[1]],int(date[0]),0,0,0,0,0,0)))

                lastvisit = e("td").eq(4).text()

                if lastvisit != "":
                    lastvisit = lastvisit.split(" ")
                    lastvisit = time.mktime(time.struct_time((int(lastvisit[2]),month[lastvisit[1]],int(lastvisit[0]),0,0,0,0,0,0)))
                else:
                    lastvisit = 0

                if id not in ids:
                    name = e("td a").eq(0).text()
                    save.users.append({'id': id, 'newid': n, 'name': e("td a").eq(0).text(), 'mail': e("td a").eq(1).text(), 'posts': int(e("td").eq(2).text()), 'date': int(date), 'lastvisit': int(lastvisit)})
                    n += 1
                    progress.update(n)
                else:
                    logging.warning('L\'utilisateur %d a déjà été récupéré.', id)

    progress.end()
Пример #43
0
 def content(self):
     d = Pq(self.dom('.article-content').html())
     d('.main-tg-area').remove()
     d('.articleRecommend').remove()
     return self.clearInput(d.html())
from pyquery import PyQuery

assert len(sys.argv) == 2, "Second argument is the notebook name!"
NOTEBOOK = sys.argv[1]

parts = NOTEBOOK.split('.')
parts[-1] = "html"
HTML_FILE = ".".join(parts)

# Gather the information from the first cell.
with open(NOTEBOOK) as f:
    res = json.load(f)
blocks = json.loads("".join(res['cells'][0]['source']))

# Convert the notebook. 
call(['ipython', 'nbconvert', NOTEBOOK, '--to', 'html', '--template', 'basic'])

# Remove input cells.
with open(HTML_FILE) as f:
    doc = PyQuery(f.read(), parser='html')
    doc.remove('.input')
    blocks['body'] = doc.html()

# Insert into simple template. 
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(BASE_DIR, 'my_template.html')) as f:
    tmpl = f.read()
template = Template(tmpl)

with open(HTML_FILE, 'w') as f:
    f.write(template.render(**blocks))
Пример #45
0
    def _generate_translation(self):
        """ Generate project description. """
        desc = PyQuery(HTML_TEMPLATE)

        # 1. Basic Information
        ######################
        project = self.project_id

        # Put country if not the same as Field Office
        if (project.country_id
                and project.country_id != project.field_office_id.country_id):
            desc(".project_country")[0].text = _(
                "The project is located in %s, close to the border."
            ) % project.country_id.name
        else:
            desc("#project_country").remove()

        desc(".project_name")[0].text = _("Project name")
        desc(".project_name")[1].text = project.name
        desc(".project_closest_city")[0].text = _("Closest city")
        self._show_field(
            desc(".project_closest_city")[1],
            desc("#project_closest_city"),
            project.closest_city,
        )
        desc(".project_cdsp_number")[0].text = _("Number of children")
        self._show_field(
            desc(".project_cdsp_number")[1],
            desc("#project_cdsp_number"),
            project.nb_cdsp_kids,
        )
        if project.electrical_power == "Not Available":
            desc(".project_electricity").html(
                _("The project has no electricity."))
        else:
            desc("#project_electricity").remove()

        # 2. Community
        ##############
        desc("#community_label").html(_("Local community"))
        desc(".community_population")[0].text = _("Population")
        self._show_field(
            desc(".community_population")[1],
            desc("#community_population"),
            "{:,}".format(project.community_population).replace(",", "'"),
        )
        desc(".community_language")[0].text = _("Language")
        self._show_field(
            desc(".community_language")[1],
            desc("#community_language"),
            project.primary_language_id.name,
        )
        if project.primary_adults_occupation_ids:
            desc(".community_job")[0].text = _("Typical job")
            self._show_field(
                desc(".community_job")[1],
                desc("#community_job"),
                project.primary_adults_occupation_ids[0].value,
            )
        else:
            desc("#community_job").remove()
        desc(".community_food")[0].text = _("Typical food")
        if project.primary_diet_ids:
            desc(".community_food")[1].text = project.primary_diet_ids[0].value
        else:
            desc("#community_food").remove()
        desc(".community_school_begins")[0].text = _("School begins in")
        self._show_field(
            desc(".community_school_begins")[1],
            desc("#community_school_begins"),
            project.translate("school_year_begins"),
        )

        # 3. Activities
        ###############
        spiritual = project.get_activities("spiritual_activity", 3)
        physical = project.get_activities("physical_activity", 3)
        cognitive = project.get_activities("cognitive_activity", 3)
        socio = project.get_activities("socio_activity", 3)
        if spiritual or physical or cognitive or socio:
            desc("#activities_label").html(
                _("Project activities for children"))
        else:
            desc("#activities").remove()

        if spiritual:
            desc(".spiritual_activities").html(_("Spiritual activities"))
            desc("#spiritual_activities_list").html("".join(
                ["<li>" + activity + "</li>" for activity in spiritual]))
        else:
            desc("#spiritual_activities").remove()
        if physical:
            desc(".physical_activities").html(_("Physical activities"))
            desc("#physical_activities_list").html("".join(
                ["<li>" + activity + "</li>" for activity in physical]))
        else:
            desc("#physical_activities").remove()
        if cognitive:
            desc(".cognitive_activities").html(_("Cognitive activities"))
            desc("#cognitive_activities_list").html("".join(
                ["<li>" + activity + "</li>" for activity in cognitive]))
        else:
            desc("#cognitive_activities").remove()
        if socio:
            desc(".socio_activities").html(_("Socio-emotional activities"))
            desc("#socio_activities_list").html("".join(
                ["<li>" + activity + "</li>" for activity in socio]))
        else:
            desc("#socio_activities").remove()
        if project.activities_for_parents:
            desc(".parent_activities").html(
                _("In addition, the project offers special activities for the "
                  "parents such as education courses."))
        else:
            desc("#parent_activities").remove()

        return desc.html()
Пример #46
0
def scrape_product(url, category_slug):
    f = urlopen(url)
    doc = html5lib.parse(
        f, treebuilder='lxml', namespaceHTMLElements=False
    )  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])
    #content = jQuery ('td#content table').eq(0)
    content = jQuery('td#content')
    content('form').remove()

    # used to do this, but some models (eg blades) don't have tables:
    #content = jQuery ('td#content table td').eq (0)

    #if content.is_('table'):
    #    content = content ('table td').eq (0)

    # nope, this was too simplistic - let's take apart the tables - see below in final save
    # nope, this doens't work either. I give up.

    skus = find_sku.findall(url)
    sku = skus[0]
    slug = slugify(sku)

    print sku
    '''
    if sku in ['ESERVE',
     'NAS6X',
     'NAS16X',
     'PREMIUM',
     'TWINSERVE',
     'PREMIUM2',
     'SANDYCORE',
     'i7CORE',
     'i7SHORT',]:
        print 'Skipping..'
        return
    #elif testing and sku != 'NAS12':
    #    print 'Skipping due to testing..'
    #    return
    '''

    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "Per single unit, this configuration's price")).remove()
    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "The base price with this configuration is")).remove()
    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "All eRacks systems come with a Standard")).remove()
    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "The price differences between the default")).remove()
    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "Contact eRacks to inquire about leasing")).remove()

    content('form').remove()

    content('#pricetext').remove()
    content('#warrantynote').remove()
    content('#closenote').remove()

    xbig = content('.xbig')
    if xbig:
        xbig('a').remove()
        inner = xbig.html().replace(':', '').strip()
        xbig.replaceWith('<h5 class=xbig>%s</h5>' % inner)
        print 'xbig replaced:', inner

    font = content('font[size=4], font[size=5]')
    if font:
        font('a').remove()
        inner = font.text().replace(':', '').strip()
        font.replaceWith('<h5 class="product">%s</h5>' % inner)
        print 'font replaced:', inner

    if testing:
        print
        print sku, 'content:'
        print content.html()

    links = content('a')
    images = content('img')

    for link in links:
        a = PyQuery(link)
        href = a.attr('href')

        if href:
            if '?' in href:
                href = href.split('?')[
                    0]  # doesn't this get rid of all get parms?
                a.attr('href', href)

            linkskus = find_sku.findall(href)  # That this is looking for?!!
        else:
            print "Empty Link:", a.html()
            linkskus = []
            print content.html()

        if linkskus:
            linksku = linkskus[0]
            a.attr('href', '/products/%s/%s/' % (category_slug, linksku))
            print 'New link:', a.attr('href')
        elif href.startswith('/Legacy'):
            linksku = slugify(href.split('/')[-1])
            a.attr('href', '/products/%s/%s/' % (category_slug, linksku))
            print 'New link:', a.attr('href')
        elif 'ore photos' in a.text():
            print 'Scraping:', href
            scrape_photos(url, href, slug)
            #print 'Removing link (scraped):', href
            #a.remove()
            print 'Updating "more photos" link:', href
            a.attr('href', '#photos')
            a.attr('onclick', '$("#photos-tab").click();')
        elif href.endswith('_photos'):
            print 'Scraping:', href
            scrape_photos(url, href, slug)
            print 'Updating "<prod>_photos" link:', href
            a.attr('href', '#photos')
            a.attr('onclick', '$("#photos-tab").click();')

    for image in images:
        img = PyQuery(image)
        src = img.attr('src')
        newsrc = getimage(src, 'products/' + slug)
        img.attr('src', newsrc)
        print 'image:', newsrc

    if dbteeth:
        #prod, created = Product.objects.get_or_create (sku=sku)  # prods are already in the db, silly!
        prod = Product.objects.get(sku=sku)
        prod.comments = prod.comments + '\n\nScraped from Zope as of ' + str(
            datetime.date.today())
        #prod.description = content.text() + '<br>'.join ([PyQuery(c).html() for c in content ('td')])  # content.html()
        prod.description = content.html()
        # save image(s):
        # prod.image =
        # prod.images.add (name, title, src, etc)
        prod.save()
        print '..saved.'
Пример #47
0
def test_pop_html():
    node = PyQuery('<test><h1></h1></test>')
    assert vdom.pop_html(node) == '<h1/>'
    assert not node.html()
Пример #48
0
	def handle(self, *args, **options):
		xml = ElementTree.parse(open(args[0], 'r'))
		channel = xml.find('channel')
		
		def node_text(node, namespace = None, parent = None):
			if namespace:
				item = (parent or channel).find(ns(namespace, node))
			else:
				item = (parent or channel).find(node)
			
			if not item is None:
				return item.text
			
			return None
		
		def ns(n, o):
			return '{%s}%s' % (XML_NS[n], o)
		
		if channel is None:
			raise CommandError('Cannot find <channel> tag')
		
		title = node_text('title')
		if title:
			print(u'Blog title: %s' % title)
		
		link = node_text('link')
		if link:
			print(u'Blog URL: %s' % link)
		
		description = node_text('description')
		if description:
			print(u'Blog description: %s' % description)
		
		mappings = {
			'users': {},
			'posts': {},
			'categories': {},
			'comments': {}
		}
		
		content_type = ContentType.objects.get_for_model(Post)
		site = Site.objects.get_current()
		postmeta = {}
		
		print
		with transaction.commit_manually():
			try:
				for author in channel.findall(ns('wp', 'wp_author')):
					username = node_text('author_login', 'wp', author)
					email = node_text('author_email', 'wp', author)
					display_name = node_text('author_display_name', 'wp', author)
					user = None
					
					if not username:
						continue
					
					if display_name:
						display_name = '%s (%s)' % (username, display_name)
					else:
						display_name = username
					
					try:
						user = User.objects.get(username__iexact = username)
					except User.DoesNotExist:
						if email:
							try:
								user = User.objects.get(email__iexact = email)
							except:
								pass
					
					if not user:
						new_username = raw_input('Map old user %s to a user in your database: ' % display_name)
						if not new_username:
							continue
						
						while True:
							try:
								user = User.objects.get(username__iexact = new_username)
								break
							except User.DoesNotExist:
								new_username = raw_input('User not found. Please try again ,or press Enter to ignore: ')
								if not new_username:
									print 'Ignoring user %s' % username
									break
					
					if user:
						mappings['users'][username] = user
						print 'Mapping user %s to %s' % (
							username, user.get_full_name() or user.username
						)
				
				for item in channel.findall('item'):
					id = node_text('post_id', 'wp', item)
					title = node_text('title', parent = item)
					url = node_text('link', parent = item)
					kind = node_text('post_type', 'wp', item)
					parent = node_text('post_parent', 'wp', item)
					published = node_text('status', 'wp', item) == 'publish'
					author = node_text('creator', 'dc', item)
					date = node_text('post_date_gmt', 'wp', item)
					body = node_text('encoded', 'content', item) or u''
					
					try:
						id = int(id)
					except ValueError:
						continue
					
					if not date:
						continue
					
					try:
						date = datetime.strptime(date,
							'%Y-%m-%d %H:%M:%S'
						).replace(
							tzinfo = get_current_timezone()
						)
					except:
						continue
					
					try:
						parent = int(parent)
					except ValueError:
						continue
					
					if parent:
						continue
					
					if not author:
						continue
					
					if not mappings['users'].has_key(author):
						continue
					
					author = mappings['users'][author]
					if not kind in ('post', 'page'):
						continue
					
					if kind == 'post':
						try:
							post = Post.objects.get(title = title, date = date)
							print 'Updating %s "%s"' % (kind, title)
						except Post.DoesNotExist:
							post = Post(
								title = title,
								slug = title and slugify(title) or None,
								date = date,
								published = published,
								broadcast = True,
								author = author
							)
							
							print 'Creating %s "%s"' % (kind, title)
					else:
						continue
					
					post.body = body
					post.save()
					mappings['posts'][id] = post
					
					for category in item.findall('category'):
						domain = category.get('domain')
						slug = category.get('nicename')
						
						if not category.text:
							continue
						
						if domain == 'category':
							if not mappings['categories'].has_key(slug):
								mappings['categories'][slug], created = Category.objects.get_or_create(
									name = category.text,
									slug = slugify(category.text)
								)
								
								if created:
									print '- Created category "%s"' % category.text
							
							post.categories.add(
								mappings['categories'][slug]
							)
						elif domain == 'post_tag':
							if category.text.startswith('"') and category.text.endswith('"'):
								post.tags.add(category.text[1:-1])
							else:
								post.tags.add(category.text)
					
					for comment in item.findall(ns('wp', 'comment')):
						comment_id = node_text('comment_id', 'wp', comment)
						comment_name = node_text('comment_author', 'wp', comment)
						comment_email = node_text('comment_author_email', 'wp', comment)
						comment_url = node_text('comment_author_url', 'wp', comment)
						comment_date = node_text('comment_date_gmt', 'wp', comment)
						comment_type = node_text('comment_type', 'wp', comment)
						comment_body = node_text('comment_content', 'wp', comment)
						comment_parent = node_text('comment_parent', 'wp', comment)
						comment_approved = node_text('comment_approved', 'wp', comment) == '1'
						
						try:
							comment_id = int(comment_id)
						except ValueError:
							continue
						
						try:
							comment_parent = int(comment_parent)
						except ValueError:
							comment_parent = 0
						
						try:
							comment_date = datetime.strptime(
								comment_date, '%Y-%m-%d %H:%M:%S'
							).replace(
								tzinfo = get_current_timezone()
							)
						except:
							continue
						
						if not comment_name:
							continue
						
						if not comment_type or comment_type == 'comment':
							try:
								comment = post.comments.get(
									name = comment_name,
									sent = comment_date
								)
							except Comment.DoesNotExist:
								comment = Comment(
									name = comment_name,
									website = comment_url,
									email = comment_email or '',
									sent = comment_date,
									approved = comment_approved,
									body = comment_body,
									content_type = content_type,
									object_id = post.pk
								)
								
								print '- Comment by %s' % comment_name
							
							comment.save(notify = False)
							mappings['comments'][comment_id] = comment
					
					postmeta[id] = {}
					for meta in item.findall(ns('wp', 'postmeta')):
						meta_key = node_text('meta_key', 'wp', meta)
						meta_value = node_text('meta_value', 'wp', meta)
						postmeta[id][meta_key] = meta_value
					
					ai = 1
					for subitem in channel.findall('item'):
						subid = node_text('post_id', 'wp', subitem)
						subparent_id = node_text('post_parent', 'wp', subitem)
						subtitle = node_text('title', parent = subitem)
						suburl = node_text('link', parent = subitem)
						subkind = node_text('post_type', 'wp', subitem)
						suburl = node_text('attachment_url', 'wp', subitem)
						
						try:
							subparent_id = int(subparent_id)
						except ValueError:
							continue
						
						if not suburl:
							continue
						
						if subkind != 'attachment' or subparent_id != id:
							continue
						
						s, d, p, a, q, f = urlparse(suburl)
						d, s, filename = p.rpartition('/')
						
						try:
							attachment = post.attachments.get(
								title = subtitle or filename
							)
						except Attachment.DoesNotExist:
							print '- Downloading %s' % filename

							response = requests.get(suburl)
							handle, tmp = mkstemp(
								path.splitext(filename)[-1]
							)

							write(handle, response.content)
							close(handle)
							
							attachment = Attachment(
								title = subtitle or filename,
								file = File(open(tmp, 'r'), name = filename),
								content_type = content_type,
								object_id = post.pk
							)
						
							if '_thumbnail_id' in postmeta[id]:
								if unicode(postmeta[id]['_thumbnail_id']) == unicode(subid):
									attachment.featured = True
						
							attachment.save()
							remove(tmp)
						
						if post.body:
							html = PyQuery('<body>' + post.body + '</body>')
							for a in html(
								'a[href="%(url)s"], [src="%(url)s"]' % {
									'url': suburl
								}
							):
								a = PyQuery(a)
								a.replaceWith('\n\n[attachment %d]\n\n' % ai)
							
							post.body = html.html()
						
						ai += 1
					
					if post.body:
						html = PyQuery('<body>' + post.body + '</body>')
						for a in html('a[href]'):
							href = a.get('href')
							if href.startswith(link):
								href = href.replace(link, 'http://%s' % site.domain)
							
							a = PyQuery(a)
						
						for p in html('p'):
							p = PyQuery(p)
							p.replaceWith('\n\n%s\n\n' % p.html())
						
						html('.alignright').addClass('pull-right').removeClass('alignright')
						html('.alignleft').addClass('pull-left').removeClass('alignleft')
						
						while '\n\n\n' in post.body:
							post.body = post.body.replace('\n\n\n', '\n\n')
						
						while '\r\r\r' in post.body:
							post.body = post.body.replace('\r\r\r', '\r\r')
						
						post.body = html.html()
						post.body = post.body.replace('<br />', '  \n')
						post.body = post.body.replace('<br/>', '  \n')
						post.body = post.body.replace('<br>', '  \n')
						
						while post.body.startswith('\n'):
							post.body = post.body[1:]
						
						while post.body.endswith('\n'):
							post.body = post.body[:-1]
						
						while post.body.startswith('\r'):
							post.body = post.body[1:]
						
						while post.body.endswith('\r'):
							post.body = post.body[:-1]
						
						while post.body.startswith('\t'):
							post.body = post.body[1:]
						
						post.body = post.body.strip()
					
					post.save()
				
				transaction.commit()
			except:
				transaction.rollback()
				raise
Пример #49
0
def get_one_page_audio(account_id, page_count):
    # http://www.ximalaya.com/1014267/index_tracks?page=2
    audit_pagination_url = "http://www.ximalaya.com/%s/index_tracks" % account_id
    query_data = {"page": page_count}
    audit_pagination_response = net.http_request(audit_pagination_url,
                                                 method="GET",
                                                 fields=query_data,
                                                 json_decode=True)
    result = {
        "audio_info_list": [],  # 页面解析出的歌曲信息列表
        "is_over": False,  # 是不是最后一页
    }
    if audit_pagination_response.status == 404:
        raise crawler.CrawlerException("账号不存在")
    elif audit_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(audit_pagination_response.status))
    if not crawler.check_sub_key(
        ("res", "html"), audit_pagination_response.json_data):
        raise crawler.CrawlerException("返回数据'res'或'html'字段不存在\n%s" %
                                       audit_pagination_response.json_data)
    if audit_pagination_response.json_data["res"] is not True:
        raise crawler.CrawlerException("返回数据'res'字段取值不正确\n%s" %
                                       audit_pagination_response.json_data)
    # 获取歌曲信息
    audio_list_selector = PQ(audit_pagination_response.json_data["html"]).find(
        "ul.body_list li.item")
    for audio_index in range(0, audio_list_selector.size()):
        audio_info = {
            "audio_id": None,  # 页面解析出的歌曲id
            "audio_title": "",  # 页面解析出的歌曲标题
        }
        audio_selector = audio_list_selector.eq(audio_index)
        # 获取歌曲id
        audio_id = audio_selector.find(".content_wrap").attr("sound_id")
        if not crawler.is_integer(audio_id):
            raise crawler.CrawlerException(
                "歌曲信息匹配歌曲id失败\n%s" %
                audio_list_selector.html().encode("UTF-8"))
        audio_info["audio_id"] = str(audio_id)
        # 获取歌曲标题
        audio_title = audio_selector.find(".sound_title").attr("title")
        if not audio_title:
            raise crawler.CrawlerException(
                "歌曲信息匹配歌曲标题失败\n%s" %
                audio_list_selector.html().encode("UTF-8"))
        audio_info["audio_title"] = str(audio_title.encode("UTF-8").strip())
        result["audio_info_list"].append(audio_info)
    # 判断是不是最后一页
    max_page_count = 1
    pagination_list_selector = PQ(
        audit_pagination_response.json_data["html"]).find(
            ".pagingBar_wrapper a.pagingBar_page")
    for pagination_index in range(0, pagination_list_selector.size()):
        pagination_selector = pagination_list_selector.eq(pagination_index)
        data_page = pagination_selector.attr("data-page")
        if data_page is None:
            continue
        if not crawler.is_integer(data_page):
            raise crawler.CrawlerException(
                "分页信息匹配失败\n%s" % audio_list_selector.html().encode("UTF-8"))
        max_page_count = max(max_page_count, int(data_page))
    result["is_over"] = page_count >= max_page_count
    return result
Пример #50
0
    def search(self, word):
        response = requests.get(self.URL.format(word=word), headers=headers)
        text = response.text
        # たまにhtmlに「𥝱」があって、処理はエラーが発生する
        text = text.replace('𥝱', '')

        doc = PyQuery(text)
        results = []
        normal_dict = doc("div.NetDicHead")
        if normal_dict:
            for head in normal_dict:
                result = {'word': word, 'type': 'normal'}
                head = PyQuery(head)
                # 括弧(【】)がある場合、漢字か外来語は入ってる
                match_kakko = re.compile(r"【(.*)】").search(head.text())
                if match_kakko:
                    kakko = match_kakko.group(1)
                    match_gairaigo = re.compile(r"[a-zA-Z]").search(kakko)
                    if match_gairaigo:
                        result['gogen'] = kakko
                        result['kana'] = word
                    else:
                        result['kanji'] = kakko
                        result['kana'] = head('b').text().replace(' ',
                                                                  '').replace(
                                                                      '・', '')
                for accent in head('span'):
                    accent = PyQuery(accent)
                    match_accent = re.compile(r"[([0-9]*)]").search(
                        accent.text())
                    if match_accent:
                        result['accent'] = result.get(
                            'accent', '') + match_accent.group(1) + ','
                if 'accent' in result:
                    result['accent'] = result['accent'][:-1]
                body = head.next()
                for a in body('a'):
                    a = PyQuery(a)
                    a.replaceWith(a.html())
                result['meaning'] = body.html()
                # 単語自体は仮名のみの場合
                if 'kana' not in result:
                    result['kana'] = word
                results.append(result)

        Jitsu_dict = doc("div.Jtnhj")
        if Jitsu_dict:
            result = {'word': word, 'type': 'Jitsu'}
            match = re.compile(
                r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?><!--AVOID_CROSSLINK-->別表記"
            ).search(Jitsu_dict.html())
            if match:
                result['kana'] = match.group(1)
                if result['kana'].find('<a') != -1:
                    result['kana'] = PyQuery(result['kana']).text()
            else:
                match = re.compile(
                    r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?>").search(
                        Jitsu_dict.html())
                if match:
                    result['kana'] = match.group(1)
                    if result['kana'].find('<a') != -1:
                        result['kana'] = PyQuery(result['kana']).text()

            if Jitsu_dict('.AM'):
                meaning = PyQuery('<div>')
                meaning.html(Jitsu_dict('.AM').nextAll())
            else:
                meaning = Jitsu_dict
            for a in meaning('a'):
                a = PyQuery(a)
                a.replaceWith(a.html())
            result['meaning'] = meaning.text()
            results.append(result)

        IT_dict = doc('div.Binit')
        if IT_dict:
            result = {'word': word, 'type': 'IT'}
            a = IT_dict('a').eq(0)
            if a.text().find('読み方') != -1:
                kana_tag = a.next('a').eq(0)
                result['kana'] = kana_tag.text().replace(' ', "")
            else:
                result['kana'] = word
                if IT_dict.text().find('【') != -1:
                    result['gogen'] = a.eq(0).text()
            for p in IT_dict('p'):
                p = PyQuery(p)
                for a in p('a'):
                    a = PyQuery(a)
                    a.replaceWith(a.html())
                if not p.html():
                    continue
                result['meaning'] = result.get('meaning',
                                               '') + "<p>" + p.html() + "</p>"
            result['kanji'] = IT_dict.prev("h2.midashigo").text()
            results.append(result)

        WIKI = doc('div.Wkpja')
        if WIKI:
            result = {'word': word, 'type': 'WIKI'}
            p = WIKI('p').not_(".WkpjaTs")
            for a in p('a'):
                a = PyQuery(a)
                a.replaceWith(a.html())
            result['meaning'] = p.html()
            result['kanji'] = WIKI.prev("h2.midashigo").text()
            results.append(result)
        if results:
            return {"status": 'success', "results": results}
        else:
            return {"status": 'error', "error_detail": "Nothing found."}
Пример #51
0
class Browser(object):
    def __init__(self, debug=False, opener_handlers=[]):
        self.debug = debug
        self.tree = None
        self.current_response = None
        self.current_html = None
        self._pyquery = None
        self.form_manager = FormManager(self)
        self.cookie_jar = cookielib.CookieJar()
        self.opener_handlers = opener_handlers

        if self.debug:
            LOG.setLevel(logging.DEBUG)
        else:
            LOG.setLevel(logging.WARNING)

    def _set_response(self, response):
        self.current_response = response
        self.current_html = response.read()
        self.tree = html.fromstring(self.current_html)
        self._pyquery = PyQuery(self.tree)

    def _get_opener(self):
        cookie_processor = urllib2.HTTPCookieProcessor(self.cookie_jar)
        handlers = [cookie_processor] + self.opener_handlers
        return urllib2.build_opener(*handlers)

    def _open(self, request):
        url = self.get_absolute_url(request.get_full_url())
        abs_request = urllib2.Request(url, request.data, request.headers,
                                      request.origin_req_host,
                                      request.unverifiable)
        opener = self._get_opener()
        self._maybe_log_request(request)
        response = opener.open(abs_request)
        self._set_response(response)

    def _maybe_log_request(self, request):
        if not LOG.isEnabledFor(logging.DEBUG):
            return
        message = "HTTP request: (%s) %s" % (request.get_method(),
                                             request.get_full_url())
        if request.get_method() == "POST":
            message += "\n  POSTDATA: %s" % request.get_data()
        LOG.debug(message)

    def visit(self, url):
        self._open(urllib2.Request(url))

    def fill(self, selector_value_dict):
        return self.form_manager.fill(selector_value_dict)

    def submit(self, form_selector=None):
        request = self.form_manager.get_submit_request(form_selector)
        self._open(request)

    def query(self, selector):
        return self._pyquery(selector)


    def html(self, selector=None):
        if selector is None:
            return self._pyquery.html()

    def get_absolute_url(self, relative_url):
        if self.url is None:
            # Browser not used yet
            return relative_url

        return urlparse.urljoin(self.url, relative_url)

    @property
    def url(self):
        if not self.current_response:
            return None

        return self.current_response.url
Пример #52
0
    if response.url != SERVER_URL:  #update host for redirects ex. goo.gl links
        logger.debug('Updating server host to match what was retreived')
        SERVER_HOST = urlparse(response.url).netloc
    raw_html = response.text
    jQuery = PyQuery(raw_html)
    job_list = None
    for job_css in JOB_CSS_SEARCH_LIST:
        job_list = jQuery(job_css).text()
        if job_list:
            break
    if not job_list:
        logger.info('Running job checker against: ' + SERVER_HOST +
                    ' and could not parse page')
        email_message[
            'Subject'] = "Job Checker could not parse page at: " + SERVER_HOST
        body = "Raw HTML FROM: " + SERVER_HOST + ":\n" + jQuery.html(
            method='html')
        email_message.attach(MIMEText(body, 'plain'))
        send_email(email, email_message)
    else:
        match = JOB_REGEX.search(job_list)
        email_message['Subject'] = "Jobs Available at: " + SERVER_HOST
        body = "Content Retrieved from " + SERVER_HOST + ":\n" + job_list
        email_message.attach(MIMEText(body, 'plain'))
        logger.debug('Found job_list: ' + job_list)
        if not match:
            logger.info('Jobs have appeared so sending email.')
            send_email(email, email_message)
        else:
            logger.info('Found no jobs so not sending email')
except Exception as error:
    logger.error('Found error in email process:', exc_info=True)
Пример #53
0
        "name": text,
        "type": "Function",
        "path": href.replace(url, ""),
        "href": href
    })
    jQuery(item).find("a").attr("href", href.replace(url, ""))


# Step 1: create the docset folder
docsetPath = os.path.join(currentPath, output, "Contents", "Resources", "Documents")
if not os.path.exists(docsetPath):
    os.makedirs(docsetPath)

# Step 2: Copy the HTML Documentation
fin = codecs.open(os.path.join(docsetPath, "index.html"), "w", "utf-8")
newContent = jQuery.html()
fin.write(newContent)
fin.close()

# Step 2.1 创建每一个函数的值页面
for result in results:
    dest = os.path.join(docsetPath, result["href"].replace(url, ""))
    if not os.path.exists(dest):
        os.makedirs(dest)
    fin = open(os.path.join(dest, "index.html"), "w")
    fin.write(urllib2.urlopen(result["href"]).read())
    fin.close()

# Step 2.2 下载CSS和JS
links = [
    "http://www.css88.com/jqapi-1.9/cssstyle/main.min.css",
Пример #54
0
 def index_page(self, response):
     title = response.save['river_name']
     url = response.url
     spider_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                 time.localtime(time.time()))  # 爬虫的时间
     type_id = 0
     source = '百度百科'
     if url.find('https://baike.baidu.com/error.html') == 0:
         result = None
     else:
         context = self.filter_page(
             response,
             'body > div.body-wrapper > div.content-wrapper > div > div.main-content'
         )
         for i in ['em', 'span', 'a']:
             context = re.sub(r'<{}[^<>]*>'.format(i), r'', context)  #删除a标签
             context = re.sub(r'</{}>'.format(i), r'', context)  #删除a标签
         for i in ['em', 'div', 'span']:  #替换成p标签
             context = re.sub(r'<([/]*){}([/]*)>'.format(i), r'<\1p\2>',
                              context)
         context = re.sub(r'<p></p>', r'', context)  #删除空标签
         context = re.sub(r'<p/>', r'', context)  #删除空标签
         for i in range(10):  #去除多层多余标签
             context = re.sub(r'<p>(<p>(?!<p>|</p>)[^<>]+<p/>)<p/>', r'\1',
                              context)
         context = re.sub(r'<[/]*(?=span|em)[^ \"</>]+[/]*>', r'',
                          context)  #删除span和em
         for count in range(10):
             for i in re.findall(r'<([^ \"<>]+)></\1>', context):  #删除空标签
                 context = context.replace('<{}></{}>'.format(i, i), '')
                 context = context.replace('<{}/>'.format(i), '')
         context = re.sub(r'<a[^<>]*>', r'', context)  #删除链接
         context = re.sub(r'</a>', r'', context)  #删除链接
         context = re.sub(r'<div>', r'<p>', context)
         context = re.sub(r'</div>', r'</p>', context)
         context = re.sub(
             r'<p>([^<>]*)<img src=\"([^ \"<>]+)\"/></p>',
             r'<p>\1</p><img src="\2" alt="" class="entry__img"/>', context)
         py = PyQuery(context)
         context = py.html()
         context = re.sub(r'(<img[^<>]+/>)<p>([^<>]+)</p>',
                          r'\1<figcaption>\2</figcaption>', context)
         context = re.sub(r'<p></p>', r'', context)
         py = PyQuery(context)
         context = py.html()
         context = re.sub(r'<p></p>', r'', context)
         context = re.sub(r'<p/>', r'', context)
         context = re.sub(r'(<p>[^<>]*)<img', r'\1</p><img', context)
         context = re.sub(r'(</p>(?!</p>).*)</p>', r'\1', context)
         context = re.sub(r'(<p>(?!</p>).*)<p>', r'\1', context)
         context = re.sub(r'<([/]*)h[1-9]>', r'<\1h3>', context)
         context = re.sub(r'<h3>', r'<h3 class="text-center">', context)
         context = re.sub('^<h3', '<div class=\"entry__article\"><h3',
                          context)  # 开头就是标题
         context = re.sub('</h3>$', '</div></h3>', context)  # 以标题结尾
         context = re.sub(
             '^<img', '<div class=\"entry__img-holder text-center\"><img',
             context)  # 开头就是图片
         context = re.sub('\">$', '\"></div>', context)  # 图片后面没有图片名并且是最后一个
         context = re.sub('</figcaption>$', '</figcaption></div>',
                          context)  # 图片后面有图片名并且是最后一个
         context = re.sub('^<p>', '<div class=\"entry__article\"><p>',
                          context)  # 开头就是正文
         context = re.sub('</p>$', '</p></div>', context)  # 以正文结尾
         context = re.sub(
             '</p><img',
             '</p></div><div class=\"entry__img-holder text-center\"><img',
             context)  # 图片在正文的后面
         context = re.sub(
             '</figcaption><p>',
             '</figcaption></div><div class=\"entry__article\"><p>',
             context)  # 有图片名的图片在正文的前面
         context = re.sub('\"><p>',
                          '\"></div><div class=\"entry__article\"><p>',
                          context)  # 没有图片名的图片在正文的前面
         context = re.sub(
             '</h3><img',
             '</h3></div><div class=\"entry__img-holder text-center\"><img',
             context)  # 图片在标题的后面
         context = re.sub(
             '</figcaption><h3',
             '</figcaption></div><div class=\"entry__article\"><h3',
             context)  # 有图片名的图片在标题的前面
         ##############
         context = self.context_css + self.context_html_tmp.format(context)
         context = re.sub(
             r'(<link rel=\"stylesheet\" type=\"text/css\" href=\"css/style.css\"><link rel=\"stylesheet\" type=\"text/css\" href=\"css/bootstrap.css\"><link rel=\"stylesheet\" type=\"text/css\" href=\"css/font-awesome.min.css\"><article class=\"entry\"><div class=\"entry__article\"></div><div class=\"entry__article\">).*<p>[\d]+</p>',
             r'\1', context)
         # with open('/home/mininet/test.txt','w+') as f:
         #     f.write(context)
         # print context
         context = context.replace('data-src', 'src')
         result = [
             title, url, '', context, '', '', type_id, spider_time, source
         ]
     return result
Пример #55
0
    def _generate_translation(self):
        """ Generate child description. """
        desc = PyQuery(HTML_TEMPLATE)

        # 1. Program type only if Home Based + Birthday estimate
        ########################################################
        child = self.child_id
        if child.cdsp_type == 'Home Based':
            desc('.program_type').html(
                self.home_based_lang[self.env.lang][child.gender].format(
                    preferred_name=child.preferred_name))
        else:
            desc('#program_type').remove()
        if child.estimated_birthdate:
            desc('.birthday_estimate').html(
                _("* The birthday is an estimation."))
        else:
            desc('#birthday_estimate').remove()

        # 2. Household
        ##############
        household = child.household_id.with_context(active_gender=child.gender)
        live_with = self._live_with()
        desc('#live_with').html(live_with)

        if not household.father_living_with_child:
            f_alive = desc('.father').children('.is_alive')
            f_alive[0].text = _('Father alive')
            f_alive[1].text = household.translate('father_alive')
        else:
            desc('.father').remove()
        self._job(desc('.father_job'), 'father')

        if not household.mother_living_with_child:
            m_alive = desc('.mother').children('.is_alive')
            m_alive[0].text = _('Mother alive')
            m_alive[1].text = household.translate('mother_alive')
        else:
            desc('.mother').remove()
        self._job(desc('.mother_job'), 'mother')

        if household.nb_brothers:
            desc('.brothers')[0].text = _("Number of brothers")
            desc('.brothers')[1].text = str(household.nb_brothers)
        else:
            desc('.brothers').remove()
        if household.nb_sisters:
            desc('.sisters')[0].text = _("Number of sisters")
            desc('.sisters')[1].text = str(household.nb_sisters)
        else:
            desc('.sisters').remove()

        # 3. Schooling
        ##############
        if child.us_grade_level and child.us_grade_level != 'Not Enrolled':
            # Make sure the education level is set
            child.convert_us_grade_to_education_level()
            desc('#school_attending').html(
                self.school_yes_lang[self.env.lang][child.gender].format(
                    preferred_name=child.preferred_name,
                    level=child.translate('education_level')))
            if child.academic_performance:
                desc('.school_performance')[0].text = _('School performance')
                desc('.school_performance')[1].text = child.translate(
                    'academic_performance')
            else:
                desc('#school_performance').remove()
            if child.major_course_study:
                desc('.school_subject')[0].text = _('Best school subject')
                desc('.school_subject')[1].text = child.translate(
                    'major_course_study')
            else:
                desc('#school_subject').remove()
            if child.vocational_training_type and \
                    child.vocational_training_type.lower() not in (
                        'not enrolled', 'other'):
                desc('.vocational_training')[0].text = _('Vocational training')
                desc('.vocational_training')[1].text = child.translate(
                    'vocational_training_type')
            else:
                desc('#vocational_training').remove()
        else:
            desc('#school_attending').html(
                self.school_no_lang[self.env.lang][child.gender].format(
                    preferred_name=child.preferred_name))
            desc('.school').remove()

        # 4. House duties
        #################
        if child.duty_ids:
            desc('#house_duties_intro').html(
                self.duties_intro_lang[self.env.lang][child.gender])
            desc('#house_duties_list').html(''.join([
                '<li>' + duty.value + '</li>' for duty in child.duty_ids[:3]
            ]))
        else:
            desc('.house_duties').remove()

        # 5. Church activities
        ######################
        if child.christian_activity_ids:
            desc('#church_activities_intro').html(
                self.church_intro_lang[self.env.lang][child.gender])
            desc('#church_activities_list').html(''.join([
                '<li>' + activity.value + '</li>'
                for activity in child.christian_activity_ids[:3]
            ]))
        else:
            desc('.church_activities').remove()

        # 6. Hobbies
        ############
        if child.hobby_ids:
            desc('#hobbies_intro').html(
                self.hobbies_intro_lang[self.env.lang][child.gender].format(
                    preferred_name=child.preferred_name))
            desc('#hobbies_list').html(''.join([
                '<li>' + hobby.value + '</li>' for hobby in child.hobby_ids[:3]
            ]))
        else:
            desc('.hobbies').remove()

        # 7. Health
        ###########
        if child.physical_disability_ids or child.chronic_illness_ids:
            desc('#handicap_intro').html(
                self.handicap_intro_lang[self.env.lang][child.gender].format(
                    preferred_name=child.preferred_name))
            handicap_list = []
            if child.physical_disability_ids:
                handicap_list.extend([
                    '<li>' + handicap.value + '</li>'
                    for handicap in child.physical_disability_ids
                ])
            if child.chronic_illness_ids:
                handicap_list.extend([
                    '<li>' + illness.value + '</li>'
                    for illness in child.chronic_illness_ids
                ])
            desc('#handicap_list').html(''.join(handicap_list))
        else:
            desc('.handicap').remove()

        return desc.html()
    def _render_span(self,
                     p: Paragraph,
                     pq: PyQuery,
                     bold=False,
                     italic=False,
                     strike=False,
                     underline=False,
                     font_size=None,
                     sub=False,
                     sup=False):
        """
        转换span
        change 19.5.3
            公式转换错误,则直接用图片
        :param pq:
        :return:
        """
        try:
            if pq.attr('data-latex'):  # 公式
                omml_str = converter.to_omml(
                    self.mini_trim(pq.attr('data-latex')))
                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return
            if pq.has_class("math-tex"):  # 公式
                if pq.attr('data-latex'):
                    omml_str = pq.attr('data-latex')
                else:
                    omml_str = html.unescape(
                        pq.html()) if pq.html() is not None else ''
                omml_str = omml_str.replace(r'\(', '').replace(r'\)', '')
                omml_str = converter.to_omml(self.mini_trim(omml_str))

                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return

            # 阿凡题公式
            if pq.has_class('afanti-latex'):
                metadata = AftQuestion(pq).parse_element()
                if metadata.startswith('^') or metadata.startswith('_'):
                    last_ele = pq(p._element).children()[-1]
                    metadata = last_ele.text[-1] + metadata
                    last_ele.text = last_ele.text[:-1]

                omml_str = converter.to_omml(self.mini_trim(metadata))
                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return
        except EquationConvertError:
            img = PyQuery('img', pq)
            self._render_img(p, img)
            return

        bold = any([
            bold,
            self._get_pq_style(pq, 'font-weight') == 'bold',
            self._get_pq_style(pq, 'font-weight') == 'bolder'
        ])
        italic = any(
            [italic, self._get_pq_style(pq, 'font-style') == 'italic'])
        strike = any([
            strike,
            self._get_pq_style(pq, 'text-decoration') == 'line-through',
            self._get_pq_style(pq, 'text-decoration-line') == 'line-through'
        ])
        underline = any([
            underline,
            self._get_pq_style(pq, 'text-decoration') == 'underline',
            self._get_pq_style(pq, 'text-decoration-line') == 'underline'
        ])

        if self._get_pq_style(pq, 'font-size'):
            size = self._get_pq_style(pq, 'font-size')
            if size.endswith('px'):
                size = size[:-2]
                size = int(float(size))
                font_size = self.get_pt(size)
            elif size.endswith('pt'):
                size = size[:-2]
                size = float(size)
                font_size = Pt(size)
        # self.__render_inline_element(p, pq, bold=bold, italic=italic, underline=underline, font_size=font_size,
        #                              strike=strike)

        contents = pq.contents()
        for item in contents:
            if isinstance(item, (HtmlElement, _Element)):
                self._render_element(p,
                                     item,
                                     is_root=True,
                                     bold=bold,
                                     italic=italic,
                                     strike=strike,
                                     underline=underline,
                                     font_size=font_size)
                continue
            run = p.add_run(self._clear_text(item))
            self.__force_simsun(run)
            if self._get_pq_style(pq, 'font-name'):
                run.font.name = self._get_pq_style(pq, 'font-name')
            if font_size:
                run.font.size = font_size

            run.underline = underline

            run.bold = bold
            run.italic = italic
            run.font.strike = strike
            run.font.superscript = sup
            run.font.subscript = sub
Пример #57
0
        else:
            newfile = special_chapters[i]
        # TODO: handle appendices and index
        link_replacements[file.replace('trinkethtml/', '')] = newfile

    for i, file in enumerate(files[1:]):  # skip book index
        print("Processing: ", file)
        selector = 'div.columns > ul > li:nth-child(' + str(i + 1) + ')'
        list_items = d(selector)
        list_items('li').eq(0).addClass('has-dropdown')
        list_items('ul').addClass('dropdown')
        toc = PyQuery('<div><ul class="right"></ul></div>')
        toc('ul').html(list_items)
        thisfile = file.replace('trinkethtml/', '')
        newfile = link_replacements[thisfile]
        toc_text = re.sub(thisfile, web_dir + newfile, toc.html(method='html'))
        #print(toc_text)

        # Extract chapter text
        with open(file) as f:
            chapter_raw = f.read()
        chapter_query = PyQuery(chapter_raw)
        chapter_text = chapter_query(".bookchapter").html(method='html')

        # Replace old links
        for old, new in link_replacements.items():
            chapter_text = re.sub(old, web_dir + new, chapter_text)
        # placeholder for tabs and newlines since re.sub will clobber them otherwise
        # print(re.findall(r'^.*?\\[tn].*?$', chapter_text, flags=re.M))
        chapter_text = re.sub(r'\\([tn])',
                              'shouldbe\g<1>',
Пример #58
0
class DocumentPublisherEngine(object):
    def __init__(self, pro, doc, group, organization=None):

        self.project = pro
        self.document = doc
        self.groups = [group.key, Group.get_worldshare().key]
        self.organization = organization

        self.user = User()
        self.user.groups = self.groups

        if organization:
            self.user.organization = organization.key

        self.html = ''
        self.body = Pq('<span></span>')

    def render(self):
        self._process_root()
        self._process_parent(self.project)
        self.html = self.body.html(method='html')

    def _process_root(self):
        attr = self.project.get_attr_by_doc(self.document)
        self.project.render_object = RenderObject()
        self.project.render_object.span = Pq('<span></span>')
        self.project.render_object.span.add_class('project_span')
        self.project.render_object.children_span = Pq('<span></span>')
        self.project.render_object.children_span.add_class('children_span')
        self.project.render_object.parent_children_span = Pq('<span></span>')
        self.project.render_object.parent_children_span.add_class(
            'parent_children_span')

        if attr and attr.is_unordered_list():
            self.project.render_object.ul = Pq('<ul></ul>')
            self.project.render_object.ul.append(
                self.project.render_object.children_span)
            self.project.render_object.ul.append(
                self.project.render_object.parent_children_span)
            self.project.render_object.span.append(
                self.project.render_object.ul)
            self.project.render_object.cur_attr = attributes.UNORDERED_LIST

        elif attr and attr.is_ordered_list():
            self.project.render_object.ul = Pq('<ol></ol>')
            self.project.render_object.ul.append(
                self.project.render_object.children_span)
            self.project.render_object.ul.append(
                self.project.render_object.parent_children_span)
            self.project.render_object.span.append(
                self.project.render_object.ul)
            self.project.render_object.cur_attr = attributes.ORDERED_LIST

        else:
            self.project.render_object.span.append(
                self.project.render_object.children_span)
            self.project.render_object.span.append(
                self.project.render_object.parent_children_span)
            self.project.render_object.cur_attr = attributes.NONE

        self.body.append(self.project.render_object.span)

    def _process_parent(self, parent):
        children = ndb.get_multi(parent.children)
        parent_span = False
        for child in children:
            if not child or not child.has_permission_read(self.user):
                continue
            child.parent_obj = parent
            self._render(child, parent)
            if not child.is_parent() and not parent_span:
                parent.render_object.children_span.append(
                    child.render_object.span.remove())
            else:
                parent_span = True
                parent.render_object.parent_children_span.append(
                    child.render_object.span.remove())
            self._process_parent(child)

    def _render(self, concept, parent):
        concept.render_object = RenderObject()
        attr = concept.get_attr_by_doc(self.document)

        ordered_list = False
        unordered_list = False

        concept.render_object.span = Pq('<span></span>')
        concept.render_object.span.attr('id', concept.id)
        if attr:
            concept.render_object.span.attr('data-attr',
                                            ' '.join(attr.attributes))
        concept.render_object.span.add_class('concept')

        if not attr or (attr and not attr.is_no_list()):
            if not attr or (attr and not attr.is_unordered_list()):
                auto_list = AutoAttributeEngine.is_ordered_list(parent,
                                                                None,
                                                                self.document,
                                                                user=self.user)
                if attr and attr.is_ordered_list():
                    ordered_list = True
                elif concept.is_parent() and auto_list:
                    ordered_list = True
            if not attr or (attr and not attr.is_ordered_list()):
                auto_list = AutoAttributeEngine.is_unordered_list(
                    parent, None, self.document, user=self.user)
                if attr and attr.is_unordered_list():
                    unordered_list = True
                elif concept.is_parent() and auto_list:
                    unordered_list = True

        concept.render_object.render_as_ordered_list = ordered_list
        concept.render_object.render_as_unordered_list = unordered_list

        attr_str = AutoAttributeEngine.get_attr(concept,
                                                attr,
                                                doc=self.document,
                                                user=self.user)

        concept.render_object.span.attr('data-ordered-list', str(ordered_list))
        concept.render_object.span.attr('data-unordered-list',
                                        str(unordered_list))

        concept.render_object.phr_span = Pq('<span></span>')
        concept.render_object.phr_span.attr('id',
                                            '%s-%s' % (concept.id, 'phr_span'))
        concept.render_object.phr_span.add_class('phr_span')
        concept.render_object.span.append(concept.render_object.phr_span)

        if concept.is_parent():
            concept.render_object.more_icon = Pq('<i></i>')
            concept.render_object.more_icon.attr(
                'id', '%s-%s' % (concept.id, 'more_icon'))
            concept.render_object.more_icon.add_class(
                'fa fa-angle-double-right expand_child_inc move-icon')
            if not concept.depth >= 0:
                concept.render_object.more_icon.add_class('hidden')
            concept.render_object.span.append(concept.render_object.more_icon)

        concept.render_object.children_span = Pq('<span></span>')
        concept.render_object.children_span.attr(
            'id', '%s-%s' % (concept.id, 'children_span'))
        concept.render_object.children_span.add_class('children_span')
        concept.render_object.span.append(concept.render_object.children_span)
        if concept.depth >= 0:
            concept.render_object.children_span.add_class('hidden')
            concept.render_object.children_span.attr('data-collapsed', 'true')

        concept.render_object.parent_children_span = Pq('<span></span>')
        concept.render_object.parent_children_span.attr(
            'id', '%s-%s' % (concept.id, 'parent_children_span'))
        concept.render_object.parent_children_span.add_class(
            'parent_children_span')
        concept.render_object.span.append(
            concept.render_object.parent_children_span)
        if concept.depth >= 0:
            concept.render_object.parent_children_span.add_class('hidden')
            concept.render_object.parent_children_span.attr(
                'data-collapsed', 'true')

        self._render_text(concept)

        if attr_str == attributes.HEADER:
            self._render_header(concept)
        elif attr_str == attributes.PARAGRAPH:
            self._render_paragraph(concept)
        elif attr_str == attributes.IMAGE:
            self._render_image(concept)
        elif attr_str == attributes.NONE:
            self._render_none(concept)

        if ordered_list:
            self._render_ordered_list(concept)
        elif unordered_list:
            self._render_unordered_list(concept)

        if AutoAttributeEngine.is_list_item(concept,
                                            self.document,
                                            user=self.user):
            self._render_list_item(concept)

        concept.render_object.cur_attr = attr_str

    def _render_none(self, concept):
        pass

    def _render_image(self, concept):
        concept.render_object.img_figure = Pq('<figure></figure>')
        concept.render_object.img_figure.attr(
            'id', '%s-%s' % (concept.id, 'img-figure'))
        concept.render_object.img_figure.add_class('img-figure')
        concept.render_object.phr_span.append(concept.render_object.img_figure)

        concept.render_object.img = Pq('<img>')
        concept.render_object.img.attr('id',
                                       '%s-%s' % (concept.id, 'concept-img'))
        concept.render_object.img.attr('alt',
                                       concept.get_phrasing(doc=self.document))
        concept.render_object.img.attr('src',
                                       '/media/download/%s' % concept.id)
        concept.render_object.img.add_class('concept-img img-full')
        concept.render_object.img_figure.append(concept.render_object.img)

        concept.render_object.img_caption = Pq('<figcaption></figcaption>')
        concept.render_object.img_caption.attr(
            'id', '%s-%s' % (concept.id, 'caption'))
        concept.render_object.img_caption.append(
            concept.render_object.phr_text_span.remove())
        concept.render_object.img_caption.add_class('caption')
        concept.render_object.img_figure.append(
            concept.render_object.img_caption)

        concept.render_object.phr_text_span.remove_class('phr_text_span')
        concept.render_object.phr_text_span.add_class('phr_text_span_img')

    def _render_unordered_list(self, concept):
        concept.render_object.ul = Pq('<ul></ul>')
        concept.render_object.ul.attr('id', '%s-%s' % (concept.id, 'ul'))
        concept.render_object.ul.append(
            concept.render_object.children_span.remove())
        concept.render_object.ul.append(
            concept.render_object.parent_children_span.remove())
        concept.render_object.span.append(concept.render_object.ul)

    def _render_ordered_list(self, concept):
        concept.render_object.ol = Pq('<ol></ol>')
        concept.render_object.ol.attr('id', '%s-%s' % (concept.id, 'ol'))
        concept.render_object.ol.append(
            concept.render_object.children_span.remove())
        concept.render_object.ol.append(
            concept.render_object.parent_children_span.remove())
        concept.render_object.span.append(concept.render_object.ol)

    def _render_list_item(self, concept):
        concept.render_object.li = Pq('<li></li>')
        concept.render_object.li.attr('id', '%s-%s' % (concept.id, 'li'))
        concept.render_object.li.append(
            concept.render_object.phr_span.children().remove())
        concept.render_object.phr_span.append(concept.render_object.li)

        if AutoAttributeEngine.is_ordered_list(concept.get_parent(), None,
                                               self.document):
            concept.render_object.render_as_ordered_list = True
        elif AutoAttributeEngine.is_unordered_list(concept.get_parent(), None,
                                                   self.document):
            concept.render_object.render_as_unordered_list = True

    def _render_paragraph(self, concept):
        concept.render_object.p = Pq('<p></p>')
        concept.render_object.p.attr('id', '%s-%s' % (concept.id, 'p'))
        concept.render_object.p.append(
            concept.render_object.span.children().remove())
        concept.render_object.span.append(concept.render_object.p)
        concept.render_object.span.append(
            concept.render_object.parent_children_span.remove())

    def _render_header(self, concept):
        hl = concept.depth + 1
        if hl > 6:
            hl = 6
        concept.render_object.header = Pq('<h%s></h%s>' % (hl, hl))
        concept.render_object.header.attr('id',
                                          '%s-%s' % (concept.id, 'header'))
        concept.render_object.header.append(
            concept.render_object.phr_text_span.remove())
        concept.render_object.phr_span.append(concept.render_object.header)
        if concept.render_object.more_icon:
            concept.render_object.header.append(
                concept.render_object.more_icon)

    def _render_text(self, concept):
        phrasing_text = concept.get_phrasing(doc=self.document)
        if not phrasing_text:
            concept.get_phrasing()

        concept.render_object.phr_text_span = Pq('<span></span>')
        concept.render_object.phr_text_span.attr(
            'id', '%s-%s' % (concept.id, 'phr_text_span'))
        concept.render_object.phr_text_span.add_class('phr_text_span')
        concept.render_object.phr_text_span.append(phrasing_text + ' ')
        concept.render_object.phr_span.append(
            concept.render_object.phr_text_span)