Exemplo n.º 1
0
def xml_fmt_convert(html):
    soup = bs4.BeautifulSoup(html)

    # remove tag and its contents
    soup = remove_tag(soup, ['pre'])

    # remove tag, leave its contents
    soup = unwrap_tag(soup, ['br', 'span'])

    # rename tag: div -> p
    for i in soup.find_all('div'):
        i.name = 'p'

    # deal with the 'style' attributes in <img>
    tag_img = soup.find_all('img')
    for i in tag_img:
        try:
            couple = get_style_couple(i['style'].replace('px', ''))
            for attr, val in couple:
                i[attr] = val
            del i['style']
        except KeyError:
            pass
    
    #soup_fmt = soup.prettify(formatter=None)
    soup_fmt = replaceCharEntity(str(soup))
    soup_fmt = '<body>' + soup_fmt + '</body>'

    # convert <img.../> tag into <img...></img> 
    soup_fmt = soup_fmt.replace('/>', '>')
    soup_fmt = re.sub('<img[^>]+>', lambda x: x.group() + '</img>', soup_fmt)

    return soup_fmt.replace('\r', '').replace('\n', '')
Exemplo n.º 2
0
def getCatArticle(cat_id=None):
    if not cat_id:
        return ''
    pattern = re.compile(r"<p>img\d+\</p>")
    con = '<body>'
    for ar in NewArticle.objects.filter(cat=cat_id):
        con += pattern.sub(find_img, replaceCharEntity(ar.content))

    con += '</body>'

    return re.sub("\r\n", '', con)
Exemplo n.º 3
0
def getCatArticle(cat_id=None):
    if not cat_id:
        return ''   
    pattern = re.compile(r"<p>img\d+\</p>")
    con='<body>'
    for ar in NewArticle.objects.filter(cat=cat_id):
        con+=pattern.sub(find_img,replaceCharEntity(ar.content) )
        
    con+='</body>'
        
    return re.sub("\r\n",'',con)
Exemplo n.º 4
0
def test_xml(id):
    event = {}
    ca = NewAppEvent(None,id)
    cont = ca['event_content'][0][1]
    cont = cont.replace('\n', '').replace('\r', '').replace('\r\n', '')
    event['mobileU'] = cont
    
    soup = BeautifulSoup(cont)

    # remove tag and its contents
    soup = unwrap_tag(soup, ['br', 'span','pre'])

    # rename tag: div -> p
    for i in soup.find_all('div'):
        i.name = 'p'

    # deal with the 'style' attributes in <img>
    tag_img = soup.find_all('img')
    for i in tag_img:
        try:
            couple = get_style_couple(i['style'].replace('px', ''))
            for attr, val in couple:
                i[attr] = val
            del i['style']
        except KeyError:
            pass
    
    #soup_fmt = soup.prettify(formatter=None)
    #soup_fmt = replaceCharEntity(str(soup))
    #pattern = re.compile(r"<p>img\d+\</p>")
    #soup=replaceCharEntity(str(soup))
    #soup_fmt = re.sub(ur"<p>img\d+\</p>" , find_img, soup )
    
    #try:
    soup_fmt = re.sub(ur"<p>img\d+\</p>" , find_img,replaceCharEntity(str(soup)) )
    #except:
    #    soup_fmt =replaceCharEntity(str(soup))
    
    soup_fmt = '<body>' + soup_fmt + '</body>'

    # convert <img.../> tag into <img...></img> 
    soup_fmt = soup_fmt.replace('/>', '></img>')
    #soup_fmt = re.sub('<img[^>]+>', lambda x: x.group() + '</img>', soup_fmt)

    event['mobileURL'] = soup_fmt.replace('\r', '').replace('\n', '')

    return event
Exemplo n.º 5
0
def gettext(strings=None):
    if not strings:
        return ''   
    pattern = re.compile(r"<p>img\d+\</p>")
    con='<body>'
    
    con+=pattern.sub(find_img,replaceCharEntity(strings) )
        
    con+='</body>'

    # add quote before and after the attributes
    pattern = re.compile("(?P<pre>\w+=)(?P<post>\"?[^ \f\n\r\t\v<>]+\"?)")
    con = re.sub(pattern, add_quote, con)
    
    con = re.sub(r":(\S+);",r'="\1"', con)
        
    return re.sub("\r\n",'',con)
Exemplo n.º 6
0
def gettext(strings=None):
    if not strings:
        return ''
    #pattern = re.compile(r"<p>img\d+\</p>")
    #con='<body>'

    con = re.sub(r"<p>img\d+\</p>", find_img, replaceCharEntity(strings))

    #con+='</body>'

    # add quote before and after the attributes
    #pattern = re.compile("(?P<pre>\w+=)(?P<post>\"?[^ \f\n\r\t\v<>]+\"?)")
    #con = re.sub(pattern, add_quote, con)

    #con = re.sub(r":(\S+);",r'="\1"', con)

    return con
Exemplo n.º 7
0
def str_html(strs=None):
    strs = strs.replace(' ','').replace('\r','').replace('\n','').replace('<p>','').replace('</p>','\r\n').replace('<br>','\r\n')\
                .replace('<br/>','\r\n').replace('<br />','\r\n').replace('\r\n\r\n','\r\n')
    strs = replaceCharEntity(strs)   
    return strs
Exemplo n.º 8
0
def str_html(str=None):
    str = str.replace(' ','').replace('\r','').replace('\n','').replace('<p>','').replace('</p>','\r\n').replace('<br>','\r\n')\
                .replace('<br/>','\r\n').replace('<br />','\r\n').replace('\r\n\r\n','\r\n')
    str = replaceCharEntity(str)
    return str