示例#1
0
def get_xml_from_dict(params_dict):
    """
    由字典转为xml字符串
    :param params_dict: 字典
    :return: xml_str
    :rtype: str
    """
    soup = BeautifulSoup(features="xml")
    xml = soup.new_tag("xml")
    for k, v in params_dict.items():
        tag = soup.new_tag(k)
        if isinstance(v, int):
            tag.append(soup.new_string(str(v)))
        elif isinstance(v, (str, unicode)):
            tag.append(CData(v))
        else:
            for k1, v1 in v.items():
                tag1 = soup.new_tag(k1)
                if isinstance(v1, int):
                    tag1.append(soup.new_string(str(v1)))
                elif isinstance(v1, (str, unicode)):
                    tag1.append(CData(v1))
            tag.append(tag1)
        xml.append(tag)
    return str(xml)
示例#2
0
    def add_alias(self, name, type_, url, updatefreq=1, address=None, descr='', detail=''):
        if address is None:
            address = url

        """ Add an alias table """
        alias = self.doc.new_tag('alias')
        self.add_child_nfo(alias, 'name', str(name))
        self.add_child_nfo(alias, 'type', str(type_))
        self.add_child_nfo(alias, 'url', str(url))
        self.add_child_nfo(alias, 'updatefreq', str(updatefreq))
        self.add_child_nfo(alias, 'address', str(address))
        self.add_child_nfo(alias, 'descr', CData(str(descr)))
        self.add_child_nfo(alias, 'detail', CData(str(detail)))
        self.doc.aliases.append(alias)
示例#3
0
    def replaceCodeBlocks(self):
        ''' this is how it should look like:
        <ac:structured-macro ac:name="code" ac:schema-version="1" ac:macro-id="168f7514-4b7f-4202-9832-76ca4d2f9650">
            <ac:plain-text-body>
                <![CDATA[first line
second line
third line]]>
            </ac:plain-text-body>
        </ac:structured-macro>

        code blocks are initially created as <pre><code>code...</code></pre>
        '''
        for codeblock in self.soup.findAll('pre'):
            children = codeblock.findChildren()
            if len(children) == 1:
                firstChild = children[0]
                if firstChild.name == 'code':
                    codeText = firstChild.text
                    structuredMacroElement = self.soup.new_tag(
                        'ac:structured-macro', **{
                            'ac:name': 'code',
                            'ac:schema-version': 1
                        })
                    plainTextBodyElement = self.soup.new_tag(
                        'ac:plain-text-body')
                    cdata = CData(codeText)
                    plainTextBodyElement.append(cdata)
                    structuredMacroElement.append(plainTextBodyElement)
                    codeblock.replaceWith(structuredMacroElement)
示例#4
0
def try_cdata():
    markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
    soup = BeautifulSoup(markup, 'html.parser')
    comment = soup.b.string
    cdata = CData("A CDATA block")
    comment.replace_with(cdata)
    print('*' * 50, 'Try_Cdata', '*' * 50)
    print(type(comment))
    print(comment)
    print(soup.prettify())
def fix_xml(pred_xml, gold_xml):
    print(pred_xml, gold_xml)
    gold_soup = BeautifulSoup(open(gold_xml, 'r').read(), features='xml')
    gold_text = gold_soup.find('TEXT').string

    print(gold_text.count('\n'))
    
    pred_soup = BeautifulSoup(open(pred_xml, 'r').read(), features='xml')
    pred_soup.find('TEXT').string = CData(gold_text)
    with open(pred_xml, 'w') as f:
        f.write(str(pred_soup))
示例#6
0
def transform_item_in_feed(item):
    """Transform an <item>"""

    link = item.link.text
    print('Processing {}'.format(link))

    # Ignore empty articles
    if item.description is None or len(item.description.contents) == 0:
        print('Empty article body, ignoring...')
        item.decompose()
        return

    # Ignore articles without title
    if item.title is None or len(item.title) == 0:
        print('Article without title, ignoring...')
        item.decompose()
        return

    # Parse the article content as HTML
    article = read_html_from_string(item.description.contents[0])

    # The creator in the RSS is a username, so try first to parse from the HTML.
    html_authors = _parse_article_authors(article)

    if html_authors is not None:
        item.creator.string = html_authors
        # Remove authors from article text itself
        article.find('div', class_='field-name-field-auteurs').decompose()

    # Get the category
    category_tag = Tag(name='category')
    category_node = article.select_one('div.field-name-field-rubriek a')

    if category_node is not None:
        category_tag.string = category_node.text.strip()
        category_tag['domain'] = category_node['href']
        # Remove category from the article body
        article.find('div', class_='field-name-field-rubriek').decompose()

    item.append(category_tag)

    # Remove edition from article body if present
    edition_node = article.find('div', class_='field-name-field-editie')
    if edition_node is not None:
        edition_node.decompose()

    encoded = article.find('body').decode_contents(formatter='html')
    item.description.contents = [
        CData(htmlmin.minify(encoded, remove_optional_attribute_quotes=False))
    ]
示例#7
0
def prediction_to_xml(X, preds, text, sents, ind2label_lookup) -> str:
    preds = postprocess_prediction(X, preds, sents, ind2label_lookup)

    soup = BeautifulSoup('<deIdi2b2><TEXT></TEXT><TAGS></TAGS></deIdi2b2>', features='xml')
    soup.find('TEXT').string = CData(text)
    tags = soup.find('TAGS')
    for i, tagged_tokens in enumerate(itertools.chain.from_iterable(preds)):
        tags.append(soup.new_tag(TOKEN_TYPE[tagged_tokens.type],
                                 id=f'P{i}',
                                 start=tagged_tokens.start,
                                 end=tagged_tokens.end,
                                 TYPE=tagged_tokens.type,
                                 text=text[tagged_tokens.start:tagged_tokens.end]))

    return str(soup)
示例#8
0
def create_note(note_data, soup):
    """Create an ENEX note element"""

    note = soup.new_tag('note')

    title = soup.new_tag('title')
    title.string = note_data.title
    note.append(title)

    content_inside = BeautifulSoup(features="xml")
    content_inside.append(Doctype('en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd"'))

    content_inside_note = soup.new_tag('en-note')
    content_inside_note.string = note_data.content
    content_inside.append(content_inside_note)

    # Holy crap this is super hacky and horrible but I don't want to fight with
    # BeautifulSoup to make it not convert all the text to HTML entities, so
    # manually convert everything to < and >
    content_inside_str = str(content_inside).replace('&lt;', '<').replace('&gt;', '>')

    content = soup.new_tag('content')
    content.string = CData(content_inside_str)
    note.append(content)

    created = soup.new_tag('created')
    created.string = str(note_data.created)
    note.append(created)

    updated = soup.new_tag('updated')
    updated.string = str(note_data.updated)
    note.append(updated)

    for single_tag in note_data.tags:
        if single_tag is not None:
            tag = soup.new_tag('tag')
            tag.string = single_tag
            note.append(tag)

    attributes = soup.new_tag('note-attributes')
    author = soup.new_tag('author')
    author.string = "Andrew Heiss"

    attributes.append(author)
    note.append(attributes)

    return note
示例#9
0
def transform_item_in_feed(item):
    link = item.link.text
    print('Processing {}'.format(link))

    article = read_html_from_url(link)

    # Remove and ignore articles without title
    title_node = item.title
    if title_node is None or len(title_node.text) == 0:
        item.decompose()
        return

    author_node = item.creator
    author_node.string = _parse_article_authors(article)

    parsed_body = _extract_article_body(article)
    encoded = parsed_body.decode_contents(formatter='html')
    minified = htmlmin.minify(encoded, remove_optional_attribute_quotes=False)
    item.description.contents = [CData(minified)]
示例#10
0
def tagMake(fileInfo):
    # sys.stdout.write("\r Completed Percent > [ {0}% ] {1:0.2f}".format(str(int(fileIdx / totalFileNum*100+1)), (time.time()-startTime)/60 ))
    # sys.stdout.flush()
    # ['APC', '00', '1000001', '1000001', '1000001', 'C:/dev/1.legalJP/00.Data/JPJ_2000001/DOCUMENT/APC/1000001/1000001/1000001/1000001.SGM']
    global errCnt
    global totalCnt
    global counter
    global strTime
    try:
        if not 'AC' == fileInfo[0] or 'collection' == fileInfo[0]:
            fileNm = '_'.join(fileInfo[0:5])
            dirNm = '_'.join(fileInfo[0:2])
            contents = ''
            reContents = ''
            tifFileList = []
            rmTifFileList = []
            tifDict = {}
            # Data Variables

            #주문
            jumun = ''
            #결정
            concluPart = ''
            # 사건번호 , 심판번호
            inciNum = ''
            # Debug 사건번호 , 심판번호
            debugInciNum = ''
            # 판결 선고일
            courtDc = ''
            # 법원명
            courtNm = ''
            # 명칭
            invenNm = ''
            # 종류
            kindOfItem = ''
            #공보종별
            offGaze = ''

            finalDp = ''
            divMain = ''
            pbDate = ''
            publicDate = ''
            sysDate = ''
            dateCt = ''

            with counter.get_lock():
                counter.value += 1

            with open(fileInfo[5], 'r', encoding='EUC-JP') as rf:
                for cont in rf:
                    contents += cont

            parser = MyHTMLParser()
            parser.feed(contents)

            # IMG Data
            if fileInfo[6]:
                if os.path.exists(fileInfo[6]):
                    tifFileList = imgParser(fileInfo[6], fileInfo[4])
                    rmTifFileList = list(set(tifFileList))
                    for tif in rmTifFileList:
                        imgNum = re.search('_\d+', tif)
                        imNum = int(imgNum.group().replace('_', '')) + 1
                        tifDict[str(imNum).zfill(6)] = tif

            # IMG Data END

            for fTag in parser.fullTagList:
                if '\n]>' == fTag or ']>' == fTag:
                    continue
                else:
                    reContents += fTag
            soup = BeautifulSoup(reContents, 'html.parser')

            if 'CD' == fileInfo[0]:
                # 판결
                divMain = 'CD'

                #사건번호
                inciNumTxt = soup.find('litigation-number')
                if not inciNumTxt is None:
                    inciNum = cdFullToHalf(inciNumTxt.text.replace('\n', ''))
                    debugInciNum = inciNumTxt.text.replace('\n', '')

                #판결선고일
                courtDcTxt = soup.find('court-decision-giving-date')

                if not courtDcTxt is None:
                    courtDc = fullToHalf(courtDcTxt.text.replace('\n', ''))

                # 법원명
                courtNmTxt = soup.find('belonging')

                if not courtNmTxt is None:
                    courtNm = fullToHalf(courtNmTxt.text.replace('\n', ''))

            else:
                # 심결
                divMain = 'AJ'

                # 심판번호
                inciNumTxt = soup.find('appeal-number')
                if not inciNumTxt is None:
                    inciNum = fullToHalf(inciNumTxt.text.replace('\n', ''))
                    debugInciNum = inciNumTxt.text.replace('\n', '')

                #판결선고일
                courtDcTxt = soup.find('appeal-decision-date')
                if not courtDcTxt is None:
                    courtDc = fullToHalf(courtDcTxt.text.replace('\n', ''))

                # 법원명
                courtNmTxt = soup.find('publication-country')
                if not courtNmTxt is None:
                    courtNm = fullToHalf(courtNmTxt.text.replace('\n', ''))
                #공보종별
                offiGazeTxt = soup.find('official-gazette-assortment')
                if not offiGazeTxt is None:
                    offGaze = offiGazeTxt.text.replace('\n', '')

                finalDispTxt = soup.find('final-disposition')
                if not finalDispTxt is None:
                    finalDp = finalDispTxt.text.replace('\n', '')

            for singleTag in soup.findAll():
                tagNm = singleTag.name

                if 'paragraph' == tagNm:
                    singleTag.string = CData(singleTag.text)

                if 'image' == tagNm:
                    if singleTag['file-id']:
                        singleTag['src'] = tifDict[singleTag['file-id']]
                        singleTag.name = 'img'
                        attrsList = []
                        for attr in singleTag.attrs:
                            if not 'src' == attr:
                                attrsList.append(attr)
                        for at in attrsList:
                            del singleTag[at]
                    else:
                        raise Exception(
                            '이미지 이상 ] FILE_ID > {0}'.format(fileNm))
                if 'sub-script' == tagNm or 'sup-script' == tagNm:
                    singleTag.decompose()

                if 'kind' == tagNm:
                    #종류
                    kindOfItem = singleTag.text.replace('\n', '')

                if 'name-of-article' == tagNm or 'title-of-the-invention' == tagNm:
                    invenNm = singleTag.text.replace('\n', '')

                if 'main-part' == tagNm:
                    jumun = singleTag.text

                if 'conclusion-part' == tagNm:
                    concluPart = singleTag.text

            divMain = '<DIV_MAIN>' + divMain + '</DIV_MAIN>' + '\n'
            divSub = '<DIV_SUB>' + fileInfo[0] + '</DIV_SUB>' + '\n'
            caseNum = '<CASENUMBER>' + inciNum + '</CASENUMBER>' + '\n'
            debugCaseNum = '<DEBUG_CASENUMBER>' + debugInciNum + '</DEBUG_CASENUMBER>' + '\n'
            caseNm = '<CASENAME>' + invenNm + '</CASENAME>' + '\n'
            senten = '<SENTENCE>' + jumun + '</SENTENCE>' + '\n'
            conClu = '<CONCLUSION>' + concluPart + '</CONCLUSION>' + '\n'
            fileIdTag = '<ID>' + fileNm + '</ID>' + '\n'
            kindTag = '<KIND>' + kindOfItem + '</KIND>'
            xmlPath = '<XML_PATH>' + '/data/prec_pdf/xml/' + dirNm + '/' + fileNm + '.xml' + '</XML_PATH>' + '\n'
            offGazeTag = '<OFFICIAL_GAZETTE>' + offGaze + '</OFFICIAL_GAZETTE>'
            finalDpTag = '<FINAL_DISPOSIT>' + finalDp + '</FINAL_DISPOSIT>'
            courtName = '<COURTNAME>' + courtNm + '</COURTNAME>' + '\n'

            if courtDc:
                dateCt = dateParsing(courtDc)
                if dateCt:
                    if dateCt[0]:
                        sysDate = '<SYSDATE>' + dateCt[0] + '</SYSDATE>' + '\n'
                    if dateCt[1]:
                        pbDate = dateCt[1].group()
            else:
                sysDate = '<SYSDATE></SYSDATE>' + '\n'

            if 'AJ' == divMain:
                publicDate = '<SENTENCEDATE></SENTENCEDATE>' + '\n' + '<CONCLUSIONDATE>' + pbDate + '</CONCLUSIONDATE>' + '\n'
            else:
                publicDate = '<SENTENCEDATE>' + pbDate + '</SENTENCEDATE>' + '\n' + '<CONCLUSIONDATE></CONCLUSIONDATE>' + '\n'

            allCont = str(soup.prettify(formatter="none")).replace(
                '&minus;', '-')

            templeteXML = '<?xml version="1.0" encoding="UTF-8"?>\n<?xml-stylesheet type="text/xsl" href="/view/prec_jp.xsl"?>'+'\n'+'<root xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="bx_Dublin.xsd" >'\
            +'\n'+divMain + divSub + caseNum + kindTag + offGazeTag + finalDpTag + debugCaseNum + caseNm + senten + publicDate +conClu + fileIdTag +courtName+sysDate+xmlPath+allCont+'</root>'

            with open('C:/dev/4.pCourt/00.Data/' + dirNm + '/' + fileNm +
                      '.xml',
                      'w',
                      encoding='UTF-8') as pf:

                pf.write(templeteXML)
                clickerVal = counter.value % 2
                clicker = ''
                if clickerVal == 0:
                    clicker = '○'
                else:
                    clicker = '●'

                # sys.stdout.write("\r %s Completed Percent > [ %s % ] {2:0.2f} %02d:%02d:%02d" % (clicker,str(int(counter.value / totalCnt*100+1)),h, m, s))
                sys.stdout.write(
                    "\r {1} Completed Percent > [ {0}% ] {2:0.2f}".format(
                        str(int(counter.value / totalCnt * 100 + 1)), clicker,
                        ((time.time() - strTime) / 60)))
                sys.stdout.flush()

    except Exception as e:
        with errCnt.get_lock():
            errCnt.value += 1
        # sys.stdout.write("\r Err Count > [ {0} ] ".format(str(int(errCnt.value))))
        # sys.stdout.flush()

        with open('C:/dev/4.pCourt/err/judge/err.txt', 'a',
                  encoding='UTF-8') as ef:
            errLog = fileNm + '>>' + '_'.join(
                fileInfo[0:2]) + '>>' + str(e) + '\n'
            ef.write(errLog)

        pass
示例#11
0
from bs4 import CData, BeautifulSoup

tag = CData('<p>Hello!</p>')
soup = BeautifulSoup('<article><h1>Hi!</h1><gohere /><article>', 'lxml')

soup.find('gohere').replace_with(tag)

res = soup.prettify(formatter='minimal')
print(res)

print('doesn\'t work')
示例#12
0
def blockTag(block, soup):
    block_tag = soup.new_tag("block")
    if block[0] != '':
        block_tag['dials'] = block[0]
    block_tag.append(CData(block[1].strip()))
    return block_tag
示例#13
0
from bs4 import BeautifulSoup
import lxml

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
from bs4 import CData
cdata = CData("A CDATA block")
comment = soup.b.string
comment.replace_with(cdata)

print(soup.b.prettify())
示例#14
0
del tag['nothing']  # doesn't matter
del tag['id']
print(tag)
#print(tag['class'])	# keyError
print(tag.get('class'))

print(tag.string, type(tag.string))
# unicode_string = unicode(tag.string)
# print(unicode_string, type(unicode_string))
tag.string.replace_with("No longer lold")
print(tag)

id_soup = BeautifulSoup('<p id="my id"></p>', 'html.parser')
print(id_soup.p['id'])  # not a multi-valued attribute
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>',
                         'html.parser')
print(rel_soup.a['rel'])
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)
print(rel_soup.a['rel'])

markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup, "html.parser")
comment = soup.b.string
print(type(comment), comment)
print(soup.b.prettify())

from bs4 import CData
cdata = CData("*A cdata block*")
comment.replace_with(cdata)
print(soup.b.prettify())
示例#15
0
def _sync_content(cloud_site_url, cloud_site_username, cloud_site_api_token,
                  cloud_site_page_id, server_site_url, server_site_username,
                  server_site_password, server_site_space_key,
                  server_site_ancestor_id):
    # get cloud page
    resp = cloudsite.get_content(cloud_site_url, cloud_site_username,
                                 cloud_site_api_token, cloud_site_page_id)

    # process storage value
    storage_value = resp['body']['storage']['value']

    # parse storage value
    soup = BeautifulSoup(storage_value, features="html.parser")
    for macro in soup.find_all('ac:structured-macro'):
        # only process plantumlcloud
        if macro.get('ac:name') != 'plantumlcloud':
            continue

        # get data
        cloud_data = ''
        for macro_parameter in macro.find_all('ac:parameter'):
            if macro_parameter.get('ac:name') != 'data':
                continue
            cloud_data = macro_parameter.contents
            if cloud_data:
                cloud_data = cloud_data[0]
        if not cloud_data:
            continue

        # decompress data
        converted_cloud_data = cloudsite.decompress_plantuml_data(cloud_data)

        # modify content
        if macro.get('ac:name') == 'plantumlcloud':
            macro['ac:name'] = 'plantuml'
            new_text_body_tag = soup.new_tag("ac:plain-text-body")
            new_text_body_tag.append(CData(converted_cloud_data))
            macro.append(new_text_body_tag)
    converted_storage_value = soup.encode()

    # convert data
    data_to_create = {
        "title": resp['title'],
        "type": resp['type'],
        "space": {
            "key": server_site_space_key
        },
        "ancestors": [{
            "id": server_site_ancestor_id
        }],
        "body": {
            "storage": {
                "value": converted_storage_value,
                "representation": resp['body']['storage']['representation']
            }
        }
    }

    # add content
    resp = serversite.add_content(server_site_url, server_site_username,
                                  server_site_password, data_to_create)
    print(resp)
    return resp['id']
示例#16
0
tool_conf_file.close()

license_clauses = json.load(open("clauses.json"))
for tool in tools:
    print tool
    tool_group, tool_name = re.sub(r"\.xml$", "", tool).split("/")
    print tool_group, tool_name
    if license_clauses.has_key(tool_group):
        tool_xml_file = open("../tools/{}".format(tool))
        tool_soup = BeautifulSoup(tool_xml_file)
        help_tag = tool_soup.find("help")
        if help_tag is not None:

            old_help = ' '.join([str(e) for e in help_tag.contents])
            if "License\n" in old_help:
                new_help = CData(old_help)
            else:
                new_help = CData(old_help + "\nLicense\n-------\n\n" +
                                 license_clauses[tool_group])
        else:
            help_tag = tool_soup.new_tag("help")
            tool_soup.tool.append(help_tag)
            new_help = CData("\nLicense\n-------\n\n" +
                             license_clauses[tool_group])
        help_tag.string = ""
        help_tag.string.replace_with(new_help)
        tool_soup.tool.insert
        print tool_soup.tool
        tool_xml_file.close()

        # with open("../tools/{}".format(tool), "w") as out_file:
示例#17
0
def process_page(title, files):
    page_set = conf_get("content",
        spaceKey=space_key,
        title=title,
        expand="body.storage,version")

    page = page_set['results'][0]
    body = page['body']['storage']['value']


    # Really should use an XML parser here because this default HTML one inserts bollocks HTML
    # and BODY tags. I don't though because the Confluence storage format doesn't explicitly
    # declare its entities so XML parsers choke on e.g. &lsquo; . In the future it might be worth
    # inserting some DOCTYPE metadata that resolves the entities so the XML parser works, but for
    # now all this means is that I need to strip the stupid extra tags below
    soup = BeautifulSoup(body, "lxml")

    headings = soup.find_all(_matched_heading)

    for heading in headings:
        file_hdr = heading.string
        code_macro = [sibling for sibling in heading.next_siblings if _tag_is_code_macro(sibling) ][0]
        code_el = code_macro.find('ac:plain-text-body')
        
        # If the code macro is empty, it won't even have the plain-text-body to insert in to.
        if code_el is None:
            code_el = soup.new_tag('ac:plain-text-body')
            code_macro.append(code_el)

        if file_hdr not in files:
            print(f"  Unmatched header {file_hdr}")
            continue

        try:
            fname = files[file_hdr]
            print(f"  Heading \"{file_hdr}\" matched to file {fname}")

            with open(fname) as f:
                data = CData(f.read())
                code_el.append(data)
        except Exception as e:
            print(f"  Can't find file '{files[file_hdr]}' for heading '{file_hdr}' ({e})")


    # Clean up the page structure, keeping only the keys we want and incrementing the version number
    page_id = page['id']
    required_keys = ['id', 'title', 'type', 'status', 'body']
    new_page = { key: page[key] for key in required_keys }

    # Intentionally swap out the dict here not just increment in-place as the original version dict
    # carries a bunch of cruft
    new_page['version'] = {'number': page['version']['number'] + 1}

    # There must be a better way to strip the body tags from this string?
    new_page['body']['storage']['value'] = str(soup.body)[6:-7]

    # Upload!
    result = conf_put(f"content/{page_id}", data=json.dumps(new_page))
    if 'success' in result and not result['success']:
        raise Exception(str(result))

    print(f"  Success. New version is {result['version']['number']}")
示例#18
0
def convert_to_dbz(soup):
        #soup is Beautifulsoup
        MLSNumber = soup.find('mlsnumber')
        if MLSNumber is not None:
                #start by creating a parent <property> tag
                dbz_soup = BeautifulSoup('<property></property>')

                property_tag = dbz_soup.property
                #calculate the ref_no
                codes = MLSNumber.text.split('-')
                if codes[0] in TYPE_RENT:
                        type_tag = dbz_soup.new_tag('type')
                        type_tag.append('RP')
                elif codes[0] in TYPE_SALE:
                        type_tag = dbz_soup.new_tag('type')
                        type_tag.append('SP')
                else:
                        #log and email
                        print "incompatible mlsnumber"
                        return None
                ### adding type tag ### 
                property_tag.append(type_tag)
                if codes[1] in APARTMENT:
                        subtype_tag = dbz_soup.new_tag('subtype')
                        subtype_tag.append('AP')
                elif codes[1] in VILLA:
                        subtype_tag = dbz_soup.new_tag('subtype')
                        subtype_tag.append('VI')
                elif codes[1] in SUBTYPE_COMMERCIAL:
                        if codes[2] not in COMMERCIAL_CODES or not codes[2]:
                                print "commercial codes"
                                return None
                        else:
                                subtype_tag = dbz_soup.new_tag('subtype')
                                subtype_tag.append('CO')
                                commercial_tag = dbz_soup.new_tag('commercialtype')
                                commercial_tag.append(codes[2].upper())
                                property_tag.append(commercial_tag)
                elif codes[1] in MULTIPLE_UNITS and codes[0] in TYPE_SALE:
                        subtype_tag.new_tag('subtype')
                        subtype_tag.append('BU')
                elif codes[1] in LAND_FOR_SALE and codes[0] in TYPE_SALE:
                        subtype_tag.new_tag('subtype')
                        subtype_tag.append('LA')
                else:
                        print 'subtype'
                        return None
                ### adding type tag ### 
                property_tag.append(subtype_tag)
        else:
                print 'MLSNumber is empty'
                return None
        ## status tag ##
        status = soup.find('listingstatus').text
        status_tag = dbz_soup.new_tag('status')
        if status == 'Active':
                status_tag.append('vacant')
        else:
                status_tag.append('deleted')
        property_tag.append(status_tag)
        ## ref no tag ##
        ref_no_tag = dbz_soup.new_tag('refno')
        ref_no_tag.append(MLSNumber.text)
        property_tag.append(ref_no_tag)
        ## title tag
        title_tag = dbz_soup.new_tag('title')
        title = soup.find('streetname').text
        if title:
                title_tag.append(title)
                property_tag.append(title_tag)


        ## CDATA description tag
        description_tag = dbz_soup.new_tag('description')
        description = soup.find('publicremark')
        if description:
                description = CData(description.text)
                description_tag.append(description)
                property_tag.append(description_tag)
        else: return None # description is required field

        ## city tag ##
        city_tag = dbz_soup.new_tag('city')
        city = soup.find('city')
        if city:
                city = city.text
                city = city.lower()
                city_code = CITY_CODES[city]
                city_tag.append(str(city_code))
                property_tag.append(city_tag)

        ## size ##
        size_tag = dbz_soup.new_tag('size')
        size = soup.find('squarefeet')
        if size:
                size_value = size.text
                size_tag.append(size_value)
                property_tag.append(size_tag)
        else: return None

        ## price ##
        price_tag = dbz_soup.new_tag('price')
        price = soup.find('listprice')
        if price:
                price_tag.append(price.text)
                property_tag.append(price_tag)

        ## location ##
        location_text_tag = dbz_soup.new_tag('locationtext')
        location_text = soup.find('listingarea')
        if location_text:
                location_text_tag.append(location_text.text)
                property_tag.append(location_text_tag)

        ## building ##
        building_tag = dbz_soup.new_tag('building')
        building = soup.find('buildingfloor')
        if building:
                building_tag.append(building.text)
                property_tag.append(building_tag)

        ## lastupdate ##
        lastupdated_tag = dbz_soup.new_tag('lastupdated')
        lastupdated = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        lastupdated_tag.append(lastupdated)
        property_tag.append(lastupdated_tag)

        ## contactemail ##
        email = soup.find('email')
        if email:
                email_tag = dbz_soup.new_tag('contactemail')
                email_tag.append(email.text)
                property_tag.append(email_tag)

        ## contactnumber ##
        contactnumber_tag = dbz_soup.new_tag('contactnumber')
        cellphone = soup.find('cellphone')
        if cellphone:
                contactnumber_tag.append(cellphone.text)
                property_tag.append(contactnumber_tag)

        ## images ##

        images = soup.find_all('picture')
        if images:
                print 'calling build_images'
                image_urls = build_images(images, refno=MLSNumber.text)
                if image_urls:
                        image_tag = dbz_soup.new_tag('photos')
                        print 'creating image tag'
                        if len(image_urls) == 1:
                                image_tag.append(image_urls[0])
                        if len(image_urls) > 1:
                                url_string = ""
                                for url in image_urls[:len(image_urls)-1]:
                                        url_string += url + '|'
                                url_string += image_urls[-1]
                                image_tag.append(url_string)
                        property_tag.append(image_tag)

        
        ## bedroosm ##                        
        bedrooms = soup.find('bedrooms')
        if subtype_tag in VILLA or subtype_tag in  APARTMENT:
            if bedrooms.text:
                bedrooms_tag = dbz_soup.new_tag('bedrooms')
                bedrooms_tag.append(bedrooms.text)
                property_tag.append(bedrooms_tag)
            else: return None

        ## bathrooms ##
        bathrooms = soup.find('bathtotal')
        if bathrooms.text:
            bathrooms_tag = dbz_soup.new_tag('bathrooms')
            bathrooms_tag.append(bathrooms.text)
            property_tag.append(bathrooms_tag)

        ## ameneties ##
        amenities = []
        parking = soup.find('parking')
        try:
            parking_contents = parking.contents
            for content in parking_contents:
                if content.text == 'Yes':
                    amenities.append('CP')
                    break
        except:
            pass

        ac = soup.find('cooling')
        if ac.text:
            amenities.append('AC')

        features = soup.find_all('feature')
        if features:
            for feature in features:
                if feature.text in DBZ_AMENITIES:
                    amenities.append(DBZ_AMENITIES[feature.text])

        if subtype_tag in VILLA or subtype_tag in APARTMENT:
            property_tag.append(dbz_soup.new_tag('privateamenities').append(amenities))
        elif subtype_tag in SUBTYPE_COMMERCIAL:
            property_tag.append(dbz_soup.new_tag('commercialamenities').append(amenities))

        return dbz_soup
示例#19
0
print(type(tag.string))  # 使用NavigableString来包装tag中的字符串
tag.string.replace_with('no longer bold')
print(tag)
tag.name = 'r'
print(tag)

# 第三种对象:BeautifulSoup
print(soup2.name)

# 第四种对象:comment,主要针对文档中注释部分,是一种特殊的NavigableString
markup = "<b><!--hey, want to buy a used parser?--></b>"
soup3 = BeautifulSoup(markup, 'lxml')
comment = soup3.b.string
print(comment)
print(soup3.b.prettify())
cdate = CData('A cdate block')
comment.replace_with(cdate)
print(soup3.b.prettify())

print('-----------------------------遍历文档树-----------------------------------')
soup4 = BeautifulSoup(html_doc, 'lxml')
print(soup4.body.p.b)
head_tag = soup4.head
print(head_tag.contents)  # 将tag的子节点以列表的方式输出
title_tag = head_tag.contents[0]
print(title_tag)
print(title_tag.contents)

print(len(soup4.contents))
print(soup4.contents[0].name)
示例#20
0
print(soup.name)

# 注释
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
print(type(comment))

# 提取的注释文本
print(comment)

print(soup.b.prettify())

from bs4 import CData

cdata = CData('A CDATA block')
comment.replace_with(cdata)
print(soup.b.prettify())

soup = BeautifulSoup(html_doc)
## 遍历文档树 tag的name
print(soup.head)
print(soup.name)

print(soup.body.b)

# 通过点取属性的方式只能获得当前名字的第一个tag
print(soup.a)
print(soup.find_all('a'))

# 将tag的子节点以列表的方式输出   contents, 仅包含直接子节点
示例#21
0
#coding=utf8
from bs4 import CData
from bs4 import BeautifulSoup
from bs4.builder import LXMLTreeBuilderForXML

xml = \
'''
<?xml version="1.0" ?>
<foo>
    <bar><![CDATA[!@#$%^&*()_+{}|:"<>?,./;'[]\-=]]></bar>
</foo>
'''
builder = LXMLTreeBuilderForXML()
soup = BeautifulSoup(xml, "xml")
print(soup.new_string)
soup.foo.bar.string = CData(soup.foo.bar.string)
soup = soup.prettify(formatter="xml")
print(soup)