def get_xml_from_dict(params_dict): """ 由字典转为xml字符串 :param params_dict: 字典 :return: xml_str :rtype: str """ soup = BeautifulSoup(features="xml") xml = soup.new_tag("xml") for k, v in params_dict.items(): tag = soup.new_tag(k) if isinstance(v, int): tag.append(soup.new_string(str(v))) elif isinstance(v, (str, unicode)): tag.append(CData(v)) else: for k1, v1 in v.items(): tag1 = soup.new_tag(k1) if isinstance(v1, int): tag1.append(soup.new_string(str(v1))) elif isinstance(v1, (str, unicode)): tag1.append(CData(v1)) tag.append(tag1) xml.append(tag) return str(xml)
def add_alias(self, name, type_, url, updatefreq=1, address=None, descr='', detail=''): if address is None: address = url """ Add an alias table """ alias = self.doc.new_tag('alias') self.add_child_nfo(alias, 'name', str(name)) self.add_child_nfo(alias, 'type', str(type_)) self.add_child_nfo(alias, 'url', str(url)) self.add_child_nfo(alias, 'updatefreq', str(updatefreq)) self.add_child_nfo(alias, 'address', str(address)) self.add_child_nfo(alias, 'descr', CData(str(descr))) self.add_child_nfo(alias, 'detail', CData(str(detail))) self.doc.aliases.append(alias)
def replaceCodeBlocks(self): ''' this is how it should look like: <ac:structured-macro ac:name="code" ac:schema-version="1" ac:macro-id="168f7514-4b7f-4202-9832-76ca4d2f9650"> <ac:plain-text-body> <![CDATA[first line second line third line]]> </ac:plain-text-body> </ac:structured-macro> code blocks are initially created as <pre><code>code...</code></pre> ''' for codeblock in self.soup.findAll('pre'): children = codeblock.findChildren() if len(children) == 1: firstChild = children[0] if firstChild.name == 'code': codeText = firstChild.text structuredMacroElement = self.soup.new_tag( 'ac:structured-macro', **{ 'ac:name': 'code', 'ac:schema-version': 1 }) plainTextBodyElement = self.soup.new_tag( 'ac:plain-text-body') cdata = CData(codeText) plainTextBodyElement.append(cdata) structuredMacroElement.append(plainTextBodyElement) codeblock.replaceWith(structuredMacroElement)
def try_cdata(): markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" soup = BeautifulSoup(markup, 'html.parser') comment = soup.b.string cdata = CData("A CDATA block") comment.replace_with(cdata) print('*' * 50, 'Try_Cdata', '*' * 50) print(type(comment)) print(comment) print(soup.prettify())
def fix_xml(pred_xml, gold_xml): print(pred_xml, gold_xml) gold_soup = BeautifulSoup(open(gold_xml, 'r').read(), features='xml') gold_text = gold_soup.find('TEXT').string print(gold_text.count('\n')) pred_soup = BeautifulSoup(open(pred_xml, 'r').read(), features='xml') pred_soup.find('TEXT').string = CData(gold_text) with open(pred_xml, 'w') as f: f.write(str(pred_soup))
def transform_item_in_feed(item): """Transform an <item>""" link = item.link.text print('Processing {}'.format(link)) # Ignore empty articles if item.description is None or len(item.description.contents) == 0: print('Empty article body, ignoring...') item.decompose() return # Ignore articles without title if item.title is None or len(item.title) == 0: print('Article without title, ignoring...') item.decompose() return # Parse the article content as HTML article = read_html_from_string(item.description.contents[0]) # The creator in the RSS is a username, so try first to parse from the HTML. html_authors = _parse_article_authors(article) if html_authors is not None: item.creator.string = html_authors # Remove authors from article text itself article.find('div', class_='field-name-field-auteurs').decompose() # Get the category category_tag = Tag(name='category') category_node = article.select_one('div.field-name-field-rubriek a') if category_node is not None: category_tag.string = category_node.text.strip() category_tag['domain'] = category_node['href'] # Remove category from the article body article.find('div', class_='field-name-field-rubriek').decompose() item.append(category_tag) # Remove edition from article body if present edition_node = article.find('div', class_='field-name-field-editie') if edition_node is not None: edition_node.decompose() encoded = article.find('body').decode_contents(formatter='html') item.description.contents = [ CData(htmlmin.minify(encoded, remove_optional_attribute_quotes=False)) ]
def prediction_to_xml(X, preds, text, sents, ind2label_lookup) -> str: preds = postprocess_prediction(X, preds, sents, ind2label_lookup) soup = BeautifulSoup('<deIdi2b2><TEXT></TEXT><TAGS></TAGS></deIdi2b2>', features='xml') soup.find('TEXT').string = CData(text) tags = soup.find('TAGS') for i, tagged_tokens in enumerate(itertools.chain.from_iterable(preds)): tags.append(soup.new_tag(TOKEN_TYPE[tagged_tokens.type], id=f'P{i}', start=tagged_tokens.start, end=tagged_tokens.end, TYPE=tagged_tokens.type, text=text[tagged_tokens.start:tagged_tokens.end])) return str(soup)
def create_note(note_data, soup): """Create an ENEX note element""" note = soup.new_tag('note') title = soup.new_tag('title') title.string = note_data.title note.append(title) content_inside = BeautifulSoup(features="xml") content_inside.append(Doctype('en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd"')) content_inside_note = soup.new_tag('en-note') content_inside_note.string = note_data.content content_inside.append(content_inside_note) # Holy crap this is super hacky and horrible but I don't want to fight with # BeautifulSoup to make it not convert all the text to HTML entities, so # manually convert everything to < and > content_inside_str = str(content_inside).replace('<', '<').replace('>', '>') content = soup.new_tag('content') content.string = CData(content_inside_str) note.append(content) created = soup.new_tag('created') created.string = str(note_data.created) note.append(created) updated = soup.new_tag('updated') updated.string = str(note_data.updated) note.append(updated) for single_tag in note_data.tags: if single_tag is not None: tag = soup.new_tag('tag') tag.string = single_tag note.append(tag) attributes = soup.new_tag('note-attributes') author = soup.new_tag('author') author.string = "Andrew Heiss" attributes.append(author) note.append(attributes) return note
def transform_item_in_feed(item): link = item.link.text print('Processing {}'.format(link)) article = read_html_from_url(link) # Remove and ignore articles without title title_node = item.title if title_node is None or len(title_node.text) == 0: item.decompose() return author_node = item.creator author_node.string = _parse_article_authors(article) parsed_body = _extract_article_body(article) encoded = parsed_body.decode_contents(formatter='html') minified = htmlmin.minify(encoded, remove_optional_attribute_quotes=False) item.description.contents = [CData(minified)]
def tagMake(fileInfo): # sys.stdout.write("\r Completed Percent > [ {0}% ] {1:0.2f}".format(str(int(fileIdx / totalFileNum*100+1)), (time.time()-startTime)/60 )) # sys.stdout.flush() # ['APC', '00', '1000001', '1000001', '1000001', 'C:/dev/1.legalJP/00.Data/JPJ_2000001/DOCUMENT/APC/1000001/1000001/1000001/1000001.SGM'] global errCnt global totalCnt global counter global strTime try: if not 'AC' == fileInfo[0] or 'collection' == fileInfo[0]: fileNm = '_'.join(fileInfo[0:5]) dirNm = '_'.join(fileInfo[0:2]) contents = '' reContents = '' tifFileList = [] rmTifFileList = [] tifDict = {} # Data Variables #주문 jumun = '' #결정 concluPart = '' # 사건번호 , 심판번호 inciNum = '' # Debug 사건번호 , 심판번호 debugInciNum = '' # 판결 선고일 courtDc = '' # 법원명 courtNm = '' # 명칭 invenNm = '' # 종류 kindOfItem = '' #공보종별 offGaze = '' finalDp = '' divMain = '' pbDate = '' publicDate = '' sysDate = '' dateCt = '' with counter.get_lock(): counter.value += 1 with open(fileInfo[5], 'r', encoding='EUC-JP') as rf: for cont in rf: contents += cont parser = MyHTMLParser() parser.feed(contents) # IMG Data if fileInfo[6]: if os.path.exists(fileInfo[6]): tifFileList = imgParser(fileInfo[6], fileInfo[4]) rmTifFileList = list(set(tifFileList)) for tif in rmTifFileList: imgNum = re.search('_\d+', tif) imNum = int(imgNum.group().replace('_', '')) + 1 tifDict[str(imNum).zfill(6)] = tif # IMG Data END for fTag in parser.fullTagList: if '\n]>' == fTag or ']>' == fTag: continue else: reContents += fTag soup = BeautifulSoup(reContents, 'html.parser') if 'CD' == fileInfo[0]: # 판결 divMain = 'CD' #사건번호 inciNumTxt = soup.find('litigation-number') if not inciNumTxt is None: inciNum = cdFullToHalf(inciNumTxt.text.replace('\n', '')) debugInciNum = inciNumTxt.text.replace('\n', '') #판결선고일 courtDcTxt = soup.find('court-decision-giving-date') if not courtDcTxt is None: courtDc = fullToHalf(courtDcTxt.text.replace('\n', '')) # 법원명 courtNmTxt = soup.find('belonging') if not courtNmTxt is None: courtNm = fullToHalf(courtNmTxt.text.replace('\n', '')) else: # 심결 divMain = 'AJ' # 심판번호 inciNumTxt = soup.find('appeal-number') if not inciNumTxt is None: inciNum = fullToHalf(inciNumTxt.text.replace('\n', '')) debugInciNum = inciNumTxt.text.replace('\n', '') #판결선고일 courtDcTxt = soup.find('appeal-decision-date') if not courtDcTxt is None: courtDc = fullToHalf(courtDcTxt.text.replace('\n', '')) # 법원명 courtNmTxt = soup.find('publication-country') if not courtNmTxt is None: courtNm = fullToHalf(courtNmTxt.text.replace('\n', '')) #공보종별 offiGazeTxt = soup.find('official-gazette-assortment') if not offiGazeTxt is None: offGaze = offiGazeTxt.text.replace('\n', '') finalDispTxt = soup.find('final-disposition') if not finalDispTxt is None: finalDp = finalDispTxt.text.replace('\n', '') for singleTag in soup.findAll(): tagNm = singleTag.name if 'paragraph' == tagNm: singleTag.string = CData(singleTag.text) if 'image' == tagNm: if singleTag['file-id']: singleTag['src'] = tifDict[singleTag['file-id']] singleTag.name = 'img' attrsList = [] for attr in singleTag.attrs: if not 'src' == attr: attrsList.append(attr) for at in attrsList: del singleTag[at] else: raise Exception( '이미지 이상 ] FILE_ID > {0}'.format(fileNm)) if 'sub-script' == tagNm or 'sup-script' == tagNm: singleTag.decompose() if 'kind' == tagNm: #종류 kindOfItem = singleTag.text.replace('\n', '') if 'name-of-article' == tagNm or 'title-of-the-invention' == tagNm: invenNm = singleTag.text.replace('\n', '') if 'main-part' == tagNm: jumun = singleTag.text if 'conclusion-part' == tagNm: concluPart = singleTag.text divMain = '<DIV_MAIN>' + divMain + '</DIV_MAIN>' + '\n' divSub = '<DIV_SUB>' + fileInfo[0] + '</DIV_SUB>' + '\n' caseNum = '<CASENUMBER>' + inciNum + '</CASENUMBER>' + '\n' debugCaseNum = '<DEBUG_CASENUMBER>' + debugInciNum + '</DEBUG_CASENUMBER>' + '\n' caseNm = '<CASENAME>' + invenNm + '</CASENAME>' + '\n' senten = '<SENTENCE>' + jumun + '</SENTENCE>' + '\n' conClu = '<CONCLUSION>' + concluPart + '</CONCLUSION>' + '\n' fileIdTag = '<ID>' + fileNm + '</ID>' + '\n' kindTag = '<KIND>' + kindOfItem + '</KIND>' xmlPath = '<XML_PATH>' + '/data/prec_pdf/xml/' + dirNm + '/' + fileNm + '.xml' + '</XML_PATH>' + '\n' offGazeTag = '<OFFICIAL_GAZETTE>' + offGaze + '</OFFICIAL_GAZETTE>' finalDpTag = '<FINAL_DISPOSIT>' + finalDp + '</FINAL_DISPOSIT>' courtName = '<COURTNAME>' + courtNm + '</COURTNAME>' + '\n' if courtDc: dateCt = dateParsing(courtDc) if dateCt: if dateCt[0]: sysDate = '<SYSDATE>' + dateCt[0] + '</SYSDATE>' + '\n' if dateCt[1]: pbDate = dateCt[1].group() else: sysDate = '<SYSDATE></SYSDATE>' + '\n' if 'AJ' == divMain: publicDate = '<SENTENCEDATE></SENTENCEDATE>' + '\n' + '<CONCLUSIONDATE>' + pbDate + '</CONCLUSIONDATE>' + '\n' else: publicDate = '<SENTENCEDATE>' + pbDate + '</SENTENCEDATE>' + '\n' + '<CONCLUSIONDATE></CONCLUSIONDATE>' + '\n' allCont = str(soup.prettify(formatter="none")).replace( '−', '-') templeteXML = '<?xml version="1.0" encoding="UTF-8"?>\n<?xml-stylesheet type="text/xsl" href="/view/prec_jp.xsl"?>'+'\n'+'<root xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="bx_Dublin.xsd" >'\ +'\n'+divMain + divSub + caseNum + kindTag + offGazeTag + finalDpTag + debugCaseNum + caseNm + senten + publicDate +conClu + fileIdTag +courtName+sysDate+xmlPath+allCont+'</root>' with open('C:/dev/4.pCourt/00.Data/' + dirNm + '/' + fileNm + '.xml', 'w', encoding='UTF-8') as pf: pf.write(templeteXML) clickerVal = counter.value % 2 clicker = '' if clickerVal == 0: clicker = '○' else: clicker = '●' # sys.stdout.write("\r %s Completed Percent > [ %s % ] {2:0.2f} %02d:%02d:%02d" % (clicker,str(int(counter.value / totalCnt*100+1)),h, m, s)) sys.stdout.write( "\r {1} Completed Percent > [ {0}% ] {2:0.2f}".format( str(int(counter.value / totalCnt * 100 + 1)), clicker, ((time.time() - strTime) / 60))) sys.stdout.flush() except Exception as e: with errCnt.get_lock(): errCnt.value += 1 # sys.stdout.write("\r Err Count > [ {0} ] ".format(str(int(errCnt.value)))) # sys.stdout.flush() with open('C:/dev/4.pCourt/err/judge/err.txt', 'a', encoding='UTF-8') as ef: errLog = fileNm + '>>' + '_'.join( fileInfo[0:2]) + '>>' + str(e) + '\n' ef.write(errLog) pass
from bs4 import CData, BeautifulSoup tag = CData('<p>Hello!</p>') soup = BeautifulSoup('<article><h1>Hi!</h1><gohere /><article>', 'lxml') soup.find('gohere').replace_with(tag) res = soup.prettify(formatter='minimal') print(res) print('doesn\'t work')
def blockTag(block, soup): block_tag = soup.new_tag("block") if block[0] != '': block_tag['dials'] = block[0] block_tag.append(CData(block[1].strip())) return block_tag
from bs4 import BeautifulSoup import lxml html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, 'html.parser') from bs4 import CData cdata = CData("A CDATA block") comment = soup.b.string comment.replace_with(cdata) print(soup.b.prettify())
del tag['nothing'] # doesn't matter del tag['id'] print(tag) #print(tag['class']) # keyError print(tag.get('class')) print(tag.string, type(tag.string)) # unicode_string = unicode(tag.string) # print(unicode_string, type(unicode_string)) tag.string.replace_with("No longer lold") print(tag) id_soup = BeautifulSoup('<p id="my id"></p>', 'html.parser') print(id_soup.p['id']) # not a multi-valued attribute rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>', 'html.parser') print(rel_soup.a['rel']) rel_soup.a['rel'] = ['index', 'contents'] print(rel_soup.p) print(rel_soup.a['rel']) markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" soup = BeautifulSoup(markup, "html.parser") comment = soup.b.string print(type(comment), comment) print(soup.b.prettify()) from bs4 import CData cdata = CData("*A cdata block*") comment.replace_with(cdata) print(soup.b.prettify())
def _sync_content(cloud_site_url, cloud_site_username, cloud_site_api_token, cloud_site_page_id, server_site_url, server_site_username, server_site_password, server_site_space_key, server_site_ancestor_id): # get cloud page resp = cloudsite.get_content(cloud_site_url, cloud_site_username, cloud_site_api_token, cloud_site_page_id) # process storage value storage_value = resp['body']['storage']['value'] # parse storage value soup = BeautifulSoup(storage_value, features="html.parser") for macro in soup.find_all('ac:structured-macro'): # only process plantumlcloud if macro.get('ac:name') != 'plantumlcloud': continue # get data cloud_data = '' for macro_parameter in macro.find_all('ac:parameter'): if macro_parameter.get('ac:name') != 'data': continue cloud_data = macro_parameter.contents if cloud_data: cloud_data = cloud_data[0] if not cloud_data: continue # decompress data converted_cloud_data = cloudsite.decompress_plantuml_data(cloud_data) # modify content if macro.get('ac:name') == 'plantumlcloud': macro['ac:name'] = 'plantuml' new_text_body_tag = soup.new_tag("ac:plain-text-body") new_text_body_tag.append(CData(converted_cloud_data)) macro.append(new_text_body_tag) converted_storage_value = soup.encode() # convert data data_to_create = { "title": resp['title'], "type": resp['type'], "space": { "key": server_site_space_key }, "ancestors": [{ "id": server_site_ancestor_id }], "body": { "storage": { "value": converted_storage_value, "representation": resp['body']['storage']['representation'] } } } # add content resp = serversite.add_content(server_site_url, server_site_username, server_site_password, data_to_create) print(resp) return resp['id']
tool_conf_file.close() license_clauses = json.load(open("clauses.json")) for tool in tools: print tool tool_group, tool_name = re.sub(r"\.xml$", "", tool).split("/") print tool_group, tool_name if license_clauses.has_key(tool_group): tool_xml_file = open("../tools/{}".format(tool)) tool_soup = BeautifulSoup(tool_xml_file) help_tag = tool_soup.find("help") if help_tag is not None: old_help = ' '.join([str(e) for e in help_tag.contents]) if "License\n" in old_help: new_help = CData(old_help) else: new_help = CData(old_help + "\nLicense\n-------\n\n" + license_clauses[tool_group]) else: help_tag = tool_soup.new_tag("help") tool_soup.tool.append(help_tag) new_help = CData("\nLicense\n-------\n\n" + license_clauses[tool_group]) help_tag.string = "" help_tag.string.replace_with(new_help) tool_soup.tool.insert print tool_soup.tool tool_xml_file.close() # with open("../tools/{}".format(tool), "w") as out_file:
def process_page(title, files): page_set = conf_get("content", spaceKey=space_key, title=title, expand="body.storage,version") page = page_set['results'][0] body = page['body']['storage']['value'] # Really should use an XML parser here because this default HTML one inserts bollocks HTML # and BODY tags. I don't though because the Confluence storage format doesn't explicitly # declare its entities so XML parsers choke on e.g. ‘ . In the future it might be worth # inserting some DOCTYPE metadata that resolves the entities so the XML parser works, but for # now all this means is that I need to strip the stupid extra tags below soup = BeautifulSoup(body, "lxml") headings = soup.find_all(_matched_heading) for heading in headings: file_hdr = heading.string code_macro = [sibling for sibling in heading.next_siblings if _tag_is_code_macro(sibling) ][0] code_el = code_macro.find('ac:plain-text-body') # If the code macro is empty, it won't even have the plain-text-body to insert in to. if code_el is None: code_el = soup.new_tag('ac:plain-text-body') code_macro.append(code_el) if file_hdr not in files: print(f" Unmatched header {file_hdr}") continue try: fname = files[file_hdr] print(f" Heading \"{file_hdr}\" matched to file {fname}") with open(fname) as f: data = CData(f.read()) code_el.append(data) except Exception as e: print(f" Can't find file '{files[file_hdr]}' for heading '{file_hdr}' ({e})") # Clean up the page structure, keeping only the keys we want and incrementing the version number page_id = page['id'] required_keys = ['id', 'title', 'type', 'status', 'body'] new_page = { key: page[key] for key in required_keys } # Intentionally swap out the dict here not just increment in-place as the original version dict # carries a bunch of cruft new_page['version'] = {'number': page['version']['number'] + 1} # There must be a better way to strip the body tags from this string? new_page['body']['storage']['value'] = str(soup.body)[6:-7] # Upload! result = conf_put(f"content/{page_id}", data=json.dumps(new_page)) if 'success' in result and not result['success']: raise Exception(str(result)) print(f" Success. New version is {result['version']['number']}")
def convert_to_dbz(soup): #soup is Beautifulsoup MLSNumber = soup.find('mlsnumber') if MLSNumber is not None: #start by creating a parent <property> tag dbz_soup = BeautifulSoup('<property></property>') property_tag = dbz_soup.property #calculate the ref_no codes = MLSNumber.text.split('-') if codes[0] in TYPE_RENT: type_tag = dbz_soup.new_tag('type') type_tag.append('RP') elif codes[0] in TYPE_SALE: type_tag = dbz_soup.new_tag('type') type_tag.append('SP') else: #log and email print "incompatible mlsnumber" return None ### adding type tag ### property_tag.append(type_tag) if codes[1] in APARTMENT: subtype_tag = dbz_soup.new_tag('subtype') subtype_tag.append('AP') elif codes[1] in VILLA: subtype_tag = dbz_soup.new_tag('subtype') subtype_tag.append('VI') elif codes[1] in SUBTYPE_COMMERCIAL: if codes[2] not in COMMERCIAL_CODES or not codes[2]: print "commercial codes" return None else: subtype_tag = dbz_soup.new_tag('subtype') subtype_tag.append('CO') commercial_tag = dbz_soup.new_tag('commercialtype') commercial_tag.append(codes[2].upper()) property_tag.append(commercial_tag) elif codes[1] in MULTIPLE_UNITS and codes[0] in TYPE_SALE: subtype_tag.new_tag('subtype') subtype_tag.append('BU') elif codes[1] in LAND_FOR_SALE and codes[0] in TYPE_SALE: subtype_tag.new_tag('subtype') subtype_tag.append('LA') else: print 'subtype' return None ### adding type tag ### property_tag.append(subtype_tag) else: print 'MLSNumber is empty' return None ## status tag ## status = soup.find('listingstatus').text status_tag = dbz_soup.new_tag('status') if status == 'Active': status_tag.append('vacant') else: status_tag.append('deleted') property_tag.append(status_tag) ## ref no tag ## ref_no_tag = dbz_soup.new_tag('refno') ref_no_tag.append(MLSNumber.text) property_tag.append(ref_no_tag) ## title tag title_tag = dbz_soup.new_tag('title') title = soup.find('streetname').text if title: title_tag.append(title) property_tag.append(title_tag) ## CDATA description tag description_tag = dbz_soup.new_tag('description') description = soup.find('publicremark') if description: description = CData(description.text) description_tag.append(description) property_tag.append(description_tag) else: return None # description is required field ## city tag ## city_tag = dbz_soup.new_tag('city') city = soup.find('city') if city: city = city.text city = city.lower() city_code = CITY_CODES[city] city_tag.append(str(city_code)) property_tag.append(city_tag) ## size ## size_tag = dbz_soup.new_tag('size') size = soup.find('squarefeet') if size: size_value = size.text size_tag.append(size_value) property_tag.append(size_tag) else: return None ## price ## price_tag = dbz_soup.new_tag('price') price = soup.find('listprice') if price: price_tag.append(price.text) property_tag.append(price_tag) ## location ## location_text_tag = dbz_soup.new_tag('locationtext') location_text = soup.find('listingarea') if location_text: location_text_tag.append(location_text.text) property_tag.append(location_text_tag) ## building ## building_tag = dbz_soup.new_tag('building') building = soup.find('buildingfloor') if building: building_tag.append(building.text) property_tag.append(building_tag) ## lastupdate ## lastupdated_tag = dbz_soup.new_tag('lastupdated') lastupdated = datetime.now().strftime("%Y-%m-%d %H:%M:%S") lastupdated_tag.append(lastupdated) property_tag.append(lastupdated_tag) ## contactemail ## email = soup.find('email') if email: email_tag = dbz_soup.new_tag('contactemail') email_tag.append(email.text) property_tag.append(email_tag) ## contactnumber ## contactnumber_tag = dbz_soup.new_tag('contactnumber') cellphone = soup.find('cellphone') if cellphone: contactnumber_tag.append(cellphone.text) property_tag.append(contactnumber_tag) ## images ## images = soup.find_all('picture') if images: print 'calling build_images' image_urls = build_images(images, refno=MLSNumber.text) if image_urls: image_tag = dbz_soup.new_tag('photos') print 'creating image tag' if len(image_urls) == 1: image_tag.append(image_urls[0]) if len(image_urls) > 1: url_string = "" for url in image_urls[:len(image_urls)-1]: url_string += url + '|' url_string += image_urls[-1] image_tag.append(url_string) property_tag.append(image_tag) ## bedroosm ## bedrooms = soup.find('bedrooms') if subtype_tag in VILLA or subtype_tag in APARTMENT: if bedrooms.text: bedrooms_tag = dbz_soup.new_tag('bedrooms') bedrooms_tag.append(bedrooms.text) property_tag.append(bedrooms_tag) else: return None ## bathrooms ## bathrooms = soup.find('bathtotal') if bathrooms.text: bathrooms_tag = dbz_soup.new_tag('bathrooms') bathrooms_tag.append(bathrooms.text) property_tag.append(bathrooms_tag) ## ameneties ## amenities = [] parking = soup.find('parking') try: parking_contents = parking.contents for content in parking_contents: if content.text == 'Yes': amenities.append('CP') break except: pass ac = soup.find('cooling') if ac.text: amenities.append('AC') features = soup.find_all('feature') if features: for feature in features: if feature.text in DBZ_AMENITIES: amenities.append(DBZ_AMENITIES[feature.text]) if subtype_tag in VILLA or subtype_tag in APARTMENT: property_tag.append(dbz_soup.new_tag('privateamenities').append(amenities)) elif subtype_tag in SUBTYPE_COMMERCIAL: property_tag.append(dbz_soup.new_tag('commercialamenities').append(amenities)) return dbz_soup
print(type(tag.string)) # 使用NavigableString来包装tag中的字符串 tag.string.replace_with('no longer bold') print(tag) tag.name = 'r' print(tag) # 第三种对象:BeautifulSoup print(soup2.name) # 第四种对象:comment,主要针对文档中注释部分,是一种特殊的NavigableString markup = "<b><!--hey, want to buy a used parser?--></b>" soup3 = BeautifulSoup(markup, 'lxml') comment = soup3.b.string print(comment) print(soup3.b.prettify()) cdate = CData('A cdate block') comment.replace_with(cdate) print(soup3.b.prettify()) print('-----------------------------遍历文档树-----------------------------------') soup4 = BeautifulSoup(html_doc, 'lxml') print(soup4.body.p.b) head_tag = soup4.head print(head_tag.contents) # 将tag的子节点以列表的方式输出 title_tag = head_tag.contents[0] print(title_tag) print(title_tag.contents) print(len(soup4.contents)) print(soup4.contents[0].name)
print(soup.name) # 注释 markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" soup = BeautifulSoup(markup) comment = soup.b.string print(type(comment)) # 提取的注释文本 print(comment) print(soup.b.prettify()) from bs4 import CData cdata = CData('A CDATA block') comment.replace_with(cdata) print(soup.b.prettify()) soup = BeautifulSoup(html_doc) ## 遍历文档树 tag的name print(soup.head) print(soup.name) print(soup.body.b) # 通过点取属性的方式只能获得当前名字的第一个tag print(soup.a) print(soup.find_all('a')) # 将tag的子节点以列表的方式输出 contents, 仅包含直接子节点
#coding=utf8 from bs4 import CData from bs4 import BeautifulSoup from bs4.builder import LXMLTreeBuilderForXML xml = \ ''' <?xml version="1.0" ?> <foo> <bar><![CDATA[!@#$%^&*()_+{}|:"<>?,./;'[]\-=]]></bar> </foo> ''' builder = LXMLTreeBuilderForXML() soup = BeautifulSoup(xml, "xml") print(soup.new_string) soup.foo.bar.string = CData(soup.foo.bar.string) soup = soup.prettify(formatter="xml") print(soup)