def strip_elements(self, elements): """ Remove list of elements (named by tag) from the node. (Wrapper for etree.strip_elements()) """ etree.strip_elements(self.node, elements)
def append_body_footer(self, article): """ Checks if the article has any Public Service Announcements and if available appends each of them to the body. :return: body with public service announcements. """ try: article['body_html'] = article['body_html'].replace('<br>', '<br/>') except KeyError: pass body = '' if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]: body = article.get('body_html', '') elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]: body = article.get('description', '') if body and article.get(FORMAT, '') == FORMATS.PRESERVED: body = body.replace('\n', '\r\n').replace('\r\r', '\r') parsed = parse_html(body, content='html') for br in parsed.xpath('//br'): br.tail = '\r\n' + br.tail if br.tail else '\r\n' etree.strip_elements(parsed, 'br', with_tail=False) body = etree.tostring(parsed, encoding="unicode") if body and article.get('body_footer'): footer = article.get('body_footer') if article.get(FORMAT, '') == FORMATS.PRESERVED: body = '{}\r\n{}'.format(body, get_text(footer)) else: body = '{}{}'.format(body, footer) return body
async def extractor(self, html): html = html.replace('<!--', '').replace('-->', '') loop = asyncio.get_event_loop() try: root = await loop.run_in_executor( None, partial(fromstring, html, parser=self.parser), ) except (ParserError, XMLSyntaxError): return content = root.xpath(self.settings['path']['content']) if not content: await asyncio.sleep(self.settings['cooldown']) return for tag in reversed(content): post_id = tag.attrib.get('id') if not post_id: raise SystemExit('Post ID not found') links = tag.xpath(self.settings['path']['links']) raw_links = self.dl_links(links) strip_elements(tag, 'a') names = (el.strip() for el in tag.xpath(self.settings['path']['name'])) name = ' '.join(el for el in names if el) yield post_id, raw_links, name
def _get_wiki_content(self, url, retry=1): """ makes the http requests, then strip off the useless tags (table, javascript and so on see EXCLUDE_TAGS) it has 5 sec time out and 1 retry :param url: input url (String) :return: """ while retry >= 0: try: r = requests.get(url, timeout=5) except: self.logger.exception( "Exception while requesting {0}".format(url)) retry -= 1 continue if r.status_code == 200: self.logger.info("Successful {0} status code {1}".format( url, r.status_code)) trimmed_content = re.sub('\s+', ' ', r.content) dom_xml = html.fromstring(trimmed_content) etree.strip_elements(dom_xml, self.EXCLUDE_TAGS) cleaned_html = html.tostring(dom_xml) return cleaned_html, r.url else: self.logger.warn("Failed {0} status code {1}".format( url, r.status_code)) return None, None return None, None
def crawl_link_to_index(inp): idx, link = inp print idx, link try: print link response = urllib.urlopen(link) while response.getcode() == 502: time.sleep(60) response = urllib.urlopen(link) page_content = response.read() tree = etree.HTML(page_content, parser=html_parser) etree.strip_elements(tree, 'script') etree.strip_tags(tree, 'script') text_data = "\n".join(filter(lambda chunk: chunk != '', [t.strip() for t in tree.itertext()])) page_title = tree.find(".//title").text es.index(index=index_name, doc_type="page", id=idx, body={ "url": link, "title": page_title, "page_text": text_data }) print "-" * 10 except Exception, e: print e
def clean(text): from lxml import etree from lxml.etree import strip_elements, tostring tree = etree.fromstring(text, parser=etree.HTMLParser()) code_tags = ['pre', 'code'] strip_elements(tree, *code_tags) return tostring(tree, with_tail=False)
def update_range(config, ranges): etree.strip_elements(config, 'range') downloaded = config.find('./downloaded') if downloaded is None: downloaded = etree.SubElement(config, 'downloaded') def add(season, start, end): etree.SubElement(downloaded, 'range', season=str(season), start=str(start), end=str(end)) for season, rng in ranges.items(): rng = sorted(rng) end = start = rng[0] for i, ep in enumerate(rng[1:]): if ep == end + 1 and i != len(rng) - 2: end = ep elif ep == end + 1: end = ep add(season, start, end) else: add(season, start, end) start = end = ep if start == end: add(season, start, end) return config
def replace_links(tree): """Replace descendent anchors with their contents. >>> xml = ''.join([ ... '<p>The <a class="reference internal" href="#module-doctest" title="do', ... 'ctest: Test pieces of code within docstrings."><code class="xref py p', ... 'y-mod docutils literal"><span class="pre">doctest</span></code></a> m', ... 'odule searches for pieces of text that look like interactive Python s', ... 'essions, and then executes those sessions to verify that they work ex', ... 'actly as shown. There are several common ways to use doctest:</p>' ... ]) >>> root = html.fromstring(xml) >>> etree.tostring(replace_links(root), encoding='unicode', method='xml') '<p>The <c..."><s...="pre">doctest</span></code> module ... doctest:</p>' >>> etree.tostring(replace_links(html.fromstring( ... '<html>nice <a>test</a></html>')), encoding='unicode', method='xml') '<html><body><p>nice test</p></body></html>' >>> etree.tostring(replace_links(html.fromstring( ... '<a>test</a>')), encoding='unicode', method='xml') '<a>test</a>' """ for a in tree.xpath('.//a'): if a.text: p = a.getparent() p.text = (p.text if p.text else '') + a.text for e in a.iterchildren(): a.addprevious(e) etree.strip_elements(tree, 'a', with_tail=False) return tree # Not necessary but makes chaining easier.
def test1(): xmlstr = """ <root><p id="sec1"> The ecstasy of discovering a new hit from screening can lead to a highly productive research effort to discover new bioactive compounds. However, in too many cases this ecstasy is followed by the agony of realizing that the compounds are not active against the desired target. Many of these false hits are Pan Assay INterference compoundS (PAINS) <sup> <xref ref-type="bibr" rid="ref1">1</xref> </sup> or colloidal aggregators. <sup> <xref ref-type="bibr" rid="ref2">2</xref> </sup> Whether the screen is conducted in silico or in the laboratory and whether screening libraries, natural products, or drugs are used, all discovery efforts that rely on some form of screening to identify bioactivity are susceptible to this phenomenon. Studies that omit critical controls against experimental artifacts caused by PAINS may waste years of research effort as useless compounds are progressed. <sup> <xref ref-type="bibr" rid="ref3">3</xref> − <xref ref-type="bibr" rid="ref8">8</xref> </sup> The American Chemical Society (ACS) is eager to alert the scientific community to this problem and to recommend protocols that will eliminate the publication of research articles based on compounds with artificial activity. This editorial aims to summarize relevant concepts and to set the framework by which relevant ACS journals will address this issue going forward. </p> </root> """ root = etree.fromstring(xmlstr) #etree.strip_tags(root,'xref') etree.strip_elements(root, 'sup', with_tail=False) #stuff=handle_paragrap('1111',root.find('p')) print(etree.tostring(root, pretty_print=True))
def hangzhou_modifier(response): body = lxml.html.fromstring(response) del_ele = body.xpath('//div[@class="MainTitle"]') for ele in del_ele: ele.clear() elements = re.findall('onclick="DownLoad\((.*?)\)"', response) node_elems = body.xpath('//div/ul/li/a[@href="javascript:void(0);"]') for element, node_elem in zip(elements, node_elems): str_list = element.replace('\'', '') link_str_1, link_str_2 = str_list.split(',') href = 'http://file.hzctc.cn/UService/DownLoadFile.aspx?dirtype=3&filepath={}&showName={}'.format( link_str_2, link_str_1) href = re.sub(r'\s+', '', href) node_elem.set('href', href) body.make_links_absolute('http://www.hzctc.cn/') etree.strip_elements(body, "script", "style", "title") for ele in body.xpath('//a[@target="_blank"]'): ele.set('href', 'http://app1.hzctc.cn/') try: element_2 = body.xpath('//div[@class="content"]')[0] content = etree.tostring(element_2, encoding='utf-8').decode('utf-8') #print(content.decode('utf-8')) data = cleaner.clean_html(content) except Exception as e: raise e #print(data) return data
def extract_content(tree, include_tables=False): '''Find the main content of a page using a set of XPath expressions, then extract relevant elements, strip them of unwanted subparts and convert them''' sure_thing = False result_body = etree.Element('body') # iterate for expr in BODY_XPATH: # select tree if the expression has been found subtree = tree.xpath(expr) if len(subtree) == 0: continue subtree = subtree[0] # prune subtree = discard_unwanted(subtree) # remove elements by link density for elem in subtree.iter('list'): if link_density_test(elem) is True: elem.getparent().remove(elem) continue elem.attrib.clear() #for subelem in elem.iter('item'): # subelem.attrib.clear() etree.strip_tags(subtree, 'a', 'link', 'span') # define iteration strategy potential_tags = set(TAG_CATALOG) # + 'span'? if include_tables is True: potential_tags.add('table') # no paragraphs containing text if len(subtree.xpath('//p//text()')) == 0: potential_tags.add('div') LOGGER.debug(sorted(potential_tags)) # etree.strip_tags(subtree, 'lb') # BoingBoing-Bug # print(html.tostring(subtree, pretty_print=True, encoding='unicode')) # extract content processed_elems = [ handle_textelem(elem, potential_tags) for elem in subtree.xpath('.//*') ] result_body.extend(list(filter(None.__ne__, processed_elems))) # exit the loop if the result has children if len(result_body) > 0: sure_thing = True LOGGER.debug(expr) break # try parsing wild <p> elements if nothing found or text too short temp_text = trim(' '.join(result_body.itertext())) len_text = len(temp_text) if len(result_body) == 0 or len_text < MIN_EXTRACTED_SIZE: result_body = recover_wild_paragraphs(tree, result_body) #search_tree = discard_unwanted(tree) #search_tree = prune_html(search_tree) #result_body, _, _ = baseline(search_tree) temp_text = trim(' '.join(result_body.itertext())) len_text = len(temp_text) # filter output etree.strip_elements(result_body, 'done') etree.strip_tags(result_body, 'div') # return return result_body, temp_text, len_text, sure_thing
def crawl_proxyGoubanjia(self): """ 获取代理 http://www.goubanjia.com/ :return: """ start_url = "http://www.goubanjia.com/" html = get_page(start_url) # root = etree.strip_elements() # print(html) if html: htmlEle = etree.HTML(html, etree.HTMLParser()) result = htmlEle.xpath("//td[@class='ip']") # print(result) # print(len(result)) for td in result: # p = td.xpath(".//p[@style='display:none;']") # p1 = td.xpath(".//p[@style='display: none;']") etree.strip_elements(td, 'p') # print(etree.tostring(td)) # print(len(p)) # print(len(p1)) text = td.xpath(".//text()") # print(text) # print(''.join(text)) yield ''.join(text)
def update_definitions(self): """ Replace the existing Definition elements with a fresh set. A sequence of changes is recorded in self.changes. The position for the inserts is determined by walking past all of the elements which precede the Definition elements. Then the sequence of Definition nodes to be inserted is reversed, so we can perform all of the insertions using the same position. """ vals = self.Values(self.root, "Definition/DefinitionText") if vals.dups: what = "definition" + (vals.dups > 1 and "s" or "") self.changes.add("%d duplicate %s eliminated" % (vals.dups, what)) nodes = [] for d in self.concept.definitions: if d.source == 'NCI': key = Concept.normalize(d.text) if key not in vals.used: status = "Reviewed" original = vals.original.get(key) if original is None: status = "Unreviewed" elif original != d.text: status = "Unreviewed" vals.updated += 1 nodes.append(d.convert(status)) vals.used.add(key) etree.strip_elements(self.root, "Definition") position = self.find_position(Concept.Definition.SKIP) for node in reversed(nodes): self.root.insert(position, node) vals.record_definition_changes(self.changes)
def extract_page(local_url, remote_url): response = requests.get(local_url) response.raise_for_status() response.encoding = "utf-8" document = lxml.html.fromstring(response.content) for element in ("script", "style"): etree.strip_elements(document, element) page_results = [] for section in document.xpath("//div[contains(@class, 'section')]"): if "expandjson" in section.attrib["class"]: continue text, section_id = extract_section(section) title = document.xpath("//title/text()")[0].split("—")[0].strip() section_title = section.xpath( "h1|h2|h3|h4|h5")[0].text_content().rstrip("¶") if title != section_title: title = f"{title} - {section_title}" page_results.append({ "url": f"{remote_url}#{section_id}", "text": text, "title": title, }) href = document.xpath("//a[@accesskey='n']/@href") if href: href = href[0] return page_results, href
def fuyang_modifier(response): body = lxml.html.fromstring(response) body.make_links_absolute('http://www.hzfyggzy.org.cn/') etree.strip_elements(body, "script", "style", "title", 'iframe') del_ele = body.xpath('//div[@class="MainTitle"]') for ele in del_ele: ele.clear() elements = re.findall('onclick="DownLoad\((.*?)\)"', response) node_elems = body.xpath('//div/ul/li/a[@href="javascript:void(0);"]') for element, node_elem in zip(elements, node_elems): str_list = element.replace('\'', '') link_str_1, link_str_2 = str_list.split(',') href = 'http://218.108.176.14:8002/DService/UpLoadFiles/ProAfficheAccessory/{}'.format( link_str_2) href = re.sub(r'\s+', '', href) node_elem.set('href', href) for ele in body.xpath('//a[@target="_blank"]'): ele.set('title', '投标人平台用户请登录投标人平台下载,其他用户请到交易中心窗口领取') try: element = body.xpath('//div[@class="content"]')[0] except Exception as e: raise e content = etree.tostring(element, encoding='utf-8') #print(content.decode('utf-8')) data = content.decode('utf-8') return data
def processData(baseURL, url, tree): # TODO: Determine if all we did was replace existing data this round to abort early # Collect all entries entries = [] for entry in tree.findall('.//item'): # Collect ID id = entry.find('guid') # optional if id is None: id = entry.find('link') # mandatory id = 'link:' + id.text else: id = 'guid:' + id.text # Collect date - we require, but RSS makes optional date = entry.find('pubDate') if date is None: scraperwiki.util.log("*** URL: %s reports items without pubDate - considering failure" % baseURL) return False date = dateutil.parser.parse(date.text) data = etree.tostring(entry) entries.append({"baseURL":baseURL, "id":id, "date":date,"data":data}) scraperwiki.sqlite.save(unique_keys = ["baseURL", "id"], data = entries, table_name = "rss_item") if baseURL == url: # Strip and preserve the root data etree.strip_elements(tree, 'item') data = etree.tostring(tree) entry = {"baseURL":baseURL, "data":data} scraperwiki.sqlite.save(unique_keys = ["baseURL"], data = [entry], table_name = "rss_root") return True
def test4(): xmlstr = """ <root> <p>test before list 1 <list list-type="simple" id="l1"> <list-item><p>item 1.1</p></list-item> <list-item><p>item 1.2</p></list-item> </list>text after list 1 or before list 2 <list list-type="simple" id="l2"> <list-item><p>item 2.1</p></list-item> <list-item><p>item 2.2</p></list-item> </list></p>text after para </root> """ root = etree.fromstring(xmlstr) etree.strip_tags(root, 'xref') etree.strip_elements(root, 'xref', with_tail=True) p = root.find('p') result = [] result.append(clean_string(p.text)) for l in p: for li in l: result.append(get_clean_text(li)) result.append(clean_string(l.tail)) result.append(clean_string(p.tail)) print(result) print n = root.find(('p/list')) n.getparent().remove(n) print(etree.tostring(root, pretty_print=True))
def parseQuestionContentToList(body,title): root = etree.HTML(body) etree.strip_elements(root,'code',with_tail=False) etree.strip_tags(root,'*') nonPunct = re.compile('.*[A-Za-z0-9].*') text = str(etree.tostring(root,pretty_print = True)[10:-11])[1:].lower()\ .replace('\\n',' ')\ .replace("\\",'')\ .replace("?",' ') title = title.lower().replace("?"," ") text += " " + title tokens = nltk.word_tokenize(text) filtered = [w for w in tokens if nonPunct.match(w)] #get rid of the punctuation that got left around the words for word in filtered: front = 0 back = 0 for letter in word: if letter not in string.punctuation: break front += 1 for letter in reversed(word): if letter not in string.punctuation: break back -= 1 if back == 0 : back = None word = word[front:back] return filtered
def _parse_tweet_text(self, text_element, tweet): #hacky way to include Emojis for emoj in text_element.cssselect('img.Emoji'): emoj.tail = emoj.get('alt') + emoj.tail if emoj.tail else emoj.get('alt') #Modify Urls so they are correct for url in text_element.cssselect('a.twitter-timeline-link'): is_truncated = u'\u2026' in url.text_content() url_disp = url.cssselect('span.js-display-url') if len(url_disp) > 0: url_disp_text = url_disp[0].text_content() if is_truncated: url_disp_text = url_disp_text + u'\u2026' url.attrib['xtract-display-url'] = url_disp_text # store for later extraction elif 'pic.twitter.com' in url.text: url.attrib['xtract-display-url'] = url.text strip_elements(url, ['*']) url.text = url.attrib['href'] tmp = str(text_element.text_content()) for m in re.finditer(r'(?<!\s)(?<!\\n)(http|https)://', tmp): #add a space before urls where required tmp = tmp[:m.start()] + ' ' + tmp[m.start():] tweet['text'] = tmp
def parse_xml(self, filename, use_objectify=False, elements=None, tags=None): """ Parse and clean the supplied file by removing any elements or tags we don't use. :param filename: The filename of the xml file to parse. Str :param use_objectify: Use the objectify parser rather than the etree parser. (Bool) :param elements: A tuple of element names (Str) to remove along with their content. :param tags: A tuple of element names (Str) to remove, preserving their content. :return: The root element of the xml document """ try: with open(filename, 'rb') as import_file: # NOTE: We don't need to do any of the normal encoding detection here, because lxml does it's own # encoding detection, and the two mechanisms together interfere with each other. if not use_objectify: tree = etree.parse(import_file, parser=etree.XMLParser(recover=True)) else: tree = objectify.parse(import_file, parser=objectify.makeparser(recover=True)) if elements or tags: self.wizard.increment_progress_bar( translate('BiblesPlugin.OsisImport', 'Removing unused tags (this may take a few minutes)...')) if elements: # Strip tags we don't use - remove content etree.strip_elements(tree, elements, with_tail=False) if tags: # Strip tags we don't use - keep content etree.strip_tags(tree, tags) return tree.getroot() except OSError as e: self.log_exception('Opening {file_name} failed.'.format(file_name=e.filename)) critical_error_message_box( title='An Error Occured When Opening A File', message='The following error occurred when trying to open\n{file_name}:\n\n{error}' .format(file_name=e.filename, error=e.strerror)) return None
def _strip_elements_from_node(node, omit_list): node_stripped = copy.deepcopy(node) for tag in omit_list: etree.strip_elements(node_stripped, tag, with_tail=False) return node_stripped
def interpolate_wiki_links(self, elem): if len(elem) > 0: elem_copy = copy.deepcopy(elem) for child in elem_copy: if child.tag == "page": page = child.text_content().strip() link = "/gmod/" + page.replace(" ", "%20") if "text" in child.attrib: link_text = "[" + child.attrib["text"].strip( ) + "](" + link + ")" elif page.startswith("Enums/"): link_text = "[" + page[len("Enums/" ):] + "](" + link + ")" elif page in self.LINKS: link_text = "[" + self.LINKS[page] + "](" + link + ")" else: link_text = "[" + page + "](" + link + ")" child.tail = link_text + (child.tail or '') strip_elements(elem_copy, "*", with_tail=False) text = elem_copy.text_content() else: text = elem.text_content() text = text.strip() if len(text) > 0: return text
def get_lyrics(self): element = self.element # Replace <br> tags with \n (prepend it with \n and then remove all # occurrences of <br>) for br in element.cssselect('br'): br.tail = '\n' + br.tail if br.tail else '\n' etree.strip_elements(element, 'br', with_tail=False) # Remove unneeded tags bad_tags = element.cssselect('.rtMatcher') + \ element.cssselect('.lyricsbreak') for tag in bad_tags: tag.drop_tree() # Remove HTML comments real_string = etree.tostring(element, encoding=unicode) cleaned_html = clean_html(real_string) # -KMS Modification- # Add try/except block to prevent script from crashing when # run from applescript try: print u'{0}'.format( html.fragment_fromstring(cleaned_html).text_content() ).encode('utf-8').strip() except UnicodeError: print u'{0}'.format( html.fragment_fromstring(cleaned_html).text_content() ).encode('utf-8').strip() return 0
def append_body_footer(self, article): """ Checks if the article has any Public Service Announcements and if available appends each of them to the body. :return: body with public service announcements. """ try: article["body_html"] = article["body_html"].replace("<br>", "<br/>") except KeyError: pass body = "" if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]: body = article.get("body_html", "") elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]: body = article.get("description", "") if body and article.get(FORMAT, "") == FORMATS.PRESERVED: body = body.replace("\n", "\r\n").replace("\r\r", "\r") parsed = parse_html(body, content="html") for br in parsed.xpath("//br"): br.tail = "\r\n" + br.tail if br.tail else "\r\n" etree.strip_elements(parsed, "br", with_tail=False) body = etree.tostring(parsed, encoding="unicode") if body and article.get("body_footer"): footer = article.get("body_footer") if article.get(FORMAT, "") == FORMATS.PRESERVED: body = "{}\r\n{}".format(body, get_text(footer)) else: body = "{}{}".format(body, footer) return body
def crawl_link_to_index(inp): idx, link = inp print idx, link try: print link response= urllib.urlopen(link) while response.getcode()==502: time.sleep(60) response= urllib.urlopen(link) page_content = response.read() tree = etree.HTML(page_content, parser=html_parser) etree.strip_elements(tree, 'script') etree.strip_tags(tree, 'script') text_data = "\n".join(filter(lambda chunk: chunk != '', [t.strip() for t in tree.itertext()])) page_title = tree.find(".//title").text es.index(index = index_name, doc_type = "page", id = idx, body = { "url": link, "title": page_title, "page_text": text_data }) print "-" * 10 except Exception, e: print e
def get_lyrics(self): response = requests.get(self.url) page_html = html.document_fromstring(response.text) element = page_html.cssselect(self.CSS_SELECTOR)[0] # Replace <br> tags with \n (prepend it with \n and then remove all # occurrences of <br>) for br in element.cssselect('br'): br.tail = '\n' + br.tail if br.tail else '\n' etree.strip_elements(element, 'br', with_tail=False) # Remove unneeded tags bad_tags = element.cssselect('.rtMatcher') + \ element.cssselect('.lyricsbreak') for tag in bad_tags: tag.drop_tree() # Remove HTML comments real_string = etree.tostring(element, encoding=unicode) cleaned_html = clean_html(real_string) info_output = format_song_info(self.json['artist'], self.json['song']) lyric_output = html.fragment_fromstring(cleaned_html).text_content() return u'{}{}'.format(info_output, lyric_output)
def longwan_modifier(response,url): body = lxml.html.fromstring(response) body.make_links_absolute('http://61.164.128.8:6081/') etree.strip_elements(body,"script","style","title",'iframe') try: element = body.xpath('//div[@class="Content-Main FloatL"]')[0] except Exception as e: raise e check_table = None for table in element.xpath('.//table'): check_str = ''.join(table.xpath('.//text()')) if '下载' in check_str: check_table = table else: table.clear() if check_table is not None: cid = re.findall(r'/(\d+)\.htm',url)[0] n = len(check_table.xpath('.//tr')) - 1 attachment_url = 'http://61.164.128.8:6081/lwcms/attachment_url.jspx?cid={}&n={}'.format(cid,n) try: attachment_urls = requests.get(attachment_url,headers=HEADERS) except Exception as e: raise e #print(type(attachment_urls.text)) i = 0 for td_ele,href in zip(check_table.xpath('.//tr//td//a[@title="文件下载"]'),eval(attachment_urls.text)): final_href = "http://61.164.128.8:6081/lwcms/attachment.jspx?cid={}&i={}{}".format(cid,i,href) td_ele.set('href',final_href) i +=1 content = etree.tostring(element,encoding='utf-8') #print(content.decode('utf-8')) data = content.decode('utf-8') return data
def get_lyrics(self): response = requests.get(self.url) page_html = html.document_fromstring(response.text) element = page_html.cssselect(self.CSS_SELECTOR)[0] # Replace <br> tags with \n (prepend it with \n and then remove all # occurrences of <br>) for br in element.cssselect('br'): br.tail = '\n' + br.tail if br.tail else '\n' etree.strip_elements(element, 'br', with_tail=False) # Remove unneeded tags bad_tags = element.cssselect('.rtMatcher') + \ element.cssselect('.lyricsbreak') for tag in bad_tags: tag.drop_tree() # Remove HTML comments real_string = etree.tostring(element, encoding="UTF-8") cleaned_html = clean_html(real_string) info_output = format_song_info(self.json['artist'], self.json['song']) lyric_output = html.fragment_fromstring(cleaned_html).text_content() return u'{}{}'.format(info_output, lyric_output)
def __merge_runs(p): while True: cont = False for run in p.iterchildren(W + 'r'): last_run = run.getprevious() if last_run is None or last_run.tag != W + 'r': continue run_props = __get_first_child(run, W + 'rPr') last_run_props = __get_first_child(last_run, W + 'rPr') if (run_props is None and last_run_props is not None) or ( run_props is not None and last_run_props is None): continue if (run_props is None and last_run_props is None) or (etree.tostring( run_props, encoding='utf-8', with_tail=False) == etree.tostring(last_run_props, encoding='utf-8', with_tail=False)): last_wt = __get_first_child(last_run, W + 't') wt = __get_first_child(run, W + 't') if last_wt is not None and wt is not None: last_wt.text += wt.text or '' if len(last_wt.text) > 0 and (last_wt.text[0] == ' ' or last_wt.text[-1] == ' '): last_wt.set(XML + 'space', 'preserve') run.tag = 'TO_BE_REMOVED' cont = True etree.strip_elements(p, 'TO_BE_REMOVED') if not cont: break
def xml(self): """Filtered and stripped serialized document.""" if not hasattr(self, "_xml"): try: xml = etree.tostring(self.doc.resolved, encoding="utf-8") parser = etree.XMLParser(remove_blank_text=True) root = etree.fromstring(xml, parser) first = True for node in root.findall("SummaryMetaData/MainTopics"): if first: first = False else: parent = node.getparent() parent.remove(node) for node in root.xpath(self.CHANGES): parent = node.getparent() parent.remove(node) etree.strip_elements(root, with_tail=False, *self.STRIP) etree.strip_attributes(root, "PdqKey") opts = dict(pretty_print=True, encoding="unicode") self._xml = etree.tostring(root, **opts) except: logger.exception("failure processing XML") bail("failure processing XML") return self._xml
def inject_chapter_metadata(self, bits_xml, chapter, chapter_settings, submission, custom_meta=None): """ Generates the metadata for the chapter :param custom_meta: Dict containing entries which will be added as <custom-meta> tags. :param bits_xml: ElementTree object containing bits2 meta-data for a submission chapter. :param chapter: Chapter row object. :param chapter_settings: OMPSettings object containing the chapter settings :param submission: Submission row object, to which the chapter belongs. :return: Updated ElementTree object with new metadata from OMP db. """ chapter_no = chapter.chapter_seq + 1 book_part_xml = bits_xml.xpath('/book-part')[0] book_part_xml.set('id', 'b{}_ch_{}'.format(submission.submission_id, chapter_no)) book_part_xml.set('seq', str(chapter_no)) book_part_xml.set(LANG_ATTR, submission.locale[:2]) # TODO How to distinguish other types? book_part_xml.set('book-part-type', 'chapter') book_part_meta_xml = book_part_xml.xpath('book-part-meta')[0] book_part_meta_xml.xpath('title-group/title')[0].text = chapter_settings.getLocalizedValue( 'title', submission.locale) contrib_group_xml = book_part_meta_xml.xpath('contrib-group')[0] for contrib in self.dal.getAuthorsByChapter(chapter.chapter_id): contrib_group_xml.append(self.build_contrib_xml(contrib, contrib_group_xml, submission.locale)) if custom_meta: custom_meta_group_xml = book_part_meta_xml.xpath('custom-meta-group')[0] # Clear old custom-meta tags etree.strip_elements(custom_meta_group_xml, 'custom-meta') for meta_name, meta_value in list(custom_meta.items()): custom_meta_xml = etree.SubElement(custom_meta_group_xml, 'custom-meta', {'specific-use': meta_name}) etree.SubElement(custom_meta_xml, 'meta-name').text = meta_name etree.SubElement(custom_meta_xml, 'meta-value').text = meta_value return bits_xml
def insert_submenus(self, submenus): """Insère les sous-menus dans l'arborescence existante. """ # Effacer les sous-menus actuels, si existants (niveau 1 et 2) if self.menu[self.cursor[-1]].find('Submenu') is not None: etree.strip_elements(self.menu[self.cursor[-1]], 'Submenu') # Popule le nouveau sous-menu (niveau 1) etree.SubElement(self.menu[self.cursor[-1]], 'Submenu') for menu in submenus: # Créé le sous-menu (niveau 2) #logging.debug(u'menu = ({}, {}, {}, {})'.format(menu[0], menu[1], menu[2], menu[3])) etree.SubElement(self.menu[self.cursor[-1]].find('Submenu'), menu[0]) # Nomme le sous-menu (niveau 2) if menu[1] is not None: etree.SubElement(self.menu[self.cursor[-1]].find('Submenu').find(menu[0]), 'Title') try: self.menu[self.cursor[-1]].find('Submenu').find(menu[0]).find('Title').text = menu[1] except ValueError: # All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters self.menu[self.cursor[-1]].find('Submenu').find(menu[0]).find('Title').text = u"Caractère(s) invalide(s)" # Affecte l'action et la commande du sous-menu (niveau 2) if menu[2] is not None and menu[3] is not None: etree.SubElement(self.menu[self.cursor[-1]].find('Submenu').find(menu[0]), menu[2]) self.menu[self.cursor[-1]].find('Submenu').find(menu[0]).find(menu[2]).text = menu[3]
def __merge_runs(p): while True: cont = False for run in p.iterchildren(A + 'r'): last_run = run.getprevious() if last_run is None or last_run.tag != A + 'r': continue run_props = __get_first_child(run, A + 'rPr') last_run_props = __get_first_child(last_run, A + 'rPr') if (run_props is None and last_run_props is not None) or (run_props is not None and last_run_props is None): continue if (run_props is None and last_run_props is None) or ( etree.tostring(run_props, encoding='utf-8', with_tail=False) == etree.tostring(last_run_props, encoding='utf-8', with_tail=False)): last_wt = __get_first_child(last_run, A + 't') wt = __get_first_child(run, A + 't') if last_wt is not None and wt is not None: last_wt.text += wt.text or '' if len(last_wt.text) > 0 and (last_wt.text[0] == ' ' or last_wt.text[-1] == ' '): last_wt.set(XML + 'space', 'preserve') run.tag = 'TO_BE_REMOVED' cont = True etree.strip_elements(p, 'TO_BE_REMOVED') if not cont: break
def getPage(id): url = "http://www.digitalspy.co.uk/news/a%s" % id try: html = urllib2.urlopen(url).read() except IOError: print "Skipping: %s" % url return None html = clean_html(html) html = BREAKS.sub("\n", html) doc = fromstring(html) article = doc.cssselect("div.article_body")[0] for image in article.xpath( '//div[@class="image"]|//div[@class="imgcaption"]'): image.getparent().remove(image) strip_elements(article, 'img') return { 'url': url, 'title': doc.cssselect("div.article_header h1")[0].text_content().encode( 'utf-8'), 'published': doc.cssselect("span.time")[0].text_content().encode('utf-8'), 'authors': ",".join(editor.text_content().encode('utf-8') for editor in doc.cssselect("span.editors a")), 'text': article.text_content().strip().encode('utf-8') }
def clean_image_block(block_tree): """ Cleans up an image block to assure that it has the correct structure. """ image = None img_wrapper = None caption = None image_found = False caption_found = False ## We get all the block descendants using lxml (should be "depth-first") ## in order to get image and caption elements, if any. for des in block_tree.iterdescendants(): ## We only take the first img element found. if des.tag == 'img' and not image_found: image_found = True ## We set the image element. image = des ## If the img element is wrapped by a link ## we set the image_wrapper too. if des.getparent().tag == 'a': img_wrapper = des.getparent() ## If the class has been modified we put the correct one. img_wrapper.attrib['class'] = 'image-link' ## We only take the first span element (caption) found. if des.tag == 'span' and not caption_found: caption_found = True ## We set the caption element. caption = des ## If the class has been modified we put the correct one. caption.attrib['class'] = 'image-caption' ## If the image block has no image inside ## then it's invalid and we remove it. if image is None: block_tree.tag = 'invalid_image_block' etree.strip_elements(block_tree, 'invalid_image_block') return ## Sanitazing the caption, we strip out every element inside the span ## preserving the content and thus all the texts present. if caption is not None: etree.strip_tags(caption, '*') ## We go through the descendants again to mark invalid elements. for des in block_tree.iterdescendants(): ## Invalid elements are all those elements which are neither the image ## nor the caption, nor the image_wrapper. if des is image or des is img_wrapper or des is caption: continue ## We remove invalid tags texts. des.text = '' ## We mark invalid tags for removal. des.tag = 'tag_to_be_stripped_out' ## We finally strip out tags marked as invalid ## now the image block should have the correct structure. etree.strip_tags(block_tree, 'tag_to_be_stripped_out')
def extract_content(tree, include_tables=False, deduplicate=False, config=None): '''Find the main content of a page using a set of XPath expressions, then extract relevant elements, strip them of unwanted subparts and convert them''' sure_thing = False result_body = etree.Element('body') # iterate for expr in BODY_XPATH: # select tree if the expression has been found subtree = tree.xpath(expr) if not subtree: continue subtree = subtree[0] # prune subtree = discard_unwanted(subtree) # remove elements by link density subtree = delete_by_link_density(subtree, 'div', backtracking=True) subtree = delete_by_link_density(subtree, 'list', backtracking=False) subtree = delete_by_link_density(subtree, 'p', backtracking=False) # define iteration strategy potential_tags = set(TAG_CATALOG) # + 'span'? if include_tables is True: potential_tags.add('table') for elem in subtree.iter('table'): if link_density_test_tables(elem) is True: elem.getparent().remove(elem) # skip if empty tree if len(subtree) == 0: continue # no paragraphs containing text if not subtree.xpath('//p//text()'): potential_tags.add('div') LOGGER.debug(sorted(potential_tags)) etree.strip_tags(subtree, 'link', 'span') # 'a', # etree.strip_tags(subtree, 'lb') # BoingBoing-Bug # extract content # list(filter(None.__ne__, processed_elems)) result_body.extend([e for e in [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')] if e is not None]) # remove trailing titles while len(result_body) > 0 and result_body[-1].tag in ('fw', 'head'): # and result_body[-1].tail is None: result_body[-1].getparent().remove(result_body[-1]) # exit the loop if the result has children if len(result_body) > 1: # try to change this to 0 or 2 LOGGER.debug(expr) break temp_text = trim(' '.join(result_body.itertext())) # try parsing wild <p> elements if nothing found or text too short if len(result_body) == 0 or len(temp_text) < config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'): result_body = recover_wild_paragraphs(tree, result_body, deduplicate=deduplicate, config=config) temp_text = trim(' '.join(result_body.itertext())) else: sure_thing = True # filter output etree.strip_elements(result_body, 'done') etree.strip_tags(result_body, 'div') # return return result_body, temp_text, len(temp_text), sure_thing
def remove_elements(tei, params): namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'} for param in params.items(): if params[param[0]] == False: etree.strip_elements(tei, "{http://www.tei-c.org/ns/1.0}" + param[0], with_tail=False) return tei
def internal_lemmas(self): """ Return a list of nodes for lemmas (<lm>) within the block (excluding lemVersions) """ node_stripped = copy.deepcopy(self.node) etree.strip_elements(node_stripped, 'lemVersions') return node_stripped.findall('.//lm')
def htmlTree(self): tree = None text = self.selectedHtml() unitext = unicode(text) if unitext != '': tree = lxml.html.fromstring(unitext) etree.strip_elements(tree, 'rt') etree.strip_tags(tree, 'ruby', 'rb') return tree
def parse_body_text(self, response): root = lh.fromstring(response.body) le.strip_elements(root, le.Comment, 'script', 'head', 'a') fulltext = lh.tostring(root, method="text", encoding=unicode) fulltext = fulltext.strip().replace('\n', '') fulltext = re.sub(r'\s+', ' ', fulltext) yield fulltext
def xml_to_articles(filepath): """ Parses an xml and returns plain text versions of the individual chapters (i.e. articles, reviews, ...) in the file. Only files are let through that are - recognized as 'nl', with a probability above .30. - longer than 200 characters """ articles = [] xml_str = codecs.open(os.path.abspath(filepath), 'r', 'utf8').read() xml_str = unicode(BeautifulSoup(xml_str)) # remove entities from the xml # get rid of nasty pagebreak (pb), breaking up tokens across pages: xml_str = re.sub(nasty_pagebreaks, '', xml_str) # attempt to parse the tree: try: tree = etree.fromstring(xml_str) except etree.XMLSyntaxError: return None # remove cf- and note-elements (don't contain actual text): for element in tree.xpath(".//cf"): element.getparent().remove(element) # individual articles etc. are represented as div's which have the type-attribute set to 'chapter': chapter_nodes = [node for node in tree.findall('.//div') if node.attrib and \ 'type' in node.attrib and \ node.attrib['type'] == 'chapter'] for chapter_node in chapter_nodes: # all text in the articles is contained under p-elements: article_text = "" for p_node in chapter_node.findall('.//p'): # remove elements that contain meta text (note that we exclude all notes!) for tag_name in ('note', 'figure', 'table'): etree.strip_elements(p_node, tag_name, with_tail=False) # collect the actual text: p_text = "".join(p_node.itertext()) # add the article (and add some whitespace to be safe): article_text += p_text+" " # collapse all whitespace to single spaces: article_text = re.sub(whitespace, ' ', article_text).strip() if len(article_text) > 500: if detect(article_text) == 'nl': articles.append(article_text) #else: # print(article_text[:200]) # print(detect(article_text)) return articles
def xml_to_articles(filepath): """ Parses an xml and returns plain text versions of the individual chapters (i.e. articles, reviews, ...) in the file. Only files are let through that are - recognized as 'nl', with a probability above .30. - longer than 200 characters """ articles = [] xml_str = codecs.open(os.path.abspath(filepath), 'r', 'utf8').read() xml_str = unicode(BeautifulSoup(xml_str)) # remove entities from the xml # get rid of nasty pagebreak (pb), breaking up tokens across pages: xml_str = re.sub(nasty_pagebreaks, '', xml_str) # attempt to parse the tree: try: tree = etree.fromstring(xml_str) except etree.XMLSyntaxError: return None # remove cf- and note-elements (don't contain actual text): for element in tree.xpath(".//cf"): element.getparent().remove(element) # individual articles etc. are represented as div's which have the type-attribute set to 'chapter': chapter_nodes = [node for node in tree.findall('.//div') if node.attrib and \ 'type' in node.attrib and \ node.attrib['type'] == 'chapter'] for chapter_node in chapter_nodes: # all text in the articles is contained under p-elements: article_text = "" for p_node in chapter_node.findall('.//p'): # remove elements that contain meta text (note that we exclude all notes!) for tag_name in ('note', 'figure', 'table'): etree.strip_elements(p_node, tag_name, with_tail=False) # collect the actual text: p_text = "".join(p_node.itertext()) # add the article (and add some whitespace to be safe): article_text += p_text + " " # collapse all whitespace to single spaces: article_text = re.sub(whitespace, ' ', article_text).strip() if len(article_text) > 500: if detect(article_text) == 'nl': articles.append(article_text) #else: # print(article_text[:200]) # print(detect(article_text)) return articles
def tei5reader_fulldocs(inpath, outfolder): """Script for reading selected text from TEI P5 files.""" print("\nLaunched tei5reader_fulldocs.") import re import os import glob from lxml import etree #print("Using LXML version: ", etree.LXML_VERSION) if not os.path.exists(outfolder): os.makedirs(outfolder) for file in glob.glob(inpath): with open(file, "r"): filename = os.path.basename(file)[:-4] #print(filename[:5]) # = idno ### The following options may help with parsing errors. #parser = etree.XMLParser(collect_ids=False, recover=True) parser = etree.XMLParser(recover=True) xml = etree.parse(file, parser) ### The TEI P5 files do have a default namespace. namespaces = {'tei':'http://www.tei-c.org/ns/1.0'} ### Removes tags but conserves their text content. etree.strip_tags(xml, "{http://www.tei-c.org/ns/1.0}hi") ### Removes elements and their text content. #etree.strip_elements(xml, "speaker") etree.strip_elements(xml, "{http://www.tei-c.org/ns/1.0}note") #etree.strip_elements(xml, "stage") etree.strip_elements(xml, "{http://www.tei-c.org/ns/1.0}head") ### XPath defining which text to select xp_bodytext = "//tei:body//text()" #xp_alltext = "//text()" ### Applying one of the above XPaths text = xml.xpath(xp_bodytext, namespaces=namespaces) text = "\n".join(text) ### Some cleaning up text = re.sub(" ", "", text) #text = re.sub(" ", "", text) text = re.sub("\n{1,6}", " ", text) #text = re.sub("\n{1,6}", "\n", text) text = re.sub("\n \n", "\n", text) text = re.sub("\t\n", "", text) outtext = str(text) outfile = outfolder + filename + ".txt" with open(outfile,"w") as output: output.write(outtext) print("Done.")
def tei5reader_fulldocs(inpath, outfolder): """Script for reading selected text from TEI P5 files.""" print("\nLaunched tei5reader_fulldocs.") import re import os import glob from lxml import etree #print("Using LXML version: ", etree.LXML_VERSION) if not os.path.exists(outfolder): os.makedirs(outfolder) for file in glob.glob(inpath): with open(file, "r"): filename = os.path.basename(file)[:-4] #print(filename[:5]) # = idno ### The following options may help with parsing errors. #parser = etree.XMLParser(collect_ids=False, recover=True) parser = etree.XMLParser(recover=True) xml = etree.parse(file, parser) ### The TEI P5 files do have a default namespace. namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'} ### Removes tags but conserves their text content. etree.strip_tags(xml, "{http://www.tei-c.org/ns/1.0}hi") ### Removes elements and their text content. #etree.strip_elements(xml, "speaker") etree.strip_elements(xml, "{http://www.tei-c.org/ns/1.0}note") #etree.strip_elements(xml, "stage") etree.strip_elements(xml, "{http://www.tei-c.org/ns/1.0}head") ### XPath defining which text to select xp_bodytext = "//tei:body//text()" #xp_alltext = "//text()" ### Applying one of the above XPaths text = xml.xpath(xp_bodytext, namespaces=namespaces) text = "\n".join(text) ### Some cleaning up text = re.sub(" ", "", text) #text = re.sub(" ", "", text) text = re.sub("\n{1,6}", " ", text) #text = re.sub("\n{1,6}", "\n", text) text = re.sub("\n \n", "\n", text) text = re.sub("\t\n", "", text) outtext = str(text) outfile = outfolder + filename + ".txt" with open(outfile, "w") as output: output.write(outtext) print("Done.")
def __symbolize_mathml(self, element, start_id): infty_ids = [] for mt in element.xpath(".//*[local-name() = 'math']"): mid = "MATH_%s_%s" % (self.__pname, start_id) mt.tail = "%s%s" % (mid, mt.tail if mt.tail is not None else "") infty_ids.append(mt.attrib["id"]) start_id += 1 etree.strip_elements(element, "{http://www.w3.org/1998/Math/MathML}math", with_tail = False) return infty_ids, start_id
def handle_metypesetdeleted(self, keep): tree = self.load_dom_tree() if keep: etree.strip_tags(tree, '{http://www.tei-c.org/ns/1.0}meTypesetDeleted') else: etree.strip_elements(tree, '{http://www.tei-c.org/ns/1.0}meTypesetDeleted', with_tail=False) self.save_tree(tree) self.debug.print_debug(self, u'Handled deleted text')
def root(self): """Fetch and parse xml and clean up unwanted elements/markup""" if not hasattr(self, "_root"): query = db.Query("document", "xml") query.where(query.Condition("id", self.cdr_id)) xml = query.execute(self.__control.cursor).fetchone().xml self._root = etree.fromstring(xml.encode("utf-8")) etree.strip_elements(self._root, *self.DROP, with_tail=False) etree.strip_tags(self._root, *self.STRIP) return self._root
def find_content_blocks(tree, min_length=None): """ Iterate over content blocks (russian version) """ from lxml.html import tostring from lxml.etree import strip_tags, strip_elements, Comment # First, make a copy of DOM-tree to not harm external code tree = deepcopy(tree) # Completely remove content of following tags nondata_tags = ['head', 'style', 'script'] strip_elements(tree, *nondata_tags) # Remove comment nodes (keep tail text) strip_tags(tree, Comment) # Remove links strip_tags(tree, 'a') # Drop inline tags inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a', 'span', 'font') strip_tags(tree, *inline_tags) # Drop media tags media_tags = ('img',) strip_tags(tree, *media_tags) body = tostring(tree, encoding='utf-8').decode('utf-8') # Normalize spaces body = normalize_space(body) # Remove ALL chars from tags re_tag = re.compile(r'<[^>]+>') body = re_tag.sub(r'<>', body) #with open('/tmp/log.html', 'w') as out: #out.write(body.encode('utf-8')) #return # Find text blocks block_rex = re.compile(r'[^<>]+') blocks = [] for match in block_rex.finditer(body): block = match.group(0) if min_length is None or len(block) >= min_length: ratio = _trash_ratio(block) if ratio < 0.05: words = block.split() if not any(len(x) > 50 for x in words): blocks.append(block) return blocks
def test(self, recipients, audId=1): url = '/audiences/%s/blasts/test.xml' % str(audId) headers = {'Content-type':'application/xml'} # handle recipients if type(recipients) != str: recipients = ','.join(recipients) toNode = self.xml.xpath('/blast/to')[0] etree.strip_elements(toNode, 'audience-id', 'include-lists') toNode.text = recipients # send request ##data = etree.tostring(self.xml, with_comments=False) data = self.tostring() resp = connection._request(url, 'POST', data, headers) return resp
def parseQuestionContentToList(content): root = etree.HTML(content) etree.strip_elements(root,'code',with_tail=False) etree.strip_tags(root,'*') print() nonPunct = re.compile('.*[A-Za-z0-9].*') text = str(etree.tostring(root,pretty_print = True)[10:-11])[1:].lower()\ .replace('\\n',' ')\ .replace("\\",'')\ .replace("?","") tokens = nltk.word_tokenize(text) filtered = [w for w in tokens if nonPunct.match(w)] return filtered
def reformatVerseElement(element, keepTags=['q', ]): """ Updates various values of the specified verse Element. """ etree.strip_elements(element, *discardTags, with_tail=False) tags = (child.tag for child in element.iter()) for tag in [tag for tag in tags if tag not in keepTags]: etree.strip_tags(element, tag) for child in element.iter('q'): child.set('class', child.get('who', 'unknown').lower()) for child in element.iter(): if child.tag not in knownTags: logger.debug('unhandled tag: %s %s', child.tag, etree.tostring(child))
def emit_tag(self, tag=None, as_initiator=False): """ Emit a subtree for an entity, using the supplied tag for the root element. If the identity is the Initiator's, emit the CUC as well. """ if as_initiator: tag = 'InitgPty' root = etree.Element(tag) # Name if hasattr(self, 'name'): name = etree.SubElement(root, 'Nm') name.text = self.name # Address if hasattr(self, 'address') and not as_initiator: root.append(self.address.__tag__()) # ID idtag = etree.SubElement(root, 'Id') if self.private: id_container = 'PrvtId' else: id_container = 'OrgId' orgid = etree.SubElement(idtag, id_container) #### Elimina il nodo Id/OrgId per il creditore if not as_initiator: etree.strip_elements(root, 'Id', with_tail=False) # CUC if as_initiator: if not hasattr(self, 'cuc'): raise MissingCUCError orgid.append(emit_id_tag(self.cuc, 'CBI')) # Tax code if not as_initiator: if hasattr(self, 'cf'): orgid.append(emit_id_tag(self.cf, 'CBI')) # if hasattr(self, 'cf'): # orgid.append(emit_id_tag(self.cf, 'ADE')) if hasattr(self, 'code'): orgid.append(emit_id_tag(self.code, None)) if not as_initiator and hasattr(self, 'country'): etree.SubElement(root, 'CtryOfRes').text = self.country return root
def getReleaseNoteDetail(tDetail): thisScreen = [] opener = urllib2.build_opener() opener.addheader = [('User-Agent','Mozilla/5.0')] resp = opener.open(tDetail) if resp.code == 200: data = resp.read() elif resp.code == 404: print "Page do not exist" exit() else: print "Can not open page" exit() parser = etree.HTMLParser() tree = etree.parse(StringIO(data), parser) comments = tree.xpath('//comment()') for c in comments: p = c.getparent() p.remove(c) #etree.strip_tags(tree,'p') etree.strip_tags(tree,'i') etree.strip_tags(tree,'a') etree.strip_elements(tree,'iframe') result = etree.tostring(tree.getroot(), pretty_print=True, method="html", encoding='utf-8') mTitle = '' titles = tree.xpath("//h1[@id='id_title']") for title in titles: if title.text is not None: mTitle = title.text break Screen2 = [] Screen2.append(clrTx(mTitle,'YELLOW')) Screen2.append(repeatStr('-',78)) articles = tree.xpath("//div[@class='articleBody']/p") for article in articles: if article.text is not None: for line in _wrap.wrap(article.text): Screen2.append(' '+line) Screen2.append(repeatStr('-',78)) option = '' while option is not 'b': for item in Screen2: print item print "b" option = raw_input()
def eat(self, fname): t = etree.parse(fname, self._p).getroot() etree.strip_elements(t, "comment", "code-helper") etree.strip_tags(t, "virtual-methods") for e in t.iter(): if isinstance(e, etree._Comment): e.getparent().remove(e) else: e.tail = e.text = None try: e.set("xmlfilename", fname) except TypeError: pass self.woot.extend( etx('*')(t) )
def parse_text(wdir, txtFolder): """ This function opens the file, reads it as xml and delete some elements. The other funcionts of this file use it. For example: content = parse_text(wdir, txtFolder) """ # We parse the text as xml file = wdir+txtFolder+".xml" xml_tree = etree.parse(file) # Let's print it to see if everything is ok # print(etree.tostring(xml_tree, pretty_print=True, encoding="unicode")) # Namespaces are specified specific_namespaces = {'tei':'http://www.tei-c.org/ns/1.0','xi':'http://www.w3.org/2001/XInclude'} # Back, front, teiHeader and heads are deleted etree.strip_elements(xml_tree, "{http://www.tei-c.org/ns/1.0}back", with_tail=False) etree.strip_elements(xml_tree, "{http://www.tei-c.org/ns/1.0}front", with_tail=False) etree.strip_elements(xml_tree, "{http://www.tei-c.org/ns/1.0}teiHeader", with_tail=False) etree.strip_elements(xml_tree, "{http://www.tei-c.org/ns/1.0}head", with_tail=False) # Only text is kept and saved as string content = xml_tree.xpath("//text()", namespaces=specific_namespaces) content = ''.join(content) #print(content) #print(type(content)) return content
def format(self, article, subscriber, codes=None): try: pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) nitf = self.get_nitf(article, subscriber, pub_seq_num) strip_elements(nitf, 'body.end') nitf_string = etree.tostring(nitf, encoding='utf-8').decode() headers = ['<?xml version=\"1.0\" encoding=\"UTF-8\"?>', '<!-- <!DOCTYPE nitf SYSTEM \"./nitf-3-3.dtd\"> -->'] return [{ 'published_seq_num': pub_seq_num, 'formatted_item': '{}\r\n{}'.format("\r\n".join(headers), nitf_string). replace(' \n', self.line_ender)}] except Exception as ex: raise FormatterError.nitfFormatterError(ex, subscriber)
def __init__(self, url, ppmm): svg_file = open(url, 'rb') self.tree = etree.fromstring(svg_file.read()) self.layers = [] for g in self.tree.iter(G): self.layers.append(g) self.ppmm = ppmm bt = copy.deepcopy(self.tree) bt.attrib['width'] = str(float(bt.attrib['width']) * self.ppmm) bt.attrib['height'] = str(float(bt.attrib['height']) * self.ppmm) self.blank_tree = bt etree.strip_elements(self.blank_tree, [G])