def crawl_link_to_index(inp): idx, link = inp print idx, link try: print link response= urllib.urlopen(link) while response.getcode()==502: time.sleep(60) response= urllib.urlopen(link) page_content = response.read() tree = etree.HTML(page_content, parser=html_parser) etree.strip_elements(tree, 'script') etree.strip_tags(tree, 'script') text_data = "\n".join(filter(lambda chunk: chunk != '', [t.strip() for t in tree.itertext()])) page_title = tree.find(".//title").text es.index(index = index_name, doc_type = "page", id = idx, body = { "url": link, "title": page_title, "page_text": text_data }) print "-" * 10 except Exception, e: print e
def transform(self, data): raw, xml = data self._tag_texts(xml) self._identify_extra_p_tags(xml) self._tag_text_in_body(xml) etree.strip_tags(xml, "REMOVE_P") return data
def email(self, alerts): node = alert_node() dismisseds = [ a.message_id for a in mAlert.objects.filter(dismiss=True, node=node) ] msgs = [] for alert in alerts: if alert.getId() not in dismisseds: """ This is all to allow <a> tags in alert messages. We need to strip out all the tags so we can send a plain text email. """ msg = unicode(alert).encode('utf8') msgnode = etree.fromstring('<msg>{}</msg>'.format(msg)) for i in msgnode.xpath('//a'): new = etree.Element('span') new.text = '{} ({})'.format(i.text, i.attrib['href']) msgnode.replace(i, new) etree.strip_tags(msgnode, '*') msgs.append(msgnode.text) if len(msgs) == 0: return hostname = socket.gethostname() send_mail(subject='%s: %s' % ( hostname, _("Critical Alerts").encode('utf8'), ), text='\n'.join(msgs))
def write_xml_file(self, xml_file, root): tree = root.getroottree() # Strip the merge tag etree.strip_tags(tree, 'merge') with open(xml_file, 'w+') as f: f.write(etree.tostring(tree, pretty_print=True, encoding='utf-8')) f.close()
def _remove_element_or_comment(node): parent = node.getparent() if parent is not None: if node.tail: text = (node.text or "").strip() + node.tail previous = node.getprevious() if previous is not None: if not previous.tail: previous.tail = "" previous.tail += text else: if not parent.text: parent.text = "" parent.text += text removed = node.tag try: node.tag = "REMOVE_NODE" except AttributeError: parent.remove(node) else: if node.getchildren(): etree.strip_tags(parent, "REMOVE_NODE") else: parent.remove(node) return removed
def parse_vote_page(self, response): lxs = LxmlSelector(response) item = response.meta["item"] etree.strip_tags(lxs.xmlNode, "b", "font", "i", "sup") meta = self.meta_as_dict(lxs) date_txt = lxs.xpath("//text()").re(r"[DUdu\s:]+(\d+/\d+/\d+)") if date_txt: item["date"] = datetime.strptime(date_txt[0], "%d/%m/%Y").isoformat() else: page_text = "".join(lxs.xpath("//text()").extract()) page_text = page_text.replace(u"\u00A0", " ") page_text = page_text.encode("utf-8") date_txt = re.search(r"du[:\s]+(\d+)[er]*\s+(.+?)\s+(\d+)", page_text) if date_txt: date_txt = " ".join(date_txt.groups()) item["date"] = datetime.strptime(date_txt, "%d %B %Y").isoformat() else: raise if lxs.css("#analyse p.nomgroupe"): item["votes"] = self.parse_vote_first_layout(lxs, response) else: # 2nd layout! item["votes"] = self.parse_vote_second_layout(lxs) if item.get("file_href"): yield Request( url=item["file_href"], callback=self.parse_info_page, meta={ "item": item, } ) else: yield item
def reprocess_definition(self, definition): etree.strip_tags(definition, self.STRIP_TAGS_LIST) definition = etree.tostring(definition, encoding='unicode') definition = definition.replace('<div class="rakibolana-definition">', '') definition = definition.replace('\n Ahitsio\n </div>\n\n', '') definition = ''.join(definition.split(':')[1:]).strip() # Segment phrases for char in 'ABDEFGHIJKLMNOPRSTVZ': definition = definition.replace(' ' + char, '. ' + char) for char in '.;:?': definition = definition.replace(char, '##') # fix OCR errors as much as possible definition = definition.replace('u', 'v') definition = definition.replace('-', '') definition = definition.replace('Y ', 'y ') definition = definition.replace(char, '##') definition = '$$'.join(definition.split('##')).strip() print(definition) return definition
def filter_types(tree): for el in tree.iterfind("//EM"): if ('TIPO' in el.attrib and len(el.attrib['TIPO']) > 0): el.attrib['CATEG'] = el.attrib['CATEG'] + '_' + el.attrib['TIPO'] else: el.tag = "to_strip" etree.strip_tags(tree, 'to_strip')
def _clean(self): """ Removes some of extraneous tags to make parsing easier """ etree.strip_tags(self.tree, 'strong') for xx in self.tree.find_class('pydocx-tab'): xx.drop_tag()
def __init__(self, xml): if isinstance(xml, getattr(etree, "_Element")): self._tree = etree.parse(six.StringIO(etree.tostring(xml))) else: self._tree = etree.parse(xml) etree.strip_tags(self._tree, etree.Comment) self._container = {}
def clean_proprietary(self): p = etree.XMLParser(remove_blank_text=True, resolve_entities=False) tree = etree.parse(self.gv.word_document_xml, p) omml = tree.xpath( '//m:oMath', namespaces={ 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math' }) for omml_paragraph in omml: omml_paragraph.tag = '{http://www.w3.org/1998/Math/MathML}math' etree.strip_tags( tree, '{http://schemas.openxmlformats.org/officeDocument/2006/math}oMathPara' ) omml = tree.xpath( '//m:oMathParaPr', namespaces={ 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math' }) for omml_paragraph in omml: omml_paragraph.getparent().remove(omml_paragraph) tree.write(self.gv.word_document_xml)
def due(pdfpath): ntree = uniform_cm(pdfpath) etree.strip_tags(ntree, 'textline') # Search for all text "textbox" elements for textbox in ntree.xpath('//textbox'): new_line = etree.Element("new_line") previous_bb = None # From a given textbox element, iterate over all the "text" elements for x in textbox.iter("text"): # Get current bb valu bb = getBBoxFirstValue(x) # Check current and past values aren't empty if bb is not None and previous_bb is not None and (bb - previous_bb) > 20: # Inserte newline into parent tag x.getparent().insert(x.getparent().index(x), new_line) # A new "new_line" element is created new_line = etree.Element("new_line") # Append current element is new_line tag new_line.append(x) # Keep latest non empty BBox 1st value if bb is not None: previous_bb = bb # Add last new_line element if not null textbox.append(new_line) tree = ntree return tree
def _parse_search_results(self, log, orig_title, orig_authors, root, matches, timeout, isbn): max_results = self.prefs[Moly_hu.KEY_MAX_BOOKS] results = root.xpath('//a[@class="book_selector"]') log.info('Found %d possible books (max: %d)'%(len(results), max_results)) i = 0 for result in results: book_urls = result.xpath('@href') etree.strip_tags(result, 'strong') author_n_title = result.text author_n_titles = author_n_title.split(':', 1) author = author_n_titles[0].strip(' \r\n\t') title = author_n_titles[1].strip(' \r\n\t') log.info('Orig: %s, target: %s'%(self.strip_accents(orig_title), self.strip_accents(title))) if orig_title: if orig_title.lower() not in title.lower() and self.strip_accents(orig_title) not in self.strip_accents(title): continue if orig_authors: author1 = orig_authors[0] authorsplit = author1.split(" ") author2 = author1 if len(authorsplit) > 1: author2 = '%s %s'%(authorsplit[1], authorsplit[0]) if author1.lower() not in author.lower() and self.strip_accents(author1) not in self.strip_accents(author) and author2.lower() not in author.lower() and self.strip_accents(author2) not in self.strip_accents(author): continue for book_url in book_urls: result_url = Moly_hu.BASE_URL + book_url if (result_url not in matches): matches.append(result_url) i += 1 if (i >= max_results): return
def stringify_children(node): from lxml.etree import tostring, strip_tags strip_tags(node, '*') text = tostring(node, method='text', encoding=unicode) return text
def analysis_section(notice, child): # Create the section element section_elm = Element('analysisSection') # Add the title element title_elm = SubElement(section_elm, 'title') title_elm.text = child['title'] # Add paragraphs for paragraph in child['paragraphs']: paragraph_number = child['paragraphs'].index(paragraph) paragraph_footnotes = [ fn for fn in child['footnote_refs'] if fn['paragraph'] == paragraph_number] text = self.resolve_footnotes(notice, paragraph, paragraph_footnotes) paragraph_elm = fromstring( '<analysisParagraph>' + text + '</analysisParagraph>') # Make sure to strip out elements that don't belong strip_tags(paragraph_elm, 'EM') section_elm.append(paragraph_elm) # Construct an analysis section for any children. try: map(lambda c: section_elm.append(analysis_section(notice, c)), child['children']) except: print("Failed to write analysis for", child['title']) pass return section_elm
def ntcir_topic_read_xhtml(filename): with open(filename, 'rt') as f: xhtml_tokens = f.read() xml_document = unicode_to_tree(mathmlcan(xhtml_tokens)) for topic_element in xml_document.xpath( '//ntcir-math:topic | //mathml:topic', namespaces=XML_NAMESPACES): topic_number_elements = topic_element.xpath( './/ntcir-math:num | .//mathml:num', namespaces=XML_NAMESPACES) assert len(topic_number_elements) == 1 topic_number_element = topic_number_elements[0] topic_number = topic_number_element.text tokens = [] for math_element in topic_element.xpath( './/ntcir-math:formula/mathml:math | .//mathml:formula/mathml:math', namespaces=XML_NAMESPACES): etree.strip_tags( math_element, '{{{}}}semantics'.format(XML_NAMESPACES['mathml'])) math_element = remove_namespaces(copy(math_element)) math_token = Math(tree_to_unicode(math_element)) tokens.append(math_token) for keyword_element in topic_element.xpath( './/ntcir-math:keyword | .//mathml:keyword', namespaces=XML_NAMESPACES): tokens.append(Text(keyword_element.text)) yield (topic_number, tokens)
def handle_lists(element, dedupbool): '''Process lists elements''' processed_element = etree.Element(element.tag) for child in element.iter('item'): newchildelem = etree.Element('item') if len(child) == 0: processed_child = process_node(child) if processed_child is not None: newchildelem.text, newchildelem.tail = processed_child.text, processed_child.tail processed_element.append(newchildelem) else: # proceed with iteration, fix for nested elements for subelem in child.iter(): processed_subchild = handle_textnode(subelem, comments_fix=False, deduplicate=dedupbool) # add child element to processed_element if processed_subchild is not None: subchildelem = etree.SubElement(newchildelem, processed_subchild.tag) subchildelem.text, subchildelem.tail = processed_subchild.text, processed_subchild.tail subelem.tag = 'done' etree.strip_tags(newchildelem, 'item') if newchildelem.text or len(newchildelem) > 0: processed_element.append(newchildelem) child.tag = 'done' # avoid double tags?? if len(processed_element) > 0: # if it has children # test if it has text if text_chars_test(''.join(processed_element.itertext())) is True: return processed_element return None
def _extract_cases_from_html(self, html): """Build list of data dictionaries, one dictionary per case (table row).""" # Strip inconsistently placed <font> and <br> # tags that make stable coverage almost impossible etree.strip_tags(html, 'font', 'br') for ul in html.xpath('//table[@id="AutoNumber1"]/tr[2]/td/table/tr/td//ul'): preceding = ul.xpath('./preceding::*[1]')[0] preceding_text = ' '.join(preceding.text_content().split()).strip(':') if preceding_text and not preceding_text.lower().endswith('future date'): # Below will fail if they change up strings or date formats case_date = convert_date_string(preceding_text.split()[-1]) for element in ul.xpath('./li | ./a'): if element.tag == 'li': text = normalize_dashes(' '.join(element.text_content().split())) if not text: continue anchor = element.xpath('.//a')[0] elif element.tag == 'a': # Malformed html, see connappct_example.html anchor = element glued = '%s %s' % (anchor.text_content(), anchor.tail) text = normalize_dashes(' '.join(glued.split())) self.cases.append({ 'date': case_date, 'url': anchor.xpath('./@href')[0], 'docket': text.split('-')[0].replace('Concurrence', '').replace('Dissent', ''), 'name': text.split('-', 1)[1], })
def reprocess_definition(self, definition): etree.strip_tags(definition, self.STRIP_TAGS_LIST) definition = etree.tostring(definition, encoding='unicode') definition = definition.replace('<td class="main">', '') definition = definition.replace('</td>', '') definition = definition.replace('\n', '') return definition
def cleanup_address_p(paragraph): """Function for dealing with the somewhat messy paragraphs inside an address block. This deals with the potential lack of spaces in the XML, extra E tags, and strange characters up front.""" if paragraph.text: ended_with_space = paragraph.text.endswith(' ') else: ended_with_space = True # Inside baseball -- adds spaces to tags that don't have them for child in paragraph.getchildren(): if not child.text: continue if not ended_with_space: child.text = ' ' + child.text if child.tail and not child.tail.startswith(' '): child.text = child.text + ' ' if child.tail: ended_with_space = child.tail.endswith(' ') else: ended_with_space = child.text.endswith(' ') etree.strip_tags(paragraph, 'E') txt = paragraph.text.strip() while txt and not (txt[0] in string.letters or txt[0] in string.digits): txt = txt[1:] return txt
def __call__(self, document, url): """Filter article content from raw html""" # turn the raw html into an etree root = html.fromstring(document) # so that they'll continue to work root.make_links_absolute(url) content = [] # if we have a selector and a whitelist # then we can work. if self.css and self.whitelist: etree.strip_tags(root, 'img', etree.Comment) # remove img/comment tags try: # get each matching block that we want # the text from for block in self.css(root): # parse out the text and whitelisted html para = self.__parse(block) # if we got something, keep it if para: content.append(para) except Exception as e: log.exception('{} in filter at: {}'.format(type(e), url)) # return a list of paragraphs return content
def dump_article_text(file_path, xpath_str, filter_tags=filter_tag_list, remove_stop_words=True): """ This method is designed to extract all text from xml documents. Every document has specific tags that are striped in order to produce clean text output for downstream processing Keyword arguments: file_path - the file path for xml document xpath_str - the xpath string to extract tags from the xml document filter_tag_list - the list of tags to strip from the xml document remove_stop_words - a flag to indicate if stop words should be removed """ tree = (ET.parse(open(file_path, "rb"), parser=parser)) # Process xml without specified tags ET.strip_tags(tree, *filter_tags) root = tree.getroot() all_tags = root.xpath(xpath_str) text = list(map(lambda x: list(x.itertext()), list(all_tags))) # Remove stop words if remove_stop_words: text = (list( map(lambda x: remove_stopwords(re.sub("\n", "", "".join(x))), text))) else: text = (list(map(lambda x: re.sub("\n", "", "".join(x)), text))) return text
def stenogramma(request): u = url + '/video/view.php' if request.method == "GET" and "t" in request.GET: u = u + '?t=%s' % request.GET['t'] #o, l, r = get_page(u) r = requests.get(u) o = html.fromstring(r.text.encode('UTF-8')) h1 = o.xpath('//h1')[0].text_content() text = o.xpath('//div[@class="body"]')[0] etree.strip_tags(text, 'font') for s in text.xpath('//a'): etree.strip_tags(s, 'b') #tt = s.text.rstrip() #s.append(tt) n = etree.tostring(text, encoding='unicode') cs = [] # get_comments(o, u) return render(request, "text.html", { 'h1': h1, 'n': n, 'cs': cs, 'u': u }) return redirect('https://catalog.oper.ru/')
def enclose_and_change_self_size(self, outer_xpath, size_attribute, tag, change_tag): tree = self.load_dom_tree() # search the tree and grab the parent for child in tree.xpath( outer_xpath, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): self.debug.print_debug( self, u'Enclosing and changing size: {0} to {1}'.format( child.tag, change_tag)) new_element = etree.Element(tag) child.attrib[u'meTypesetSize'] = size_attribute if child.tag == '{http://www.tei-c.org/ns/1.0}' + change_tag: child.tag = 'REMOVE' else: for sub_element in child: if sub_element.tag == '{http://www.tei-c.org/ns/1.0}' + change_tag: child.tag = 'REMOVE' if child.tag != 'REMOVE': child.tag = change_tag child.addnext(new_element) Manipulate.append_safe(new_element, child, self) if child.tag == 'REMOVE': etree.strip_tags(child.getparent(), 'REMOVE') if not (child.attrib['rend'] is None): if u'bold' in child.attrib[u'rend']: child.attrib[u'rend'] = child.attrib[u'rend'].replace( u'bold', u'') self.save_tree(tree)
def transform(self, data): raw, xml = data for tag in self.TAGS: nodes = xml.findall(".//" + tag) if len(nodes) > 0: etree.strip_tags(xml, tag) return data
def header_to_xml(header_lines, book, output_xml_path): header_lines = [[y for y in x] for x in group_ranges(header_lines)] # First line: last line d = {x[0]: x[-1] for x in header_lines} # Delete section tags ET.strip_tags(book, "section") for from_line, to_line in d.items(): f = book.find('.//line[@num="' + str(from_line) + '"]') new_element = ET.Element('header') prev = f.getprevious() if prev is not None: for line_num in range(from_line, to_line + 1): e = book.find('.//line[@num="' + str(line_num) + '"]') new_element.append(e) prev.addnext(new_element) else: parent = f.getparent() for line_num in range(from_line, to_line + 1): e = book.find('.//line[@num="' + str(line_num) + '"]') new_element.append(e) parent.insert(0, new_element) ET.strip_tags(book, "line") # Write to file with open(output_xml_path, 'wb') as f: f.write(ET.tostring(book, pretty_print=True))
def _load_from(self, data): try: self._xmp = parse(BytesIO(data)) except XMLSyntaxError: data = re_xml_illegal_bytes.sub(b'', data) try: self._xmp = parse(BytesIO(data)) except XMLSyntaxError as e: if str(e).startswith( "Start tag expected, '<' not found") or str( e).startswith("Document is empty"): # This is usually triggered by processing instructions # in another otherwise empty document, or empty documents, # which we consider safe to coerce to a well-formed # XMP. For harder cases like truncated XMP, we want to # raise the exception so that someone is alerted. self._xmp = parse(BytesIO(XMP_EMPTY)) else: raise PdfError() from e pis = self._xmp.xpath('/processing-instruction()') for pi in pis: etree.strip_tags(self._xmp, pi.tag) try: self._get_rdf_root() except ValueError: if self._xmp.find('.', self.NS).tag == '{adobe:ns:meta/}xmpmeta': # Looks like: <x:xmpmeta></x:xmpmeta>, so reload with template # that includes <rdf:RDF> return self._load_from(XMP_EMPTY) else: raise # Probably not XMP
def analysis_section(notice, child): # Create the section element section_elm = Element('analysisSection') # Add the title element title_elm = SubElement(section_elm, 'title') title_elm.text = child['title'] # Add paragraphs for paragraph in child['paragraphs']: paragraph_number = child['paragraphs'].index(paragraph) paragraph_footnotes = [ fn for fn in child['footnote_refs'] if fn['paragraph'] == paragraph_number ] text = self.resolve_footnotes(notice, paragraph, paragraph_footnotes) paragraph_elm = fromstring('<analysisParagraph>' + text + '</analysisParagraph>') # Make sure to strip out elements that don't belong strip_tags(paragraph_elm, 'EM') section_elm.append(paragraph_elm) # Construct an analysis section for any children. map(lambda c: section_elm.append(analysis_section(notice, c)), child['children']) return section_elm
def xmltotxt(xmloutput): '''Convert to plain text format''' returnlist = [] etree.strip_tags(xmloutput, 'hi') for element in xmloutput.iter(): # process text if element.text is None and element.tail is None: # newlines for textless elements if element.tag in ('row', 'table'): returnlist.append('\n') continue if element.text is not None and element.tail is not None: textelement = ' '.join([element.text, element.tail]) elif element.text is not None and element.tail is None: textelement = element.text else: textelement = element.tail if element.tag in ('code', 'fw', 'head', 'lb', 'p', 'quote', 'row', 'table'): returnlist.extend(['\n', textelement, '\n']) elif element.tag == 'item': returnlist.extend(['\n- ', textelement, '\n']) elif element.tag == 'cell': returnlist.extend(['|', textelement, '|']) elif element.tag == 'comments': returnlist.append('\n\n') else: returnlist.extend([textelement, ' ']) return sanitize(''.join(returnlist))
def evaluate(self, pred, true): ''' Computes TEDS score between the prediction and the ground truth of a given sample ''' if (not pred) or (not true): return 0.0 parser = html.HTMLParser(remove_comments=True, encoding='utf-8') pred = html.fromstring(pred, parser=parser) true = html.fromstring(true, parser=parser) if pred.xpath('body/table') and true.xpath('body/table'): pred = pred.xpath('body/table')[0] true = true.xpath('body/table')[0] if self.ignore_nodes: etree.strip_tags(pred, *self.ignore_nodes) etree.strip_tags(true, *self.ignore_nodes) n_nodes_pred = len(pred.xpath(".//*")) n_nodes_true = len(true.xpath(".//*")) n_nodes = max(n_nodes_pred, n_nodes_true) tree_pred = self.load_html_tree(pred) tree_true = self.load_html_tree(true) distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance() return 1.0 - (float(distance) / n_nodes) else: return 0.0
def compare_xml_content(f1: str, f2: str) -> bool: """ Compare the contents of two XML files and report if the contents are the same, minus comments :param f1: File 1 :param f2: File 2 :return: Is file content, minus comments, the same? """ file1 = pathlib.Path(f1) file2 = pathlib.Path(f2) if file1.is_file() and file2.is_file(): parser = etree.XMLParser(remove_blank_text=True) root = etree.parse(str(file1), parser).getroot() etree.strip_tags(root, etree.Comment) f1_hash = hashlib.sha512(etree.tostring(root)).hexdigest() parser = etree.XMLParser(remove_blank_text=True) root = etree.parse(str(file2), parser).getroot() etree.strip_tags(root, etree.Comment) f2_hash = hashlib.sha512(etree.tostring(root)).hexdigest() return f1_hash == f2_hash return False
def preprocess(tree, filename, strip_tags): """ Wrapper function that help apply the parser to different pages. :param tree: Document tree that contains different page tree as children :param filename: Filename of the output HTML file :param strip_tags: Tags name to be flatten :return: All words and equations from this file (with coordinate information) """ etree.strip_tags(tree, *strip_tags) words = [] equations = [] for page_tree in tree: generate_rawtext_from_ocrx(page_tree) remove_ocr_img_for_non_img(page_tree) img_segment_clean_up(page_tree) split_paragraph(page_tree) words += [*get_all_words_with_coordinates(page_tree)] equations += list(get_equation(page_tree)) remove_ocr_elements(page_tree) add_name(page_tree) with open(filename, 'wb') as out_html: out_html.write(etree.tostring(tree, pretty_print=True)) return words, equations
def import_from_html(filename_or_fobj, encoding='utf-8', index=0, ignore_colspan=True, preserve_html=False, row_tag='tr', column_tag='td|th', *args, **kwargs): # TODO: unescape before returning: html_parser.unescape(html) # TODO: lxml -> unicode? filename, fobj = get_filename_and_fobj(filename_or_fobj) kwargs['encoding'] = encoding html = fobj.read().decode(encoding) html_tree = document_fromstring(html) tables = html_tree.xpath('//table') table = tables[index] strip_tags(table, 'thead') strip_tags(table, 'tbody') row_elements = table.xpath(row_tag) if not preserve_html: table_rows = [[value_element.text_content().strip() for value_element in row.xpath(column_tag)] for row in row_elements] else: table_rows = [[_get_content(value_element) for value_element in row.xpath(column_tag)] for row in row_elements] max_columns = max(len(row) for row in table_rows) if ignore_colspan: table_rows = filter(lambda row: len(row) == max_columns, table_rows) meta = {'imported_from': 'html', 'filename': filename,} return create_table(table_rows, meta=meta, *args, **kwargs)
def sanitize_tree(tree, include_formatting=False): '''Convert and sanitize the output from the generic algorithm (post-processing)''' # delete unnecessary elements for elem in tree.xpath(SANITIZED_XPATH): elem.getparent().remove(elem) etree.strip_tags(tree, MANUALLY_STRIPPED + ['a', 'span']) tree = prune_html(tree) # convert cleaned_tree = convert_tags(tree, include_formatting) for elem in cleaned_tree.iter('td', 'th', 'tr'): # elem.text, elem.tail = trim(elem.text), trim(elem.tail) # finish table conversion if elem.tag == 'tr': elem.tag = 'row' elif elem.tag in ('td', 'th'): if elem.tag == 'th': elem.set('role', 'head') elem.tag = 'cell' # sanitize sanitization_list = list() for tagname in [element.tag for element in set(cleaned_tree.iter())]: if tagname not in TEI_VALID_TAGS: sanitization_list.append(tagname) # if tagname in ('article', 'content', 'link', 'main', 'section', 'span'): # for element in cleaned_tree.iter(tagname): # merge_with_parent(element) # else: # print(tagname) etree.strip_tags(cleaned_tree, sanitization_list) text = trim(' '.join(cleaned_tree.itertext())) return cleaned_tree, text, len(text)
def get_text(elements, itemize=False): paragraphs = [] highlight_elements = ['varname', 'parameter'] strip_elements = [ 'returnvalue', 'command', 'link', 'footnote', 'simpara', 'footnoteref', 'function' ] + highlight_elements for element in elements: # put "Since MPD version..." in paranthese etree.strip_tags(element, "application") for e in element.xpath("footnote/simpara"): e.text = "(" + e.text.strip() + ")" for e in element.xpath("|".join(highlight_elements)): e.text = "*" + e.text.strip() + "*" etree.strip_tags(element, *strip_elements) if itemize: initial_indent = " * " subsequent_indent = " " else: initial_indent = " " subsequent_indent = " " wrapper = TextWrapper(subsequent_indent=subsequent_indent, initial_indent=initial_indent) text = element.text.replace("\n", " ").strip() text = re.subn(r'\s+', ' ', text)[0] paragraphs.append(wrapper.fill(text)) return "\n\n".join(paragraphs)
def xmltostring(xml): etree.strip_tags(xml, '*', etree.Comment) if not xml.text: return '' string = " ".join(xml.text.split()) string = reencode(string) return string
def cleanup(table): etree.strip_tags(table,'span','strong','div', 'tbody') for tag in table.iter(): for att in tag.attrib.keys(): tag.attrib.pop(att) if tag.tag == "table": tag.set('border','1') return table;
def extract_comments(tree, dedupbool): '''Try and extract comments out of potential sections in the HTML''' comments_body = etree.Element('body') # define iteration strategy potential_tags = set(TAG_CATALOG) # 'span' # potential_tags.add('div') trouble with <div class="comment-author meta"> for expr in COMMENTS_XPATH: # select tree if the expression has been found subtree = tree.xpath(expr) if not subtree: continue subtree = subtree[0] # prune subtree = discard_unwanted_comments(subtree) etree.strip_tags(subtree, 'a', 'link', 'span') # extract content #for elem in subtree.xpath('.//*'): # processed_elem = process_comments_node(elem, potential_tags) # if processed_elem is not None: # comments_body.append(processed_elem) processed_elems = [ process_comments_node(elem, potential_tags, dedupbool) for elem in subtree.xpath('.//*') ] comments_body.extend(list(filter(None.__ne__, processed_elems))) # control if len(comments_body) > 0: # if it has children LOGGER.debug(expr) # remove corresponding subtree subtree.getparent().remove(subtree) break # lengths temp_comments = trim(' '.join(comments_body.itertext())) return comments_body, temp_comments, len(temp_comments), tree
def parse_xml(self, filename, use_objectify=False, elements=None, tags=None): """ Parse and clean the supplied file by removing any elements or tags we don't use. :param filename: The filename of the xml file to parse. Str :param use_objectify: Use the objectify parser rather than the etree parser. (Bool) :param elements: A tuple of element names (Str) to remove along with their content. :param tags: A tuple of element names (Str) to remove, preserving their content. :return: The root element of the xml document """ try: with open(filename, 'rb') as import_file: # NOTE: We don't need to do any of the normal encoding detection here, because lxml does it's own # encoding detection, and the two mechanisms together interfere with each other. if not use_objectify: tree = etree.parse(import_file, parser=etree.XMLParser(recover=True)) else: tree = objectify.parse(import_file, parser=objectify.makeparser(recover=True)) if elements or tags: self.wizard.increment_progress_bar( translate('BiblesPlugin.OsisImport', 'Removing unused tags (this may take a few minutes)...')) if elements: # Strip tags we don't use - remove content etree.strip_elements(tree, elements, with_tail=False) if tags: # Strip tags we don't use - keep content etree.strip_tags(tree, tags) return tree.getroot() except OSError as e: self.log_exception('Opening {file_name} failed.'.format(file_name=e.filename)) critical_error_message_box( title='An Error Occured When Opening A File', message='The following error occurred when trying to open\n{file_name}:\n\n{error}' .format(file_name=e.filename, error=e.strerror)) return None
def fix_corresp_label(root): global output for corresp in root.xpath("//corresp"): if corresp.xpath("label"): etree.strip_tags(corresp, "label") output += "correction: removed label tag from corresp " + corresp.attrib["id"] + "\n" return root
def spaces_then_remove(el, tag_str): """FR's XML tends to not add spaces where needed, which leads to the removal of tags sometimes smashing together words.""" for tag in el.xpath('.//' + tag_str): prepost_pend_spaces(tag) etree.strip_tags(el, tag_str) return el
def parseQuestionContentToList(body,title): root = etree.HTML(body) etree.strip_elements(root,'code',with_tail=False) etree.strip_tags(root,'*') nonPunct = re.compile('.*[A-Za-z0-9].*') text = str(etree.tostring(root,pretty_print = True)[10:-11])[1:].lower()\ .replace('\\n',' ')\ .replace("\\",'')\ .replace("?",' ') title = title.lower().replace("?"," ") text += " " + title tokens = nltk.word_tokenize(text) filtered = [w for w in tokens if nonPunct.match(w)] #get rid of the punctuation that got left around the words for word in filtered: front = 0 back = 0 for letter in word: if letter not in string.punctuation: break front += 1 for letter in reversed(word): if letter not in string.punctuation: break back -= 1 if back == 0 : back = None word = word[front:back] return filtered
def xml_text_only(elem): '''Return inner text of element with tags stripped''' etree.strip_tags(elem, '*') inner_text = elem.text if inner_text: return inner_text.strip() return None
def update_oed(self, **kwargs): valid_links_only = kwargs.get('validLinksOnly', False) tree = etree.parse(self.oed_in) for entry in tree.findall('./link'): oed_id = entry.get('sourceID', None) oed_label_text = self.oed_index.find(oed_id, field='label') or LinkUpdater.error_message source_label = entry.find('./sourceLabel') etree.strip_tags(source_label, 'i', 'sup', 'sub', 'hm') source_label.text = oed_label_text lexid = entry.get('targetID', None) ode_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message target_label = entry.find('./targetLabel') etree.strip_tags(target_label, 'i', 'sup', 'sub', 'hm') target_label.text = ode_label_text if (valid_links_only and (oed_id is None or lexid is None or source_label.text == LinkUpdater.error_message or target_label.text == LinkUpdater.error_message or not check_match(source_label.text, target_label.text))): entry.getparent().remove(entry) with open(self.oed_out, 'w') as filehandle: filehandle.write(etree.tostring(tree, pretty_print=True, encoding='unicode'))
def search_wiki(math_knowledge, math_map, mcom_map, roots, math_exp_rev, old_new_math_map): ws = WikiPageSearcher(solr_wiki_math, solr_wiki_doc) na = norm_attribute() for mid, vals in math_knowledge.iteritems(): #mid = "MATH_C04-1197_15" mml = etree.tostring(math_map[mid]) mml = na.normalize(mml) mml_comp = etree.tostring(mcom_map[mid]) mml_comp = na.normalize(mml_comp) lst_dct_weighted_nps = [] lst_dct_weighted_nps.append(vals["nps"]) if "children" in vals: for v, vt in vals["children"]: if vt is Link_Types.comp or vt is Link_Types.simcomp: continue #text = u"%s %s" % (text, math_knowledge[v]["paragraph"]) lst_dct_weighted_nps.append(math_knowledge[v]["nps"]) agg_nps = nps_aggregration(lst_dct_weighted_nps) mathdb, docdb = ws.search_wikipedia_pages(mml_comp, agg_nps) is_root = old_new_math_map[math_exp_rev[mid]] in roots is_root = str(is_root) mml_to_print = etree.fromstring(etree.tostring(math_map[mid])) etree.strip_tags(mml_to_print, "*") print "\t".join((is_root, mid, encode(mml_to_print.text), print_docs_score(mathdb), print_docs_score(docdb)))
def update_odo(self, **kwargs): valid_links_only = kwargs.get('validLinksOnly', False) tree = etree.parse(self.odo_in) for entry in tree.findall('./e'): lexid = entry.get('lexid', None) odo_label = entry.find('./label') odo_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message etree.strip_tags(odo_label, 'i', 'sup', 'sub', 'hm') odo_label.text = odo_label_text link = entry.find('./linkSet/link') if link is not None: refentry = link.get('refentry', '0') refid = link.get('refid', '0') oed_label_text = self.oed_index.find(refentry, field='label') or LinkUpdater.error_message etree.strip_tags(link, 'i', 'sup', 'sub', 'hm') link.text = oed_label_text if (valid_links_only and (link is None or link.text == LinkUpdater.error_message or odo_label.text == LinkUpdater.error_message or not check_match(link.text, odo_label.text))): entry.getparent().remove(entry) with open(self.odo_out, 'w') as filehandle: filehandle.write(etree.tostring(tree, pretty_print=True, encoding='unicode'))
def stringify_children(node): from lxml.etree import tostring, strip_tags strip_tags(node,'*') text = tostring(node, method='text', encoding=unicode) return text
def _extract_cases_from_html(self, html): """Build list of data dictionaries, one dictionary per case (table row).""" # Strip inconsistently placed <font> and <br> # tags that make stable coverage almost impossible etree.strip_tags(html, 'font', 'br') path = '//table[@id="AutoNumber1"]//ul' for ul in html.xpath(path): preceding = ul.xpath('./preceding::*[1]')[0] preceding_text = ' '.join(preceding.text_content().split()).strip(':') # Skip sections that are marked to be published at future date if preceding_text and not preceding_text.lower().endswith(' date'): # Below will fail if they change up string format date_string = preceding_text.split()[-1] case_date = convert_date_string(date_string) for element in ul.xpath('./li | ./a'): if element.tag == 'li': text = normalize_dashes(' '.join(element.text_content().split())) if not text: continue anchor = element.xpath('.//a')[0] elif element.tag == 'a': # Malformed html, see connappct_example.html anchor = element glued = '%s %s' % (anchor.text_content(), anchor.tail) text = normalize_dashes(' '.join(glued.split())) self.cases.append({ 'date': case_date, 'url': anchor.xpath('./@href')[0], 'docket': text.split('-')[0].replace('Concurrence', '').replace('Dissent', ''), 'name': text.split('-', 1)[1], })
def fix_article_title_tags(root): global output title = root.xpath("//title-group/article-title")[0] if title.xpath("//named-content"): etree.strip_tags(title, "named-content") output += "correction: removed named-content tags from article title\n" return root
def processFile(first_docid, filename): with file(filename, 'rb') as xmlfile: xml = "<root>%s</root>" % xmlfile.read() parser=etree.XMLParser(recover=True) tree=etree.fromstring(xml,parser=parser) etree.strip_tags(tree,"a") def inclusion_filter(doc): return (not doc.get('url', '').endswith(u'_(disambiguation)') and not u'List_of_' in doc.get('url', '') and len(doc.text.strip()) >= args.mindoclen) (kept, dropped) = bifurcate(inclusion_filter, tree.xpath('//doc')) print "Dropped %s of %s documents" % (len(dropped), len(kept) + len(dropped)) docs = list(enumerate(kept, start=first_docid)) if len(docs) == 0: return first_docid for (docid, doc) in docs: text=doc.text.encode('utf-8').split("\n") attrs = { 'id' : doc.get("id"), 'title' : text[1], 'text' : "\n".join(text[2:]), 'source': 'Wikipedia', 'date' : TODAY, 'url' : doc.get("url", "").replace("http://it.wikipedia.org","http://en.wikipedia.org") } add_document(attrs, docid) print "Processed %s docs from %s" % (len(docs), filename) return docs[-1][0]
def email(self, alerts): node = alert_node() dismisseds = [a.message_id for a in mAlert.objects.filter(node=node)] msgs = [] for alert in alerts: if alert.getId() not in dismisseds: """ This is all to allow <a> tags in alert messages. We need to strip out all the tags so we can send a plain text email. """ msg = str(alert) msgnode = etree.fromstring('<msg>{}</msg>'.format(msg)) for i in msgnode.xpath('//a'): new = etree.Element('span') new.text = '{} ({})'.format(i.text, i.attrib['href']) msgnode.replace(i, new) etree.strip_tags(msgnode, '*') msgs.append(msgnode.text) if len(msgs) == 0: return hostname = socket.gethostname() send_mail( subject='%s: %s' % ( hostname, _("Critical Alerts"), ), text='\n'.join(msgs) )
def word_lookup(word): request_url = api_url + word + '?key=' + api_key try: doc = etree.parse(request_url) except IOError: print 'Failed to connect to API.' return except etree.XMLSyntaxError: print 'Invalid XML response when looking up "' + word + '".' return entries = doc.xpath('//entry_list/entry') suggestions = doc.xpath('//entry_list/suggestion') if entries: print '\n\tInput: ' + word for entry in entries: print '=========================' print entry.find('ew').text for definition in entry.xpath('def/dt'): etree.strip_tags(definition,"*") print '=> ' + definition.text.replace(':','', 1) print '=========================' elif suggestions: print 'The word "' + word + '" isn\'t in the dictionary.\nSuggestions:' for suggestion in suggestions: print suggestion.text else: print 'No results found for "' + word + '".'
def enclose_and_change_self_size(self, outer_xpath, size_attribute, tag, change_tag): tree = self.load_dom_tree() # search the tree and grab the parent for child in tree.xpath(outer_xpath, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): self.debug.print_debug(self, u'Enclosing and changing size: {0} to {1}'.format(child.tag, change_tag)) new_element = etree.Element(tag) child.attrib[u'meTypesetSize'] = size_attribute if child.tag == '{http://www.tei-c.org/ns/1.0}' + change_tag: child.tag = 'REMOVE' else: for sub_element in child: if sub_element.tag == '{http://www.tei-c.org/ns/1.0}' + change_tag: child.tag = 'REMOVE' if child.tag != 'REMOVE': child.tag = change_tag child.addnext(new_element) Manipulate.append_safe(new_element, child, self) if child.tag == 'REMOVE': etree.strip_tags(child.getparent(), 'REMOVE') if not (child.attrib['rend'] is None): if u'bold' in child.attrib[u'rend']: child.attrib[u'rend'] = child.attrib[u'rend'].replace(u'bold', u'') self.save_tree(tree)
def cleanup_address_p(paragraph): """Function for dealing with the somewhat messy paragraphs inside an address block. This deals with the potential lack of spaces in the XML, extra E tags, and strange characters up front.""" if paragraph.text: ended_with_space = paragraph.text.endswith(" ") else: ended_with_space = True # Inside baseball -- adds spaces to tags that don't have them for child in paragraph.getchildren(): if not child.text: continue if not ended_with_space: child.text = " " + child.text if child.tail and not child.tail.startswith(" "): child.text = child.text + " " if child.tail: ended_with_space = child.tail.endswith(" ") else: ended_with_space = child.text.endswith(" ") etree.strip_tags(paragraph, "E") txt = paragraph.text.strip() while txt and not (txt[0] in string.letters or txt[0] in string.digits): txt = txt[1:] return txt
def clean_image_block(block_tree): """ Cleans up an image block to assure that it has the correct structure. """ image = None img_wrapper = None caption = None image_found = False caption_found = False ## We get all the block descendants using lxml (should be "depth-first") ## in order to get image and caption elements, if any. for des in block_tree.iterdescendants(): ## We only take the first img element found. if des.tag == 'img' and not image_found: image_found = True ## We set the image element. image = des ## If the img element is wrapped by a link ## we set the image_wrapper too. if des.getparent().tag == 'a': img_wrapper = des.getparent() ## If the class has been modified we put the correct one. img_wrapper.attrib['class'] = 'image-link' ## We only take the first span element (caption) found. if des.tag == 'span' and not caption_found: caption_found = True ## We set the caption element. caption = des ## If the class has been modified we put the correct one. caption.attrib['class'] = 'image-caption' ## If the image block has no image inside ## then it's invalid and we remove it. if image is None: block_tree.tag = 'invalid_image_block' etree.strip_elements(block_tree, 'invalid_image_block') return ## Sanitazing the caption, we strip out every element inside the span ## preserving the content and thus all the texts present. if caption is not None: etree.strip_tags(caption, '*') ## We go through the descendants again to mark invalid elements. for des in block_tree.iterdescendants(): ## Invalid elements are all those elements which are neither the image ## nor the caption, nor the image_wrapper. if des is image or des is img_wrapper or des is caption: continue ## We remove invalid tags texts. des.text = '' ## We mark invalid tags for removal. des.tag = 'tag_to_be_stripped_out' ## We finally strip out tags marked as invalid ## now the image block should have the correct structure. etree.strip_tags(block_tree, 'tag_to_be_stripped_out')
def _clean_article(self, article): if len(article.cssselect('h1')) > 0: article.remove(article.cssselect('h1')[0]) for e in article.cssselect('p,br,ul,li'): e.tail = '\n' + (e.tail if e.tail else '') etree.strip_tags(article, '*') text = unicode(article.text_content()).strip() self.text = re.sub(r'\W*\n\W*', '\n\n', text)
def parse_info_page(self, response): def get_text_formatted(node): from lxml.html import fromstring etree.strip_tags(node.xmlNode, "a") txt = node.extract() txt = txt.replace("<br/>", "\n") txt = txt.replace(u"\u00A0", " ") txt = fromstring(txt).text_content() txt = re.sub(r"\n[ \t]+", "\n", txt) return txt.strip() def get_text(node, regexp=None, invert=False): etree.strip_tags(node.xmlNode, "a") txt = "" for line in node.xpath(".//text()").extract(): line = line.replace(u"\u00A0", " ") line = line.strip() if not line: continue match = True if regexp: match = regexp.search(line) and True or False if (match and not invert) or (not match and invert): if line[0] != line[0].lower(): txt += ". " txt += " %s " % line txt = re.sub("(\s\.+\s)+", ".", txt) txt = re.sub("[\s]+", " ", txt) txt = re.sub("[\.]+", ".", txt) txt = re.sub("^. ", "", txt) txt = txt.strip() return txt lxs = LxmlSelector(response) item = response.meta["item"] meta = self.meta_as_dict(lxs) etree.strip_tags(lxs.xmlNode, "b", "font", "i") info_node = lxs.xpath("//a[@name = 'PDT']/ancestor::td[1]") if info_node: item["info"] = get_text_formatted(info_node[0]) amendments_node = lxs.xpath("//a[@name = 'PAC']/ancestor::td[1]") if amendments_node: item["amendments"] = get_text_formatted(amendments_node[0]) summary_node = lxs.xpath("//a[@name = 'ECRCM']/ancestor::td[1]") if summary_node: item["summary"] = get_text_formatted(summary_node[0]) file_href = meta.get("URL_DOSSIER") or None if file_href: file_href = urljoin(response.url, file_href) item["law"] = LawItem( title=meta.get("LOI_PROMULGUEE", ""), href=meta.get("LIEN_LOI_PROMULGUEE", ""), file_href=file_href, ) yield item