Пример #1
0
    def strip_elements(self, elements):
        """
        Remove list of elements (named by tag) from the node.

        (Wrapper for etree.strip_elements())
        """
        etree.strip_elements(self.node, elements)
Пример #2
0
    def append_body_footer(self, article):
        """
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
        """
        try:
            article['body_html'] = article['body_html'].replace('<br>', '<br/>')
        except KeyError:
            pass

        body = ''
        if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]:
            body = article.get('body_html', '')
        elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]:
            body = article.get('description', '')

        if body and article.get(FORMAT, '') == FORMATS.PRESERVED:
            body = body.replace('\n', '\r\n').replace('\r\r', '\r')
            parsed = parse_html(body, content='html')

            for br in parsed.xpath('//br'):
                br.tail = '\r\n' + br.tail if br.tail else '\r\n'

            etree.strip_elements(parsed, 'br', with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get('body_footer'):
            footer = article.get('body_footer')
            if article.get(FORMAT, '') == FORMATS.PRESERVED:
                body = '{}\r\n{}'.format(body, get_text(footer))
            else:
                body = '{}{}'.format(body, footer)
        return body
Пример #3
0
    async def extractor(self, html):
        html = html.replace('<!--', '').replace('-->', '')
        loop = asyncio.get_event_loop()

        try:
            root = await loop.run_in_executor(
                None,
                partial(fromstring, html, parser=self.parser),
            )
        except (ParserError, XMLSyntaxError):
            return

        content = root.xpath(self.settings['path']['content'])
        if not content:
            await asyncio.sleep(self.settings['cooldown'])
            return

        for tag in reversed(content):
            post_id = tag.attrib.get('id')
            if not post_id:
                raise SystemExit('Post ID not found')

            links = tag.xpath(self.settings['path']['links'])
            raw_links = self.dl_links(links)
            strip_elements(tag, 'a')
            names = (el.strip()
                     for el in tag.xpath(self.settings['path']['name']))
            name = ' '.join(el for el in names if el)

            yield post_id, raw_links, name
Пример #4
0
    def _get_wiki_content(self, url, retry=1):
        """
        makes the http requests, then strip off the useless tags (table, javascript and so on see EXCLUDE_TAGS)
        it has 5 sec time out and 1 retry
        :param url: input url (String)
        :return:
        """
        while retry >= 0:
            try:
                r = requests.get(url, timeout=5)
            except:
                self.logger.exception(
                    "Exception while requesting {0}".format(url))
                retry -= 1
                continue

            if r.status_code == 200:
                self.logger.info("Successful {0} status code {1}".format(
                    url, r.status_code))
                trimmed_content = re.sub('\s+', ' ', r.content)
                dom_xml = html.fromstring(trimmed_content)
                etree.strip_elements(dom_xml, self.EXCLUDE_TAGS)
                cleaned_html = html.tostring(dom_xml)
                return cleaned_html, r.url
            else:
                self.logger.warn("Failed {0} status code {1}".format(
                    url, r.status_code))
                return None, None
        return None, None
        def crawl_link_to_index(inp):
            idx, link = inp
            print idx, link
            try:
                print link
                response = urllib.urlopen(link)

                while response.getcode() == 502:
                    time.sleep(60)
                    response = urllib.urlopen(link)
                page_content = response.read()

                tree = etree.HTML(page_content, parser=html_parser)
                etree.strip_elements(tree, 'script')
                etree.strip_tags(tree, 'script')
                text_data = "\n".join(filter(lambda chunk: chunk != '',
                                             [t.strip() for t in tree.itertext()]))

                page_title = tree.find(".//title").text

                es.index(index=index_name,
                         doc_type="page",
                         id=idx,
                         body={
                             "url": link,
                             "title": page_title,
                             "page_text": text_data
                         })
                print "-" * 10
            except Exception, e:
                print e
def clean(text):
    from lxml import etree
    from lxml.etree import strip_elements, tostring
    tree = etree.fromstring(text, parser=etree.HTMLParser())
    code_tags = ['pre', 'code']
    strip_elements(tree, *code_tags)
    return tostring(tree, with_tail=False)
Пример #7
0
def update_range(config, ranges):
    etree.strip_elements(config, 'range')

    downloaded = config.find('./downloaded')
    if downloaded is None:
        downloaded = etree.SubElement(config, 'downloaded')

    def add(season, start, end):
        etree.SubElement(downloaded,
                         'range',
                         season=str(season),
                         start=str(start),
                         end=str(end))

    for season, rng in ranges.items():
        rng = sorted(rng)
        end = start = rng[0]
        for i, ep in enumerate(rng[1:]):
            if ep == end + 1 and i != len(rng) - 2:
                end = ep
            elif ep == end + 1:
                end = ep
                add(season, start, end)
            else:
                add(season, start, end)
                start = end = ep
        if start == end:
            add(season, start, end)
    return config
Пример #8
0
def replace_links(tree):
    """Replace descendent anchors with their contents.

    >>> xml = ''.join([
    ... '<p>The <a class="reference internal" href="#module-doctest" title="do',
    ... 'ctest: Test pieces of code within docstrings."><code class="xref py p',
    ... 'y-mod docutils literal"><span class="pre">doctest</span></code></a> m',
    ... 'odule searches for pieces of text that look like interactive Python s',
    ... 'essions, and then executes those sessions to verify that they work ex',
    ... 'actly as shown.  There are several common ways to use doctest:</p>'
    ... ])
    >>> root = html.fromstring(xml)
    >>> etree.tostring(replace_links(root), encoding='unicode', method='xml')
    '<p>The <c..."><s...="pre">doctest</span></code> module ... doctest:</p>'

    >>> etree.tostring(replace_links(html.fromstring(
    ... '<html>nice <a>test</a></html>')), encoding='unicode', method='xml')
    '<html><body><p>nice test</p></body></html>'

    >>> etree.tostring(replace_links(html.fromstring(
    ... '<a>test</a>')), encoding='unicode', method='xml')
    '<a>test</a>'
    """
    for a in tree.xpath('.//a'):
        if a.text:
            p = a.getparent()
            p.text = (p.text if p.text else '') + a.text
        for e in a.iterchildren():
            a.addprevious(e)
    etree.strip_elements(tree, 'a', with_tail=False)
    return tree  # Not necessary but makes chaining easier.
Пример #9
0
def test1():
    xmlstr = """
<root><p id="sec1">
The ecstasy of discovering a new hit from screening can lead to a highly productive research effort to discover new bioactive compounds. However, in too many cases this ecstasy is followed by the agony of realizing that the compounds are not active against the desired target. Many of these false hits are Pan Assay INterference compoundS (PAINS)
<sup>
<xref ref-type="bibr" rid="ref1">1</xref>
</sup>
or colloidal aggregators.
<sup>
<xref ref-type="bibr" rid="ref2">2</xref>
</sup>
Whether the screen is conducted in silico or in the laboratory and whether screening libraries, natural products, or drugs are used, all discovery efforts that rely on some form of screening to identify bioactivity are susceptible to this phenomenon. Studies that omit critical controls against experimental artifacts caused by PAINS may waste years of research effort as useless compounds are progressed.
<sup>
<xref ref-type="bibr" rid="ref3">3</xref>
−
<xref ref-type="bibr" rid="ref8">8</xref>
</sup>
The American Chemical Society (ACS) is eager to alert the scientific community to this problem and to recommend protocols that will eliminate the publication of research articles based on compounds with artificial activity. This editorial aims to summarize relevant concepts and to set the framework by which relevant ACS journals will address this issue going forward.
</p>
</root>
"""
    root = etree.fromstring(xmlstr)
    #etree.strip_tags(root,'xref')
    etree.strip_elements(root, 'sup', with_tail=False)
    #stuff=handle_paragrap('1111',root.find('p'))
    print(etree.tostring(root, pretty_print=True))
Пример #10
0
def hangzhou_modifier(response):
    body = lxml.html.fromstring(response)

    del_ele = body.xpath('//div[@class="MainTitle"]')
    for ele in del_ele:
        ele.clear()

    elements = re.findall('onclick="DownLoad\((.*?)\)"', response)
    node_elems = body.xpath('//div/ul/li/a[@href="javascript:void(0);"]')
    for element, node_elem in zip(elements, node_elems):
        str_list = element.replace('\'', '')
        link_str_1, link_str_2 = str_list.split(',')
        href = 'http://file.hzctc.cn/UService/DownLoadFile.aspx?dirtype=3&filepath={}&showName={}'.format(
            link_str_2, link_str_1)
        href = re.sub(r'\s+', '', href)
        node_elem.set('href', href)

    body.make_links_absolute('http://www.hzctc.cn/')
    etree.strip_elements(body, "script", "style", "title")
    for ele in body.xpath('//a[@target="_blank"]'):
        ele.set('href', 'http://app1.hzctc.cn/')

    try:
        element_2 = body.xpath('//div[@class="content"]')[0]
        content = etree.tostring(element_2, encoding='utf-8').decode('utf-8')
        #print(content.decode('utf-8'))
        data = cleaner.clean_html(content)
    except Exception as e:
        raise e
    #print(data)

    return data
Пример #11
0
def extract_content(tree, include_tables=False):
    '''Find the main content of a page using a set of XPath expressions,
       then extract relevant elements, strip them of unwanted subparts and
       convert them'''
    sure_thing = False
    result_body = etree.Element('body')
    # iterate
    for expr in BODY_XPATH:
        # select tree if the expression has been found
        subtree = tree.xpath(expr)
        if len(subtree) == 0:
            continue
        subtree = subtree[0]
        # prune
        subtree = discard_unwanted(subtree)
        # remove elements by link density
        for elem in subtree.iter('list'):
            if link_density_test(elem) is True:
                elem.getparent().remove(elem)
                continue
            elem.attrib.clear()
            #for subelem in elem.iter('item'):
            #    subelem.attrib.clear()
        etree.strip_tags(subtree, 'a', 'link', 'span')
        # define iteration strategy
        potential_tags = set(TAG_CATALOG)  # + 'span'?
        if include_tables is True:
            potential_tags.add('table')
        # no paragraphs containing text
        if len(subtree.xpath('//p//text()')) == 0:
            potential_tags.add('div')
        LOGGER.debug(sorted(potential_tags))
        # etree.strip_tags(subtree, 'lb') # BoingBoing-Bug
        # print(html.tostring(subtree, pretty_print=True, encoding='unicode'))
        # extract content
        processed_elems = [
            handle_textelem(elem, potential_tags)
            for elem in subtree.xpath('.//*')
        ]
        result_body.extend(list(filter(None.__ne__, processed_elems)))
        # exit the loop if the result has children
        if len(result_body) > 0:
            sure_thing = True
            LOGGER.debug(expr)
            break
    # try parsing wild <p> elements if nothing found or text too short
    temp_text = trim(' '.join(result_body.itertext()))
    len_text = len(temp_text)
    if len(result_body) == 0 or len_text < MIN_EXTRACTED_SIZE:
        result_body = recover_wild_paragraphs(tree, result_body)
        #search_tree = discard_unwanted(tree)
        #search_tree = prune_html(search_tree)
        #result_body, _, _ = baseline(search_tree)
        temp_text = trim(' '.join(result_body.itertext()))
        len_text = len(temp_text)
    # filter output
    etree.strip_elements(result_body, 'done')
    etree.strip_tags(result_body, 'div')
    # return
    return result_body, temp_text, len_text, sure_thing
    def crawl_proxyGoubanjia(self):
        """
        获取代理 http://www.goubanjia.com/
        :return:
        """
        start_url = "http://www.goubanjia.com/"
        html = get_page(start_url)

        # root = etree.strip_elements()
        # print(html)
        if html:
            htmlEle = etree.HTML(html, etree.HTMLParser())
            result = htmlEle.xpath("//td[@class='ip']")
            # print(result)
            # print(len(result))

            for td in result:
                # p = td.xpath(".//p[@style='display:none;']")
                # p1 = td.xpath(".//p[@style='display: none;']")
                etree.strip_elements(td, 'p')
                # print(etree.tostring(td))
                # print(len(p))
                # print(len(p1))

                text = td.xpath(".//text()")
                # print(text)
                # print(''.join(text))
                yield ''.join(text)
Пример #13
0
    def update_definitions(self):
        """
        Replace the existing Definition elements with a fresh set.

        A sequence of changes is recorded in self.changes.
        The position for the inserts is determined by walking past
        all of the elements which precede the Definition elements.
        Then the sequence of Definition nodes to be inserted is
        reversed, so we can perform all of the insertions using
        the same position.
        """

        vals = self.Values(self.root, "Definition/DefinitionText")
        if vals.dups:
            what = "definition" + (vals.dups > 1 and "s" or "")
            self.changes.add("%d duplicate %s eliminated" % (vals.dups, what))
        nodes = []
        for d in self.concept.definitions:
            if d.source == 'NCI':
                key = Concept.normalize(d.text)
                if key not in vals.used:
                    status = "Reviewed"
                    original = vals.original.get(key)
                    if original is None:
                        status = "Unreviewed"
                    elif original != d.text:
                        status = "Unreviewed"
                        vals.updated += 1
                    nodes.append(d.convert(status))
                    vals.used.add(key)
        etree.strip_elements(self.root, "Definition")
        position = self.find_position(Concept.Definition.SKIP)
        for node in reversed(nodes):
            self.root.insert(position, node)
        vals.record_definition_changes(self.changes)
def extract_page(local_url, remote_url):
    response = requests.get(local_url)
    response.raise_for_status()
    response.encoding = "utf-8"

    document = lxml.html.fromstring(response.content)

    for element in ("script", "style"):
        etree.strip_elements(document, element)

    page_results = []
    for section in document.xpath("//div[contains(@class, 'section')]"):
        if "expandjson" in section.attrib["class"]:
            continue

        text, section_id = extract_section(section)

        title = document.xpath("//title/text()")[0].split("—")[0].strip()
        section_title = section.xpath(
            "h1|h2|h3|h4|h5")[0].text_content().rstrip("¶")

        if title != section_title:
            title = f"{title} - {section_title}"

        page_results.append({
            "url": f"{remote_url}#{section_id}",
            "text": text,
            "title": title,
        })

    href = document.xpath("//a[@accesskey='n']/@href")
    if href:
        href = href[0]

    return page_results, href
Пример #15
0
def fuyang_modifier(response):
    body = lxml.html.fromstring(response)
    body.make_links_absolute('http://www.hzfyggzy.org.cn/')
    etree.strip_elements(body, "script", "style", "title", 'iframe')
    del_ele = body.xpath('//div[@class="MainTitle"]')
    for ele in del_ele:
        ele.clear()
    elements = re.findall('onclick="DownLoad\((.*?)\)"', response)
    node_elems = body.xpath('//div/ul/li/a[@href="javascript:void(0);"]')
    for element, node_elem in zip(elements, node_elems):
        str_list = element.replace('\'', '')
        link_str_1, link_str_2 = str_list.split(',')
        href = 'http://218.108.176.14:8002/DService/UpLoadFiles/ProAfficheAccessory/{}'.format(
            link_str_2)
        href = re.sub(r'\s+', '', href)
        node_elem.set('href', href)
    for ele in body.xpath('//a[@target="_blank"]'):
        ele.set('title', '投标人平台用户请登录投标人平台下载,其他用户请到交易中心窗口领取')
    try:
        element = body.xpath('//div[@class="content"]')[0]
    except Exception as e:
        raise e
    content = etree.tostring(element, encoding='utf-8')
    #print(content.decode('utf-8'))
    data = content.decode('utf-8')
    return data
def processData(baseURL, url, tree):
    # TODO: Determine if all we did was replace existing data this round to abort early
    # Collect all entries
    entries = []
    for entry in tree.findall('.//item'):
        # Collect ID
        id = entry.find('guid') # optional
        if id is None:
            id = entry.find('link') # mandatory
            id = 'link:' + id.text
        else:
            id = 'guid:' + id.text
        # Collect date - we require, but RSS makes optional
        date = entry.find('pubDate')
        if date is None:
            scraperwiki.util.log("*** URL: %s reports items without pubDate - considering failure" % baseURL)
            return False

        date = dateutil.parser.parse(date.text)
        data = etree.tostring(entry)
        entries.append({"baseURL":baseURL, "id":id, "date":date,"data":data})

    scraperwiki.sqlite.save(unique_keys = ["baseURL", "id"], data = entries, table_name = "rss_item")

    if baseURL == url:
        # Strip and preserve the root data
        etree.strip_elements(tree, 'item')
        data = etree.tostring(tree)
        entry = {"baseURL":baseURL, "data":data}
        scraperwiki.sqlite.save(unique_keys = ["baseURL"], data = [entry], table_name = "rss_root")

    return True
Пример #17
0
def test4():
    xmlstr = """
<root>
	<p>test before list 1
		<list list-type="simple" id="l1">
			<list-item><p>item 1.1</p></list-item>
			<list-item><p>item 1.2</p></list-item>
		</list>text after list 1 or before list 2
		<list list-type="simple" id="l2">
			<list-item><p>item 2.1</p></list-item>
			<list-item><p>item 2.2</p></list-item>
		</list></p>text after para
</root>
"""
    root = etree.fromstring(xmlstr)
    etree.strip_tags(root, 'xref')
    etree.strip_elements(root, 'xref', with_tail=True)
    p = root.find('p')
    result = []
    result.append(clean_string(p.text))
    for l in p:
        for li in l:
            result.append(get_clean_text(li))
        result.append(clean_string(l.tail))
    result.append(clean_string(p.tail))
    print(result)
    print
    n = root.find(('p/list'))
    n.getparent().remove(n)
    print(etree.tostring(root, pretty_print=True))
Пример #18
0
def parseQuestionContentToList(body,title):
	root = etree.HTML(body)
	etree.strip_elements(root,'code',with_tail=False)
	etree.strip_tags(root,'*')
	nonPunct = re.compile('.*[A-Za-z0-9].*')
	text = str(etree.tostring(root,pretty_print = True)[10:-11])[1:].lower()\
	.replace('\\n',' ')\
	.replace("\\",'')\
	.replace("?",' ')
	title = title.lower().replace("?"," ")
	text += " " + title
	tokens = nltk.word_tokenize(text)
	filtered = [w for w in tokens if nonPunct.match(w)]
	#get rid of the punctuation that got left around the words
	for word in filtered:
		front = 0
		back = 0
		for letter in word:
			if letter not in string.punctuation:
				break
			front += 1
		for letter in reversed(word):
			if letter not in string.punctuation:
				break
			back -= 1
		if back == 0 :
			back = None
		word  = word[front:back]

	return filtered
Пример #19
0
    def _parse_tweet_text(self, text_element, tweet):
           
        #hacky way to include Emojis
        for emoj in text_element.cssselect('img.Emoji'):
            emoj.tail = emoj.get('alt') + emoj.tail if emoj.tail else emoj.get('alt')
        
        #Modify Urls so they are correct
        for url in text_element.cssselect('a.twitter-timeline-link'):
            is_truncated = u'\u2026' in url.text_content()

            url_disp = url.cssselect('span.js-display-url')
            if len(url_disp) > 0:
                url_disp_text =  url_disp[0].text_content()
                if is_truncated:
                    url_disp_text = url_disp_text + u'\u2026'
                url.attrib['xtract-display-url'] = url_disp_text # store for later extraction
            elif 'pic.twitter.com' in url.text:
                url.attrib['xtract-display-url'] = url.text
            strip_elements(url, ['*'])      
            url.text = url.attrib['href']

        tmp = str(text_element.text_content())
        for m in re.finditer(r'(?<!\s)(?<!\\n)(http|https)://', tmp): #add a space before urls where required
            tmp = tmp[:m.start()] + ' ' + tmp[m.start():]

        tweet['text'] = tmp
Пример #20
0
 def parse_xml(self, filename, use_objectify=False, elements=None, tags=None):
     """
     Parse and clean the supplied file by removing any elements or tags we don't use.
     :param filename: The filename of the xml file to parse. Str
     :param use_objectify: Use the objectify parser rather than the etree parser. (Bool)
     :param elements: A tuple of element names (Str) to remove along with their content.
     :param tags: A tuple of element names (Str) to remove, preserving their content.
     :return: The root element of the xml document
     """
     try:
         with open(filename, 'rb') as import_file:
             # NOTE: We don't need to do any of the normal encoding detection here, because lxml does it's own
             # encoding detection, and the two mechanisms together interfere with each other.
             if not use_objectify:
                 tree = etree.parse(import_file, parser=etree.XMLParser(recover=True))
             else:
                 tree = objectify.parse(import_file, parser=objectify.makeparser(recover=True))
             if elements or tags:
                 self.wizard.increment_progress_bar(
                     translate('BiblesPlugin.OsisImport', 'Removing unused tags (this may take a few minutes)...'))
             if elements:
                 # Strip tags we don't use - remove content
                 etree.strip_elements(tree, elements, with_tail=False)
             if tags:
                 # Strip tags we don't use - keep content
                 etree.strip_tags(tree, tags)
             return tree.getroot()
     except OSError as e:
         self.log_exception('Opening {file_name} failed.'.format(file_name=e.filename))
         critical_error_message_box(
             title='An Error Occured When Opening A File',
             message='The following error occurred when trying to open\n{file_name}:\n\n{error}'
             .format(file_name=e.filename, error=e.strerror))
     return None
Пример #21
0
def _strip_elements_from_node(node, omit_list):
    node_stripped = copy.deepcopy(node)
    for tag in omit_list:
        etree.strip_elements(node_stripped,
                             tag,
                             with_tail=False)
    return node_stripped
Пример #22
0
    def interpolate_wiki_links(self, elem):
        if len(elem) > 0:
            elem_copy = copy.deepcopy(elem)

            for child in elem_copy:
                if child.tag == "page":
                    page = child.text_content().strip()
                    link = "/gmod/" + page.replace(" ", "%20")
                    if "text" in child.attrib:
                        link_text = "[" + child.attrib["text"].strip(
                        ) + "](" + link + ")"
                    elif page.startswith("Enums/"):
                        link_text = "[" + page[len("Enums/"
                                                   ):] + "](" + link + ")"
                    elif page in self.LINKS:
                        link_text = "[" + self.LINKS[page] + "](" + link + ")"
                    else:
                        link_text = "[" + page + "](" + link + ")"

                    child.tail = link_text + (child.tail or '')

            strip_elements(elem_copy, "*", with_tail=False)

            text = elem_copy.text_content()
        else:
            text = elem.text_content()

        text = text.strip()
        if len(text) > 0:
            return text
Пример #23
0
    def get_lyrics(self):
        element = self.element

        # Replace <br> tags with \n (prepend it with \n and then remove all
        # occurrences of <br>)
        for br in element.cssselect('br'):
            br.tail = '\n' + br.tail if br.tail else '\n'
        etree.strip_elements(element, 'br', with_tail=False)

        # Remove unneeded tags
        bad_tags = element.cssselect('.rtMatcher') + \
            element.cssselect('.lyricsbreak')
        for tag in bad_tags:
            tag.drop_tree()

        # Remove HTML comments
        real_string = etree.tostring(element, encoding=unicode)
        cleaned_html = clean_html(real_string)

        # -KMS Modification-
        # Add try/except block to prevent script from crashing when
        # run from applescript
        try:
            print u'{0}'.format(
                html.fragment_fromstring(cleaned_html).text_content()
            ).encode('utf-8').strip()
        except UnicodeError:
            print u'{0}'.format(
                html.fragment_fromstring(cleaned_html).text_content()
            ).encode('utf-8').strip()
        return 0
Пример #24
0
    def append_body_footer(self, article):
        """
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
        """
        try:
            article["body_html"] = article["body_html"].replace("<br>", "<br/>")
        except KeyError:
            pass

        body = ""
        if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]:
            body = article.get("body_html", "")
        elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]:
            body = article.get("description", "")

        if body and article.get(FORMAT, "") == FORMATS.PRESERVED:
            body = body.replace("\n", "\r\n").replace("\r\r", "\r")
            parsed = parse_html(body, content="html")

            for br in parsed.xpath("//br"):
                br.tail = "\r\n" + br.tail if br.tail else "\r\n"

            etree.strip_elements(parsed, "br", with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get("body_footer"):
            footer = article.get("body_footer")
            if article.get(FORMAT, "") == FORMATS.PRESERVED:
                body = "{}\r\n{}".format(body, get_text(footer))
            else:
                body = "{}{}".format(body, footer)
        return body
Пример #25
0
            def crawl_link_to_index(inp):
                idx, link = inp
                print idx, link                
                try:
                    print link
                    response= urllib.urlopen(link)
                    
                    while response.getcode()==502:
                        time.sleep(60)
                        response= urllib.urlopen(link)
                    page_content = response.read()

                
                    tree = etree.HTML(page_content, parser=html_parser)
                    etree.strip_elements(tree, 'script')
                    etree.strip_tags(tree, 'script')
                    text_data = "\n".join(filter(lambda chunk: chunk != '',
                                [t.strip() for t in tree.itertext()]))
                
                    page_title = tree.find(".//title").text
                   
                    es.index(index = index_name,
                             doc_type = "page",
                             id = idx,
                             body = {
                                 "url": link,
                                 "title": page_title,
                                 "page_text": text_data
                             })
                    print "-" * 10
                except Exception, e:
                    print e
Пример #26
0
    def get_lyrics(self):
        response = requests.get(self.url)
        page_html =  html.document_fromstring(response.text)
        element = page_html.cssselect(self.CSS_SELECTOR)[0]

        # Replace <br> tags with \n (prepend it with \n and then remove all
        # occurrences of <br>)
        for br in element.cssselect('br'):
            br.tail = '\n' + br.tail if br.tail else '\n'
        etree.strip_elements(element, 'br', with_tail=False)

        # Remove unneeded tags
        bad_tags = element.cssselect('.rtMatcher') + \
            element.cssselect('.lyricsbreak')
        for tag in bad_tags:
            tag.drop_tree()

        # Remove HTML comments
        real_string = etree.tostring(element, encoding=unicode)
        cleaned_html = clean_html(real_string)

        info_output = format_song_info(self.json['artist'], self.json['song'])
        lyric_output = html.fragment_fromstring(cleaned_html).text_content()

        return u'{}{}'.format(info_output, lyric_output)
Пример #27
0
def longwan_modifier(response,url):
    body = lxml.html.fromstring(response)
    body.make_links_absolute('http://61.164.128.8:6081/')
    etree.strip_elements(body,"script","style","title",'iframe')

    try:
        element = body.xpath('//div[@class="Content-Main FloatL"]')[0]
    except Exception as e:
        raise e
    check_table = None
    for table in element.xpath('.//table'):
        check_str = ''.join(table.xpath('.//text()'))
        if '下载' in check_str:
            check_table = table
        else:
            table.clear()
    if check_table is not None:
        cid = re.findall(r'/(\d+)\.htm',url)[0]
        n = len(check_table.xpath('.//tr')) - 1
        attachment_url = 'http://61.164.128.8:6081/lwcms/attachment_url.jspx?cid={}&n={}'.format(cid,n)
        try:
            attachment_urls = requests.get(attachment_url,headers=HEADERS)
        except Exception as e:
            raise e
        #print(type(attachment_urls.text))
        i = 0
    
        for td_ele,href in zip(check_table.xpath('.//tr//td//a[@title="文件下载"]'),eval(attachment_urls.text)):
            final_href = "http://61.164.128.8:6081/lwcms/attachment.jspx?cid={}&i={}{}".format(cid,i,href)
            td_ele.set('href',final_href)
            i +=1
    content = etree.tostring(element,encoding='utf-8')
    #print(content.decode('utf-8'))
    data = content.decode('utf-8')
    return data
Пример #28
0
    def get_lyrics(self):
        response = requests.get(self.url)
        page_html = html.document_fromstring(response.text)
        element = page_html.cssselect(self.CSS_SELECTOR)[0]

        # Replace <br> tags with \n (prepend it with \n and then remove all
        # occurrences of <br>)
        for br in element.cssselect('br'):
            br.tail = '\n' + br.tail if br.tail else '\n'
        etree.strip_elements(element, 'br', with_tail=False)

        # Remove unneeded tags
        bad_tags = element.cssselect('.rtMatcher') + \
            element.cssselect('.lyricsbreak')
        for tag in bad_tags:
            tag.drop_tree()

        # Remove HTML comments
        real_string = etree.tostring(element, encoding="UTF-8")
        cleaned_html = clean_html(real_string)

        info_output = format_song_info(self.json['artist'], self.json['song'])
        lyric_output = html.fragment_fromstring(cleaned_html).text_content()

        return u'{}{}'.format(info_output, lyric_output)
Пример #29
0
def __merge_runs(p):
    while True:
        cont = False
        for run in p.iterchildren(W + 'r'):
            last_run = run.getprevious()
            if last_run is None or last_run.tag != W + 'r':
                continue
            run_props = __get_first_child(run, W + 'rPr')
            last_run_props = __get_first_child(last_run, W + 'rPr')
            if (run_props is None and last_run_props is not None) or (
                    run_props is not None and last_run_props is None):
                continue
            if (run_props is None
                    and last_run_props is None) or (etree.tostring(
                        run_props, encoding='utf-8',
                        with_tail=False) == etree.tostring(last_run_props,
                                                           encoding='utf-8',
                                                           with_tail=False)):
                last_wt = __get_first_child(last_run, W + 't')
                wt = __get_first_child(run, W + 't')
                if last_wt is not None and wt is not None:
                    last_wt.text += wt.text or ''
                    if len(last_wt.text) > 0 and (last_wt.text[0] == ' '
                                                  or last_wt.text[-1] == ' '):
                        last_wt.set(XML + 'space', 'preserve')
                    run.tag = 'TO_BE_REMOVED'
                    cont = True
        etree.strip_elements(p, 'TO_BE_REMOVED')
        if not cont:
            break
Пример #30
0
    def xml(self):
        """Filtered and stripped serialized document."""

        if not hasattr(self, "_xml"):
            try:
                xml = etree.tostring(self.doc.resolved, encoding="utf-8")
                parser = etree.XMLParser(remove_blank_text=True)
                root = etree.fromstring(xml, parser)
                first = True
                for node in root.findall("SummaryMetaData/MainTopics"):
                    if first:
                        first = False
                    else:
                        parent = node.getparent()
                        parent.remove(node)
                for node in root.xpath(self.CHANGES):
                    parent = node.getparent()
                    parent.remove(node)
                etree.strip_elements(root, with_tail=False, *self.STRIP)
                etree.strip_attributes(root, "PdqKey")
                opts = dict(pretty_print=True, encoding="unicode")
                self._xml = etree.tostring(root, **opts)
            except:
                logger.exception("failure processing XML")
                bail("failure processing XML")
        return self._xml
Пример #31
0
    def inject_chapter_metadata(self, bits_xml, chapter, chapter_settings, submission, custom_meta=None):
        """
        Generates the metadata for the chapter

        :param custom_meta: Dict containing entries which will be added as <custom-meta> tags.
        :param bits_xml: ElementTree object containing bits2 meta-data for a submission chapter.
        :param chapter: Chapter row object.
        :param chapter_settings: OMPSettings object containing the chapter settings
        :param submission: Submission row object, to which the chapter belongs.
        :return: Updated ElementTree object with new metadata from OMP db.
        """
        chapter_no = chapter.chapter_seq + 1
        book_part_xml = bits_xml.xpath('/book-part')[0]
        book_part_xml.set('id', 'b{}_ch_{}'.format(submission.submission_id, chapter_no))
        book_part_xml.set('seq', str(chapter_no))
        book_part_xml.set(LANG_ATTR, submission.locale[:2])
        # TODO How to distinguish other types?
        book_part_xml.set('book-part-type', 'chapter')
        book_part_meta_xml = book_part_xml.xpath('book-part-meta')[0]
        book_part_meta_xml.xpath('title-group/title')[0].text = chapter_settings.getLocalizedValue(
            'title', submission.locale)
        contrib_group_xml = book_part_meta_xml.xpath('contrib-group')[0]
        for contrib in self.dal.getAuthorsByChapter(chapter.chapter_id):
            contrib_group_xml.append(self.build_contrib_xml(contrib, contrib_group_xml, submission.locale))
        if custom_meta:
            custom_meta_group_xml = book_part_meta_xml.xpath('custom-meta-group')[0]
            # Clear old custom-meta tags
            etree.strip_elements(custom_meta_group_xml, 'custom-meta')
            for meta_name, meta_value in list(custom_meta.items()):
                custom_meta_xml = etree.SubElement(custom_meta_group_xml, 'custom-meta', {'specific-use': meta_name})
                etree.SubElement(custom_meta_xml, 'meta-name').text = meta_name
                etree.SubElement(custom_meta_xml, 'meta-value').text = meta_value
        return bits_xml
Пример #32
0
    def insert_submenus(self, submenus):
        """Insère les sous-menus dans l'arborescence existante.
        """

        # Effacer les sous-menus actuels, si existants (niveau 1 et 2)
        if self.menu[self.cursor[-1]].find('Submenu') is not None:
            etree.strip_elements(self.menu[self.cursor[-1]], 'Submenu')

        # Popule le nouveau sous-menu (niveau 1)
        etree.SubElement(self.menu[self.cursor[-1]], 'Submenu')
        for menu in submenus:

            # Créé le sous-menu (niveau 2)
            #logging.debug(u'menu = ({}, {}, {}, {})'.format(menu[0], menu[1], menu[2], menu[3]))
            etree.SubElement(self.menu[self.cursor[-1]].find('Submenu'), menu[0])

            # Nomme le sous-menu (niveau 2)
            if menu[1] is not None:
                etree.SubElement(self.menu[self.cursor[-1]].find('Submenu').find(menu[0]), 'Title')

                try:
                    self.menu[self.cursor[-1]].find('Submenu').find(menu[0]).find('Title').text = menu[1]

                except ValueError:
                    # All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
                    self.menu[self.cursor[-1]].find('Submenu').find(menu[0]).find('Title').text = u"Caractère(s) invalide(s)"

            # Affecte l'action et la commande du sous-menu (niveau 2)
            if menu[2] is not None and menu[3] is not None:
                etree.SubElement(self.menu[self.cursor[-1]].find('Submenu').find(menu[0]), menu[2])
                self.menu[self.cursor[-1]].find('Submenu').find(menu[0]).find(menu[2]).text = menu[3]
Пример #33
0
    def append_body_footer(self, article):
        """
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
        """
        try:
            article['body_html'] = article['body_html'].replace('<br>', '<br/>')
        except KeyError:
            pass

        body = ''
        if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]:
            body = article.get('body_html', '')
        elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]:
            body = article.get('description', '')

        if body and article.get(FORMAT, '') == FORMATS.PRESERVED:
            body = body.replace('\n', '\r\n').replace('\r\r', '\r')
            parsed = parse_html(body, content='html')

            for br in parsed.xpath('//br'):
                br.tail = '\r\n' + br.tail if br.tail else '\r\n'

            etree.strip_elements(parsed, 'br', with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get('body_footer'):
            footer = article.get('body_footer')
            if article.get(FORMAT, '') == FORMATS.PRESERVED:
                body = '{}\r\n{}'.format(body, get_text(footer))
            else:
                body = '{}{}'.format(body, footer)
        return body
def __merge_runs(p):
    while True:
        cont = False
        for run in p.iterchildren(A + 'r'):
            last_run = run.getprevious()
            if last_run is None or last_run.tag != A + 'r':
                continue
            run_props = __get_first_child(run, A + 'rPr')
            last_run_props = __get_first_child(last_run, A + 'rPr')
            if (run_props is None and last_run_props is not None) or (run_props is not None and last_run_props is None):
                continue
            if (run_props is None and last_run_props is None) or (
                        etree.tostring(run_props, encoding='utf-8', with_tail=False) == etree.tostring(last_run_props,
                                                                                                       encoding='utf-8',
                                                                                                       with_tail=False)):
                last_wt = __get_first_child(last_run, A + 't')
                wt = __get_first_child(run, A + 't')
                if last_wt is not None and wt is not None:
                    last_wt.text += wt.text or ''
                    if len(last_wt.text) > 0 and (last_wt.text[0] == ' ' or last_wt.text[-1] == ' '):
                        last_wt.set(XML + 'space', 'preserve')
                    run.tag = 'TO_BE_REMOVED'
                    cont = True
        etree.strip_elements(p, 'TO_BE_REMOVED')
        if not cont:
            break
Пример #35
0
def getPage(id):
    url = "http://www.digitalspy.co.uk/news/a%s" % id
    try:
        html = urllib2.urlopen(url).read()
    except IOError:
        print "Skipping: %s" % url
        return None
    html = clean_html(html)
    html = BREAKS.sub("\n", html)
    doc = fromstring(html)
    article = doc.cssselect("div.article_body")[0]
    for image in article.xpath(
            '//div[@class="image"]|//div[@class="imgcaption"]'):
        image.getparent().remove(image)
    strip_elements(article, 'img')
    return {
        'url':
        url,
        'title':
        doc.cssselect("div.article_header h1")[0].text_content().encode(
            'utf-8'),
        'published':
        doc.cssselect("span.time")[0].text_content().encode('utf-8'),
        'authors':
        ",".join(editor.text_content().encode('utf-8')
                 for editor in doc.cssselect("span.editors a")),
        'text':
        article.text_content().strip().encode('utf-8')
    }
Пример #36
0
def clean_image_block(block_tree):
    """ Cleans up an image block to assure that
        it has the correct structure.
    """
    image = None
    img_wrapper = None
    caption = None
    image_found = False
    caption_found = False

    ## We get all the block descendants using lxml (should be "depth-first")
    ## in order to get image and caption elements, if any.
    for des in block_tree.iterdescendants():
        ## We only take the first img element found.
        if des.tag == 'img' and not image_found:
            image_found = True
            ## We set the image element.
            image = des
            ## If the img element is wrapped by a link
            ## we set the image_wrapper too.
            if des.getparent().tag == 'a':
                img_wrapper = des.getparent()
                ## If the class has been modified we put the correct one.
                img_wrapper.attrib['class'] = 'image-link'

        ## We only take the first span element (caption) found.
        if des.tag == 'span' and not caption_found:
            caption_found = True
            ## We set the caption element.
            caption = des
            ## If the class has been modified we put the correct one.
            caption.attrib['class'] = 'image-caption'

    ## If the image block has no image inside
    ## then it's invalid and we remove it.
    if image is None:
        block_tree.tag = 'invalid_image_block'
        etree.strip_elements(block_tree, 'invalid_image_block')
        return

    ## Sanitazing the caption, we strip out every element inside the span
    ## preserving the content and thus all the texts present.
    if caption is not None:
        etree.strip_tags(caption, '*')

    ## We go through the descendants again to mark invalid elements.
    for des in block_tree.iterdescendants():
        ## Invalid elements are all those elements which are neither the image
        ## nor the caption, nor the image_wrapper.
        if des is image or des is img_wrapper or des is caption:
            continue
        ## We remove invalid tags texts.
        des.text = ''
        ## We mark invalid tags for removal.
        des.tag = 'tag_to_be_stripped_out'

    ## We finally strip out tags marked as invalid
    ## now the image block should have the correct structure.
    etree.strip_tags(block_tree, 'tag_to_be_stripped_out')
Пример #37
0
def extract_content(tree, include_tables=False, deduplicate=False, config=None):
    '''Find the main content of a page using a set of XPath expressions,
       then extract relevant elements, strip them of unwanted subparts and
       convert them'''
    sure_thing = False
    result_body = etree.Element('body')
    # iterate
    for expr in BODY_XPATH:
        # select tree if the expression has been found
        subtree = tree.xpath(expr)
        if not subtree:
            continue
        subtree = subtree[0]
        # prune
        subtree = discard_unwanted(subtree)
        # remove elements by link density
        subtree = delete_by_link_density(subtree, 'div', backtracking=True)
        subtree = delete_by_link_density(subtree, 'list', backtracking=False)
        subtree = delete_by_link_density(subtree, 'p', backtracking=False)
        # define iteration strategy
        potential_tags = set(TAG_CATALOG)  # + 'span'?
        if include_tables is True:
            potential_tags.add('table')
            for elem in subtree.iter('table'):
                if link_density_test_tables(elem) is True:
                    elem.getparent().remove(elem)
        # skip if empty tree
        if len(subtree) == 0:
            continue
        # no paragraphs containing text
        if not subtree.xpath('//p//text()'):
            potential_tags.add('div')
        LOGGER.debug(sorted(potential_tags))
        etree.strip_tags(subtree, 'link', 'span') # 'a',
        # etree.strip_tags(subtree, 'lb') # BoingBoing-Bug
        # extract content
        # list(filter(None.__ne__, processed_elems))
        result_body.extend([e for e in
                            [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')]
                            if e is not None])
        # remove trailing titles
        while len(result_body) > 0 and result_body[-1].tag in ('fw', 'head'): # and result_body[-1].tail is None:
            result_body[-1].getparent().remove(result_body[-1])
        # exit the loop if the result has children
        if len(result_body) > 1: # try to change this to 0 or 2
            LOGGER.debug(expr)
            break
    temp_text = trim(' '.join(result_body.itertext()))
    # try parsing wild <p> elements if nothing found or text too short
    if len(result_body) == 0 or len(temp_text) < config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'):
        result_body = recover_wild_paragraphs(tree, result_body, deduplicate=deduplicate, config=config)
        temp_text = trim(' '.join(result_body.itertext()))
    else:
        sure_thing = True
    # filter output
    etree.strip_elements(result_body, 'done')
    etree.strip_tags(result_body, 'div')
    # return
    return result_body, temp_text, len(temp_text), sure_thing
Пример #38
0
def remove_elements(tei, params):
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}
    for param in params.items():
        if params[param[0]] == False:
            etree.strip_elements(tei,
                                 "{http://www.tei-c.org/ns/1.0}" + param[0],
                                 with_tail=False)
    return tei
Пример #39
0
 def internal_lemmas(self):
     """
     Return a list of nodes for lemmas (<lm>) within the block
     (excluding lemVersions)
     """
     node_stripped = copy.deepcopy(self.node)
     etree.strip_elements(node_stripped, 'lemVersions')
     return node_stripped.findall('.//lm')
Пример #40
0
 def htmlTree(self):
     tree = None
     text = self.selectedHtml()
     unitext = unicode(text)
     if unitext != '':
       tree = lxml.html.fromstring(unitext)
       etree.strip_elements(tree, 'rt')
       etree.strip_tags(tree, 'ruby', 'rb')
     return tree
Пример #41
0
    def parse_body_text(self, response):

        root = lh.fromstring(response.body)
        le.strip_elements(root, le.Comment, 'script', 'head', 'a')
        fulltext = lh.tostring(root, method="text", encoding=unicode)
        fulltext = fulltext.strip().replace('\n', '')
        fulltext = re.sub(r'\s+', ' ', fulltext)

        yield fulltext
Пример #42
0
def xml_to_articles(filepath):
    """
    Parses an xml and returns plain text versions of the individual chapters (i.e. articles, reviews, ...)
    in the file. Only files are let through that are
        - recognized as 'nl', with a probability above .30.
        - longer than 200 characters
    """

    articles = []

    xml_str = codecs.open(os.path.abspath(filepath), 'r', 'utf8').read()
    xml_str = unicode(BeautifulSoup(xml_str)) # remove entities from the xml

    # get rid of nasty pagebreak (pb), breaking up tokens across pages:
    xml_str = re.sub(nasty_pagebreaks, '', xml_str)

    # attempt to parse the tree:
    try:
        tree = etree.fromstring(xml_str)
    except etree.XMLSyntaxError:
        return None

    # remove cf- and note-elements (don't contain actual text):
    for element in tree.xpath(".//cf"):
        element.getparent().remove(element)
    
    # individual articles etc. are represented as div's which have the type-attribute set to 'chapter':
    chapter_nodes = [node for node in tree.findall('.//div')
                        if node.attrib and \
                           'type' in node.attrib and \
                           node.attrib['type'] == 'chapter']

    for chapter_node in chapter_nodes:
        # all text in the articles is contained under p-elements:
        article_text = ""
        for p_node in chapter_node.findall('.//p'):
            # remove elements that contain meta text (note that we exclude all notes!)
            for tag_name in ('note', 'figure', 'table'):
                etree.strip_elements(p_node, tag_name, with_tail=False)

            # collect the actual text:
            p_text = "".join(p_node.itertext())

            # add the article (and add some whitespace to be safe):
            article_text += p_text+" "

        # collapse all whitespace to single spaces:
        article_text = re.sub(whitespace, ' ', article_text).strip()

        if len(article_text) > 500:
            if detect(article_text) == 'nl':
                articles.append(article_text)
            #else:
            #    print(article_text[:200])
            #    print(detect(article_text))

    return articles
Пример #43
0
    def parse_body_text(self, response):

        root = lh.fromstring(response.body)
        le.strip_elements(root, le.Comment, 'script', 'head', 'a')
        fulltext = lh.tostring(root, method="text", encoding=unicode)
        fulltext = fulltext.strip().replace('\n', '')
        fulltext = re.sub(r'\s+', ' ', fulltext)

        yield fulltext
Пример #44
0
def xml_to_articles(filepath):
    """
    Parses an xml and returns plain text versions of the individual chapters (i.e. articles, reviews, ...)
    in the file. Only files are let through that are
        - recognized as 'nl', with a probability above .30.
        - longer than 200 characters
    """

    articles = []

    xml_str = codecs.open(os.path.abspath(filepath), 'r', 'utf8').read()
    xml_str = unicode(BeautifulSoup(xml_str))  # remove entities from the xml

    # get rid of nasty pagebreak (pb), breaking up tokens across pages:
    xml_str = re.sub(nasty_pagebreaks, '', xml_str)

    # attempt to parse the tree:
    try:
        tree = etree.fromstring(xml_str)
    except etree.XMLSyntaxError:
        return None

    # remove cf- and note-elements (don't contain actual text):
    for element in tree.xpath(".//cf"):
        element.getparent().remove(element)

    # individual articles etc. are represented as div's which have the type-attribute set to 'chapter':
    chapter_nodes = [node for node in tree.findall('.//div')
                        if node.attrib and \
                           'type' in node.attrib and \
                           node.attrib['type'] == 'chapter']

    for chapter_node in chapter_nodes:
        # all text in the articles is contained under p-elements:
        article_text = ""
        for p_node in chapter_node.findall('.//p'):
            # remove elements that contain meta text (note that we exclude all notes!)
            for tag_name in ('note', 'figure', 'table'):
                etree.strip_elements(p_node, tag_name, with_tail=False)

            # collect the actual text:
            p_text = "".join(p_node.itertext())

            # add the article (and add some whitespace to be safe):
            article_text += p_text + " "

        # collapse all whitespace to single spaces:
        article_text = re.sub(whitespace, ' ', article_text).strip()

        if len(article_text) > 500:
            if detect(article_text) == 'nl':
                articles.append(article_text)
            #else:
            #    print(article_text[:200])
            #    print(detect(article_text))

    return articles
Пример #45
0
def tei5reader_fulldocs(inpath, outfolder):
    """Script for reading selected text from TEI P5 files."""
    print("\nLaunched tei5reader_fulldocs.")

    import re
    import os
    import glob
    from lxml import etree
    #print("Using LXML version: ", etree.LXML_VERSION)

    if not os.path.exists(outfolder):
        os.makedirs(outfolder)
     
    for file in glob.glob(inpath):
        with open(file, "r"):
            filename = os.path.basename(file)[:-4]
            #print(filename[:5]) # = idno

            ### The following options may help with parsing errors.
            #parser = etree.XMLParser(collect_ids=False, recover=True)
            parser = etree.XMLParser(recover=True)
            xml = etree.parse(file, parser)
            
            ### The TEI P5 files do have a default namespace.
            namespaces = {'tei':'http://www.tei-c.org/ns/1.0'}

            ### Removes tags but conserves their text content.
            etree.strip_tags(xml, "{http://www.tei-c.org/ns/1.0}hi")

            ### Removes elements and their text content.
            #etree.strip_elements(xml, "speaker")
            etree.strip_elements(xml, "{http://www.tei-c.org/ns/1.0}note")
            #etree.strip_elements(xml, "stage")
            etree.strip_elements(xml, "{http://www.tei-c.org/ns/1.0}head")

            ### XPath defining which text to select
            xp_bodytext = "//tei:body//text()"
            #xp_alltext = "//text()"

            ### Applying one of the above XPaths
            text = xml.xpath(xp_bodytext, namespaces=namespaces)
            text = "\n".join(text)

            ### Some cleaning up
            text = re.sub("  ", "", text)
            #text = re.sub("    ", "", text)
            text = re.sub("\n{1,6}", " ", text)
            #text = re.sub("\n{1,6}", "\n", text)
            text = re.sub("\n \n", "\n", text)
            text = re.sub("\t\n", "", text)

            outtext = str(text)
            outfile = outfolder + filename + ".txt"
        with open(outfile,"w") as output:
            output.write(outtext)
    print("Done.")
Пример #46
0
def tei5reader_fulldocs(inpath, outfolder):
    """Script for reading selected text from TEI P5 files."""
    print("\nLaunched tei5reader_fulldocs.")

    import re
    import os
    import glob
    from lxml import etree
    #print("Using LXML version: ", etree.LXML_VERSION)

    if not os.path.exists(outfolder):
        os.makedirs(outfolder)

    for file in glob.glob(inpath):
        with open(file, "r"):
            filename = os.path.basename(file)[:-4]
            #print(filename[:5]) # = idno

            ### The following options may help with parsing errors.
            #parser = etree.XMLParser(collect_ids=False, recover=True)
            parser = etree.XMLParser(recover=True)
            xml = etree.parse(file, parser)

            ### The TEI P5 files do have a default namespace.
            namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

            ### Removes tags but conserves their text content.
            etree.strip_tags(xml, "{http://www.tei-c.org/ns/1.0}hi")

            ### Removes elements and their text content.
            #etree.strip_elements(xml, "speaker")
            etree.strip_elements(xml, "{http://www.tei-c.org/ns/1.0}note")
            #etree.strip_elements(xml, "stage")
            etree.strip_elements(xml, "{http://www.tei-c.org/ns/1.0}head")

            ### XPath defining which text to select
            xp_bodytext = "//tei:body//text()"
            #xp_alltext = "//text()"

            ### Applying one of the above XPaths
            text = xml.xpath(xp_bodytext, namespaces=namespaces)
            text = "\n".join(text)

            ### Some cleaning up
            text = re.sub("  ", "", text)
            #text = re.sub("    ", "", text)
            text = re.sub("\n{1,6}", " ", text)
            #text = re.sub("\n{1,6}", "\n", text)
            text = re.sub("\n \n", "\n", text)
            text = re.sub("\t\n", "", text)

            outtext = str(text)
            outfile = outfolder + filename + ".txt"
        with open(outfile, "w") as output:
            output.write(outtext)
    print("Done.")
Пример #47
0
    def __symbolize_mathml(self, element, start_id):
        infty_ids = []
        for mt in element.xpath(".//*[local-name() = 'math']"):
            mid         = "MATH_%s_%s" % (self.__pname, start_id)
            mt.tail     = "%s%s" % (mid, mt.tail if mt.tail is not None else "")

            infty_ids.append(mt.attrib["id"])
            start_id   += 1
        etree.strip_elements(element, "{http://www.w3.org/1998/Math/MathML}math", with_tail = False)
        return infty_ids, start_id
Пример #48
0
    def handle_metypesetdeleted(self, keep):
        tree = self.load_dom_tree()

        if keep:
            etree.strip_tags(tree, '{http://www.tei-c.org/ns/1.0}meTypesetDeleted')
        else:
            etree.strip_elements(tree, '{http://www.tei-c.org/ns/1.0}meTypesetDeleted', with_tail=False)

        self.save_tree(tree)
        self.debug.print_debug(self, u'Handled deleted text')
Пример #49
0
 def root(self):
     """Fetch and parse xml and clean up unwanted elements/markup"""
     if not hasattr(self, "_root"):
         query = db.Query("document", "xml")
         query.where(query.Condition("id", self.cdr_id))
         xml = query.execute(self.__control.cursor).fetchone().xml
         self._root = etree.fromstring(xml.encode("utf-8"))
         etree.strip_elements(self._root, *self.DROP, with_tail=False)
         etree.strip_tags(self._root, *self.STRIP)
     return self._root
Пример #50
0
def find_content_blocks(tree, min_length=None):
    """
    Iterate over content blocks (russian version)
    """
    from lxml.html import tostring
    from lxml.etree import strip_tags, strip_elements, Comment

    # First, make a copy of DOM-tree to not harm external code
    tree = deepcopy(tree)

    # Completely remove content of following tags
    nondata_tags = ['head', 'style', 'script']
    strip_elements(tree, *nondata_tags)

    # Remove comment nodes (keep tail text)
    strip_tags(tree, Comment)

    # Remove links
    strip_tags(tree, 'a')

    # Drop inline tags
    inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a',
                   'span', 'font')
    strip_tags(tree, *inline_tags)

    # Drop media tags
    media_tags = ('img',)
    strip_tags(tree, *media_tags)

    body = tostring(tree, encoding='utf-8').decode('utf-8')

    # Normalize spaces
    body = normalize_space(body)

    # Remove ALL chars from tags
    re_tag = re.compile(r'<[^>]+>')
    body = re_tag.sub(r'<>', body)

    #with open('/tmp/log.html', 'w') as out:
        #out.write(body.encode('utf-8'))
    #return

    # Find text blocks
    block_rex = re.compile(r'[^<>]+')

    blocks = []
    for match in block_rex.finditer(body):
        block = match.group(0)
        if min_length is None or len(block) >= min_length:
            ratio = _trash_ratio(block)
            if ratio < 0.05:
                words = block.split()
                if not any(len(x) > 50 for x in words):
                    blocks.append(block)
    return blocks
Пример #51
0
	def test(self, recipients, audId=1):
		url = '/audiences/%s/blasts/test.xml' % str(audId)
		headers = {'Content-type':'application/xml'}
		# handle recipients
		if type(recipients) != str: recipients = ','.join(recipients)
		toNode = self.xml.xpath('/blast/to')[0]
		etree.strip_elements(toNode, 'audience-id', 'include-lists')
		toNode.text = recipients
		# send request
		##data = etree.tostring(self.xml, with_comments=False)
		data = self.tostring()
		resp = connection._request(url, 'POST', data, headers)
		return resp
Пример #52
0
def parseQuestionContentToList(content):
	root = etree.HTML(content)
	etree.strip_elements(root,'code',with_tail=False)
	etree.strip_tags(root,'*')
	print()
	nonPunct = re.compile('.*[A-Za-z0-9].*')
	text = str(etree.tostring(root,pretty_print = True)[10:-11])[1:].lower()\
	.replace('\\n',' ')\
	.replace("\\",'')\
	.replace("?","")
	tokens = nltk.word_tokenize(text)
	filtered = [w for w in tokens if nonPunct.match(w)]
	return filtered
Пример #53
0
def reformatVerseElement(element, keepTags=['q', ]):
    """ Updates various values of the specified verse Element.

    """
    etree.strip_elements(element, *discardTags, with_tail=False)
    tags = (child.tag for child in element.iter())
    for tag in [tag for tag in tags if tag not in keepTags]:
        etree.strip_tags(element, tag)
    for child in element.iter('q'):
        child.set('class', child.get('who', 'unknown').lower())
    for child in element.iter():
        if child.tag not in knownTags:
            logger.debug('unhandled tag: %s %s', child.tag, etree.tostring(child))
Пример #54
0
    def emit_tag(self, tag=None, as_initiator=False):
        """
        Emit a subtree for an entity, using the supplied tag for the root
        element. If the identity is the Initiator's, emit the CUC as well.
        """
        if as_initiator:
            tag = 'InitgPty'
        root = etree.Element(tag)

        # Name
        if hasattr(self, 'name'):
            name = etree.SubElement(root, 'Nm')
            name.text = self.name

        # Address
        if hasattr(self, 'address') and not as_initiator:
            root.append(self.address.__tag__())

        # ID
        idtag = etree.SubElement(root, 'Id')
        if self.private:
            id_container = 'PrvtId'
        else:
            id_container = 'OrgId'
        orgid = etree.SubElement(idtag, id_container)

#### Elimina il nodo Id/OrgId per il creditore
        if not as_initiator:
            etree.strip_elements(root, 'Id', with_tail=False)

        # CUC
        if as_initiator:
            if not hasattr(self, 'cuc'):
                raise MissingCUCError
            orgid.append(emit_id_tag(self.cuc, 'CBI'))

        # Tax code
        if not as_initiator:
            if hasattr(self, 'cf'):
                orgid.append(emit_id_tag(self.cf, 'CBI'))
#        if hasattr(self, 'cf'):
#            orgid.append(emit_id_tag(self.cf, 'ADE'))

        if hasattr(self, 'code'):
            orgid.append(emit_id_tag(self.code, None))

        if not as_initiator and hasattr(self, 'country'):
            etree.SubElement(root, 'CtryOfRes').text = self.country

        return root
Пример #55
0
def getReleaseNoteDetail(tDetail):
	thisScreen = []
	opener = urllib2.build_opener()
	opener.addheader = [('User-Agent','Mozilla/5.0')]
	resp = opener.open(tDetail)
	if resp.code == 200:
		data = resp.read()
	elif resp.code == 404:
		print "Page do not exist"
		exit()
	else:
		print "Can not open page"
		exit()
	parser = etree.HTMLParser()
	tree = etree.parse(StringIO(data), parser)

	comments = tree.xpath('//comment()')
	for c in comments:
		p = c.getparent()
		p.remove(c)

	#etree.strip_tags(tree,'p')
	etree.strip_tags(tree,'i')
	etree.strip_tags(tree,'a')
	etree.strip_elements(tree,'iframe')
	result = etree.tostring(tree.getroot(), pretty_print=True, method="html", encoding='utf-8')

	mTitle = ''
	titles = tree.xpath("//h1[@id='id_title']")
	for title in titles:
		if title.text is not None:
			mTitle = title.text
			break

	Screen2 = []
	Screen2.append(clrTx(mTitle,'YELLOW'))
	Screen2.append(repeatStr('-',78))
	articles = tree.xpath("//div[@class='articleBody']/p")
	for article in articles:
		if article.text is not None:
			for line in _wrap.wrap(article.text):
				Screen2.append('    '+line)
	Screen2.append(repeatStr('-',78))

	option = ''
	while option is not 'b':
		for item in Screen2:
			print item
		print "b"
		option = raw_input()
Пример #56
0
 def eat(self, fname):
     t = etree.parse(fname, self._p).getroot()
     etree.strip_elements(t, "comment", "code-helper")
     etree.strip_tags(t, "virtual-methods")
     for e in t.iter():
         if isinstance(e, etree._Comment):
             e.getparent().remove(e)
         else:
             e.tail = e.text = None
         try:
             e.set("xmlfilename", fname)
         except TypeError:
             pass
     self.woot.extend( etx('*')(t) )
Пример #57
0
def parse_text(wdir, txtFolder):
    """
    This function opens the file, reads it as xml and delete some elements.
    The other funcionts of this file use it. For example:
        content =  parse_text(wdir, txtFolder)
    """
    # We parse the text as xml
    file = wdir+txtFolder+".xml"
        
    xml_tree = etree.parse(file)
    
    # Let's print it to see if everything is ok    
    # print(etree.tostring(xml_tree, pretty_print=True, encoding="unicode"))

    # Namespaces are specified
    specific_namespaces = {'tei':'http://www.tei-c.org/ns/1.0','xi':'http://www.w3.org/2001/XInclude'}

    # Back, front, teiHeader and heads are deleted
    etree.strip_elements(xml_tree, "{http://www.tei-c.org/ns/1.0}back", with_tail=False)
    etree.strip_elements(xml_tree, "{http://www.tei-c.org/ns/1.0}front", with_tail=False)
    etree.strip_elements(xml_tree, "{http://www.tei-c.org/ns/1.0}teiHeader", with_tail=False)
    etree.strip_elements(xml_tree, "{http://www.tei-c.org/ns/1.0}head", with_tail=False)

    # Only text is kept and saved as string
    content = xml_tree.xpath("//text()", namespaces=specific_namespaces)
    content = ''.join(content)
    
    #print(content)
    #print(type(content))

    return content
Пример #58
0
 def format(self, article, subscriber, codes=None):
     try:
         pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
         nitf = self.get_nitf(article, subscriber, pub_seq_num)
         strip_elements(nitf, 'body.end')
         nitf_string = etree.tostring(nitf, encoding='utf-8').decode()
         headers = ['<?xml version=\"1.0\" encoding=\"UTF-8\"?>',
                    '<!-- <!DOCTYPE nitf SYSTEM \"./nitf-3-3.dtd\"> -->']
         return [{
             'published_seq_num': pub_seq_num,
             'formatted_item': '{}\r\n{}'.format("\r\n".join(headers), nitf_string).
                 replace('&#13;\n', self.line_ender)}]
     except Exception as ex:
         raise FormatterError.nitfFormatterError(ex, subscriber)
Пример #59
0
    def __init__(self, url, ppmm):
        svg_file = open(url, 'rb')
        self.tree = etree.fromstring(svg_file.read())
        self.layers = []
        for g in self.tree.iter(G):
            self.layers.append(g)

        self.ppmm = ppmm
        bt = copy.deepcopy(self.tree)
        bt.attrib['width'] = str(float(bt.attrib['width']) * self.ppmm)
        bt.attrib['height'] = str(float(bt.attrib['height']) * self.ppmm)

        self.blank_tree = bt
        etree.strip_elements(self.blank_tree,
                             [G])