Пример #1
0
    def fetch_daily_feeds(self, day):
        """ Fetch the feed for +day+ and returns an ElementTree instance. """
        # import xml.etree.ElementTree as ET

        from xml.etree import ElementTree
        from htmlentitydefs import name2codepoint

        if self.FEED_PASSWORD is None:
            raise ValueError("%s.FEED_PASSWORD must be set." % self.__class__.__name__)

        # r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
        #                  auth=(self.FEED_USER, self.FEED_PASSWORD),
        #                  verify=False,
        #                  timeout=60)

        payload = {'PHP_AUTH_USER': self.FEED_USER, 'PHP_AUTH_PW': self.FEED_PASSWORD}

        r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
                         headers=payload,
                         verify=False,
                         timeout=60)

        r.raise_for_status()

        parser = ElementTree.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        etree = ElementTree

        return etree.fromstring(r.text.encode('utf-8'), parser=parser)
Пример #2
0
    def fetch_daily_feeds(self, day):
        """ Fetch the feed for +day+ and returns an ElementTree instance. """
        # import xml.etree.ElementTree as ET

        from xml.etree import ElementTree
        from htmlentitydefs import name2codepoint

        if self.FEED_PASSWORD is None:
            raise ValueError("%s.FEED_PASSWORD must be set." % self.__class__.__name__)

        # r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
        #                  auth=(self.FEED_USER, self.FEED_PASSWORD),
        #                  verify=False,
        #                  timeout=60)

        payload = {'PHP_AUTH_USER': self.FEED_USER, 'PHP_AUTH_PW': self.FEED_PASSWORD}

        r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'),
                         headers=payload,
                         verify=False,
                         timeout=60)

        r.raise_for_status()

        parser = ElementTree.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        etree = ElementTree

        return etree.fromstring(r.text.encode('utf-8'), parser=parser)
Пример #3
0
def fromName2codepoint(string):
    '''String with (X)HTML entitites -> string with their Unicode counterparts.

    >>> fromName2codepoint('This & that.')
    'This & that.'

    '''
    for key, val in name2codepoint.iteritems():
        string = string.replace('&%s;' % key, '&#%d;' % val)
    return string
Пример #4
0
def get_xml_tree(html_string):
    try:
        parser = ET.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        tree = ET.fromstring(html_string, parser=parser)
    except:
       dump_html(html_string)
       raise
    return tree
Пример #5
0
def get_xml_tree(html_string):
    try:
        parser = ET.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        tree = ET.fromstring(html_string, parser=parser)
    except:  # noqa FIXME: figure out what we expect this to throw.
        dump_html(html_string)
        raise
    return tree
Пример #6
0
def get_xml_tree(html_string):
    try:
        parser = ET.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
        tree = ET.fromstring(html_string, parser=parser)
    except:
       dump_html(html_string)
       raise
    return tree
Пример #7
0
def get_xml_tree(html_string):
    try:
        parser = ET.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update(
            (x, unichr(i)) for x, i in name2codepoint.iteritems())
        tree = ET.fromstring(html_string, parser=parser)
    except:  # noqa FIXME: figure out what we expect this to throw.
        dump_html(html_string)
        raise
    return tree
Пример #8
0
def main():
    res = []
    qlist = ints_sample(284)
    qcount = 0
    appcount = 0
    parser = etree.XMLParser()
    parser.parser.UseForeignDTD(True)
    parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
    raw_data = etree.parse(sys.argv[1], parser=parser)
    elms = raw_data.getroot()
    mclist = elms.findall('mc')
    qlist = ints_sample(len(mclist))
    for i in qlist:
        mc = mclist[i]
        qset = mc.find('question')
        mc_id = qset.attrib['id']
        lc = qset.find('leftcontext').text
        bl = qset.find('blank')
        rc = qset.find('rightcontext').text
        question = (lc.strip() if lc else "") + ' _____ ' + (rc.strip()
                                                             if rc else "")
        qtype = bl.attrib['type']
        context = mc.find('contextart').text.strip()
        cands = []
        for cand in mc.findall('choice'):
            cand_id = cand.attrib['idx']
            cand_cor = cand.attrib['correct']
            cand_stat = cand.text.strip()
            cand_str = "{0}) {1}".format(cand_id, cand_stat)
            if cand_cor == "true":
                cand_str = "*" + cand_str
            cands.append(cand_str)
        answer = "\n".join(cands)
        questions = [{
            'question': question,
            'answer': answer,
            'question_type': qtype
        }]
        res.append({
            'id': 'wdw_{0:03d}'.format(len(res)),
            'original_id': mc_id,
            'contents': {
                'context': context,
                'questions': questions
            }
        })

    print 'Extracted (passage,question) pairs:', len(res)
    return res
Пример #9
0
    def getAntragText(self,html):
	parser = ET.XMLParser()
	parser.parser.UseForeignDTD(True)
	parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
	
	#the parser can only read from a file, not from a string, so we create a file
	tf = tempfile.NamedTemporaryFile(delete=False)
	name = tf.name  
	tf.write(html.encode('ascii', 'xmlcharrefreplace'))
	tf.close()  
	try:
	    tree = ET.parse(name, parser=parser) 
	    root = tree.getroot()  
	    bodys = root.findall('.//{http://www.w3.org/1999/xhtml}body')
	    text = '\n'.join([a for a in bodys[1].itertext()]) 
	    #print '.',
	except ET.ParseError:
	    chunk = html.split('<meta name="generator" content="Aspose.Words for .NET')[1]
	    text = re.sub('<[^>]*?>','',chunk)
	    #print ''
	return text
Пример #10
0
    def extractantraege(self,scrapedpage,parteien):
	"""extract the Antrag data from an ALLRis page"""
	
	parser = ET.XMLParser()
	parser.parser.UseForeignDTD(True)
	parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
	
	tf = tempfile.NamedTemporaryFile(delete=False)
	name = tf.name  
	tf.write(scrapedpage.encode('ascii', 'xmlcharrefreplace'))
	tf.close()  
	
	try:
	    tree = ET.parse(name, parser=parser)   	    
	except ET.ParseError:
	    print "parse error in", name, "for", parteien
	    return []
	root = tree.getroot()  
	zl11 = root.findall(".//{http://www.w3.org/1999/xhtml}tr[@class='zl11']")
	zl12 = root.findall(".//{http://www.w3.org/1999/xhtml}tr[@class='zl12']")    
	trs = zl11 + zl12
	antraege = []
	for tr in trs: 
	    tds = tr.findall('{http://www.w3.org/1999/xhtml}td') 
	    words = tds[1].find('{http://www.w3.org/1999/xhtml}a').text.split() 
	    dsnr = words[0]
	    title = ' '.join(words[1:]) 
	    href = self.baseurl+tds[1].find('{http://www.w3.org/1999/xhtml}a').attrib['href']
	    #partei = tds[3].text
	    typ = tds[5].text
	    antrag = Antrag(bezirk , dsnr, url=href, parteien=parteien)
	    antrag.title = unicode(title)  
	    antrag.typ = unicode(typ)  
	    antrag.html = self.getAntragHTML(href,bezirk)
	    antrag.text = self.getAntragText(antrag.html)
	    antrag.status = self.getStatus(antrag.html)
	    antrag.ausschuss = self.getAusschussFields(antrag.html)
	    antrag.updateLengths()
	    antraege.append(antrag)
	return antraege
Пример #11
0
 def replaceEntities(s):
     s = s.groups()[0]
     try:
         if s[0] == "#":
             s = s[1:]
             if s[0] in ['x', 'X']:
                 c = int(s[1:], 16)
             else:
                 c = int(s)
             return unichr(c)
     except ValueError:
         return '&#' + s + ';'
     else:
         from htmlentitydefs import entitydefs, name2codepoint
         if entitydefs is None:
             # entitydefs = \
             entitydefs = {'apos': u"'"}
             for k, v in name2codepoint.iteritems():
                 entitydefs[k] = unichr(v)
         try:
             return entitydefs[s]
         except KeyError:
             return '&' + s + ';'
Пример #12
0
 def replaceEntities(s):
     s = s.groups()[0]
     try:
         if s[0] == "#":
             s = s[1:]
             if s[0] in ['x', 'X']:
                 c = int(s[1:], 16)
             else:
                 c = int(s)
             return unichr(c)
     except ValueError:
         return '&#' + s + ';'
     else:
         from htmlentitydefs import entitydefs, name2codepoint
         if entitydefs is None:
             # entitydefs = \
             entitydefs = {'apos': u"'"}
             for k, v in name2codepoint.iteritems():
                 entitydefs[k] = unichr(v)
         try:
             return entitydefs[s]
         except KeyError:
             return '&' + s + ';'
Пример #13
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# tweetokenize: Regular expression based tokenizer for Twitter
# Copyright: (c) 2013, Jared Suttles. All rights reserved.
# License: BSD, see LICENSE for details.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
import re
from os import path
from itertools import imap
from htmlentitydefs import name2codepoint

html_entities = {k: unichr(v) for k, v in name2codepoint.iteritems()}
html_entities_re = re.compile(r"&#?\w+;")
emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'),
                (u'\U0001f680', u'\U0001f6c5'), (u'\u2600', u'\u26ff'),
                (u'\U0001f170', u'\U0001f19a'))
emoji_flags = {
    u'\U0001f1ef\U0001f1f5', u'\U0001f1f0\U0001f1f7', u'\U0001f1e9\U0001f1ea',
    u'\U0001f1e8\U0001f1f3', u'\U0001f1fa\U0001f1f8', u'\U0001f1eb\U0001f1f7',
    u'\U0001f1ea\U0001f1f8', u'\U0001f1ee\U0001f1f9', u'\U0001f1f7\U0001f1fa',
    u'\U0001f1ec\U0001f1e7'
}


def _converthtmlentities(msg):
    def replace_entities(s):
        s = s.group(0)[1:-1]  # remove & and ;
        if s[0] == '#':
            try:
                return unichr(int(s[2:], 16) if s[1] in 'xX' else int(s[1:]))
Пример #14
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# tweetokenize: Regular expression based tokenizer for Twitter
# Copyright: (c) 2013, Jared Suttles. All rights reserved.
# License: BSD, see LICENSE for details.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
import re
from os import path
from itertools import imap
from htmlentitydefs import name2codepoint

html_entities = {k: unichr(v) for k, v in name2codepoint.iteritems()}
html_entities_re = re.compile(r"&#?\w+;")
emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'), (u'\U0001f680', u'\U0001f6c5'),
                (u'\u2600', u'\u26ff'), (u'\U0001f170', u'\U0001f19a'))
emoji_flags =  {u'\U0001f1ef\U0001f1f5', u'\U0001f1f0\U0001f1f7', u'\U0001f1e9\U0001f1ea',
                u'\U0001f1e8\U0001f1f3', u'\U0001f1fa\U0001f1f8', u'\U0001f1eb\U0001f1f7',
                u'\U0001f1ea\U0001f1f8', u'\U0001f1ee\U0001f1f9', u'\U0001f1f7\U0001f1fa',
                u'\U0001f1ec\U0001f1e7'}


def _converthtmlentities(msg):
    def replace_entities(s):
        s = s.group(0)[1:-1] # remove & and ;
        if s[0] == '#':
            try:
                return unichr(int(s[2:],16) if s[1] in 'xX' else int(s[1:]))
            except ValueError:
                return '&#' + s + ';'
        else:
Пример #15
0
import sys
from cStringIO import StringIO
from xml.etree import ElementTree as ET
from htmlentitydefs import name2codepoint

source = StringIO("""<html>
<body>
<p>Less than "&lt;"</p>
<p>Non-breaking space "&nbsp;"</p>
</body>
</html>""")

parser = ET.XMLParser()
parser.parser.UseForeignDTD(True)
parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
etree = ET.ElementTree()

tree = etree.parse(source, parser=parser)
for p in tree.findall('.//p'):
     print ET.tostring(p, encoding='UTF-8')
Пример #16
0
import re, codecs
from htmlentitydefs import name2codepoint, codepoint2name


encode_table={unichr(code): "&%s;"%name for code, name in codepoint2name.iteritems()}
encode_regex=re.compile("(%s)"%"|".join(map(re.escape, encode_table.keys())))

decode_table={"&%s;"%name: unichr(code) for name, code in name2codepoint.iteritems()}
decode_regex=re.compile("(?:&#(\d{1,5});)|(?:&#x(\d{1,5});)|(&\w{1,8};)")


class HtmlCodec(codecs.Codec):
	
	def encode(self, input, errors='strict'):
		output=encode_regex.sub(lambda match: encode_table[match.group(0)], input)
		return output, len(output)

	def decode(self, input, errors='strict'):
		def substitute(match):
			code, xcode, entity=match.group(1, 2, 3)
			return unichr(int(code)) if code else unichr(int(xcode, 16)) if xcode else decode_table.get(entity, entity)
		output=decode_regex.sub(substitute, input)
		return output, len(output)


class HtmlIncrementalEncoder(codecs.IncrementalEncoder):
	
	def encode(self, input, final=False):
		raise NotImplementedError 
Пример #17
0
def parse(file1, file2):
    path = '/Users/apple/Downloads/wdw_script/who_did_what/Strict/'

    parser = ET.XMLParser()
    parser.parser.UseForeignDTD(True)
    parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems())
    etree = ET.ElementTree()
    for filename in os.listdir(path):
        if filename == file1:
            data = open(path + filename, 'r')
            root = etree.parse(data, parser=parser)
            break
    #question:question, answer:choice=True, document
    ind = 1
    entity_lists = []
    plus = []
    correct = []
    for mc in root:
        entity_lists.append({})
        plus.append({})
        correct.append({})
        memory = None
        for child in mc:
            if child.tag == 'question':
                for grandchildren in child:
                    if grandchildren.tag == 'blank':
                        t = grandchildren.text
                        t = clean_str(t)
                        plus[-1][t] = ' @entity' + str(ind) + ' '
                        ind += 1
                        memory = plus[-1][t]
                        break
            elif child.tag == 'choice':
                t = clean_str(child.text)
                if child.attrib['correct'] == 'true':
                    correct[-1][t] = memory
                else:
                    entity_lists[-1][t] = ' @entity' + str(ind) + ' '
                    ind += 1

    i = -1
    f = open(path + file2, 'w')
    for mc in root:
        i += 1
        dic = entity_lists[i]
        pl = plus[i]
        co = correct[i]
        for child in mc:
            if child.tag == 'question':
                question = ''
                for grandchildren in child:
                    # print grandchildren.text
                    if grandchildren.tag == 'blank':
                        t = '@placeholder '
                    else:
                        t = grandchildren.text
                        if not t:
                            continue
                        t = clean_str(t) + ' '
                    question += t
            if child.tag == 'choice':
                if child.attrib['correct'] == 'true':
                    t = clean_str(child.text)
                    answer = co[t]
            if child.tag == 'contextart':
                document = clean_str(child.text)
        flag = True
        for key, value in co.items():
            if key in document:
                flag = False
                document = document.replace(key, value)
        for key, value in pl.items():
            if key in document:
                flag = False
                document = document.replace(key, value)
        if flag:
            continue
        for key, value in dic.items():
            document = document.replace(key, value)
        f.write(question + '\n')
        f.write(answer + '\n')
        f.write(document + '\n\n')