def fetch_daily_feeds(self, day): """ Fetch the feed for +day+ and returns an ElementTree instance. """ # import xml.etree.ElementTree as ET from xml.etree import ElementTree from htmlentitydefs import name2codepoint if self.FEED_PASSWORD is None: raise ValueError("%s.FEED_PASSWORD must be set." % self.__class__.__name__) # r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'), # auth=(self.FEED_USER, self.FEED_PASSWORD), # verify=False, # timeout=60) payload = {'PHP_AUTH_USER': self.FEED_USER, 'PHP_AUTH_PW': self.FEED_PASSWORD} r = requests.get(self.FEED_URL % day.strftime('%d-%m-%Y'), headers=payload, verify=False, timeout=60) r.raise_for_status() parser = ElementTree.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems()) etree = ElementTree return etree.fromstring(r.text.encode('utf-8'), parser=parser)
def fromName2codepoint(string): '''String with (X)HTML entitites -> string with their Unicode counterparts. >>> fromName2codepoint('This & that.') 'This & that.' ''' for key, val in name2codepoint.iteritems(): string = string.replace('&%s;' % key, '&#%d;' % val) return string
def get_xml_tree(html_string): try: parser = ET.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems()) tree = ET.fromstring(html_string, parser=parser) except: dump_html(html_string) raise return tree
def get_xml_tree(html_string): try: parser = ET.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems()) tree = ET.fromstring(html_string, parser=parser) except: # noqa FIXME: figure out what we expect this to throw. dump_html(html_string) raise return tree
def get_xml_tree(html_string): try: parser = ET.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update( (x, unichr(i)) for x, i in name2codepoint.iteritems()) tree = ET.fromstring(html_string, parser=parser) except: # noqa FIXME: figure out what we expect this to throw. dump_html(html_string) raise return tree
def main(): res = [] qlist = ints_sample(284) qcount = 0 appcount = 0 parser = etree.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems()) raw_data = etree.parse(sys.argv[1], parser=parser) elms = raw_data.getroot() mclist = elms.findall('mc') qlist = ints_sample(len(mclist)) for i in qlist: mc = mclist[i] qset = mc.find('question') mc_id = qset.attrib['id'] lc = qset.find('leftcontext').text bl = qset.find('blank') rc = qset.find('rightcontext').text question = (lc.strip() if lc else "") + ' _____ ' + (rc.strip() if rc else "") qtype = bl.attrib['type'] context = mc.find('contextart').text.strip() cands = [] for cand in mc.findall('choice'): cand_id = cand.attrib['idx'] cand_cor = cand.attrib['correct'] cand_stat = cand.text.strip() cand_str = "{0}) {1}".format(cand_id, cand_stat) if cand_cor == "true": cand_str = "*" + cand_str cands.append(cand_str) answer = "\n".join(cands) questions = [{ 'question': question, 'answer': answer, 'question_type': qtype }] res.append({ 'id': 'wdw_{0:03d}'.format(len(res)), 'original_id': mc_id, 'contents': { 'context': context, 'questions': questions } }) print 'Extracted (passage,question) pairs:', len(res) return res
def getAntragText(self,html): parser = ET.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems()) #the parser can only read from a file, not from a string, so we create a file tf = tempfile.NamedTemporaryFile(delete=False) name = tf.name tf.write(html.encode('ascii', 'xmlcharrefreplace')) tf.close() try: tree = ET.parse(name, parser=parser) root = tree.getroot() bodys = root.findall('.//{http://www.w3.org/1999/xhtml}body') text = '\n'.join([a for a in bodys[1].itertext()]) #print '.', except ET.ParseError: chunk = html.split('<meta name="generator" content="Aspose.Words for .NET')[1] text = re.sub('<[^>]*?>','',chunk) #print '' return text
def extractantraege(self,scrapedpage,parteien): """extract the Antrag data from an ALLRis page""" parser = ET.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems()) tf = tempfile.NamedTemporaryFile(delete=False) name = tf.name tf.write(scrapedpage.encode('ascii', 'xmlcharrefreplace')) tf.close() try: tree = ET.parse(name, parser=parser) except ET.ParseError: print "parse error in", name, "for", parteien return [] root = tree.getroot() zl11 = root.findall(".//{http://www.w3.org/1999/xhtml}tr[@class='zl11']") zl12 = root.findall(".//{http://www.w3.org/1999/xhtml}tr[@class='zl12']") trs = zl11 + zl12 antraege = [] for tr in trs: tds = tr.findall('{http://www.w3.org/1999/xhtml}td') words = tds[1].find('{http://www.w3.org/1999/xhtml}a').text.split() dsnr = words[0] title = ' '.join(words[1:]) href = self.baseurl+tds[1].find('{http://www.w3.org/1999/xhtml}a').attrib['href'] #partei = tds[3].text typ = tds[5].text antrag = Antrag(bezirk , dsnr, url=href, parteien=parteien) antrag.title = unicode(title) antrag.typ = unicode(typ) antrag.html = self.getAntragHTML(href,bezirk) antrag.text = self.getAntragText(antrag.html) antrag.status = self.getStatus(antrag.html) antrag.ausschuss = self.getAusschussFields(antrag.html) antrag.updateLengths() antraege.append(antrag) return antraege
def replaceEntities(s): s = s.groups()[0] try: if s[0] == "#": s = s[1:] if s[0] in ['x', 'X']: c = int(s[1:], 16) else: c = int(s) return unichr(c) except ValueError: return '&#' + s + ';' else: from htmlentitydefs import entitydefs, name2codepoint if entitydefs is None: # entitydefs = \ entitydefs = {'apos': u"'"} for k, v in name2codepoint.iteritems(): entitydefs[k] = unichr(v) try: return entitydefs[s] except KeyError: return '&' + s + ';'
#!/usr/bin/env python # -*- coding: utf-8 -*- # # tweetokenize: Regular expression based tokenizer for Twitter # Copyright: (c) 2013, Jared Suttles. All rights reserved. # License: BSD, see LICENSE for details. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - import re from os import path from itertools import imap from htmlentitydefs import name2codepoint html_entities = {k: unichr(v) for k, v in name2codepoint.iteritems()} html_entities_re = re.compile(r"&#?\w+;") emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'), (u'\U0001f680', u'\U0001f6c5'), (u'\u2600', u'\u26ff'), (u'\U0001f170', u'\U0001f19a')) emoji_flags = { u'\U0001f1ef\U0001f1f5', u'\U0001f1f0\U0001f1f7', u'\U0001f1e9\U0001f1ea', u'\U0001f1e8\U0001f1f3', u'\U0001f1fa\U0001f1f8', u'\U0001f1eb\U0001f1f7', u'\U0001f1ea\U0001f1f8', u'\U0001f1ee\U0001f1f9', u'\U0001f1f7\U0001f1fa', u'\U0001f1ec\U0001f1e7' } def _converthtmlentities(msg): def replace_entities(s): s = s.group(0)[1:-1] # remove & and ; if s[0] == '#': try: return unichr(int(s[2:], 16) if s[1] in 'xX' else int(s[1:]))
#!/usr/bin/env python # -*- coding: utf-8 -*- # # tweetokenize: Regular expression based tokenizer for Twitter # Copyright: (c) 2013, Jared Suttles. All rights reserved. # License: BSD, see LICENSE for details. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - import re from os import path from itertools import imap from htmlentitydefs import name2codepoint html_entities = {k: unichr(v) for k, v in name2codepoint.iteritems()} html_entities_re = re.compile(r"&#?\w+;") emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'), (u'\U0001f680', u'\U0001f6c5'), (u'\u2600', u'\u26ff'), (u'\U0001f170', u'\U0001f19a')) emoji_flags = {u'\U0001f1ef\U0001f1f5', u'\U0001f1f0\U0001f1f7', u'\U0001f1e9\U0001f1ea', u'\U0001f1e8\U0001f1f3', u'\U0001f1fa\U0001f1f8', u'\U0001f1eb\U0001f1f7', u'\U0001f1ea\U0001f1f8', u'\U0001f1ee\U0001f1f9', u'\U0001f1f7\U0001f1fa', u'\U0001f1ec\U0001f1e7'} def _converthtmlentities(msg): def replace_entities(s): s = s.group(0)[1:-1] # remove & and ; if s[0] == '#': try: return unichr(int(s[2:],16) if s[1] in 'xX' else int(s[1:])) except ValueError: return '&#' + s + ';' else:
import sys from cStringIO import StringIO from xml.etree import ElementTree as ET from htmlentitydefs import name2codepoint source = StringIO("""<html> <body> <p>Less than "<"</p> <p>Non-breaking space " "</p> </body> </html>""") parser = ET.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems()) etree = ET.ElementTree() tree = etree.parse(source, parser=parser) for p in tree.findall('.//p'): print ET.tostring(p, encoding='UTF-8')
import re, codecs from htmlentitydefs import name2codepoint, codepoint2name encode_table={unichr(code): "&%s;"%name for code, name in codepoint2name.iteritems()} encode_regex=re.compile("(%s)"%"|".join(map(re.escape, encode_table.keys()))) decode_table={"&%s;"%name: unichr(code) for name, code in name2codepoint.iteritems()} decode_regex=re.compile("(?:&#(\d{1,5});)|(?:&#x(\d{1,5});)|(&\w{1,8};)") class HtmlCodec(codecs.Codec): def encode(self, input, errors='strict'): output=encode_regex.sub(lambda match: encode_table[match.group(0)], input) return output, len(output) def decode(self, input, errors='strict'): def substitute(match): code, xcode, entity=match.group(1, 2, 3) return unichr(int(code)) if code else unichr(int(xcode, 16)) if xcode else decode_table.get(entity, entity) output=decode_regex.sub(substitute, input) return output, len(output) class HtmlIncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): raise NotImplementedError
def parse(file1, file2): path = '/Users/apple/Downloads/wdw_script/who_did_what/Strict/' parser = ET.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update((x, unichr(i)) for x, i in name2codepoint.iteritems()) etree = ET.ElementTree() for filename in os.listdir(path): if filename == file1: data = open(path + filename, 'r') root = etree.parse(data, parser=parser) break #question:question, answer:choice=True, document ind = 1 entity_lists = [] plus = [] correct = [] for mc in root: entity_lists.append({}) plus.append({}) correct.append({}) memory = None for child in mc: if child.tag == 'question': for grandchildren in child: if grandchildren.tag == 'blank': t = grandchildren.text t = clean_str(t) plus[-1][t] = ' @entity' + str(ind) + ' ' ind += 1 memory = plus[-1][t] break elif child.tag == 'choice': t = clean_str(child.text) if child.attrib['correct'] == 'true': correct[-1][t] = memory else: entity_lists[-1][t] = ' @entity' + str(ind) + ' ' ind += 1 i = -1 f = open(path + file2, 'w') for mc in root: i += 1 dic = entity_lists[i] pl = plus[i] co = correct[i] for child in mc: if child.tag == 'question': question = '' for grandchildren in child: # print grandchildren.text if grandchildren.tag == 'blank': t = '@placeholder ' else: t = grandchildren.text if not t: continue t = clean_str(t) + ' ' question += t if child.tag == 'choice': if child.attrib['correct'] == 'true': t = clean_str(child.text) answer = co[t] if child.tag == 'contextart': document = clean_str(child.text) flag = True for key, value in co.items(): if key in document: flag = False document = document.replace(key, value) for key, value in pl.items(): if key in document: flag = False document = document.replace(key, value) if flag: continue for key, value in dic.items(): document = document.replace(key, value) f.write(question + '\n') f.write(answer + '\n') f.write(document + '\n\n')