def __convert_drug(self, node): drug = Drug() drug.name = xpath.findvalue('name', node) drug.indication = xpath.findvalue('indication', node) drug.fda_product_id = xpath.findvalue('external-identifiers/external-identifier[starts-with(resource, "National Drug Code Directory")]/identifier', node) drug.wikipedia = xpath.findvalue('external-links/external-link[starts-with(resource, "Wikipedia")]/url', node) if not drug.fda_product_id: return print str(drug)
def parse_response(self, resp): resp_dict = {"imported": 0, "ignored": 0, "updated": 0} try: doc = parseString(resp) status = xpath.findvalue('//status', doc) imported = xpath.findvalue('//dataValueCount[1]/@imported', doc) ignored = xpath.findvalue('//dataValueCount[1]/@ignored', doc) updated = xpath.findvalue('//dataValueCount[1]/@updated', doc) #conflicts = xpath.find('//conflict', doc) resp_dict = {"status": status, "imported": imported, "ignored": ignored, "updated": updated} except Exception, e: return False, "%s" % str(e)
def get_games(stage): dom = get_dom(stage) games = [] for groupdom in dom: group_name = xpath.findvalue('caption',groupdom) for match in xpath.find('tbody/tr',groupdom): new_game = {'id' : xpath.findvalue('td[contains(@class,"mNum")]',match), 'group' : group_name , 'time' : timestring_to_datetime(xpath.findvalue('td[contains(@class,"dt")]/span/@title',match).split(",")[1].encode('utf-8')) , 'location' : xpath.findvalue('td/a[contains(@href,"destination")]',match).strip() , } for home_or_away in ['home','away']: new_game[home_or_away + '_team'] = get_team(match, home_or_away) games.append(new_game) return games
def test_render(self): class F(Form): fields = [ Field('name', conv=convs.Char(), widget=self.widget(classname="cls")) ] form = F(self.env) render = form.get_field('name').widget.render('<p>Paragraph</p>') html = self.parse(render) value = self.get_value(html) self.assertEqual(value, '<p>Paragraph</p>') self.assertEqual(xpath.findvalue('.//*:%s/@readonly'%self.tag, html), None) self.assertEqual(xpath.findvalue('.//*:%s/@class'%self.tag, html), 'cls')
def parse_response(self, resp): resp_dict = {"imported": 0, "ignored": 0, "updated": 0} try: doc = parseString(resp) status = xpath.findvalue('//status', doc) imported = xpath.findvalue('//dataValueCount[1]/@imported', doc) ignored = xpath.findvalue('//dataValueCount[1]/@ignored', doc) updated = xpath.findvalue('//dataValueCount[1]/@updated', doc) #conflicts = xpath.find('//conflict', doc) resp_dict = { "status": status, "imported": imported, "ignored": ignored, "updated": updated } except Exception, e: return False, "%s" % str(e)
def getValue(self, location, context=None): """Return a single value from the document (as string)""" with self: if context is None: context = self._doc.documentElement return xpath.findvalue(location, context, originalContext=[context])
def get_team(match, home_or_away): new_team = {} td_xpath = 'td[contains(@class,"'+home_or_away+'Team")]' team_href = xpath.findvalue(td_xpath+'/a/@href',match) td_value = xpath.findvalue(td_xpath, match) if not team_href is None: new_team['name'] = td_value new_team['flag'] = xpath.findvalue('td/a[@href="'+team_href+'"]/img/@src',match) new_team['href'] = team_href elif re.match(r'^[12][A-H]$', td_value): new_team['reference'] = {'rank': int(td_value[0]) , 'game_ref' : "Group "+td_value[1]} elif re.match(r'W([0-9]+)$', td_value): new_team['reference'] = {'rank': 1 , 'game_ref' : "KO "+td_value[1:]} elif re.match(r'L([0-9]+)$', td_value): new_team['reference'] = {'rank': 2 , 'game_ref' : "KO "+td_value[1:]} else: new_team['reference'] = td_value return new_team
def get_games(stage): dom = get_dom(stage) games = [] #print debug for matchdom in dom: group_name = xpath.findvalue('tr/td/div/span[@class="gname"]//a', matchdom) if None == group_name: group_name = xpath.findvalue('tr/td/div/span[@class="rname"]//a', matchdom) if None == group_name: group_name = "" else: group_name = group_name.strip(' \t\n\r') else: group_name = group_name.strip(' \t\n\r') dayvalue = xpath.findvalue( 'tr/td/div/span[@class="b dateT"]', matchdom).strip().split(' ')[0].encode('utf-8') hourstr = xpath.findvalue('tr/td[@class="c b score nob"]//a', matchdom) if None == hourstr: hourstr = xpath.findvalue('tr/td[@class="c b score nob"]', matchdom) if None == hourstr: print matchdom.toprettyxml(encoding='utf-8') return [] hourvalue = hourstr.strip().encode('utf-8') matchtime = datetime(2012, 6, int(dayvalue), int(hourvalue.split('.')[0]) - 2, int(hourvalue.split('.')[1]), 0, 0) stadium = re.match(r".*Stadium:.*,(.*)", xpath.findvalue('tr[@class="referee_stadium"]/td', matchdom).encode('utf-8').strip(), flags=re.DOTALL).group(1).strip() hournode = xpath.findvalue('tr/td[@class="c b score nob"]//a/@href', matchdom) if None == hournode: print matchdom.toprettyxml(encoding='utf-8') return [] matchid = int( re.match( r"/uefaeuro/season=2012/matches/round=[0-9]+/match=([0-9]+)/index.html", hournode).group(1)) - 2003318 #print [ group_name , dayvalue , hourvalue , matchtime, stadium , matchid, hournode ] new_game = { 'id': matchid, 'group': group_name, 'time': matchtime, 'location': stadium, } for home_or_away in ['home', 'away']: new_game[home_or_away + '_team'] = get_team(matchdom, home_or_away) games.append(new_game) return games
def get_team(match, home_or_away): new_team = {} td_xpath = 'tr/td[contains(@class,"' + home_or_away + '")]' team_href = xpath.findvalue(td_xpath + '/a/@href', match) td_value = xpath.findvalue(td_xpath, match).strip() if not team_href is None: new_team['name'] = td_value new_team['flag'] = xpath.findvalue( '//tr/td/a[@href="' + team_href + '"]/img/@src', match) new_team['href'] = team_href elif re.match(r'^[12][A-H]$', td_value): new_team['reference'] = { 'rank': int(td_value[0]), 'game_ref': "Group " + td_value[1] } elif re.match(r'W([0-9]+)$', td_value): new_team['reference'] = {'rank': 1, 'game_ref': "KO " + td_value[1:]} elif re.match(r'L([0-9]+)$', td_value): new_team['reference'] = {'rank': 2, 'game_ref': "KO " + td_value[1:]} else: new_team['reference'] = td_value return new_team
def get_games(stage): dom = get_dom(stage) games = [] for groupdom in dom: group_name = xpath.findvalue('caption', groupdom) for match in xpath.find('tbody/tr', groupdom): new_game = { 'id': xpath.findvalue('td[contains(@class,"mNum")]', match), 'group': group_name, 'time': timestring_to_datetime( xpath.findvalue('td[contains(@class,"dt")]/span/@title', match).split(",")[1].encode('utf-8')), 'location': xpath.findvalue('td/a[contains(@href,"destination")]', match).strip(), } for home_or_away in ['home', 'away']: new_game[home_or_away + '_team'] = get_team( match, home_or_away) games.append(new_game) return games
def test_render_readonly(self): class F(Form): fields = [ Field('name', conv=convs.Char(), widget=self.widget(), permissions="r", ) ] form = F(self.env) form.raw_data = MultiDict({'name': '<p>Paragraph</p>'}) render = form.get_field('name').widget.render() html = self.parse(render) value = self.get_value(html) self.assertEqual(value, '<p>Paragraph</p>') self.assertEqual(xpath.findvalue('.//*:%s/@readonly'% self.tag, html), 'readonly')
def extractMenu(doc): weekdays = ["Måndag", "Tisdag", "Onsdag", "Torsdag", "Fredag", "Lördag", "Söndag"] menu = [] c = 2 i = 0 menudoc = xpath.find("//div[@class='menyn']", doc) while i < 5 and c < 20: #weekmenu = xpath.findvalue("//p[@align='center']/font[$day]", menudoc[0], day=c) weekmenu = xpath.findvalue("//font[$day]", menudoc[0], day=c) c = c + 1 if weekmenu and len(weekmenu) > 7: i = i + 1 regexp = r'(' + '|'.join(weekdays) + ')$' weekmenu = re.sub(regexp, '', weekmenu) menu.append(weekmenu) #print "Day " + str(i) + "\n " + weekmenu #else: #print "Found no weekmenu" if c == 20: raise LunchrParseException("Couldn't find any lunch menu.") return menu
def get_games(stage): dom = get_dom(stage) games = [] #print debug for matchdom in dom: group_name = xpath.findvalue('tr/td/div/span[@class="gname"]//a',matchdom) if None == group_name: group_name = xpath.findvalue('tr/td/div/span[@class="rname"]//a',matchdom) if None == group_name: group_name = "" else: group_name = group_name.strip(' \t\n\r') else: group_name = group_name.strip(' \t\n\r') dayvalue = xpath.findvalue('tr/td/div/span[@class="b dateT"]',matchdom).strip().split(' ')[0].encode('utf-8') hourstr = xpath.findvalue('tr/td[@class="c b score nob"]//a',matchdom) if None == hourstr: hourstr = xpath.findvalue('tr/td[@class="c b score nob"]',matchdom) if None == hourstr: print matchdom.toprettyxml(encoding='utf-8') return [] hourvalue = hourstr.strip().encode('utf-8') matchtime = datetime(2012,6,int(dayvalue),int(hourvalue.split('.')[0])-2,int(hourvalue.split('.')[1]),0,0) stadium = re.match(r".*Stadium:.*,(.*)",xpath.findvalue('tr[@class="referee_stadium"]/td',matchdom).encode('utf-8').strip(), flags = re.DOTALL).group(1).strip() hournode = xpath.findvalue('tr/td[@class="c b score nob"]//a/@href',matchdom) if None == hournode: print matchdom.toprettyxml(encoding='utf-8') return [] matchid = int(re.match(r"/uefaeuro/season=2012/matches/round=[0-9]+/match=([0-9]+)/index.html",hournode).group(1)) - 2003318 #print [ group_name , dayvalue , hourvalue , matchtime, stadium , matchid, hournode ] new_game = {'id' : matchid , 'group' : group_name , 'time' : matchtime , 'location' : stadium , } for home_or_away in ['home','away']: new_game[home_or_away + '_team'] = get_team(matchdom, home_or_away) games.append(new_game) return games
def parse(self, node): selection = xpath.findvalue(self.selector, node) return selection.lower() in ["true", "1", 'yes', 'y']
def get_options(self, html): return [(x.getAttribute('value'), xpath.findvalue('./*:label/text()', x.parentNode), x.hasAttribute('checked')) for x in xpath.find('.//*:input', html)]
def get_value(self, html): return xpath.findvalue('.//*:%s/@value'%self.tag, html)
def check_multiple(self, html): self.assertEqual(xpath.findvalue('.//*:input/@type', html), 'checkbox')
events = pulldom.parse(NLWIKI_FILE) nltk_stopwords = nltk.corpus.stopwords.words('dutch') with open("data/stopwords.txt") as f: stopwords = f.readlines() stopwords = set([x.strip() for x in stopwords]) stopwords.update(nltk_stopwords) for event, node in events: if event == 'START_ELEMENT' and node.tagName == 'page': x += 1 events.expandNode(node) # node now contains a dom fragment title = xpath.findvalue('title', node) title = re.sub("[\(|].*?[\)]", "", title).strip().lower() if len(title.split()) > 1 or any(bad in title for bad in bad_list): continue title = re.sub(r'[\W]+', "", title) revision = xpath.findvalue('revision', node) text = xpath.findvalues('revision/text', node) wiki_parsed = wtp.parse(text[0]).sections[0] wiki_parsed_str = str(wiki_parsed) for table in wiki_parsed.tables: wiki_parsed_str = wiki_parsed_str.replace(str(table), "") for tmpl in wiki_parsed.templates: wiki_parsed_str = wiki_parsed_str.replace(str(tmpl), "") for ref in wiki_parsed.get_tags(): wiki_parsed_str = wiki_parsed_str.replace(str(ref), '')
def test_compiled_expr_argument(self): expr = xpath.XPath('//item[3]') result = xpath.findvalue(expr, self.doc) self.failUnlessEqual(result, 'parrot')
def parse(self, node): return xpath.findvalue(self.selector, node)
def check_not_multiple(self, html): self.assertEqual(xpath.findvalue('.//*:select/@multiple', html), None)
def _findvalue(root, xpath_expr): return xpath.findvalue(xpath_expr, root)
def test_compiled_expr_argument(self): expr = xpath.XPath('//item[3]') result = xpath.findvalue(expr, self.doc) self.assertEqual(result, 'parrot')
def check_not_multiple(self, html): self.assertEqual(xpath.findvalue('.//*:input/@type', html), 'radio')