def add_class(original_html, crn1, crn2=''): html = lxml.html.fromstring(original_html) html.get_element_by_id("crn_id1").value = crn1 html.get_element_by_id("crn_id2").value = crn2 form = html.forms[1] #return form.form_values().append(('REG_BTN', 'Submit Changes')) values = form.form_values() values.append(('REG_BTN', 'Submit Changes')) return values
def getCountData(self, html): result_text = html.get_element_by_id('content').xpath( 'form/div/div/div/span/text()')[0].strip() results_more = '+' in result_text result_count = int( result_text.split(' ')[0].strip('+').replace(',', '')) return result_count, results_more
def get_athletes(): req = urllib2.Request( "http://www.thepowerof10.info/athletes/athleteslookup.aspx?surname=&firstname=&club=beckenham" ) req.add_header("User-Agent", "Mozilla") req.add_header("Host", "www.thepowerof10.info") req.add_header("Accept", "*/*") r = urllib2.urlopen(req) text = r.read() html = lxml.html.document_fromstring(text) # Hurts my eyes to do this, fragile as hell way # to get the race ID and date d = html.get_element_by_id("ctl00_cphBody_pnlResults") athlete_rows = list(d.cssselect("table#ctl00_cphBody_dgAthletes"))[0] for athlete in athlete_rows: firstname = athlete.cssselect("td")[0].text_content() secondname = athlete.cssselect("td")[1].text_content() #print athlete el = athlete.cssselect("td")[7][0] #print firstname #print el #linkparts = el.get('href').split('/') #racelink = linkparts[len(linkparts) - 2] racelink = el.get("href") print racelink if not "First" in firstname: athletes.append(racelink)
def __enter__(self): # Similar to assertContains(), we verify the status code self.test_case.assertEqual(self.response.status_code, self.status_code) # TODO consider validating self.response['Content-Type'] # Parse the response as HTML html = lxml.html.fromstring(self.response.content.decode('utf-8')) if self.selector is not None: # Use cssselect to filter the elements elements = html.cssselect(self.selector) # Ensure some data exists if len(elements) == 0: raise SelectorNotFound( 'No selector matches found for {0}'.format(self.selector) ) return elements if self.element_id is not None: try: return html.get_element_by_id(self.element_id) except KeyError: raise ElementIDNotFound( 'Element with id, {0}, not present'.format(self.element_id) ) # No filtering defined, return the entire parsed HTML document return html
def calc(phenny, input): """Google calculator.""" if not input.group(2): return phenny.reply("Nothing to calculate.") q = input.group(2).encode('utf-8') q = q.replace('\xcf\x95', 'phi') # utf-8 U+03D5 q = q.replace('\xcf\x80', 'pi') # utf-8 U+03C0 uri = 'http://www.google.com/search?q=' bytes = web.get(uri + web.urllib.quote(q)) html = lxml.html.fromstring(bytes) try: answer = html.get_element_by_id("cwos").text_content().strip() except KeyError: try: answer = lxml.etree.tostring(html.find_class("vk_ans")[0]) answer = answer[answer.find('>')+1:answer.rfind('<')] except IndexError: answer = None if answer: answer = web.decode(answer) answer = answer.replace(u'\xc2\xa0', ',') answer = answer.replace('<sup>', '^(') answer = answer.replace('</sup>', ')') answer = answer.encode('utf-8') phenny.say(answer) else: phenny.say('Sorry, no result.')
def __enter__(self): # Similar to assertContains(), we verify the status code self.test_case.assertEqual(self.response.status_code, self.status_code) # TODO consider validating self.response['Content-Type'] # Parse the response as HTML html = lxml.html.fromstring(self.response.content.decode('utf-8')) if self.selector is not None: # Use cssselect to filter the elements elements = html.cssselect(self.selector) # Ensure some data exists if len(elements) == 0: raise SelectorNotFound( 'No selector matches found for {0}'.format(self.selector)) return elements if self.element_id is not None: try: return html.get_element_by_id(self.element_id) except KeyError: raise ElementIDNotFound( 'Element with id, {0}, not present'.format( self.element_id)) # No filtering defined, return the entire parsed HTML document return html
def get_athletes(): req = urllib2.Request("http://www.thepowerof10.info/athletes/athleteslookup.aspx?surname=&firstname=&club=beckenham") req.add_header("User-Agent", "Mozilla") req.add_header("Host", "www.thepowerof10.info") req.add_header("Accept", "*/*") r = urllib2.urlopen(req) text = r.read() html = lxml.html.document_fromstring(text) # Hurts my eyes to do this, fragile as hell way # to get the race ID and date d = html.get_element_by_id("ctl00_cphBody_pnlResults") athlete_rows = list(d.cssselect("table#ctl00_cphBody_dgAthletes"))[0] for athlete in athlete_rows: firstname = athlete.cssselect("td")[0].text_content() secondname = athlete.cssselect("td")[1].text_content() #print athlete el = athlete.cssselect("td")[7][0] #print firstname #print el #linkparts = el.get('href').split('/') #racelink = linkparts[len(linkparts) - 2] racelink = el.get("href") print racelink if not "First" in firstname: athletes.append(racelink)
def scrape_department_list(html): """ Given an html etree, return a list of (code, name) tuples for departments. """ select = html.get_element_by_id("subj_id") return [(child.attrib["value"], child.text.strip()) for child in select if child.tag == "option"]
def clear_html(source_html): html = lxml.html.fromstring(source_html) table0 = html.get_element_by_id("GlobalTable") table1 = html.cssselect("#GlobalTable tbody tr td table")[1] table2 = table1.cssselect("table")[0] return lxml.html.tostring(table2, encoding="unicode", pretty_print=True)
def get_results(race): req = urllib2.Request(parkrun_url % (race)) req.add_header("User-Agent", "Mozilla") req.add_header("Host", "www.parkrun.org.uk") req.add_header("Accept", "*/*") r = urllib2.urlopen(req) text = r.read() html = lxml.html.document_fromstring(text) # Hurts my eyes to do this, fragile as hell way # to get the race ID and date d = html.get_element_by_id("dnn_ctr4953_ModuleContent") racename = list(d.find_class("normal")[0])[1].text_content() raceid = racename.split('\n')[1].strip().replace("-", "").strip() racedate = dateutil.parser.parse(racename.split('\n')[2].strip(), dayfirst=True) assert (int(raceid) == race) results_rows = list(html.get_element_by_id("results"))[1] racers = [] for row in results_rows: vals = [td.text_content() for td in row] racer = dict() # Some racers have no times, skip them if not vals[2]: continue racer['position'] = int(vals[0]) racer['name'] = vals[1] min, sec = [int(d) for d in vals[2].split(":")] racer['time'] = min * 60 + sec racer['agegroup'] = vals[3] racer['gender'] = vals[5] racer['genderpos'] = int(vals[6]) racer['race'] = int(race) racer['racedate'] = racedate racers.append(racer) print race, len(racers) return racers
def get_results(race): req = urllib2.Request(parkrun_url%(race)) req.add_header("User-Agent", "Mozilla") req.add_header("Host", "www.parkrun.org.uk") req.add_header("Accept", "*/*") r = urllib2.urlopen(req) text = r.read() html = lxml.html.document_fromstring(text) # Hurts my eyes to do this, fragile as hell way # to get the race ID and date d = html.get_element_by_id("dnn_ctr4953_ModuleContent") racename = list(d.find_class("normal")[0])[1].text_content() raceid = racename.split('\n')[1].strip().replace("-","").strip() racedate = dateutil.parser.parse(racename.split('\n')[2].strip(), dayfirst=True) assert(int(raceid) == race) results_rows = list(html.get_element_by_id("results"))[1] racers = [] for row in results_rows: vals = [td.text_content() for td in row] racer = dict() # Some racers have no times, skip them if not vals[2]: continue racer['position'] = int(vals[0]) racer['name'] = vals[1] min,sec = [int(d) for d in vals[2].split(":")] racer['time'] = min*60 + sec racer['agegroup'] = vals[3] racer['gender'] = vals[5] racer['genderpos'] = int(vals[6]) racer['race'] = int(race) racer['racedate'] = racedate racers.append(racer) print race, len(racers) return racers
def get_element_data(school_url): print "connecting to "+school_url try: page = urllib2.urlopen(school_url).read() except: print "couldn\'t open url: " + school_url na_data = { 'Type' : 'n/a', 'Address' : 'n/a', 'Phone' : 'n/a', 'District' : 'n/a'} return na_data try: html = lxml.html.fromstring(page) element = html.get_element_by_id('ctl00_ContentPlaceHolder1_SchoolInfoDisplay') basic_info = [i for i in element.itertext()] data = {} # The Type of School: Private, Public, Catholic is located at the second position data["Type"] = basic_info[1] # Assemble the address data["Address"] = re.sub("\sBox\s+\d+|\sC.?P.?\s+\d+|\sGeneral Delivery","", basic_info[2].strip() + ", " + basic_info[3].strip() + ', Canada') # Getting the Phone Number data["Phone"] = basic_info[4].split(":")[1].strip() # Getting the School District data["District"] = basic_info[5].split(":")[1].strip() except: print "Error trying to parse basic info for: " + school_url na_data = { 'Type' : 'n/a', 'Address' : 'n/a', 'Phone' : 'n/a', 'District' : 'n/a'} return na_data try: element = html.get_element_by_id('ctl00_ContentPlaceHolder1_detailedReportCard_SchoolProperties1_tblProps') counter = 0 header_tmp = '' for e in element.iter('td'): if e.keys()[0] == 'width': if counter%2 == 0: header_tmp = e.text else: data[header_tmp] = e.text counter +=1 except: print "Error trying to parse additional info for: " + school_url return data
def search(id, function, filter1=None, filter2=None): payload = { '__EVENTTARGET': '', '__EVENTARGUMENT': '', '_ctl0:ContentPlaceHolder1:chklInstances:' + id: 'on', '_ctl0:ContentPlaceHolder1:btnSearch': 'Search', '_ctl0:ContentPlaceHolder1:ddlFunctions': function, } if filter1: payload['_ctl0:ContentPlaceHolder1:chklCourtsOfAppeal:' + filter1] = 'on' if filter2: payload['_ctl0:ContentPlaceHolder1:chklCourts:' + filter2] = 'on' #for letter in string.ascii_uppercase: payload['_ctl0:ContentPlaceHolder1:txtSearchKenmerken'] = '*' session = requests.session() url = search_url while True: response = session.post(url, data=payload) print response.status_code html = parse(response.content) for tr in result_selector(html): row = [None] * 3 for i, td in enumerate(tr): if i: row[i] = td.text_content().strip() else: link = td[0] #row[0] = link.get('id') row[0] = link.text.strip() yield row next = html.get_element_by_id('_ctl0_ContentPlaceHolder1_lbNext', None) if next is None: break print 'next' payload.clear() payload = { '__EVENTTARGET': '_ctl0$ContentPlaceHolder1$lbNext', '__EVENTARGUMENT': '', '__VIEWSTATE': html.get_element_by_id('__VIEWSTATE', '').get('value'), } print payload url = results_url
def drop_classes(original_html, crn_list): html = lxml.html.fromstring(original_html) course_table = html.find_class("datadisplaytable")[0] # make a list of crns in the order on the website to match them up # with the enumerated form values infosu_crns = [] for element in course_table.find_class('dddefault'): try: crn = element.getchildren()[1]._value__get() if crn: infosu_crns.append(crn) except: pass # for element in [element.getchildren()[0]._value__get() for element in course_table.find_class('dddefault') if type(element.getchildren()[0]) is lxml.html.InputElement]: # if 'DUMMY' in element and len(temp_list) > 0: # course_list.append(temp_list) # temp_list = [] # elif 'DUMMY' not in element: # temp_list.append(element) # with the list of crns from the website in order, we can make a list # of form values to submit and drop action_id_list = [] for crn in crn_list: for index, infosu_crn in enumerate(infosu_crns): if crn == infosu_crn: action_id_list.append("action_id" + str(index+1)) # action_id starts at 1 break # set each course to drop in the dropdown boxes (from value '' to 'DX') for action_id in action_id_list: html.get_element_by_id(action_id)._value__set('DX') # set to drop form = html.forms[1] values = form.form_values() values.append(('REG_BTN', 'Submit Changes')) return values
def parse_results(filename): content = open(filename).read() html = lxml.html.fromstring(content) table = html.get_element_by_id('tournamentTable') tbody = list(table.iterchildren())[1] rows = tbody.iterchildren() results = [] for row in rows: result = parse_row(row) if result is not None: results.append(result) return results
def load_upload_form(self): url = submit_url(2) with self.client.get(url, allow_redirects=False, catch_response=True) as response: if response.status_code == 200: html = lxml.html.fromstring(response.content) return html.get_element_by_id('create-addon') else: more_info = '' if response.status_code in (301, 302): more_info = ('Location: {}'.format( response.headers['Location'])) response.failure('Unexpected status: {}; {}'.format( response.status_code, more_info))
def load_upload_form(self): url = submit_url(2) with self.client.get( url, allow_redirects=False, catch_response=True) as response: if response.status_code == 200: html = lxml.html.fromstring(response.content) return html.get_element_by_id('create-addon') else: more_info = '' if response.status_code in (301, 302): more_info = ('Location: {}' .format(response.headers['Location'])) response.failure('Unexpected status: {}; {}' .format(response.status_code, more_info))
def load_upload_form(self): url = helpers.submit_url('upload-unlisted') response = self.client.get(url, allow_redirects=False, catch_response=True) if response.status_code == 200: response.success() html = lxml.html.fromstring(response.content) return html.get_element_by_id('create-addon') else: more_info = '' if response.status_code in (301, 302): more_info = 'Location: {}'.format(response.headers['Location']) response.failure( f'Unexpected status: {response.status_code}; {more_info}')
def get_domain_num(self, ip): """ Get num of pages. @rtype : int @param ip: the ip you want to search. """ try: r = requests.get("http://dns.aizhan.com/?q=%s" % ip, timeout=5) r.close() html = lxml.html.fromstring(r.text) num = html.get_element_by_id("yhide").text return int(num) except Exception, e: print "[!]ERROR: %s" % e.message sys.exit(0)
def sort_by_options(self): """ Validates an HTML <ul> snippet and transforms it into a 2-tuple """ html = self._sortable_fields() html = lxml.html.fromstring(html) ul = html.get_element_by_id('sortable_fields') assert ul.tag.lower() == 'ul' _sort_by_options = [] for li in ul: key = li.get('id') _sort_by_options.append((key, li.text)) return _sort_by_options
def fetch(self): self.search() if not self.source: return None response = requests.get(self.source) self.html = response.text html = lxml.html.fromstring(self.html) if not self.check(): return False log.info('[FOUND] %s' % self.source) div = html.get_element_by_id('lyrics-body') lyrics = re.sub(r'\n\[ From: .*? \]', '', div.text_content()) self.lyrics = lyrics.strip() return True
def reserve_tickets(self, tickets): # Make sure we have a clean session self.client.cookies.clear() self.client.get('/') resp = self.client.get('/tickets') html = lxml.html.fromstring(resp.content) form = html.get_element_by_id('choose_tickets') amounts = {i.label.text_content(): i.name for i in form.inputs if i.name.endswith('-amount')} data = dict(**form.fields) for display_name, count in tickets.items(): data[amounts[display_name]] = count self.client.post('/tickets', data) raise StopLocust()
def get_lyrics(azlyrics_url): debug("Getting lyrics...") r = requests.get(azlyrics_url) html = lxml.html.fromstring(r.text) main = html.get_element_by_id("main") # Findnig the artist artist = main.find("h2").text # Geting rid of the last word, which is always "LYRICS" artist = ' '.join(artist.split(' ')[:-1]) artist = "Artist: " + artist.title() # Finding the tile title = main.find("b").text # Getting rid of the beginning and ending quotes title = title[1:-1] title = "Title: " + title.title() main_divs = main.findall("div") lyrics_div = None for div in main_divs: div_comments = filter(lambda x: type(x) == lxml.html.HtmlComment, div.getchildren()) if ' start of lyrics ' in map(lambda x: x.text, div_comments): lyrics_div = div break if lyrics_div is None: raise NoLyricsDivFound( "Could not find div contaning lyrics in the page") debug(" Done!") result = "\n".join([artist, title, lyrics_div.text_content()]) return result
def reserve_tickets(self, tickets): # Make sure we have a clean session self.client.cookies.clear() self.client.get("/") resp = self.client.get("/tickets") html = lxml.html.fromstring(resp.content) form = html.get_element_by_id("choose_tickets") amounts = { i.label.text_content(): i.name for i in form.inputs if i.name.endswith("-amount") } data = dict(**form.fields) for display_name, count in tickets.items(): data[amounts[display_name]] = count self.client.post("/tickets", data) raise StopLocust()
def parse_webpage(inner_html): ret = [] html = lxml.html.fromstring(inner_html) search_result_html = html.get_element_by_id("search-results-table") result_list = search_result_html.xpath('//table/tbody/tr') result_list.pop(0) for result in result_list: obj = {} card_name = result.xpath("./td/strong/a/text()")[0] card_name = card_name.replace("\n", "") card_name = card_name.strip() obj["name"] = card_name set_name = result.xpath("./td[2]/div[1]/text()")[0] set_name = set_name.replace("\n", "") set_name = set_name.strip() splited_set_name = set_name.split(" ") set_name = splited_set_name[0] obj["set"] = set_name if len(splited_set_name) > 1: obj["foil"] = True else: obj["foil"] = False price = result.xpath("./td[3]/div[2]/strong/text()")[0] price = price.replace("\n", "") price = price.strip() price = price.split(" ")[0] obj["tix"] = price ret.append(obj) return ret
def get_runs(): req = urllib2.Request("http://www.parkrun.org.uk/results/firstfinishers/") req.add_header("User-Agent", "Mozilla") req.add_header("Host", "www.parkrun.org.uk") req.add_header("Accept", "*/*") r = urllib2.urlopen(req) text = r.read() html = lxml.html.document_fromstring(text) # Hurts my eyes to do this, fragile as hell way # to get the race ID and date d = html.get_element_by_id("content") runs_rows = list(d.cssselect("table.sortable"))[0][1] for row in runs_rows: name = row.cssselect("td")[0].text_content() el = row.cssselect("td a")[0] linkparts = el.get('href').split('/') racelink = linkparts[len(linkparts) - 2] runs.append(racelink) runnames.append(name)
def get_results(athleteurl): print "Getting athlete info for ".join(athleteurl) req = urllib2.Request(atheleteinfo_url.athleteurl) req.add_header("User-Agent", "Mozilla") req.add_header("Host", "www.thepowerof10.info") req.add_header("Accept", "*/*") r = urllib2.urlopen(req) text = r.read() html = lxml.html.document_fromstring(text) # Hurts my eyes to do this, fragile as hell way # to get the race ID and date d = html.get_element_by_id("ctl00_cphBody_pnlMain") athletename = d.cssselect("h2")[0].text_content() #racerealname = racename.split('#')[0].strip() #raceid = racename.split('\n')[1].strip().replace("-","").strip() #assert(int(raceid) == race) #print racename #results_rows = list(html.get_element_by_id("results"))[1] """
def __enter__(self): # Parse the as HTML html = lxml.html.fromstring(self.rendered_template) if self.selector is not None: # Use cssselect to filter the elements elements = html.cssselect(self.selector) # Ensure some data exists if len(elements) == 0: raise SelectorNotFound( 'No selector matches found for {0}'.format(self.selector) ) return elements if self.element_id is not None: try: return html.get_element_by_id(self.element_id) except KeyError: raise ElementIDNotFound( 'Element with id, {0}, not present'.format(self.element_id) ) # No filtering defined, return the entire parsed HTML document return html
"MDPV", "Amphetamin", "Amphetami", "Coffein", "MDDMA", "4-Fa", "Buflomedil", "Amoxicillin", "m-CCP", "4-F-A", ]) if __name__ == "__main__": results = [] html = html.fromstring(open("tmp/page.html").read()) names = map(lambda _: _.text_content(), html.get_element_by_id("cc").findall("h2")) for i, tag in enumerate(html.find_class("inhalt_pillen")): infos = dict(name=names[i]) images = map(lambda _: _.get("src"), tag.findall("img")) for image in images: image_filename = os.path.join("tmp/pictures", os.path.basename(image)) if not os.path.isfile(image_filename): r = requests.get("http://www.mindzone.info/%s" % (image), stream=True) with open(image_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter keep-alive new chunks f.write(chunk) f.flush() infos["images"] = map(lambda _: os.path.basename(_), images) if not infos["images"]: continue for line in etree.tostring(tag).split("\n"):
def by_id(html, id_name): return html.get_element_by_id(id_name).text_content().replace("\n", " ").replace("\t", " ")
data = { "rank": [], "name": [], "current_players": [], "peak_players": [], "hours_played": [] } #Gather our data from the internet print( "Gathering Steam Game Statistics (Last 30 Days). This may take a few moments." ) for i in range(1, 41): r = requests.get("https://steamcharts.com/top/p." + str(i)) html = lxml.html.fromstring(r.text) top_games_table = html.get_element_by_id("top-games") tableElements = top_games_table.getchildren()[1].getchildren() #Import our data into Pandas DataFrame for element in tableElements: cells = element.getchildren() data["rank"].append( int(str(cells[0].text_content().strip()).replace(".", ""))) data["name"].append(cells[1].text_content().strip()) data["current_players"].append(int(cells[2].text_content().strip())) data["peak_players"].append(int(cells[4].text_content().strip())) data["hours_played"].append(int(cells[5].text_content().strip())) #Create the Database db = sqlite3.connect(":memory:") cursor = db.cursor() cursor.execute(
formV = extractForm(html.forms[0]) formV['p_kch'] = course_id formV['page'] = '-1' formV['m'] = 'rxSearch' except Exception as e: print #!/usr/bin/env python print 'pass.' while True: try: data = s.post('http://zhjwxk.cic.tsinghua.edu.cn/xkBks.vxkBksXkbBs.do', data=formV).text if data.find('table_t') < 0: print "Session timeout!" break html = lxml.html.fromstring(data) tbRes = html.get_element_by_id('table_t') ind = 0 for td in tbRes.cssselect('td'): ind += 1 if ind == 5: left = td.text_content() break if left != "0": html = lxml.html.fromstring(data) formV = extractForm(html.forms[0]) formV['m'] = 'saveRxKc' formV['p_rx_id'] = "%s;%s;%s;" % (sem_num, course_id, sub_id) data = s.post('http://zhjwxk.cic.tsinghua.edu.cn/xkBks.vxkBksXkbBs.do', data=formV).text print re.search(r'showMsg\((.*)\)', data).group(1) exit(0)
def get_logstatus(html): html = lxml.html.fromstring(bytes(html, 'utf8')) html = html.get_element_by_id('document-navigation') links = html.cssselect('li>a') status = links[1].text_content() return status
"MDDMA", "4-Fa", "Buflomedil", "Amoxicillin", "m-CCP", "4-F-A", "Metham", "Butylon", "MDPV", "DMA", ]) if __name__ == "__main__": html_parser = HTMLParser.HTMLParser() results = [] html = html.fromstring(open("tmp/page.html").read()) names = map(lambda _: _.text_content(), html.get_element_by_id("cc").findall("h2")) for i, tag in enumerate(html.find_class("inhalt_pillen")): infos = dict(name=names[i]) images = map(lambda _: _.get("src"), tag.findall("img")) for image in images: image_filename = os.path.join("tmp/pictures", os.path.basename(image)) if not os.path.isfile(image_filename): r = requests.get("http://www.mindzone.info/%s" % (image), stream=True) with open(image_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter keep-alive new chunks f.write(chunk) f.flush() infos["images"] = map(lambda _: os.path.basename(_), images) if not infos["images"]: continue for line in etree.tostring(tag).split("\n"):
page = urllib2.urlopen(sourceUrl) if page.geturl() == sourceUrl: #ie. we haven't been redirected data = {} data["combinationId"] = combinationId data["sourceUrl"] = sourceUrl html = lxml.html.parse(page).getroot() data["drugClass"] = html.get_element_by_id("ctl00_ContentPlaceHolder1_FormView1_lblClass").text data["drugName"] = html.get_element_by_id("ctl00_ContentPlaceHolder1_FormView1_lblDrugName").text data["HIVDrugName"] = html.get_element_by_id("ctl00_ContentPlaceHolder1_FormView1_lblHIVDrugName").text data["warning"] = html.get_element_by_id("ctl00_ContentPlaceHolder1_FormView1_lblIntearctionName").text data["evidence"] = html.get_element_by_id("ctl00_ContentPlaceHolder1_FormView1_lblGradeName").text data["summary"] = html.get_element_by_id("ctl00_ContentPlaceHolder1_FormView1_lblSummary").text descriptionElement = html.get_element_by_id("ctl00_ContentPlaceHolder1_FormView1_lblTitle") data["description"] = descriptionElement.text try: data["reference"] = descriptionElement[1].text #This could be vulnerabel if there were many references, or none except: pass data["HIVDrugURL"] = html.get_element_by_id("ctl00_ContentPlaceHolder1_FormView1_hypInteractionsList").get("href") scraperwiki.sqlite.save(unique_keys=["combinationId"], data=data)
def parse_html(html): table = html.get_element_by_id('ctl00_ContentPlaceHolder3_grwIzvestaji') rows = iter(table) rows.next() for row in rows: yield row.text_content()
def get_csrf_token(self, token_for): rv = self.app.get('/' + token_for) html = lxml.html.document_fromstring(rv.data) return html.get_element_by_id('csrf_token').value
combinationId = i sourceUrl = baseurl + str(combinationId) page = urllib2.urlopen(sourceUrl) if page.geturl() == sourceUrl: #ie. we haven't been redirected data = {} data["combinationId"] = combinationId data["sourceUrl"] = sourceUrl html = lxml.html.parse(page).getroot() data["drugClass"] = html.get_element_by_id( "ctl00_ContentPlaceHolder1_FormView1_lblClass").text data["drugName"] = html.get_element_by_id( "ctl00_ContentPlaceHolder1_FormView1_lblDrugName").text data["HIVDrugName"] = html.get_element_by_id( "ctl00_ContentPlaceHolder1_FormView1_lblHIVDrugName").text data["warning"] = html.get_element_by_id( "ctl00_ContentPlaceHolder1_FormView1_lblIntearctionName").text data["evidence"] = html.get_element_by_id( "ctl00_ContentPlaceHolder1_FormView1_lblGradeName").text data["summary"] = html.get_element_by_id( "ctl00_ContentPlaceHolder1_FormView1_lblSummary").text descriptionElement = html.get_element_by_id( "ctl00_ContentPlaceHolder1_FormView1_lblTitle") data["description"] = descriptionElement.text try: data["reference"] = descriptionElement[
def class_search(original_html, dep, num): html = lxml.html.fromstring(original_html) ################################### # Get course desc / title via regex title_regex = compile('\n\s+([\w\s/,-]+)\r\s+\(\d\)\.') description_regex = compile('\(\d\)\.\s*?(<img .*?>)?\s*?(<img .*?>)?\s*?</h3>\s+([\w\s/,\.-]+)') # also handles bacc core classes try: title = title_regex.findall(original_html)[0] except: title = None try: description = description_regex.findall(original_html)[0][2] except: description = None ################################### try: table_element = html.get_element_by_id('ctl00_ContentPlaceHolder1_SOCListUC1_gvOfferings') except: return [] table_elements = table_element.getchildren()[1:] elements_to_int = ['WL Cap', 'Weeks', 'CRN', 'WL Curr', 'WL Avail', 'Cr'] classes = [] row_headers = [] for header in table_element.getchildren()[0].getchildren(): row_headers.append(header.text_content()) for row in table_elements: one_class = {} cells = row.getchildren() for index, cell in enumerate(cells): content = cell.text_content().strip() if row_headers[index] == 'Restrictions' and content != '': content = content.split(':')[1].strip().replace('\n', '').replace('\r', '').replace(' ', '') if content != 'TBA': if row_headers[index] == 'Day/Time/Date': fields = content.split(' ') days = list(fields[0]) try: times = fields[1][:9].split('-') for inner_index, time in enumerate(times): times[inner_index] = time[:2] + ':' + time[2:] except: times = '' try: dates = fields[1][8:] except: dates = '' content = {"days":days, "time":times, "dates":dates} if row_headers[index] == 'Day/Time/Date': one_class['days'] = content['days'] one_class['times'] = content['time'] one_class['duration'] = content['dates'] else: one_class[str.lower(row_headers[index])] = content else: one_class['days'] = 'TBA' one_class['times'] = 'TBA' one_class['duration'] = 'TBA' one_class['location'] = 'TBA' # Change dict keys to standardize and remove spaces for django to access keys_to_change = ('cr', 'wl avail', 'avail', 'wl cap', 'wl curr') keys_to_use = ('credits', 'wl_available', 'available', 'wl_cap', 'wl_curr') for key_orig, key_replace in zip(keys_to_change, keys_to_use): if key_orig in one_class: one_class[key_replace] = one_class[key_orig] one_class['department'] = dep one_class['number'] = num if description: one_class['description'] = description if title: one_class['title'] = title classes.append(one_class) return classes
import os import glob import codecs import lxml.html import re import csv writer = csv.writer(open("exams_raw.csv", "w")) writer2 = csv.writer(open("exams.csv", "w")) exams_set = list() for i, infile in enumerate(glob.glob(os.path.join("data/", "*.html"))): print i with codecs.open(infile, "r", "utf-8") as f: source = f.read() html = lxml.html.fromstring(source) if not len(html.xpath("//table[@id='UCResultsTable_resultsTbl']")): continue exams = html.get_element_by_id("UCResultsTable_resultsTbl").xpath("//tr[td/@class = 'results noprint']") for exam in exams: # details = ["<a onClick='event_add($(this))'>%s</a>" % re.sub("<br>|\n", " ", re.sub("</?td( class=\"b\")?>|</?(div|input).*>|\t", "", lxml.html.tostring(detail))) for detail in exam.xpath("td[not(input)]")] details = [re.sub("<br>|\n", " ", re.sub("</?td( class=\"b\")?>|</?(div|input).*>|\t", "", lxml.html.tostring(detail))) for detail in exam.xpath("td[not(input)]")] if details not in exams_set: writer.writerow(details) writer2.writerow(["<a onClick='event_add($(this))'>%s</a>" % detail for detail in details]) exams_set.append(details)
def class_search(original_html, dep, num): html = lxml.html.fromstring(original_html) ################################### # Get course desc / title via regex title_regex = compile('\n\s+([\w\s/,-]+)\r\s+\(\d\)\.') description_regex = compile( '\(\d\)\.\s*?(<img .*?>)?\s*?(<img .*?>)?\s*?</h3>\s+([\w\s/,\.-]+)' ) # also handles bacc core classes try: title = title_regex.findall(original_html)[0] except: title = None try: description = description_regex.findall(original_html)[0][2] except: description = None ################################### try: table_element = html.get_element_by_id( 'ctl00_ContentPlaceHolder1_SOCListUC1_gvOfferings') except: return [] table_elements = table_element.getchildren()[1:] elements_to_int = ['WL Cap', 'Weeks', 'CRN', 'WL Curr', 'WL Avail', 'Cr'] classes = [] row_headers = [] for header in table_element.getchildren()[0].getchildren(): row_headers.append(header.text_content()) for row in table_elements: one_class = {} cells = row.getchildren() for index, cell in enumerate(cells): content = cell.text_content().strip() if row_headers[index] == 'Restrictions' and content != '': content = content.split(':')[1].strip().replace( '\n', '').replace('\r', '').replace(' ', '') if content != 'TBA': if row_headers[index] == 'Day/Time/Date': fields = content.split(' ') days = list(fields[0]) try: times = fields[1][:9].split('-') for inner_index, time in enumerate(times): times[inner_index] = time[:2] + ':' + time[2:] except: times = '' try: dates = fields[1][8:] except: dates = '' content = {"days": days, "time": times, "dates": dates} if row_headers[index] == 'Day/Time/Date': one_class['days'] = content['days'] one_class['times'] = content['time'] one_class['duration'] = content['dates'] else: one_class[str.lower(row_headers[index])] = content else: one_class['days'] = 'TBA' one_class['times'] = 'TBA' one_class['duration'] = 'TBA' one_class['location'] = 'TBA' # Change dict keys to standardize and remove spaces for django to access keys_to_change = ('cr', 'wl avail', 'avail', 'wl cap', 'wl curr') keys_to_use = ('credits', 'wl_available', 'available', 'wl_cap', 'wl_curr') for key_orig, key_replace in zip(keys_to_change, keys_to_use): if key_orig in one_class: one_class[key_replace] = one_class[key_orig] one_class['department'] = dep one_class['number'] = num if description: one_class['description'] = description if title: one_class['title'] = title classes.append(one_class) return classes
def get_movie_info_from_home(url): try: html = get_html(url) movie = None try: movie = html.get_element_by_id('movieEColl') except Exception as exception: #log_error('Exception:%s', exception) #log_error('SEARCH_MOVIE NOT MOVIEECOLL') pass if movie is None: return None title_tag = movie.get_element_by_id('movieTitle') a_tag = title_tag.find('a') href = a_tag.attrib['href'] title = a_tag.find('b').text_content() # 2019-08-09 tmp = title_tag.text_content() #tmp_year = movie_year tmp_year = '' match = re.compile(r'(?P<year>\d{4})\s%s' % u'제작').search(tmp) more = {} if match: tmp_year = match.group('year') more['eng_title'] = tmp.replace(title, '').replace( tmp_year, '').replace(u'제작', '').replace(u',', '').strip() country_tag = movie.xpath('//div[3]/div/div[1]/div[2]/dl[1]/dd[2]') country = '' if country_tag: country = country_tag[0].text_content().split('|')[0].strip() logger.debug(country) more['poster'] = movie.xpath( '//*[@id="nmovie_img_0"]/a/img')[0].attrib['src'] more['title'] = movie.xpath( '//*[@id="movieTitle"]/span')[0].text_content() tmp = movie.xpath( '//*[@id="movieEColl"]/div[3]/div/div[1]/div[2]/dl') more['info'] = [] #for t in tmp: # more['info'].append(t.text_content().strip()) #more['info'].append(tmp[0].text_content().strip()) more['info'].append(country_tag[0].text_content().strip()) #2019-09-07 logger.debug(more['info'][0]) tmp = more['info'][0].split('|') if len(tmp) == 5: more['country'] = tmp[0].replace(u'외', '').strip() more['genre'] = tmp[1].replace(u'외', '').strip() more['date'] = tmp[2].replace(u'개봉', '').strip() more['rate'] = tmp[3].strip() more['during'] = tmp[4].strip() elif len(tmp) == 4: more['country'] = tmp[0].replace(u'외', '').strip() more['genre'] = tmp[1].replace(u'외', '').strip() more['date'] = '' more['rate'] = tmp[2].strip() more['during'] = tmp[3].strip() elif len(tmp) == 3: more['country'] = tmp[0].replace(u'외', '').strip() more['genre'] = tmp[1].replace(u'외', '').strip() more['date'] = '' more['rate'] = '' more['during'] = tmp[2].strip() daum_id = href.split('=')[1] return { 'movie': movie, 'title': title, 'daum_id': daum_id, 'year': tmp_year, 'country': country, 'more': more } except Exception as exception: log_error('Exception:%s', exception) log_error(traceback.format_exc())
def get_csrf_token(app, token_for): rv = app.get('/' + token_for) html = lxml.html.document_fromstring(rv.data) return html.get_element_by_id('csrf_token').value
def htmlChampionsGeaendert(html, champListeAngepasst, patchNummer): # Championänderungen listeChampionsVeraendert = list() # Nach patch-champions elementIds = ["patch-champions", "patch-minor-changes-and-bugfixes", "patch-simple-champion-changes", "patch-fighters", "patch-mages-and-assassins", "patch-marksmen", "patch-supports", "patch-juggernauts", "patch-marksman-updates", "patch-major-mage-updates", "patch-minor-mage-updates", "patch-tank-updates", "patch-reksai-kindred-malzahar", "patch-mid-lane-worlds-balance", "patch-top-lane", "patch-mid-lane", "patch-bot-lane", "patch-rammus", "patch-duskblade-champions", "patch-simple-buffs", "patch-simple-nerfs", "patch-tanks", "patch-akali-and-pyke", "patch-kalista,-ornn-and-sejuani", "patch-simple-changes", "patch-simple-buffs-and-nerfs"] for elementId in elementIds: try: if patchNummer == '6.18': htmlChampions = html.get_element_by_id(elementId) elif patchNummer == '8.9': htmlChampions = html.xpath("//h2[contains(@id, '" + elementId +"')]")[0].getparent() else: htmlChampions = html.get_element_by_id(elementId).getparent() while True: htmlChampions = htmlChampions.getnext() if htmlChampions.tag == 'p': continue if htmlChampions.tag != 'div': break for child in htmlChampions[0][0].getchildren(): if child.get("id") is not None: listeChampionsVeraendert.append(child.get("id")) except: pass # Filtern pdChampionsVeraendert = pd.DataFrame({'Champions': [champ[6:].replace('wukong', 'monkeyking') for champ in listeChampionsVeraendert]}) pdChampionsVeraendert = pdChampionsVeraendert.loc[pdChampionsVeraendert['Champions'].isin(champListeAngepasst)] if patchNummer == '6.9': # Mage-Update mageZusatz = ['vladimir', 'malzahar', 'cassiopeia', 'zyra', 'brand', 'velkoz', 'anivia', 'annie', 'fiddlesticks', 'kennen', 'swain', 'syndra', 'veigar', 'viktor', 'xerath', 'ziggs'] pdChampionsVeraendert = pdChampionsVeraendert.append(pd.DataFrame({'Champions': mageZusatz})) elif patchNummer == '6.21': # Assassin-Update assassinZusatz = ['talon', 'katarina', 'leblanc', 'rengar', 'fizz', 'khazix', 'akali', 'zed', 'ekko', 'shaco'] pdChampionsVeraendert = pdChampionsVeraendert.append(pd.DataFrame({'Champions': assassinZusatz})) # Nach Skins suchen skinsHtml = html.xpath("//h4[contains(@class, 'skin-title')]") skinsListe = list() if len(skinsHtml): for skin in skinsHtml: # skin = skinsHtml[0] # skinString = tostring(skin).decode('utf8') # skinsListe.append(re.search("href=\".*\"", skinString).group(0).split("\"")[1].split('/')[-1].split('_')[0]) try: skinString = tostring(skin[0]).decode('utf8') # print(skinString) skinAuswahl = [champ for champ in champListe if champ.lower().replace('dr. mundo', 'mundo').replace('miss fortune', 'fortune') in re.search('>.*<', skinString).group(0).lower().replace('’', '\'')] if len(skinAuswahl) == 1: skinsListe.append(skinAuswahl[0]) elif len(skinAuswahl) > 1: skinsListe.append(skinAuswahl[np.argmax([len(skinSel) for skinSel in skinAuswahl])]) except: try: skinAuswahl = [champ for champ in champListe if champ.lower().replace('dr. mundo', 'mundo').replace('miss fortune', 'fortune') in re.search('>.*$', skinString).group(0).lower().replace('’', '\'')] if len(skinAuswahl) == 1: skinsListe.append(skinAuswahl[0]) elif len(skinAuswahl) > 1: skinsListe.append(skinAuswahl[np.argmax([len(skinSel) for skinSel in skinAuswahl])]) except: pass # Ausnahmen hinzufügen if patchNummer == '5.4': skinsListe.append('Sona') elif patchNummer == '8.12': skinsListe.append('Rammus') elif patchNummer == '8.14': skinsListe.append('Ezreal') skinsListe.append('Gnar') skinsListe.append('Rakan') skinsListe.append('Taliyah') skinsListe.append('Xayah') elif patchNummer == '8.23': skinsListe.append('Soraka') skinsListe.append('Miss Fortune') skinsListe.append('Ezreal') skinsListe.append('Lulu') skinsListe.append('Lux') elif patchNummer == '9.6': skinsListe.append('Yorick') elif patchNummer == '9.8': skinsListe.append('Camille') skinsListe.append('Fiora') skinsListe.append('Irelia') skinsListe.append('Kai\'sa') skinsListe.append('Rakan') skinsListe.append('LeBlanc') # elif patchNummer == '5.14': # skinsListe.append('Miss Fortune') pdSkins = pd.DataFrame({'Skins': skinsListe}).drop_duplicates().reset_index(drop = True) if patchNummer == '6.15': pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Lux')[0]).reset_index(drop = True) pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'LeBlanc')[0]).reset_index(drop = True) elif patchNummer == '6.19': pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Fiora')[0]).reset_index(drop = True) pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Teemo')[0]).reset_index(drop = True) elif patchNummer == '6.21': pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Vladimir')[0]).reset_index(drop = True) pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Sion')[0]).reset_index(drop = True) elif patchNummer == '6.22': pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Twitch')[0]).reset_index(drop = True) pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Maokai')[0]).reset_index(drop = True) elif patchNummer == '7.9': pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Sejuani')[0]).reset_index(drop = True) pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Maokai')[0]).reset_index(drop = True) elif patchNummer == '7.23': pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Garen')[0]).reset_index(drop = True) elif patchNummer == '7.24': pdSkins = pdSkins.drop(np.where(pdSkins['Skins'] == 'Amumu')[0]).reset_index(drop = True) # Patch-Datum htmlStringAlles = tostring(html).decode('utf8') if patchNummer == '5.16': datum = '20.08.2015' else: datum = max([datetime.strptime(datum.split('.')[0].lstrip("0") + '.' + datum.split('.')[1].lstrip("0") + '.' + datum.split('.')[2], "%d.%m.%Y") for datum in re.findall('\d{1,2}[.]\d{1,2}[.]\d{4}', htmlStringAlles)]).strftime("%d.%m.%Y") # Daten im Format %d.%m.%Y # datumHtml = html.xpath("//h3[contains(@class, change-title)]") # # datumHtml = html.xpath("//abbr[contains(@title, '')]") # datumAuswahl = list() # if len(datumHtml): # for datumElement in datumHtml: # datumAuswahl.append(re.search('title=".*,', tostring(datumElement).decode('utf8')).group(0).split('"')[1][:-1]) # Als Datum das häufigste Element nehmen # datum = modus(datumAuswahl) # Dict mit den Resultaten datenGesammelt = {'Champions': pdChampionsVeraendert, 'Skins': pdSkins, 'Datum': datum, 'Patchnummer': patchNummer} return(datenGesammelt)
def get_results(index): print "Getting retuls for " + runnames[index] req = urllib2.Request(parkrun_url % (runs[index])) req.add_header("User-Agent", "Mozilla") req.add_header("Host", "www.parkrun.org.uk") req.add_header("Accept", "*/*") r = urllib2.urlopen(req) code = r.getcode() #print code text = r.read() html = lxml.html.document_fromstring(text) # Hurts my eyes to do this, fragile as hell way # to get the race ID and date ab = 0 try: d = html.get_element_by_id("dnn_ContentPane") ab = len(d.text_content()) except: pass #print ab if ab == 0: #print "New Style" d = html.get_element_by_id("content") racename = d.cssselect("h2")[0].text_content() racerealname = racename.split('#')[0].strip() raceid = racename.split('\n')[1].strip().replace("-", "").strip() racedate = dateutil.parser.parse(racename.split('\n')[2].strip(), dayfirst=True) #assert(int(raceid) == race) #print racename results_rows = list(html.get_element_by_id("results"))[1] racers = [] #print results_rows for row in results_rows: vals = [td.text_content() for td in row] racer = dict() # Some racers have no times, skip them if not vals[2]: continue racer['position'] = int(vals[0]) racer['name'] = vals[1] #min,sec = [int(d) for d in vals[2].split(":")] #racer['time'] = min + ":" + sec racer['time'] = vals[2] racer['agegroup'] = vals[3] racer['agegrade'] = vals[4] racer['gender'] = vals[5] racer['genderpos'] = int(vals[6]) racer['club'] = vals[7] racer['note'] = vals[8] racer['totalruns'] = vals[9] racer['racename'] = racename racer['racedate'] = racedate #print racer if "Beckenham RC" in racer['club']: #http://www.parkrun.org.uk/athleteresultshistory?athleteNumber=46999 el = row.cssselect("td a")[0] # get href attribute and strip athlete number #print el athnumber = el.get('href').split('=')[1].strip() racer['id'] = athnumber #print athnumber racers.append(racer) print "Finished getting results for " + racename #print racers #print racedate, len(racers) scraperwiki.sqlite.save(keys, racers, table_name="races")