def parse_html(html): soup = soupparser.fromstring(html) body = soup.xpath("body")[0] div_wenshu = body.xpath("div[@id='wrap']/div[@id='wenshu']")[0] inner_table = div_wenshu.xpath("table[1]/tbody[1]/tr[1]/td[1]/div[@id='ws']/table")[0] try: first_tr, second_tr, third_tr, fourth_tr = inner_table.xpath("tr") idx = html.find("<HTML>") html = html[idx:] soup = soupparser.fromstring(html) contents = soup.xpath("//text()")[:-50] contents = [content.encode('utf-8') for content in contents] content = "\n".join(contents).strip().encode('utf-8') except: first_tr, second_tr, third_tr, fourth_tr = inner_table.xpath("tr")[:4] idx = html.find("PrintArea") last_idx = html.find("td", idx) html = html[idx-74:last_idx+3].encode("utf-8") soup = soupparser.fromstring(html) contents = soup.xpath("//text()") contents = [content.encode('utf-8').strip() for content in contents if len(content.strip()) != 0] content = "\n".join(contents).strip().encode('utf-8') title = first_tr.xpath("td/div[@id='wsTitle']/text()")[0].encode('utf-8') time = second_tr.xpath("td/div[@id='wsTime']/span/text()")[0].encode('utf-8')[-10:] court = content.split('\n')[0].encode('utf-8') return title, time, court, content
def getElementFromTemplateInHtml(html, knownHtml, knownTextDict): res = {} for i in knownTextDict.keys(): # print i # print 'finding:',knownTextDict[i] res[i] = getElementFromTemplateInHtmlElem(fromstring(html), fromstring(knownHtml), knownTextDict[i]) return res
def get_deviations(args): resp = opener.open('http://my.deviantart.com/gallery/?offset=0') print "Opened your dA gallery management page" got_all_devs=False dev_page = soupparser.fromstring(resp.read()) print dev_page.xpath('//tbody') #print dev_page.getchildren('tbody') while not got_all_devs: for dev in dev_page.xpath('//tbody/tr'): dev_link_cell=dev.xpath('./td/a')[0] #dev_link_date=dev.xpath('./td')[3] #print dev_link_cell.attrib['href'] #print etree.tostring(dev_link_cell) try: get_deviation(dev_link_cell.attrib['href'],args) except IndexError: print ("Probably tried to get a deviation that" " didn't have download of the original enabled") next_page=dev_page.cssselect('.pagination ul.pages li.next a') if not ('href' in next_page[0].attrib.keys()): got_all_devs=True else: print "proceeding to {}".format(next_page[0].attrib['href']) resp=opener.open(next_page[0].attrib['href']) dev_page = soupparser.fromstring(resp.read())
def test_delete_msgid(self): self.browser.go('http://localhost/portal/portal_i18n/manage_messages?lang=de') dom = fromstring(re.sub(r'\s+', ' ', self.browser.get_html())) results = dom.xpath('//table[@id="message_results"]/tr/td/small/a') found = None for result in results: if result.text == '${cnt} cats': found = result break self.browser.go(found.attrib['href']) form = self.browser.get_form('translate_message') self.browser.clicked(form, self.browser.get_form_field(form, 'translation:utf8:ustring')) self.browser.submit(fieldname='manage_delMessage:method') self.browser.go('http://localhost/portal/portal_i18n/manage_messages?lang=de') dom = fromstring(re.sub(r'\s+', ' ', self.browser.get_html())) results = dom.xpath('//table[@id="message_results"]/tr/td/small/a') found = None for result in results: if result.text == '${cnt} cats': found = result break self.assertTrue(found is None)
def test_all_sorts_result_in_one_arrow_present(self): """Assert only one sort class is present in the decision list view""" # Assumes CSS will be correctly displaying the sort status decision_list = DecisionList() sort_options = deepcopy(decision_list.sort_table_headers) unsortable_headers = decision_list.unsortable_fields[:] for header_list in sort_options.values(): for header in unsortable_headers: index = header_list.index(header) header_list.pop(index) self.create_decisions_with_different_statuses() # Test Ascending Sort for page, sort_queries in sort_options.iteritems(): for sort_query in sort_queries: response = self.client.get(reverse('publicweb_item_list', args=[self.bettysorg.slug, page]), {'sort': sort_query}) html = fromstring(response.content) sort_selector = CSSSelector('table.summary-list .sort-asc') sorts = sort_selector(html) self.assertEquals(len(sorts), 1, 'Number of ascending sort arrows should be 1. But is ' + str(len(sorts)) + ' for page=' + page + ' sort_query=' + sort_query) # Test Descending Sort for page, sort_queries in sort_options.iteritems(): for sort_query in sort_queries: response = self.client.get(reverse('publicweb_item_list', args=[self.bettysorg.slug, page]), {'sort': '-' + sort_query}) html = fromstring(response.content) sort_selector = CSSSelector('table.summary-list .sort-desc') sorts = sort_selector(html) self.assertEquals(len(sorts), 1, 'Number of descending sort arrows should be 1. But is ' + str(len(sorts)) + ' for page=' + page + ' sort_query=' + sort_query)
def getFormatHtml(htmlContent): try: dom = soupparser.fromstring(htmlContent) except Exception, e: cleaner = Cleaner() htmlContent = cleaner.clean_html(htmlContent) doc = soupparser.fromstring(htmlContent)
def __get_paper_from_acm (self, entry_url): resp_body = self.op.open (entry_url).read () root = sp.fromstring (resp_body) divmain = root.xpath ("//div[@id='divmain']")[0] title = divmain.xpath ("div/h1/strong")[0].text # use regex to extract abstract link abst_url = re.compile (r"tab.abstract.cfm[^']*").search (resp_body).group (0) abst_url = 'http://dl.acm.org/' + abst_url abst_body = self.op.open (abst_url).read () # extract all text node from this dom tree abst = ''.join (sp.fromstring (abst_body).xpath ('//div/p/div/p/descendant-or-self::*/text()')) # instantiate a Paper class paper = Paper (title, abst) # locate the author table block author_table = divmain.xpath ("table/tr/td/table")[1] # add each author for author_row in author_table.xpath ('tr'): name = author_row.xpath ('td/a/text()')[0] affn = author_row.xpath ('td/a/small/text()')[0] paper.add_author (Author (name, affn)) return paper
def GetUserInfo(self,html): try: if '搜索结果为空' in html: print (u'weibo用户不存在!') return False if '您当前访问的用户状态异常' in html: #print (u'weibo用户状态异常!') return False html = self.GetHtmlInfo(html, '{\"pid\":\"pl_user_feedList\"') root = fromstring(html) usersDivise = root.xpath("//div[@class='list_person clearfix']") if len(usersDivise) > 0: users = [] for div in usersDivise: # user = dict.fromkeys(sinaSetting.USER_KEY, '') user={}#定义一个用户字典并初始化 div = tostring(div , encoding='utf-8') div = fromstring(div) try: iu_node = div.xpath("//div[@class='person_pic']/a/img")[0] user['Imgurl'] = iu_node.get("src") user['nickname'] = div.xpath("//div[@class='person_detail']/p[@class='person_name']")[0].text_content() user['uid'] = iu_node.get("uid") sex_node = div.xpath("//div[@class='person_detail']/p[@class='person_addr']/span[@class='male m_icon']") sex = '' if sex_node: sex = sex_node[0].get('title') user['sex'] = sex addr_node = div.xpath("//div[@class='person_detail']/p[@class='person_addr']") addr = '' if addr_node: addr = addr_node[0].text_content() user['addr'] = addr num_node = div.xpath("//div[@class='person_detail']/p[@class='person_num']") num = '' if num_node: num = num_node[0].text_content() user['num'] = num intro_node = div.xpath("//div[@class='person_detail']/div[@class='person_info']") intro = '' if intro_node: intro = intro_node[0].text_content() user['intro'] = intro users.append(user) except: pass self.result['users'] = users else: return False except Exception: s=sys.exc_info() msg = (u"GetUserInfo Error %s happened on line %d" % (s[1],s[2].tb_lineno)) loggerSearch.error(msg) return False return True
def getUserInfo(self, html): try: if '搜索结果为空' in html: #print (u'weibo用户不存在!') return False if '您当前访问的用户状态异常' in html: #print (u'weibo用户状态异常!') return False html = self.getPanelInfo(html, '{\"pid\":\"pl_user_feedList\"') root = fromstring(html) user_divs = root.xpath("//div[@class='list_person clearfix']") if len(user_divs) > 0: users = [] for div in user_divs: user = {} div = tostring(div , encoding='utf-8') div = fromstring(div) try: iu_node = div.xpath("//div[@class='person_pic']/a/img")[0] user['iu'] = iu_node.get("src") user['sn'] = div.xpath("//div[@class='person_detail']/p[@class='person_name']")[0].text_content() user['uid'] = iu_node.get("uid") sx_node = div.xpath("//div[@class='person_detail']/p[@class='person_addr']/span[@class='male m_icon']") sx = '' if sx_node: sx = sx_node[0].get('title') user['sx'] = sx ad_node = div.xpath("//div[@class='person_detail']/p[@class='person_addr']") ad = '' if ad_node: ad = ad_node[0].text_content() user['ad'] = ad num_node = div.xpath("//div[@class='person_detail']/p[@class='person_num']") num = '' if num_node: num = num_node[0].text_content() user['num'] = num de_node = div.xpath("//div[@class='person_detail']/div[@class='person_info']") de = '' if de_node: de = de_node[0].text_content() user['de'] = de users.append(user) except: pass self.result['users'] = users else: return False except Exception: s=sys.exc_info() msg = (u"getUserMsgInfo Error %s happened on line %d" % (s[1],s[2].tb_lineno)) logger.error(msg) return False return True
def main(argv=None): if argv is None: argv = sys.argv # print argv if len(argv) < 2: pass f1 = open("50dbb0570e45771ca4d7c2204cd2649f") f2 = open("38d54af2d8f7d8628acfae40933675a1") d1 = f1.read() d2 = f2.read() print getElementFromTemplateInHtmlElem( fromstring(d2), fromstring(d1), "Alarms are wrongly mapped or reported in BTS when using CTU2D" ) print getElementFromTemplateInHtml(d2, d1, {"res": "Alarms are wrongly mapped or reported in BTS when using CTU2D"})
def scrape(resource, **args): session = login() if resource == 'uid': # we effectively only need the first user, so don't scrape all pages search = session.get( 'http://www.erepublik.com/en/main/search/%s/' % args['query'].replace(' ', '_') ) doc = fromstring(search.text) uid = doc.xpath('//div[@class="nameholder"]/a/@href')[0].split('/')[-1].strip() return uid elif resource == 'citizen.profile': profile = session.get( 'http://www.erepublik.com/en/citizen/profile/%s' % args['citizenId'] ) doc = fromstring(profile.text) citizen_state = doc.xpath('//div[@class="citizen_state"]/div[@class="is"]/span/img/@src') is_dead = citizen_state and 'dead_citizen' in citizen_state[0] profile = { 'general': { 'avatar': doc.xpath('//img[@class="citizen_avatar"]/@style')[0].split('(')[1].split(')')[0], 'level': doc.xpath('//*[@class="citizen_level"]')[0].text, 'experience_points': doc.xpath('//*[@class="citizen_experience"]/div/p')[0].text.split(' / ')[0].replace(',', ''), 'name': doc.xpath('//*[@class="citizen_profile_header auth"]/h2')[0].text_content().strip(), 'is_alive': str(int(not is_dead)), 'birthDay': doc.xpath('//div[@class="citizen_second"]/p')[1].text.strip(), 'nationalRank': doc.xpath('//div[@class="citizen_second"]/small/strong')[0].text, }, 'location': { 'citizenship_country_initials': doc.xpath('//div[contains(@class, "citizen_info")]/a/@href')[2].split('/')[-1], 'residence_country_name': doc.xpath('//div[contains(@class, "citizen_info")]/a/@title')[0], 'residence_region_name': doc.xpath('//div[contains(@class, "citizen_info")]/a/@title')[1], }, 'party': { 'name': doc.xpath('//div[@class="citizen_activity"]/div/div/span/a')[0].text.strip(), }, 'militaryUnit': { 'id': doc.xpath('//div[@class="citizen_activity"]/div/div/a/@href')[0].split('/')[-1], 'name': doc.xpath('//div[@class="citizen_activity"]/div/div/a/span')[0].text.strip(), }, 'militaryAttributes': { 'strength': doc.xpath('//div[@class="citizen_military"]/h4')[0].text.replace(',', '').strip(), 'rank_points': doc.xpath('//div[@class="stat"]/small/strong')[1].text.split(' / ')[0].replace(',', ''), }, } return profile
def test_lang_in_path(self): self.browser_do_login('admin', '') self.portal.gl_add_site_language('es', 'Spanish') import transaction; transaction.commit() self.browser.go('http://localhost/portal/de') doc = fromstring(re.sub(r'\s+', ' ', self.browser.get_html())) self.assertEqual(doc.xpath('//div[@id="middle_port"]/h1')[0].text, 'Error page') self.browser.go('http://localhost/portal/es') doc = fromstring(re.sub(r'\s+', ' ', self.browser.get_html())) self.assertTrue(doc.attrib.has_key('lang')) self.assertEqual(doc.attrib['lang'], 'es')
def scrape(self): self.log('starting %s', self.__class__.__name__) self.open() root = soupparser.fromstring(self.body()) # get total available books totals = TotalAvailableBooksParser() total_books = totals.get_value(root) total_books_str = str(total_books) if not total_books: raise AssertionError('Could not parse for total books in %s' % self._url) self.log('total books = %d', total_books) perpage_inputs = root.cssselect('input[name^="%s"]' % self.perpage_input_name) if len(perpage_inputs): form = {} for ppi in perpage_inputs: name = ppi.attrib['name'] try: value = int(ppi.attrib['value']) except (TypeError, ValueError): continue if value < total_books: form[name] = total_books_str # if a hidden 'per page' input is changed this means there are # more than 1 page of results, otherwise all available books are # already in this 1 initial page if form: # load all books and reparse response self.submit(form) root = soupparser.fromstring(self.body()) self.log('scraping for book prices') books = [] for cls in self.parser_classes: self.log('... using %s', cls.__name__) parser = cls() pbooks = [self.tag(b) for b in parser.parse(root)] books.extend(pbooks) self.log('... found %d books (total = %d)', len(pbooks), len(books)) return books
def downloadlz(self, type='txt', start=1, end=5): """ type:txt or photo 默认下载前5页的内容 :param : :return: """ self.__mkdirfile(self.tienum) num = int(self.getnum()) print u'本帖子楼主共有%d页发表!' % num if start > end: print u"结束页超过起始页,请检查参数!\n" sys.exit() elif start > num: print u"起始页面超过上限!本帖子一共有 %d 页\n" % num sys.exit() num = num if num < end else end for i in xrange(start - 1, num): soup = soupparser.fromstring(requests.get(self.url + str(i + 1), verify=False).content) if type == "txt": self.__get_lz_txt(i + 1, soup) elif type == "photo": self.__get_lz_jpg(i + 1, soup) else: print u"输入的参数有误,只能输入'txt'或者'photo'"
def parseRefeed(self, node): node = fromstring(tostring(node)) #ui userNode = node.xpath(self.config.get("RT_USER_XPATH")) if userNode: userNode = userNode[0] ui = userNode.get("usercard", "").replace("id=", "") sn = userNode.get("nick-name", " ") un = userNode.get("href", "").replace("/", "") else: return {} rtmsg = self.parseCommon(node, "rtmsg") if rtmsg: rtmsg[COLUMN_USERID] = ui rtmsg[COLUMN_USERNAME] = un rtmsg[COLUMN_SCRENNAME] = sn #转发消息URL muNode = node.xpath("//div/div/div/div[@class='WB_from']/a[@title]") mu = "" mid = "" if muNode: mu = muNode[0].get("href", "").split("/")[-1] mid = sinaWburl2ID(mu) rtmsg[COLUMN_ID] = mid rtmsg[COLUMN_MSGURL] = mu return rtmsg
def test_dork_links(self): """Objective: Test if a random link from the dork page exists in the database. Input: A random link from a created dork page. Expected Results: The path of the link should be at least once in the db. Notes: Links have the parameters truncated, so multiple entries are likely.""" pages_dir = tempfile.mkdtemp() try: (db, engine, dork_generator) = self.dork_generator_chain('sql', pages_dir) dork_generator.regular_generate_dork(0) sample_file = choice(dork_generator.get_current_pages()) print "Randomly selected dork page:", sample_file.rsplit('/', 1)[1] with open(sample_file, 'r') as sample_data: data = fromstring(sample_data) links = data.cssselect('a') test_link_path = choice(links).get('href') print "Randomly selected path:", test_link_path from_livedb = db.select_entry(test_link_path) #the test database has below 100 entries, so it will seeded from the dorkdb from_dorkdb = db.get_dork_list('inurl', starts_with=test_link_path) result_count = len(from_livedb) + len(from_dorkdb) print "Done searching for the entry." self.assertTrue(result_count > 0) print "The dork db returned:", print "{0} entries,".format(result_count), print "which equates our expectation." finally: if os.path.isdir(pages_dir): shutil.rmtree(pages_dir)
def assertTitle(self, url_lang_sufix, value): #title = lambda x: x.xpath('//span[@class="page_title"]')[0].text or '' title = lambda x: CSSSelector("span.page_title")(x)[0].text_content() url = 'http://localhost/portal/' self.browser.go(url + url_lang_sufix) doc = fromstring(re.sub(r'\s+', ' ', self.browser.get_html())) self.assertEqual(title(doc), value)
def test_dork_page_content(self): """Objective: Testing the attack surfaces content. Input: An attack surface sample. The structure is defined in a template. Expected Results: The attack surface should be a HTML page containing text and links. Notes: We extract and count the elements in the HTML document.""" pages_dir = tempfile.mkdtemp() try: dork_generator = self.dork_generator_chain('sql', pages_dir)[2] dork_generator.regular_generate_dork(0) sample_file = choice(dork_generator.get_current_pages()) with open(sample_file, 'r') as sample_data: data = fromstring(sample_data) self.assertTrue(len(data.cssselect('a')) > 0) self.assertTrue(len(data.cssselect('title')) > 0) self.assertTrue(len(data.cssselect('form')) > 0) print "The content analysis of a random HTML page returned:" print len(data.cssselect('a')), 'links (<a href=""></a>)', print len(data.cssselect('title')), 'page title (<title />)', print len(data.cssselect('form')), 'form field (<form />)' print "which equates our expectation." finally: if os.path.isdir(pages_dir): shutil.rmtree(pages_dir)
def import_day(self, day, month, year): tree = fromstring(self._get_html(self._url(day, month, year))) titlenodes = tree.xpath("//td[@class='size2']/font[@color='#CD076A']/b") for titlenode in titlenodes: event = Event() event.name = titlenode.text_content() time = titlenode.xpath("./ancestor::table/parent::td/table[2]/tr/td[2]/font[1]/text()")[0] event.date_start = datetime.strptime(time, "%H:%M Uhr") event.date_start = event.date_start.replace(year=year, day=day, month=month) venue = titlenode.xpath("./ancestor::table/parent::td/table[2]/tr[2]/td[2]/font/descendant::text()")[0] address = titlenode.xpath("./ancestor::table/parent::td/table[2]/tr[2]/td[2]/text()[preceding-sibling::br]")[0].strip() p = re.search(".*[0-9]{5} (.*)$", address) city = p.group(1) geodata = GooglePlacesLookup.find_geo_data_for_venue(venue, city) venue = geodata['name'] lat = geodata['lat'] lon = geodata['lon'] location, created = Location.objects.get_or_create(name=venue, city=city, latitude=lat, longitude=lon) event.location = location if not self.is_duplicate_event(event): event.save()
def ElementFromFile(path, use_html_parser=False): """ Creates a new Element object from an XML file at the given path. @param path: The path to the XML file @type path: string @return: Element """ #text = open(path) tfile = open(path) text = tfile.read() tfile.close() if text is not None: if use_html_parser: try: root = html.fromstring(text) test = html.tostring(root, encoding=unicode) return root except: return fromstring(text) else: return etree.fromstring(text) else: return None
def __get_paperentry_from_acm (self, title, authors): QUERY_URL = 'http://dl.acm.org/advsearch.cfm' self.br.open (self.__wrapper (QUERY_URL)) # this query form does not have id attribute, nr=0 means the first form self.br.select_form (nr=0) # termzone is a multi choice dropdown menu, the value should be a list self.br.form['termzone'] = ['Title'] self.br.form['allofem'] = title self.br.form['peoplezone'] = ['Author'] self.br.form['people'] = authors self.br.form['peoplehow'] = ['and'] resp_body = self.__deljs_html (self.br.submit ().read ()) # check if the rearch result is not empty if resp_body.find ('was not found') == -1: root = sp.fromstring (resp_body) # select the first entry in search result entry_url = root.xpath ("//a[@class='medium-text' and @target='_self' and \ starts-with (@href, 'citation.cfm')]/@href")[0] return 'http://dl.acm.org/' + entry_url else: return ""
def initializeUi(self): # search results html = str(self.html) systems = [] names = [] urls = [] doc = fromstring(html) el = doc.xpath("//table[@class='results']") for table in el: rows = table.getchildren()[2:] for row in rows: system = row.getchildren()[0].text.strip() if system == "": system = systems[-1] systems.append(system) names.append(row.getchildren()[1].findtext("a")) urls.append(GAMEFAQS_URL + row.getchildren()[1].getchildren()[0].attrib["href"]) # Displaying search results model = QStandardItemModel() if len(systems) > 0: for i in range(0, len(systems)): item = QStandardItem("(" + systems[i] + ") " + names[i]) item.setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled) item.setData(Qt.Unchecked, Qt.CheckStateRole) model.appendRow(item) model.itemChanged.connect(self.on_item_changed) else: item = QStandardItem("No game was found") model.appendRow(item) self.ui.listViewGames.setModel(model) self.urls = urls self.checked = 0 self.ui.pushButtonOk.setEnabled(False)
def cleaner(infile='', outfile='', soup=True): '''clean html''' fl = open(infile) doc = fl.read() fl.close() for element in ['span', 'div', 'font', 'u']: doc = doc.replace("%s>" % element, "%s> " % element) if soup: doc = soupparser.fromstring(doc) else: #fallback doc = html.fromstring(doc) safe_attrs = clean.defs.safe_attrs clean.defs.safe_attrs = frozenset(['href', 'alt', 'id', 'src', 'width', 'height']) c = clean.Cleaner(scripts=True, embedded=True, meta=True, style=True, remove_tags = ['span', 'div', 'font', 'u'], safe_attrs_only=True) c.safe_attrs=frozenset(['href', 'alt', 'id', 'src', 'width', 'height']) #this seems to work no it doesnt d2 = c.clean_html(doc) #ps = #for p in ps: # if p.find('a'):# # if p.find('a').find('img'): # print ok d2 = squash_paragraph_attributes(d2) flout = open(outfile, 'wb') flout.write(etree.tostring(d2, method="html", encoding='utf-8')) flout.close()
def __get_author_from_ms (self, entry_url): author_id = entry_url.split ('/')[-2] entry_url = "http://academic.research.microsoft.com/io.ashx?authorID=%s" % author_id resp_body = self.op.open (entry_url).read () json_obj = json.loads (resp_body) name = json_obj['DisplayName'] if json_obj['Affiliation']: affn = json_obj['Affiliation']['FullName'] else: affn = '' print "Finished author" return Author (name, affn) # OLD ONE: FETCH OFFICIAL AUTHOR PAGE root = sp.fromstring (resp_body) name = root.xpath ("//span[@id='ctl00_MainContent_AuthorItem_authorName']")[0].text aff_nodes = root.xpath ("//a[@id='ctl00_MainContent_AuthorItem_affiliation']") # make sure the author page has affiliation if len (aff_nodes) > 0: affn = aff_nodes[0].text else: affn = "" return Author (name, affn)
def extend_crumbs(crumbs_html, editor_url, extra_crumbs): from lxml.html.soupparser import fromstring from lxml.html import tostring from lxml.builder import E crumbs = fromstring(crumbs_html).find('div[@class="breadcrumbtrail"]') roles_div = crumbs.find('div[@class="breadcrumbitemlast"]') roles_div.attrib["class"] = "breadcrumbitem" roles_link = E.a(roles_div.text, href=editor_url) roles_div.text = "" roles_div.append(roles_link) for title, href in extra_crumbs: a = E.a(title, {"href": href}) div = E.div(a, {"class": "breadcrumbitem"}) crumbs.append(div) last_crumb = crumbs.xpath('div[@class="breadcrumbitem"]')[-1] last_crumb_text = last_crumb.find("a").text last_crumb.clear() last_crumb.attrib["class"] = "breadcrumbitemlast" last_crumb.text = last_crumb_text return tostring(crumbs)
def getRandoAtUrl(url): print "syncing " + url response = urllib2.urlopen(url) html = response.read() root = fromstring(html) content = root.xpath("//td[@class='col_content']")[0] name = content.xpath("string(span[@class='rando_titre']/text())") start = content.xpath("string(span[@class='rando_depart']/text())").replace(u"Au d\u00E9part de","") description = content.xpath("string(p[@class='rando_description']/text())") if description=="": description = content.xpath("string(span[@class='rando_itineraire']/text())") itinerary = content.xpath("string(span[@class='rando_itineraire']/text())") propertiesTable = root.xpath(u"//th[starts-with(.,'Caract\u00E9ristiques')]/../following-sibling::tr/td/table")[0] props = propertiesTable.xpath(".//tr/td[2]") place = props[0].xpath("string(.)") placeInfo = props[1].xpath("string(.)") startAltitude = props[2].xpath("string(.)").replace(u"Alt. au d\u00E9p.","") rise = props[3].xpath("string(.)").replace(u"Mont\u00E9e","") descent = props[4].xpath("string(.)").replace("Descente","") duration = props[5].xpath("string(.)").replace(u"Dur\u00E9e","") difficulty = props[6].xpath("string(.)").replace(u"Difficult\u00E9e","") bestPeriod = props[7].xpath("string(.)").replace(u"P\u00E9riode conseill\u00E9e","") howToGetThere = root.xpath(u"string(//th[starts-with(.,'Acc\u00E8s Routier')]/../following-sibling::tr/td[@class='module_texte']/text())") rando = {"url":url, "name":name, "start":start, "description":description, "itinerary":itinerary, "place":place, "placeinfo":placeInfo, "startaltitude":startAltitude, "rise":rise, "descent":descent, "duration":duration, "difficulty":difficulty, "bestperiod":bestPeriod, "howtogetthere":howToGetThere} return rando
def to_doc(text, parser=scraper.LXML_HTML, whole_doc=True): """Parse an HTML text. Return value: lxml.html.HtmlElement document. parser: which parser to use. whole_doc: parse to complete HTML document (with <html> around), or parse just a fragment of HTML.""" doc = None if parser == scraper.LXML_HTML: if whole_doc: doc = html.document_fromstring(text) else: doc = html.fromstring(text) elif parser == scraper.HTML5PARSER: # html5parser was broken for me, bug report is here: https://bugs.launchpad.net/lxml/+bug/780642 #if whole_doc: # doc = html5parser.document_fromstring(text) #else: # doc = html5parser.fromstring(text) # Here is my workaround: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) etree_doc = parser.parse(text) # returns an ElementTree doc = html.document_fromstring(elementtree_to_string(etree_doc)) # ^ this double conversion makes it slow ^ elif parser == scraper.BEAUTIFULSOUP: # soupparser has no document_fromstring method doc = soupparser.fromstring(text) else: print >>sys.stderr, "Warning: you want to use an unknown parser in lx.py." # doc is None return doc # lxml.html.HtmlElement
def load_from_sitemap(self, url): try: res = urlopen(url) except ValueError: logging.warning('Sitemap URL is not valid') content = fromstring(res.read()) self.urls.extend(content.xpath('//loc/text()'))
def prev_and_next_ep(url): """Next episode, followed by previous episode.""" parser = html5lib.HTMLParser() tag_soup = urllib2.urlopen(url).read() root = fromstring(tag_soup) string = tostring(root, pretty_print=True) soup = BeautifulSoup(string) div = soup.findAll("div", "grid_7_5 box margin_top_bottom") lst = [] prev_and_next = [] next = "" prev = "" for item in div: x = item.find("span", "content_title") if x: if x.find(text=True) == "Episode Info": lst.append(item) y = lst[0].findAll("h2") for item in y: prev_and_next.append((item.findAll(text=True))) for item in prev_and_next[0][1:]: next += str(item) for item in prev_and_next[1][1:]: prev += str(item) return (next, prev)
def parseFeedlist(self, html): feedDoc = fromstring(html) self.config = self.xpathconfig.getIndexConfig('v1') nodeLst = feedDoc.xpath(self.config.get("USER_FEEDLIST_XPATH")) moreNode = feedDoc.xpath(self.config.get("MORE_FEEDLIST_XPATH")) feedmsgLst = [] hasMore = 0 max_id = "" for node in nodeLst: try: msg,rtmsg = self.parseFeed(node) if msg: max_id = msg.get("mid") feedmsgLst.append(msg) if rtmsg: feedmsgLst.append(rtmsg) except: #s=sys.exc_info() #msg = (u"parseFeedlist Error %s happened on line %d" % (s[1],s[2].tb_lineno)) #logger.error(msg) continue if moreNode: #需要解析更多 hasMore = 1 return hasMore,feedmsgLst,max_id
def from_chocolatey(): root = fromstring( requests.get( 'https://chocolatey.org/packages/vmwareworkstation').content) trs = root.findall('.//tr') p_version = re.compile('(?P<version>\d{1,2}\..*)', re.IGNORECASE) for entry in trs: date = entry.xpath('string(td[3])').strip() release = entry.xpath('string(td[1]/a|td[1]/span)') version_entry = p_version.search(release) if version_entry and date: release = version_entry.group('version') workstation = {} format_str = "%A, %B %d, %Y" datetime_obj = datetime.datetime.strptime(date, format_str) workstation['date'] = datetime_obj.date().isoformat() yield release, workstation
def from_clamav(): urls = ['https://www.clamav.net/downloads', 'https://www.clamav.net/previous_stable_releases'] for url in urls: root = fromstring(requests.get(url).content) trs = root.findall('.//tr') p_version = re.compile('clamav-(?P<version>\d{1,2}\..*)\.tar\.gz$', re.IGNORECASE) for entry in trs: date = entry.xpath('string(td[2])').strip() release = entry.xpath('string(td[1])').strip() version_entry = p_version.search(release) if version_entry and date: release = version_entry.group('version') clamav = {} format_str = "%Y-%m-%d %H:%M:%S UTC" datetime_obj = datetime.datetime.strptime(date, format_str) clamav['date'] = datetime_obj.date().isoformat() yield release, clamav
def fetch_xici(self): ''' www.xicidaili.com/nn/ first page ''' self.opener = urllib2.build_opener() ua = [('User-Agent', random.choice(settings.USER_AGENTS))] self.opener.addheaders = ua try: page = self.opener.open(self.websites_url['xici']).read() doc = soupparser.fromstring(page) proxy_sels = doc.xpath('//table[@id="ip_list"]//tr') for proxy_sel in proxy_sels[1:-1]: params = proxy_sel.xpath('./td') # print params proxy = dict() proxy['ip'] = params[2].xpath('./text()')[0] proxy['port'] = params[3].xpath('./text()')[0] proxy['type'] = params[6].xpath('./text()')[0] self.proxies.append(proxy) except urllib2.URLError as e: print e
def download_one(self,problem_name): try: print 'downloading',problem_name if os.path.isfile(problem_name): print 'exist,skip' return self.update_referer(self.PROBLEM_URL) print self.SUBMISSION_URL%problem_name submission_page = self.opener.open(self.SUBMISSION_URL%problem_name).read() results = soupparser.fromstring(submission_page).xpath(self.SUBMISSION_XPATH) if not results: print 'accepted submission cannot be found on the first page' return self.update_referer(self.SUBMISSION_URL%problem_name) detail_page = self.opener.open(self.SITE_URL+results[0]).read() match_results = re.search(self.CPP_REGEX,detail_page) code = match_results.group(1).decode('unicode-escape') with open(self.folder+'/'+problem_name+'.cpp','w') as w: w.write(code) except:
def get_rates(country, date=None): """Retrieve the VAT rates for the specified country. Returns a Rates object on success, or in case of error raises an exception.""" if date is None: date = datetime.date.today() req = urllib.request.Request( url=TIC_VATRATESEARCH, headers={'Content-Type': 'application/x-www-form-urlencoded'}) req.method = 'POST' req.data = urllib.parse.urlencode([('listOfMsa', msa_map[country]), ('listOfTypes', 'Standard'), ('listOfTypes', 'Reduced'), ('listOfTypes', 'Category'), ('dateFilter', format_date(date))]) f = urllib.request.urlopen(req) status = f.getcode() if status != 200: raise TICHTTPException(status, f.info(), f.read()) body = f.read() xml = soupparser.fromstring(body) row = xml.find('.//div[@id="national"]/table/tbody/tr') std_rate = ''.join(row[1].itertext()).strip() m = _percent_re.match(std_rate) if not m: raise TICException("didn't understand rate %s" % std_rate) rate = Rate(D(m.group(1)), date) rates = Rates({'Standard': rate}, {}, {}) return rates
def from_apache(): root = fromstring( requests.get('https://archive.apache.org/dist/httpd/').content) trs = root.xpath( './/a[starts-with(@href, "apache_") or starts-with(@href, "httpd-")]') p_version = re.compile( r'(apache_|httpd-)(?P<version>\d\.\d.\d{1,2})\.[^\d]*', re.IGNORECASE) for entry in trs: release = entry.text date = entry.tail.strip().rsplit(' ', 1)[0].strip() version_entry = p_version.search(release) if version_entry and date: release = version_entry.group('version') apache = {} format_str = "%Y-%m-%d %H:%M" datetime_obj = datetime.datetime.strptime(date, format_str) apache['date'] = datetime_obj.date().isoformat() yield release, apache
def __get_msg_reply(self, url): """ 获取post数据所需要的各种参数,通过游览器查看得出 唯一有疑问的是mouse_pwd这个参数,在我电脑上实验这个参数可以顺利评论帖子 如出现不能post可根据你游览器截获到的参数修改 :param url: :return: """ dictory = {} text = requests.get(url=url, allow_redirects=False, verify=False).content text2 = requests.get(url="http://tieba.baidu.com/f/user/sign_list?t=" + str(int(time.time() * 10000)), allow_redirects=False, verify=False).content soup = soupparser.fromstring(text) msg = soup.xpath(".//*[@type='hidden']")[0] dictory['kw'] = msg.attrib['value'] dictory['floor_num'] = re.findall("reply_num:([0-9]*),", text)[0] dictory['tid'] = re.findall("thread_id:([0-9]*),", text)[0] dictory['fid'] = re.findall('"forum_id":([0-9]*),', text)[0] dictory['tbs'] = re.findall('"tbs": "([\w]*)",', text)[0] dictory["sign_id"] = json.loads(text2.decode("gbk"))['data']['used_id'] dictory["mouse_pwd_t"] = int(time.time()) return dictory
def highlightcallback(code): try: lexer = get_lexer_by_name(code.attrib['lang']) except Exception: lexer = guess_lexer(etree.tostring(code)) output = code.text_content( ) # same as `etree.tostring(code, method='text')` afaict output = highlight(output, lexer, HtmlFormatter()) # NOTE: emitting the styles like this doesn't feel right # if you have multiple entries with source code -> redundant style tags # plus, all this style info doesn't really belong in the html output = '<style>' + HtmlFormatter().get_style_defs( '.highlight') + '</style>' + output # newElement has html tags around the actual content! newElement = fromstring(output) # lxml insists of wrapping with <html>..</html> tags, so page source would look like: # <code><html><style... # the easiest fix is just changing the html to div, we get rid of the html tag mid-document # and having a wrapping div tag is harmless. newElement.tag = 'div' code.clear() code.append(newElement)
def get_content(self, url): rt_result = [] dr = re.compile(r'<[^>]+>', re.S) html = urllib.urlopen(url).read() cur_title = Document(html).short_title().replace(' ', '') readable_article = Document(html).summary() print readable_article.encode('utf8') readable_article = readable_article.replace(' ', '') cur_list = readable_article.replace('</p>', '\n').split('\n') for item in cur_list: if '<img' in item and 'src=' in item: #print item.split('src=')[1].split('"')[1] dom = soupparser.fromstring(item) if len(dom) > 0: img_path = dom[0].xpath('.//img') for img in img_path: rt_result.append(['0', img.get('src')]) else: use_item = dr.sub('', item).replace(' ', '') if len(use_item) > 10: rt_result.append(['1', use_item]) return cur_title, rt_result
def open_xml(self): """ Opens the XML file and reads raw string, uses soupparse to encode. Removes some text that has no relevance for XML files, such as HTML tags, LaTeX entities Note that soupparser.fromstring is called by both open_xml and parse_xml. open_xml uses soupparser.fromstring because the html and LaTex cleanup needs a string, not a parse tree :return: semi-parsed XML content """ raw_xml = None try: logger.debug('Opening the file: {0}'.format(self.file_input)) with open(self.file_input, 'rb') as fp: raw_xml = fp.read() # use soupparser to properly encode file contents # it could be utf-8, iso-8859, etc. parsed_content = soupparser.fromstring(raw_xml) # convert to string for ease of clean-up, convert html and LaTeX entities raw_xml = tostring(parsed_content) raw_xml = re.sub('(<!-- body|endbody -->)', '', raw_xml) raw_xml = edef.convertentities(raw_xml) raw_xml = re.sub('<\?CDATA.+?\?>', '', raw_xml) logger.debug('reading') logger.debug('Opened file, trying to massage the input.') logger.debug('XML file opened successfully') self.raw_xml = raw_xml except Exception as err: logger.error('Error: {0}'.format(err)) raise Exception(err) return raw_xml
def from_bucardo(): root = fromstring( requests.get('https://bucardo.org/postgres_all_versions.html').content) trs = root.findall('.//td') p_version_and_date = re.compile( '^(?P<version>\d{1,2}\..*) \((?P<date>[\d]{4}-[\d]{2}-[\d]{2})\)$', re.IGNORECASE) for entries in trs: entries = entries.text_content().splitlines() for entry in entries: version_and_date = p_version_and_date.search(entry) if version_and_date: release = version_and_date.group('version') date = version_and_date.group('date') postgres = {} format_str = "%Y-%m-%d" datetime_obj = datetime.datetime.strptime(date, format_str) postgres['date'] = datetime_obj.date().isoformat() yield release, postgres
def set_tieba(self, name): """ 修改正在操作的贴吧名字 :param name: 贴吧名 :return: """ self.name = name url_items = [MAIN_URL, "f?kw=", name, "&fr=home"] self.url = ''.join(url_items) req = requests.get(self.url, headers=HEADERS, allow_redirects=False, verify=False) if int(req.status_code) != 200: raise TiebaError, 'The tieba: "%s" have not exist!' % self.name self.html = req.content try: self.soup = soupparser.fromstring(self.html) except ValueError: self.set_html_by_js()
def from_virten(): root = fromstring( requests.get( 'https://www.virten.net/vmware/workstation-release-and-build-number-history/' ).content) trs = root.xpath('.//tr') p_version = re.compile('(?P<version>(\d{1,2}\.?){2,3})', re.IGNORECASE) for entry in trs: release = entry.xpath('string(td[1]/text())') date = entry.xpath('string(td[3]/text())') version = p_version.search(release) if version and date: release = version.group('version') workstation = {} format_str = "%Y-%m-%d" datetime_obj = datetime.datetime.strptime(date, format_str) workstation['date'] = datetime_obj.date().isoformat() yield release, workstation
def run(self): temp_status = None temp_object = None try: temp_req = Request(self.obj['url'], headers=request_headers) temp_res = urlopen(temp_req) temp_code = temp_res.getcode() temp_type = temp_res.info()["Content-Type"] temp_status = temp_res.getcode() temp_object = temp_res if temp_code == 200: if types in temp_type: temp_content = temp_res.read() #var_dump(temp_content) try: temp_data = fromstring(temp_content) temp_thread = threading.Thread(target=ParseThread, args=(self.obj['url'], temp_data)) link_threads.append(temp_thread) temp_thread.start() except (RuntimeError, TypeError, NameError, ValueError): print ('Content could not be parsed, perhaps it is XML? We do not support that yet.') #var_dump(temp_content) pass except HTTPError as e: temp_status = e.code pass self.obj['obj'] = temp_object self.obj['sta'] = temp_status ProcessChecked(self.obj)
def check_all_text_translated(self, view, args): old_lang = translation.get_language() self.mock_get_text_functions_for_french() translation.activate("fr") response = self.client.get(reverse(view, args=args), follow=True) html = response.content # pylint: disable=E1103 root = fromstring(html) sel = CSSSelector('*') for element in sel(root): if self.has_translatable_text(element): try: self.assertTrue( self.contains(element.text, "XXX "), "No translation for element " + element.tag + " with text '" + element.text + "' from view '" + view + "'") finally: translation.activate(old_lang)
def getDetailsFromUrl(address): url = 'http://lesvisiteursdusoir.com' + address response = urllib2.urlopen(url) html = response.read() root = fromstring(html) titleArray = root.xpath("//h1[@class='title']/text()") yearArray = root.xpath( "//div[@class='field field-name-field-annee-realisation field-type-date field-label-inline clearfix']//span/text()" ) durationArray = root.xpath( "string(//div[@class='field field-name-field-duree-txt field-type-text field-label-inline clearfix']//div[@class='field-item even']/text())" ) countryArray = root.xpath( "//div[@class='field field-name-field-pays field-type-text field-label-inline clearfix']//div[@class='field-item even']/text()" ) directorArray = root.xpath( "//div[@class='field field-name-field-realisateur field-type-text field-label-inline clearfix']//div[@class='field-item even']/text()" ) synopsisArray = root.xpath( "//div[@class='field field-name-field-synopsis field-type-text-long field-label-above']//p/text()" ) imageArray = root.xpath( "//div[@class='field field-name-field-affiche field-type-image field-label-hidden']//img/@src" ) allocineShortcutArray = root.xpath( "//div[@class='region region-content']//div[@class='content']/a/@href") #print title, duration, country, director, synopsis, image, allocineShortcut details = { "title": firstValue(titleArray), "duration": str(durationArray), "country": firstValue(countryArray), "year": firstValue(yearArray), "synopsis": firstValue(synopsisArray), "director": firstValue(directorArray), "url": url, "poster": firstValue(imageArray), "allocine": firstValue(allocineShortcutArray) } return details
def _check_response(self, response): '''Checks response :param requests.Response response: response object :return: bool (True if valid, False if not valid) ''' response_ct = response.headers.get('Content-Type') if not response.ok: if response.status_code == 400: raise appngizer.errors.HttpClientBadRequest('400 - Bad request ({})'.format(response.url)) if response.status_code == 409: raise appngizer.errors.HttpElementConflict('409 - Conflict ({})'.format(response.url)) if response.status_code == 403: raise appngizer.errors.HttpElementForbidden('403 - Forbidden ({})'.format(response.url)) if response.status_code == 404: raise appngizer.errors.HttpElementNotFound('404 - Not found ({})'.format(response.url)) if response.status_code == 500: # try to get exception message from html error page if exist if response.text: html_error = soupparser.fromstring(response.text) pre_childs = html_error.xpath('//pre[contains(text(),"Exception")]') pre_texts = [] for pre_text in pre_childs: pre_texts.append(pre_text.text) raise appngizer.errors.HttpServerError('500 - Server error ({}): {}'.format(response.url, ' '.join(pre_texts))) else: raise appngizer.errors.HttpServerError('500 - Server error ({})'.format(response.url)) else: raise appngizer.errors.ClientError(response.raise_for_status()) else: if self.content_type != response_ct: if response.status_code == 204 and response.request.method == 'DELETE': return True if response.status_code == 200 and response_ct == None: return True else: raise appngizer.errors.ClientError('Unexpected response Content-Type: {0}'.format(response_ct)) return True
def parse_state_schools(schools_html): schools = [] root = fromstring(schools_html) schools_form = root.cssselect('form[name="selectHsForm"]')[0] for row in schools_form.cssselect('tr'): form_input = row.cssselect('input[name="hsCode"]') if len(form_input) == 0: continue school = {} school['hs_code'] = form_input[0].get('value') cols = row.cssselect('td') school['high_school_name'] = cols[1].text_content().strip() school['city'] = cols[2].text_content().strip() school['state'] = cols[3].text_content().strip()[:2] schools.append(school) return schools
def extract(self, url, html): """根据xpath从DOM树中抽取出感兴趣的内容""" # extract person's url result = {} # extract city from url city = re.search(r'city_py=\w+', url).group(0) if len(city) > 8: city = city[8:] result['city'] = city # from lxml import etree import lxml.html.soupparser as soupparser dom = soupparser.fromstring(html) for name, xpath in self.xpaths.items(): result[name] = [] r = dom.xpath(xpath) for item in r: try: uid = re.search(r'\d+', item.strip()).group(0) result[name].append(uid) self.fetched_num += 1 except Exception, e: # print 'Error occurs: =>', url, ' ==> ', item pass # always item = '/user/'
def from_wikipedia(): root = fromstring( requests.get( 'https://en.wikipedia.org/wiki/Google_Chrome_version_history'). content) trs = root.findall('.//tbody/tr') p_version = re.compile('(?P<version>\d{1,2}\.[0-9.]*)', re.IGNORECASE) for entry in trs: release = entry.xpath('string(td[1])').strip() # split-trick to keep only the first date occurence, the other ones are details per OS date = entry.xpath('string(td[2]/text())').strip().split(' ', 2)[0] version_entry = p_version.search(release) if version_entry and date: release = version_entry.group('version') chrome = {} datetime_obj = datetime.datetime.strptime(date, "%Y-%m-%d") chrome['date'] = datetime_obj.date().isoformat() yield release, chrome
def from_oracle(): root = fromstring( requests.get( 'https://java.com/en/download/faq/release_dates.xml').content) p_java = re.compile( 'java (?P<version_major>\d*) update (?P<version_minor>\d*)', re.IGNORECASE) trs = root.findall('.//tr') for entry in trs: release = entry.xpath('string(td[1]/text())') date = entry.xpath('string(td[2]/text())') java_entry = p_java.search(release) if java_entry: java = {} version_full = "1.%s.0_%s" % ( java_entry.group('version_major').strip(), java_entry.group('version_minor').strip()) java['version_major'] = java_entry.group('version_major') java['date'] = date.strip() yield version_full, java
def from_string(self, string, isHTML=False, encoding=None, remove_blank_text=False): if string is None: return None if encoding == None: ud = UnicodeDammit(str(string), isHTML=isHTML) markup = ud.markup.encode('utf-8') else: markup = str(string).encode(encoding) if isHTML: try: return html.fromstring(markup, parser=html_parser) except: self._core.log_exception( 'Error parsing with lxml, falling back to soupparser') return soupparser.fromstring(string) else: return etree.fromstring( markup, parser=(xml_parser if remove_blank_text else None))
def scrape(self, i): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" } res = requests.get(self.indexes[i], headers=headers) tree = fromstring(res.text) companies = tree.xpath( "//article[@class='nordic-our-listed-companies']//tbody/tr") rv = [] for company in companies: fields = company.findall("td") name = fields[0].find("a").text ticker = fields[1].text currency = fields[2].text category = fields[4].text rv.append( NasdaqCompany(name=name, ticker=ticker, currency=currency, category=category, segment=i)) return rv
def search(results,encodedTitle,title,searchTitle,siteNum,lang,searchByDateActor,searchDate,searchSiteID): if searchSiteID != 9999: siteNum = searchSiteID searchString = searchTitle.replace(" ","-") headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'} try: searchResults = HTML.ElementFromURL(PAsearchSites.getSearchSearchURL(searchSiteID) + searchString) except: request = urllib.Request(PAsearchSites.getSearchSearchURL(searchSiteID) + searchString, headers=headers) response = urllib.urlopen(request, context=ssl.SSLContext(ssl.PROTOCOL_TLSv1)) htmlstring = response.read() searchResults = fromstring(htmlstring) for searchResult in searchResults.xpath('//div[contains(@class,"postTag")]'): titleNoFormatting = searchResult.xpath('.//div[@class="nazev"]//h2//a')[0].text_content() Log('title: ' + titleNoFormatting) curID = searchResult.xpath('.//a')[0].get('href').replace("./","_") Log('curID: ' + curID) releaseDate = parse(searchResult.xpath('.//div[@class="datum"]')[0].text_content().strip()).strftime('%Y-%m-%d') Log('releaseDate: ' + releaseDate) actors = searchResult.xpath('.//div[@class="nazev"]//div[@class="featuring"]//a') actorList = [] for actor in actors: actorName = actor.text_content() actorList.append(actorName) actorsPrint = ", ".join(actorList) Log("actors: " + actorsPrint) if searchDate: score = 100 - Util.LevenshteinDistance(searchDate, releaseDate) else: score = 100 - Util.LevenshteinDistance(searchTitle.lower(), titleNoFormatting.lower()) results.Append(MetadataSearchResult(id = curID + "|" + str(siteNum), name = actorsPrint + " in " + titleNoFormatting + " [" + PAsearchSites.getSearchSiteName(siteNum) + "] " + releaseDate, score = score, lang = lang)) Log(curID + "|" + str(siteNum) + " // " + titleNoFormatting + " [" + PAsearchSites.getSearchSiteName(siteNum) + "] " + releaseDate + " // " + str(score)) return results
def get_incidents(year): """ Gets crime incidents from the DC Government XML. """ print 'Downloading year: %s' % year # Build URL from year. # If the year is 2007-2011, download the XML straight from ... my S3 account. if year in range(2007, 2011): url = 'http://wapo-projects.s3.amazonaws.com/techathon/scraperwiki/xml/crime_incidents_%s_plain.xml' % year # If the year is 2012, get it from the DC government. This is NOT the whole year. if year == 2012: url = 'http://data.octo.dc.gov/feeds/crime_incidents/crime_incidents_current.xml' # Request the data using the Requests library. request = requests.get(url) unzipped_request = request.content # Parse the XML using lxml's BeautifulSoup parser. crime_xml_parsed = fromstring(unzipped_request) # Return the parsed Element() objects by grabbing the xpath for <entry> tags. return crime_xml_parsed.xpath('//entry')
def test_save_bookmarks(self): expected = """<!DOCTYPE NETSCAPE-Bookmark-file-1> <!-- This is an automatically generated file. It will be read and overwritten. DO NOT EDIT! --> <H1>Bookmarks Menu</H1> <DL><p> <DT><H3 ADD_DATE="1518129521" LAST_MODIFIED="1518129615">Subfolder</H3> <DL><p> <DT><A ADD_DATE="1518129612" HREF="http://www.sub.level.html" LAST_MODIFIED="1518129612"> Sub level link</A> </DL> <DT><A ADD_DATE="1518129612" HREF="http://www.top.level.html" LAST_MODIFIED="1518129612"> Top level link</A> </DL> """ tree = bs.reduce_tree(fromstring(expected)) with tempfile.TemporaryDirectory() as fpd: filepath = os.path.join(fpd, 'merged.html') bs.save_bookmarks(tree, filepath) actual = self.file_to_string(filepath) self.assertEqual(actual, expected)
def from_wikipedia(): root = fromstring( requests.get( 'https://en.wikipedia.org/wiki/Java_version_history').content) p_java_until_9 = re.compile( 'java se (?P<version_major>\d*) update (?P<version_minor>.*)', re.IGNORECASE) p_java_9_plus = re.compile( 'java se (?P<version_major>\d*?)\.(?P<version_minor>.*)', re.IGNORECASE) trs = root.findall('.//tbody/tr') for entry in trs: release = entry.xpath('string(td[1]/text())') date = entry.xpath('string(td[2]/text())') java_entry = p_java_until_9.search(release) if java_entry: java = {} version_full = "1.%s.0_%s" % ( java_entry.group('version_major').strip(), java_entry.group('version_minor').strip()) java['version_major'] = java_entry.group('version_major') java['date'] = date.strip() yield version_full, java java_entry = p_java_9_plus.search(release) if java_entry: java = {} version_full = "1.%s.%s" % ( java_entry.group('version_major').strip(), java_entry.group('version_minor').strip().replace('.', '_')) java['version_major'] = java_entry.group('version_major') java['date'] = date.strip() yield version_full, java
def from_chocolatey(): root = fromstring( re.sub( r'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', requests.get('https://chocolatey.org/packages/vlc').content.decode( 'utf-8'))) trs = root.findall('.//tr') p_version = re.compile('(?P<version>\d{1,2}\..*)', re.IGNORECASE) for entry in trs: date = entry.xpath('string(td[3])').strip() release = entry.xpath('string(td[1]/a|td[1]/span)') version_entry = p_version.search(release) if version_entry and date: release = version_entry.group('version') vlc = {} format_str = "%A, %B %d, %Y" datetime_obj = datetime.datetime.strptime(date, format_str) vlc['date'] = datetime_obj.date().isoformat() yield release, vlc
def handle_sncf_message(self, message): payload = list(message.walk())[1].get_payload() root = fromstring( quopri.decodestring(payload).decode("latin1").replace( "\t", "").replace("\n", "").replace('\\xa0', ' ')) departure_city, _, arrival_city, _, seat_info, duration, _ = [ r.replace("\xa0", " ") for r in root.xpath( "//table/tr/td/table/tr/td/table/tr/td/span/text()") ] departure_time, train_id, ticket_id, arrival_time = [ r.replace("\xa0", " ") for r in root.xpath( "//table/tr/td/table/tr/td/table/tr/td/span/b/text()") ] departure_date = [ r.replace("\xa0", " ") for r in root.xpath( "//html/body/table/tr/td/table/tr/td/span/text()") ] c = Calendar() e = Event() e.name = "%s: %s -> %s [%s]" % (train_id, departure_city, arrival_city, ticket_id) e.begin = dateparser.parse("%s %s CEST" % (departure_date, departure_time), languages=["fr"]) e.end = dateparser.parse("%s %s CEST " % (departure_date, arrival_time), languages=["fr"]) e.location = departure_city e.description = "%s" % seat_info c.events.add(e) c.events with open('my.ics', 'w') as f: f.writelines(c)
def fix_links(site, text): f = open('/tmp/links.txt', 'a+') from lxml.html.soupparser import fromstring e = fromstring(text) for img in e.xpath('//img'): src = img.get('src') f.write((src or '').encode('utf-8') + "\n") image = get_image_from_link(site, src) if isinstance(image, basestring): pass else: if image is not None: url = localize(image, site) + "/@@images/image" logger.info("Change image link %s to %s", src, url) img.set('src', url) for a in e.xpath('//a'): href = a.get('href') f.write((href or '').encode('utf-8') + "\n") if href is not None: res = fix_inner_link(site, href) if not res: continue if href != res: if not isinstance(res, basestring): res = res.absolute_url() logger.info("Change link %s to %s", href, res) a.set('href', res) f.close() return lxml.html.tostring(e, encoding='unicode', pretty_print=True)