def get_threads(url): tree = lxml.html.parse(url) xpatheval = etree.XPathDocumentEvaluator(tree) web_pages_num_xpath = xpatheval( '//a[@class="popupctrl" and contains(text(),"Strona 1 z")]/text()') if len(web_pages_num_xpath) == 0: threads_web_pages = 1 else: t1 = len('Strona 1 z ') threads_web_pages = int(web_pages_num_xpath[0][t1:].strip()) topics = [] for x in range(1, threads_web_pages + 1): tree = lxml.html.parse(url + '/page' + str(x)) xpatheval = etree.XPathDocumentEvaluator(tree) titles = xpatheval( "//a[contains(@class,'title') and not( contains(@href,'http'))]/text()" ) hrefs = xpatheval( "//a[contains(@class,'title') and not( contains(@href,'http'))]/@href" ) assert len(titles) == len(hrefs) for idx, m in enumerate(titles): topics.append({'href': hrefs[idx], 'title': m}) print('loading thread from ' + url + ' done. ') return {v['href']: v for v in topics}.values()
def test_propfind_all_names(self): self.sub_object.get_descendants.return_value += [self.sub_object] request = Mock(META={}) path = 'collection/sub_object' v = DavView(base_url='/base/', path=path, request=request, acl_class=FullAcl, xml_pretty_print=True) v.__dict__['resource'] = self.sub_object resp = v.propfind(request, path, etree.XPathDocumentEvaluator(ElementTree( D.propfind( D.propname() ) ), namespaces=WEBDAV_NSMAP) ) self.assertEqual(resp.status_code, 207) self.assertEqual(resp.content, etree.tostring(D.multistatus( D.response( D.href('/base/collection/sub_object'), D.propstat( D.prop( D.getcontentlength(), D.creationdate(), D.getlastmodified(), D.resourcetype(), D.displayname(), ), D.status("HTTP/1.1 200 OK") ) ), ), pretty_print=True, xml_declaration=True, encoding='utf-8') )
def __init__(self, *args, **kwargs): """ Initialize the trees in the parent class and then provide some overrides. """ super(CustomLookupType, self).__init__(*args, **kwargs) self.xpath_evaluator = etree.XPathDocumentEvaluator(self.tree) self.lemma = self.prepare_xpath(self.lemma_match_query)
def dispatch(self, request, path, *args, **kwargs): """ Basic dispatch handler for all requests coming to the :param request: :param path: :param args: :param kwargs: :return: """ if path: self.path = path self.base_url = request.META['PATH_INFO'][:-len(self.path)] else: self.path = '/' self.base_url = request.META['PATH_INFO'] self.user = request.user meta = request.META.get self.xbody = kwargs['xbody'] = None if (request.method.lower() != 'put' and "/xml" in meta('CONTENT_TYPE', '') and meta('CONTENT_LENGTH', 0) != '' and int(meta('CONTENT_LENGTH', 0)) > 0): # parse XML using defusedxmls parse function self.xbody = kwargs['xbody'] = etree.XPathDocumentEvaluator( parse(request, etree.XMLParser(ns_clean=True, resolve_entities=True)), namespaces=WEBDAV_NSMAP) if request.method.upper() in self._allowed_methods(): handler = getattr(self, request.method.lower(), self.http_method_not_allowed) else: handler = self.http_method_not_allowed try: resp = handler(request, self.path, *args, **kwargs) except ResponseException as e: print(e) resp = e.response except PermissionDenied as pe: print(pe) resp = HttpResponseForbidden() except ValidationError as ve: print(ve) resp = HttpResponseBadRequest() if not 'Allow' in resp: methods = self._allowed_methods() if methods: resp['Allow'] = ", ".join(methods) if not 'Date' in resp: resp['Date'] = rfc1123_date(now()) if self.server_header: resp['Server'] = self.server_header return resp
def safe_names_to_file(): first_letter_string = 'wertuiopasdfghjklzcbnm' names = [] webpage_url = 'http://www.ksiegaimion.com/spis-imion' tree = lxml.html.parse(webpage_url) xpatheval = etree.XPathDocumentEvaluator(tree) names.append(xpatheval('//div[@class="table"]//a/text()')) time.sleep(1) with open('../files/names_pl.txt', 'w') as f: for name in names[0]: cos = name.strip() f.write('%s\n' % unidecode.unidecode(cos)) f.close()
def safe_surnames_to_file(): first_letter_string = 'wertuiopasdfghjklzcbnm' webpage_url = 'http://genealogiapolska.pl/surnames-oneletter.php?firstchar=' surnames = [] for i in range(0, len(first_letter_string)): letter_webpage = first_letter_string[i].upper() surnames_webpage_url = webpage_url + letter_webpage tree = lxml.html.parse(surnames_webpage_url) xpatheval = etree.XPathDocumentEvaluator(tree) surnames.append(xpatheval('//table[@class="sntable"]//a[@href]/text()')) time.sleep(1) with open('../files/surnames_pl.txt', 'w', encoding='utf-8') as f: for surname_on_letter in surnames: for surname in surname_on_letter: if len(surname)>2: f.write('%s\n' % unidecode.unidecode(surname.strip())) f.close()
def split_subjects_xpath(webpage_url): try: tree = lxml.html.parse(webpage_url) xpatheval = etree.XPathDocumentEvaluator(tree) except: print('ERROR!') print webpage_url return user_names = xpatheval('//span[@class="memname"]/text()') answers = xpatheval('//span[@class="memname"]/text()') date = xpatheval('//span[@class="date"]/text()') time = xpatheval('//span[@class="memname"]/text()') for idx, val in enumerate(answers): subjects[idx]['answer'] = val subjects[idx]['users'] = users[idx] subjects[idx]['date'] = date[idx] subjects[idx]['time'] = time[idx] return subjects
def safe_eng_names_to_file(): num_of_pages = 13 webpage_url = 'http://www.behindthename.com/names/usage/english' names = [] for i in range(1, num_of_pages): letter_webpage = '/'+str(i) surnames_webpage_url = webpage_url + letter_webpage tree = lxml.html.parse(surnames_webpage_url) xpatheval = etree.XPathDocumentEvaluator(tree) names.extend(xpatheval('//div[@class="browsename"]/a[@href]/text()')) time.sleep(1) names2 = names names = list(set(names)) with open('../files/names_eng.txt', 'w', encoding='utf-8') as f: for name in names: if len(name) > 2: f.write('%s\n' % unidecode.unidecode(name.strip())) f.close()
def dispatch(self, request, path, *args, **kwargs): if path: self.path = path self.base_url = request.META['PATH_INFO'][:-len(self.path)] else: self.path = '/' self.base_url = request.META['PATH_INFO'] meta = request.META.get self.xbody = kwargs['xbody'] = None if (request.method.lower() != 'put' and "/xml" in meta('CONTENT_TYPE', '') and meta('CONTENT_LENGTH', 0) != '' and int(meta('CONTENT_LENGTH', 0)) > 0): self.xbody = kwargs['xbody'] = etree.XPathDocumentEvaluator( etree.parse(request, etree.XMLParser(ns_clean=True)), namespaces=WEBDAV_NSMAP) if request.method.upper() in self._allowed_methods(): handler = getattr(self, request.method.lower(), self.http_method_not_allowed) else: handler = self.http_method_not_allowed try: resp = handler(request, self.path, *args, **kwargs) except ResponseException as e: resp = e.response if not 'Allow' in resp: methods = self._allowed_methods() if methods: resp['Allow'] = ", ".join(methods) if not 'Date' in resp: resp['Date'] = rfc1123_date(now()) if self.server_header: resp['Server'] = self.server_header return resp
news = feedparser.parse("https://news.google.com/news?q=apple&output=rss") print news.feed.title print news.feed.link print news.feed.description print news.feed.published print news.feed.published_parsed print "total ", len(news.entries) for news in news.entries: print "\t", news.title print "\t\t", news.id print "\t\t", news.link # print "\t\t", news.description # print "\t\t", news.summary_detail print "\t\t", news.published print "\t\t", news.published_parsed url = urlparse(news.link) qs = parse_qs(url.query) print "\t\t", qs['url'][0] dom = lxml.html.parse(qs['url'][0]) xpath = etree.XPathDocumentEvaluator(dom) links = xpath("//p/text()") print links # break
except Exception, e: print print " *** ** ** ** ** ** * ***" print " *** ERROR parsing %s" % filename print " *** ** ** ** ** ** * ***" print print " Check the compilation process... " print " Is the file empty?" print " Saxon errors?" print sys.exit(2) else: self.tree = PARSED_TREES[filename] else: self.tree = tree self.xpath_evaluator = etree.XPathDocumentEvaluator(self.tree) # Initialize XPath queries _re_pos_match = """re:match(%(pos)s, $pos, "i")""" % xpaths self.lemmaStartsWith = etree.XPath( ".//e[starts-with(%(pos)s, $lemma)]" % xpaths ) self.lemma = etree.XPath('.//e[lg/l/text() = $lemma]') self.lemmaPOS = etree.XPath( './/e[lg/l/text() = $lemma and ' + _re_pos_match + ']', namespaces={'re': regexpNS})