示例#1
0
def get_threads(url):
    tree = lxml.html.parse(url)
    xpatheval = etree.XPathDocumentEvaluator(tree)
    web_pages_num_xpath = xpatheval(
        '//a[@class="popupctrl" and contains(text(),"Strona 1 z")]/text()')
    if len(web_pages_num_xpath) == 0:
        threads_web_pages = 1
    else:
        t1 = len('Strona 1 z ')
        threads_web_pages = int(web_pages_num_xpath[0][t1:].strip())
    topics = []
    for x in range(1, threads_web_pages + 1):
        tree = lxml.html.parse(url + '/page' + str(x))
        xpatheval = etree.XPathDocumentEvaluator(tree)
        titles = xpatheval(
            "//a[contains(@class,'title') and not( contains(@href,'http'))]/text()"
        )
        hrefs = xpatheval(
            "//a[contains(@class,'title') and not( contains(@href,'http'))]/@href"
        )
        assert len(titles) == len(hrefs)
        for idx, m in enumerate(titles):
            topics.append({'href': hrefs[idx], 'title': m})
    print('loading thread from ' + url + ' done. ')
    return {v['href']: v for v in topics}.values()
示例#2
0
 def test_propfind_all_names(self):
     self.sub_object.get_descendants.return_value += [self.sub_object]
     request = Mock(META={})
     path = 'collection/sub_object'
     v = DavView(base_url='/base/', path=path, request=request, acl_class=FullAcl, xml_pretty_print=True)
     v.__dict__['resource'] = self.sub_object
     resp = v.propfind(request, path,
         etree.XPathDocumentEvaluator(ElementTree(
             D.propfind(
                 D.propname()
             )
         ), namespaces=WEBDAV_NSMAP)
     )
     self.assertEqual(resp.status_code, 207)
     self.assertEqual(resp.content,
         etree.tostring(D.multistatus(
             D.response(
                 D.href('/base/collection/sub_object'),
                 D.propstat(
                     D.prop(
                         D.getcontentlength(),
                         D.creationdate(),
                         D.getlastmodified(),
                         D.resourcetype(),
                         D.displayname(),
                     ),
                     D.status("HTTP/1.1 200 OK")
                 )
             ),
         ), pretty_print=True, xml_declaration=True, encoding='utf-8')
     )
示例#3
0
    def __init__(self, *args, **kwargs):
        """ Initialize the trees in the parent class and then provide
        some overrides. """

        super(CustomLookupType, self).__init__(*args, **kwargs)

        self.xpath_evaluator = etree.XPathDocumentEvaluator(self.tree)
        self.lemma = self.prepare_xpath(self.lemma_match_query)
示例#4
0
文件: views.py 项目: arcli/djangodav
    def dispatch(self, request, path, *args, **kwargs):
        """
        Basic dispatch handler for all requests coming to the
        :param request:
        :param path:
        :param args:
        :param kwargs:
        :return:
        """
        if path:
            self.path = path
            self.base_url = request.META['PATH_INFO'][:-len(self.path)]
        else:
            self.path = '/'
            self.base_url = request.META['PATH_INFO']

        self.user = request.user

        meta = request.META.get
        self.xbody = kwargs['xbody'] = None
        if (request.method.lower() != 'put'
                and "/xml" in meta('CONTENT_TYPE', '')
                and meta('CONTENT_LENGTH', 0) != ''
                and int(meta('CONTENT_LENGTH', 0)) > 0):

            # parse XML using defusedxmls parse function
            self.xbody = kwargs['xbody'] = etree.XPathDocumentEvaluator(
                parse(request,
                      etree.XMLParser(ns_clean=True, resolve_entities=True)),
                namespaces=WEBDAV_NSMAP)

        if request.method.upper() in self._allowed_methods():
            handler = getattr(self, request.method.lower(),
                              self.http_method_not_allowed)
        else:
            handler = self.http_method_not_allowed
        try:
            resp = handler(request, self.path, *args, **kwargs)
        except ResponseException as e:
            print(e)
            resp = e.response
        except PermissionDenied as pe:
            print(pe)
            resp = HttpResponseForbidden()
        except ValidationError as ve:
            print(ve)
            resp = HttpResponseBadRequest()

        if not 'Allow' in resp:
            methods = self._allowed_methods()
            if methods:
                resp['Allow'] = ", ".join(methods)
        if not 'Date' in resp:
            resp['Date'] = rfc1123_date(now())
        if self.server_header:
            resp['Server'] = self.server_header
        return resp
示例#5
0
def safe_names_to_file():
    first_letter_string = 'wertuiopasdfghjklzcbnm'
    names = []
    webpage_url = 'http://www.ksiegaimion.com/spis-imion'
    tree = lxml.html.parse(webpage_url)
    xpatheval = etree.XPathDocumentEvaluator(tree)
    names.append(xpatheval('//div[@class="table"]//a/text()'))
    time.sleep(1)
    with open('../files/names_pl.txt', 'w') as f:
        for name in names[0]:
            cos = name.strip()
            f.write('%s\n' % unidecode.unidecode(cos))
    f.close()
示例#6
0
def safe_surnames_to_file():
    first_letter_string = 'wertuiopasdfghjklzcbnm'
    webpage_url = 'http://genealogiapolska.pl/surnames-oneletter.php?firstchar='
    surnames = []
    for i in range(0, len(first_letter_string)):
        letter_webpage = first_letter_string[i].upper()
        surnames_webpage_url = webpage_url + letter_webpage
        tree = lxml.html.parse(surnames_webpage_url)
        xpatheval = etree.XPathDocumentEvaluator(tree)
        surnames.append(xpatheval('//table[@class="sntable"]//a[@href]/text()'))
        time.sleep(1)
    with open('../files/surnames_pl.txt', 'w', encoding='utf-8') as f:
        for surname_on_letter in surnames:
            for surname in surname_on_letter:
                if len(surname)>2:
                    f.write('%s\n' % unidecode.unidecode(surname.strip()))
    f.close()
示例#7
0
def split_subjects_xpath(webpage_url):
    try:
        tree = lxml.html.parse(webpage_url)
        xpatheval = etree.XPathDocumentEvaluator(tree)
    except:
        print('ERROR!')
        print webpage_url
        return
    user_names = xpatheval('//span[@class="memname"]/text()')
    answers = xpatheval('//span[@class="memname"]/text()')
    date = xpatheval('//span[@class="date"]/text()')
    time = xpatheval('//span[@class="memname"]/text()')

    for idx, val in enumerate(answers):
        subjects[idx]['answer'] = val
        subjects[idx]['users'] = users[idx]
        subjects[idx]['date'] = date[idx]
        subjects[idx]['time'] = time[idx]
    return subjects
示例#8
0
def safe_eng_names_to_file():
    num_of_pages = 13
    webpage_url = 'http://www.behindthename.com/names/usage/english'
    names = []
    for i in range(1, num_of_pages):
        letter_webpage = '/'+str(i)
        surnames_webpage_url = webpage_url + letter_webpage

        tree = lxml.html.parse(surnames_webpage_url)
        xpatheval = etree.XPathDocumentEvaluator(tree)
        names.extend(xpatheval('//div[@class="browsename"]/a[@href]/text()'))
        time.sleep(1)
    names2 = names
    names = list(set(names))
    with open('../files/names_eng.txt', 'w', encoding='utf-8') as f:
        for name in names:
            if len(name) > 2:
                f.write('%s\n' % unidecode.unidecode(name.strip()))
    f.close()
示例#9
0
文件: views.py 项目: TZanke/djangodav
    def dispatch(self, request, path, *args, **kwargs):
        if path:
            self.path = path
            self.base_url = request.META['PATH_INFO'][:-len(self.path)]
        else:
            self.path = '/'
            self.base_url = request.META['PATH_INFO']

        meta = request.META.get
        self.xbody = kwargs['xbody'] = None
        if (request.method.lower() != 'put'
                and "/xml" in meta('CONTENT_TYPE', '')
                and meta('CONTENT_LENGTH', 0) != ''
                and int(meta('CONTENT_LENGTH', 0)) > 0):
            self.xbody = kwargs['xbody'] = etree.XPathDocumentEvaluator(
                etree.parse(request, etree.XMLParser(ns_clean=True)),
                namespaces=WEBDAV_NSMAP)

        if request.method.upper() in self._allowed_methods():
            handler = getattr(self, request.method.lower(),
                              self.http_method_not_allowed)
        else:
            handler = self.http_method_not_allowed
        try:
            resp = handler(request, self.path, *args, **kwargs)
        except ResponseException as e:
            resp = e.response
        if not 'Allow' in resp:
            methods = self._allowed_methods()
            if methods:
                resp['Allow'] = ", ".join(methods)
        if not 'Date' in resp:
            resp['Date'] = rfc1123_date(now())
        if self.server_header:
            resp['Server'] = self.server_header
        return resp
示例#10
0
news = feedparser.parse("https://news.google.com/news?q=apple&output=rss")

print news.feed.title
print news.feed.link
print news.feed.description
print news.feed.published
print news.feed.published_parsed

print "total ", len(news.entries)

for news in news.entries:
    print "\t", news.title
    print "\t\t", news.id
    print "\t\t", news.link
    # print "\t\t", news.description
    # print "\t\t", news.summary_detail
    print "\t\t", news.published
    print "\t\t", news.published_parsed

    url = urlparse(news.link)
    qs = parse_qs(url.query)
    print "\t\t", qs['url'][0]

    dom = lxml.html.parse(qs['url'][0])
    xpath = etree.XPathDocumentEvaluator(dom)

    links = xpath("//p/text()")

    print links

    # break
示例#11
0
                except Exception, e:
                    print
                    print " *** ** ** ** ** ** * ***"
                    print " *** ERROR parsing %s" % filename
                    print " *** ** ** ** ** ** * ***"
                    print
                    print " Check the compilation process... "
                    print " Is the file empty?"
                    print " Saxon errors?"
                    print
                    sys.exit(2)
            else:
                self.tree = PARSED_TREES[filename]
        else:
            self.tree = tree
        self.xpath_evaluator = etree.XPathDocumentEvaluator(self.tree)

        # Initialize XPath queries

        _re_pos_match = """re:match(%(pos)s, $pos, "i")""" % xpaths


        self.lemmaStartsWith = etree.XPath(
            ".//e[starts-with(%(pos)s, $lemma)]" % xpaths
        )

        self.lemma = etree.XPath('.//e[lg/l/text() = $lemma]')

        self.lemmaPOS = etree.XPath(
            './/e[lg/l/text() = $lemma and ' + _re_pos_match + ']',
            namespaces={'re': regexpNS})