示例#1
0
 def doWeb(self, doc, url):
     if type(doc) == type("huh"):  #then it's not BeautifulSoup
         tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser())
         links = tree.xpath(
             "/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']"
         )
         #print "links = ", links
         #for each in links:
         #    print type(links[0])
         document = BSXPathEvaluator(doc)
     else:
         document = doc
     if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]",
                          document, None, XPathResult.ANY_TYPE, None):
         articles = []
         if (self.detectWeb(doc, url) == "multiple"):
             #search page
             items = {}
             xpath = None
             if (url.count("_ob=PublicationURL") > 0):
                 xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a'
             else:
                 xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a'
             rows = document.evaluate(xpath, document, None,
                                      XPathResult.ANY_TYPE, None)
             print rows
             next_row = None
             #for next_row in rows.iterateNext():
             isTrue = True
             next_row = rows
             while isTrue:
                 try:
                     next_row = rows.iterateNext()
                 except IndexError:
                     isTrue = False
                 #while (next_row = rows.iterateNext()):
                 print next_row.__dict__
                 title = "some title here"  #next_row.text
                 link = "some href here"  #next_row.href
                 if not re.match("PDF \(", title) and not re.match(
                         "Related Articles", title):
                     items[link] = title
             #items = zotero.SelectItems(items)
             #let's assume we want all of them
             [articles.append(i) for i in items]
             result_sets = []
             for article in articles:
                 result_sets.append({'article': article})
         else:
             articles = [url]
             return_sets = [{"currentdoc": doc}]
         if len(articles) == 0:
             print "ERROR: no items were found"
             return
         print "articles = ", articles
         print "result_sets = ", result_sets
     return result_sets  #return all articles or the currentdoc in a dict for stuff that we want to grab
示例#2
0
 def detectWeb(self, doc, url):
     if type(doc) == type(""):
         doc = BSXPathEvaluator(doc)
     if url.count(
             "_ob=DownloadURL") != 0 or doc.title == "ScienceDirect Login":
         return False
     if ((not re.match("pdf", url)) and url.count("_ob=ArticleURL") == 0 and
             url.count("/article/") == 0) or url.count("/journal/") != 0:
         return "multiple"
     elif not re.match("pdf", url):
         return "journalArticle"
     return False
示例#3
0
def test():
    global document, options, DEFAULT_TESTDIR, url_data

    def nodesStr(nodes):
        def tagstr(node):
            try:
                strs = ['<' + node.name]
                i = node.get('id')
                c = node.get('class')
                if i:
                    strs.append('id=' + i)
                if c:
                    strs.append('class=' + c)
                return escapeStr(' '.join(strs) + '>')
            except:
                return escapeStr(unicode(node))

        if isinstance(nodes, list):
            return ' '.join([tagstr(node) for node in nodes])
        elif getattr(nodes, 'nodeType', None) or isinstance(nodes, basestring):
            return escapeStr(unicode(nodes))
        else:
            return nodes

    if options.web:
        fp = urllib2.urlopen(url_data)
        dirdoc = BSXPathEvaluator(fp.read())
        files = map(lambda node: node.get('href'),
                    dirdoc.getItemList('//li/a[@href!="../"]'))
    else:
        if options.path:
            testdir = options.path
        else:
            testdir = DEFAULT_TESTDIR
        files = os.listdir(testdir)

    tnames = ','.join(options.names).split(',') if options.names else None
    tnumbers = ','.join(
        options.numbers).split(',') if options.numbers else None
    for name in files:
        if tnames:
            fname = re.sub(r'\..*$', '', name)
            if not fname in tnames: continue
        target = url_data + '/' + name if options.web else os.path.join(
            testdir, name)
        data = parseTestData(target, options.web)
        print '[%s]\n%s\n' % (name, data.comment)
        document = BSXPathEvaluator(data.html)
        context = document.evaluate(data.contextExpr, document, None,
                                    XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
                                    None).snapshotItem(0)
        tests = data.tests
        cnt = 0
        for test in tests:
            cnt = cnt + 1
            if tnumbers:
                if not str(cnt) in tnumbers: continue
            print u'No.%d' % cnt
            expr = test.expr
            print u'expr  : %s' % (expr)

            (nodes, time, resultType) = document.applyXPath(context, expr)

            print u'time  : %d.%06d sec' % (time.seconds, time.microseconds)
            print u'result: %s' % nodesStr(nodes)
            print u'expect: %s' % (test.data)

            judge = testNodes(nodes, test.data)

            print u'judge : %s (%s)' % (judge.status, judge.detail)
            print u''

        print u''