def doWeb(self, doc, url): if type(doc) == type("huh"): #then it's not BeautifulSoup tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser()) links = tree.xpath( "/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']" ) #print "links = ", links #for each in links: # print type(links[0]) document = BSXPathEvaluator(doc) else: document = doc if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]", document, None, XPathResult.ANY_TYPE, None): articles = [] if (self.detectWeb(doc, url) == "multiple"): #search page items = {} xpath = None if (url.count("_ob=PublicationURL") > 0): xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a' else: xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a' rows = document.evaluate(xpath, document, None, XPathResult.ANY_TYPE, None) print rows next_row = None #for next_row in rows.iterateNext(): isTrue = True next_row = rows while isTrue: try: next_row = rows.iterateNext() except IndexError: isTrue = False #while (next_row = rows.iterateNext()): print next_row.__dict__ title = "some title here" #next_row.text link = "some href here" #next_row.href if not re.match("PDF \(", title) and not re.match( "Related Articles", title): items[link] = title #items = zotero.SelectItems(items) #let's assume we want all of them [articles.append(i) for i in items] result_sets = [] for article in articles: result_sets.append({'article': article}) else: articles = [url] return_sets = [{"currentdoc": doc}] if len(articles) == 0: print "ERROR: no items were found" return print "articles = ", articles print "result_sets = ", result_sets return result_sets #return all articles or the currentdoc in a dict for stuff that we want to grab
def detectWeb(self, doc, url): if type(doc) == type(""): doc = BSXPathEvaluator(doc) if url.count( "_ob=DownloadURL") != 0 or doc.title == "ScienceDirect Login": return False if ((not re.match("pdf", url)) and url.count("_ob=ArticleURL") == 0 and url.count("/article/") == 0) or url.count("/journal/") != 0: return "multiple" elif not re.match("pdf", url): return "journalArticle" return False
def test(): global document, options, DEFAULT_TESTDIR, url_data def nodesStr(nodes): def tagstr(node): try: strs = ['<' + node.name] i = node.get('id') c = node.get('class') if i: strs.append('id=' + i) if c: strs.append('class=' + c) return escapeStr(' '.join(strs) + '>') except: return escapeStr(unicode(node)) if isinstance(nodes, list): return ' '.join([tagstr(node) for node in nodes]) elif getattr(nodes, 'nodeType', None) or isinstance(nodes, basestring): return escapeStr(unicode(nodes)) else: return nodes if options.web: fp = urllib2.urlopen(url_data) dirdoc = BSXPathEvaluator(fp.read()) files = map(lambda node: node.get('href'), dirdoc.getItemList('//li/a[@href!="../"]')) else: if options.path: testdir = options.path else: testdir = DEFAULT_TESTDIR files = os.listdir(testdir) tnames = ','.join(options.names).split(',') if options.names else None tnumbers = ','.join( options.numbers).split(',') if options.numbers else None for name in files: if tnames: fname = re.sub(r'\..*$', '', name) if not fname in tnames: continue target = url_data + '/' + name if options.web else os.path.join( testdir, name) data = parseTestData(target, options.web) print '[%s]\n%s\n' % (name, data.comment) document = BSXPathEvaluator(data.html) context = document.evaluate(data.contextExpr, document, None, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, None).snapshotItem(0) tests = data.tests cnt = 0 for test in tests: cnt = cnt + 1 if tnumbers: if not str(cnt) in tnumbers: continue print u'No.%d' % cnt expr = test.expr print u'expr : %s' % (expr) (nodes, time, resultType) = document.applyXPath(context, expr) print u'time : %d.%06d sec' % (time.seconds, time.microseconds) print u'result: %s' % nodesStr(nodes) print u'expect: %s' % (test.data) judge = testNodes(nodes, test.data) print u'judge : %s (%s)' % (judge.status, judge.detail) print u'' print u''