def doWeb(self, doc, url): if type(doc) == type("huh"): #then it's not BeautifulSoup tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser()) links = tree.xpath( "/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']" ) #print "links = ", links #for each in links: # print type(links[0]) document = BSXPathEvaluator(doc) else: document = doc if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]", document, None, XPathResult.ANY_TYPE, None): articles = [] if (self.detectWeb(doc, url) == "multiple"): #search page items = {} xpath = None if (url.count("_ob=PublicationURL") > 0): xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a' else: xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a' rows = document.evaluate(xpath, document, None, XPathResult.ANY_TYPE, None) print rows next_row = None #for next_row in rows.iterateNext(): isTrue = True next_row = rows while isTrue: try: next_row = rows.iterateNext() except IndexError: isTrue = False #while (next_row = rows.iterateNext()): print next_row.__dict__ title = "some title here" #next_row.text link = "some href here" #next_row.href if not re.match("PDF \(", title) and not re.match( "Related Articles", title): items[link] = title #items = zotero.SelectItems(items) #let's assume we want all of them [articles.append(i) for i in items] result_sets = [] for article in articles: result_sets.append({'article': article}) else: articles = [url] return_sets = [{"currentdoc": doc}] if len(articles) == 0: print "ERROR: no items were found" return print "articles = ", articles print "result_sets = ", result_sets return result_sets #return all articles or the currentdoc in a dict for stuff that we want to grab
def doWeb(self, doc, url): if type(doc) == type("huh"): #then it's not BeautifulSoup tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser()) links = tree.xpath("/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']") #print "links = ", links #for each in links: # print type(links[0]) document = BSXPathEvaluator(doc) else: document = doc if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]", document, None, XPathResult.ANY_TYPE, None): articles = [] if (self.detectWeb(doc, url) == "multiple"): #search page items = {} xpath = None if (url.count("_ob=PublicationURL") > 0): xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a' else: xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a' rows = document.evaluate(xpath, document, None, XPathResult.ANY_TYPE, None) print rows next_row = None #for next_row in rows.iterateNext(): isTrue = True next_row = rows while isTrue: try: next_row=rows.iterateNext() except IndexError: isTrue=False #while (next_row = rows.iterateNext()): print next_row.__dict__ title = "some title here" #next_row.text link = "some href here" #next_row.href if not re.match("PDF \(",title) and not re.match("Related Articles",title): items[link] = title; #items = zotero.SelectItems(items) #let's assume we want all of them [articles.append(i) for i in items] result_sets = [] for article in articles: result_sets.append({'article':article}) else: articles = [url] return_sets = [{"currentdoc":doc}] if len(articles) == 0: print "ERROR: no items were found" return print "articles = ", articles print "result_sets = ", result_sets return result_sets #return all articles or the currentdoc in a dict for stuff that we want to grab
def test(): global document,options,DEFAULT_TESTDIR,url_data def nodesStr(nodes): def tagstr(node): try: strs=['<'+node.name] i=node.get('id') c=node.get('class') if i: strs.append('id='+i) if c: strs.append('class='+c) return escapeStr(' '.join(strs)+'>') except: return escapeStr(unicode(node)) if isinstance(nodes,list): return ' '.join([tagstr(node) for node in nodes]) elif getattr(nodes,'nodeType',None) or isinstance(nodes,basestring): return escapeStr(unicode(nodes)) else: return nodes if options.web: fp=urllib2.urlopen(url_data) dirdoc=BSXPathEvaluator(fp.read()) files=map(lambda node:node.get('href'),dirdoc.getItemList('//li/a[@href!="../"]')) else: if options.path: testdir=options.path else: testdir=DEFAULT_TESTDIR files=os.listdir(testdir) tnames=','.join(options.names).split(',') if options.names else None tnumbers=','.join(options.numbers).split(',') if options.numbers else None for name in files: if tnames: fname=re.sub(r'\..*$','',name) if not fname in tnames: continue target=url_data+'/'+name if options.web else os.path.join(testdir,name) data=parseTestData(target,options.web) print '[%s]\n%s\n' % (name,data.comment) document=BSXPathEvaluator(data.html) context=document.evaluate(data.contextExpr,document,None,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,None).snapshotItem(0) tests=data.tests cnt=0 for test in tests: cnt=cnt+1 if tnumbers: if not str(cnt) in tnumbers: continue print u'No.%d' % cnt expr=test.expr print u'expr : %s' % (expr) (nodes,time,resultType)=document.applyXPath(context,expr) print u'time : %d.%06d sec' % (time.seconds,time.microseconds) print u'result: %s' % nodesStr(nodes) print u'expect: %s' % (test.data) judge=testNodes(nodes,test.data) print u'judge : %s (%s)' % (judge.status,judge.detail) print u'' print u''
def collectFriendsEmails(): """collectFriendsEmails() uses official facebook api to get list of friends uses list of friends to manually access info page of each saves each contact information in csv """ global usr, debug, browser, debug startTime = time.time() #save current time for calculation of elapsed time logger.info("%s launching CONTACT-DATA COLLECTION" % stages[2]) try:#get access token res = browser.open('http://developers.facebook.com/docs/reference/api') html = res.read() if debug: print "%s fetching access token..." % stages[2] if debug:open('referenceAPI','w').write(BeautifulSoup(html).prettify()) match = re.search('access_token=(.*?)"', html) acc = match.group(1) if debug: print 'access token: ' + acc #get friends res = browser.open('https://graph.facebook.com/me/friends?access_token=%s' % acc) html = res.read() friends = json.loads(html) except Exception as e: logger.error("%s could not get list of friends. Are you executing multiple instances with these credentials?: %s"%(stages[2],str(e))) if debug: print sys.exc_info() return #create csv writer f = open('%s.csv' % usr, 'ab') writer = UnicodeWriter(f) #writer = csv.writer(open('%s.csv' % usr, 'ab'), delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) #logger.info('%s******************LIST OF CONTACTS******************' %stages[2]) for acc in friends['data']: #for each dataset in JSON data friend_id = acc['id'] friend_name = acc['name'] #open profile url try: res = browser.open('http://m.facebook.com/profile.php?id=%s&v=info&refid=17' % friend_id,timeout=4.0) html = res.read() document = BSXPathEvaluator(html) #output_line=friend_id.encode('utf-8')+' | '+friend_name.encode('utf-8') resume=True i = 1 contact_infos = [friend_id,friend_name] while resume: #while further contact data available #look for line in table of contact details and extra contact detail result = document.evaluate('//div[@id="contact"]//table//tr[%d]'%i,document,None,XPathResult.STRING_TYPE,None) contact_info = result.stringValue i+=1 if len(contact_info)==0: resume=False else: contact_info=contact_info.replace('@','@') #replace html character code contact_info=contact_info.replace('%40', '@') #replace url encoding if 'Website' not in contact_info: contact_infos.append(contact_info) #append contact info to list of infos #output_line+= " | "+contact_info.encode('utf-8') #if len(contact_infos)>2: #if contact info apart from id and name was found #logger.info( #stages[2]+'****************************************************\n'+ #stages[2]+'** '+output_line+'\n'+ #stages[2]+'****************************************************' #) logger.info(contact_infos) writer.writerow(contact_infos) #write to csv except URLError as e: logger.error('%s a URL TIMEOUT occured while fetching data for %s: %s' % (stages[2],friend_name,str(e))) except socket.error as e: logger.error('%s a SOCKET ERROR occured while fetching data for %s: %s' % (stages[2],friend_name,str(e))) except: logger.error('%s an error occured while fetching data for %s: %s' % (stages[2],friend_name,sys.exc_info())) endTime = time.time() #set end time for calculation of 'time elapsed' logger.info('%s fetched data of %d friends in %d seconds' %(stages[2],len(friends['data']),endTime-startTime)) logger.info('%s saved collection of contact data in %s.csv! \n program will exit when crawling is finished...' % (stages[2], usr))
def test(): global document, options, DEFAULT_TESTDIR, url_data def nodesStr(nodes): def tagstr(node): try: strs = ['<' + node.name] i = node.get('id') c = node.get('class') if i: strs.append('id=' + i) if c: strs.append('class=' + c) return escapeStr(' '.join(strs) + '>') except: return escapeStr(unicode(node)) if isinstance(nodes, list): return ' '.join([tagstr(node) for node in nodes]) elif getattr(nodes, 'nodeType', None) or isinstance(nodes, basestring): return escapeStr(unicode(nodes)) else: return nodes if options.web: fp = urllib2.urlopen(url_data) dirdoc = BSXPathEvaluator(fp.read()) files = map(lambda node: node.get('href'), dirdoc.getItemList('//li/a[@href!="../"]')) else: if options.path: testdir = options.path else: testdir = DEFAULT_TESTDIR files = os.listdir(testdir) tnames = ','.join(options.names).split(',') if options.names else None tnumbers = ','.join( options.numbers).split(',') if options.numbers else None for name in files: if tnames: fname = re.sub(r'\..*$', '', name) if not fname in tnames: continue target = url_data + '/' + name if options.web else os.path.join( testdir, name) data = parseTestData(target, options.web) print '[%s]\n%s\n' % (name, data.comment) document = BSXPathEvaluator(data.html) context = document.evaluate(data.contextExpr, document, None, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, None).snapshotItem(0) tests = data.tests cnt = 0 for test in tests: cnt = cnt + 1 if tnumbers: if not str(cnt) in tnumbers: continue print u'No.%d' % cnt expr = test.expr print u'expr : %s' % (expr) (nodes, time, resultType) = document.applyXPath(context, expr) print u'time : %d.%06d sec' % (time.seconds, time.microseconds) print u'result: %s' % nodesStr(nodes) print u'expect: %s' % (test.data) judge = testNodes(nodes, test.data) print u'judge : %s (%s)' % (judge.status, judge.detail) print u'' print u''