示例#1
0
 def doWeb(self, doc, url):
     if type(doc) == type("huh"):  #then it's not BeautifulSoup
         tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser())
         links = tree.xpath(
             "/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']"
         )
         #print "links = ", links
         #for each in links:
         #    print type(links[0])
         document = BSXPathEvaluator(doc)
     else:
         document = doc
     if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]",
                          document, None, XPathResult.ANY_TYPE, None):
         articles = []
         if (self.detectWeb(doc, url) == "multiple"):
             #search page
             items = {}
             xpath = None
             if (url.count("_ob=PublicationURL") > 0):
                 xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a'
             else:
                 xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a'
             rows = document.evaluate(xpath, document, None,
                                      XPathResult.ANY_TYPE, None)
             print rows
             next_row = None
             #for next_row in rows.iterateNext():
             isTrue = True
             next_row = rows
             while isTrue:
                 try:
                     next_row = rows.iterateNext()
                 except IndexError:
                     isTrue = False
                 #while (next_row = rows.iterateNext()):
                 print next_row.__dict__
                 title = "some title here"  #next_row.text
                 link = "some href here"  #next_row.href
                 if not re.match("PDF \(", title) and not re.match(
                         "Related Articles", title):
                     items[link] = title
             #items = zotero.SelectItems(items)
             #let's assume we want all of them
             [articles.append(i) for i in items]
             result_sets = []
             for article in articles:
                 result_sets.append({'article': article})
         else:
             articles = [url]
             return_sets = [{"currentdoc": doc}]
         if len(articles) == 0:
             print "ERROR: no items were found"
             return
         print "articles = ", articles
         print "result_sets = ", result_sets
     return result_sets  #return all articles or the currentdoc in a dict for stuff that we want to grab
示例#2
0
 def doWeb(self, doc, url):
     if type(doc) == type("huh"): #then it's not BeautifulSoup
         tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser())
         links = tree.xpath("/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']")
         #print "links = ", links
         #for each in links:
         #    print type(links[0])
         document = BSXPathEvaluator(doc)
     else: document = doc
     if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]", document, None, XPathResult.ANY_TYPE, None):
         articles = []
         if (self.detectWeb(doc, url) == "multiple"):
             #search page
             items = {}
             xpath = None
             if (url.count("_ob=PublicationURL") > 0):
                 xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a'
             else:
                 xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a'
             rows = document.evaluate(xpath, document, None, XPathResult.ANY_TYPE, None)
             print rows
             next_row = None
             #for next_row in rows.iterateNext():
             isTrue = True
             next_row = rows
             while isTrue:
                 try: next_row=rows.iterateNext()
                 except IndexError: isTrue=False
                 #while (next_row = rows.iterateNext()):
                 print next_row.__dict__
                 title = "some title here" #next_row.text
                 link = "some href here" #next_row.href
                 if not re.match("PDF \(",title) and not re.match("Related Articles",title): items[link] = title;
             #items = zotero.SelectItems(items)
             #let's assume we want all of them
             [articles.append(i) for i in items]
             result_sets = []
             for article in articles:
                 result_sets.append({'article':article})
         else:
             articles = [url]
             return_sets = [{"currentdoc":doc}]
         if len(articles) == 0:
             print "ERROR: no items were found"
             return
         print "articles = ", articles
         print "result_sets = ", result_sets
     return result_sets #return all articles or the currentdoc in a dict for stuff that we want to grab
示例#3
0
def test():
  global document,options,DEFAULT_TESTDIR,url_data
  
  def nodesStr(nodes):
    def tagstr(node):
      try:
        strs=['<'+node.name]
        i=node.get('id')
        c=node.get('class')
        if i:
          strs.append('id='+i)
        if c:
          strs.append('class='+c)
        return escapeStr(' '.join(strs)+'>')
      except:
        return escapeStr(unicode(node))
    
    if isinstance(nodes,list):
      return ' '.join([tagstr(node) for node in nodes])
    elif getattr(nodes,'nodeType',None) or isinstance(nodes,basestring):
      return escapeStr(unicode(nodes))
    else:
      return nodes
  
  if options.web:
    fp=urllib2.urlopen(url_data)
    dirdoc=BSXPathEvaluator(fp.read())
    files=map(lambda node:node.get('href'),dirdoc.getItemList('//li/a[@href!="../"]'))
  else:
    if options.path:
      testdir=options.path
    else:
      testdir=DEFAULT_TESTDIR
    files=os.listdir(testdir)
  
  tnames=','.join(options.names).split(',') if options.names else None
  tnumbers=','.join(options.numbers).split(',') if options.numbers else None
  for name in files:
    if tnames:
      fname=re.sub(r'\..*$','',name)
      if not fname in tnames: continue
    target=url_data+'/'+name if options.web else os.path.join(testdir,name)
    data=parseTestData(target,options.web)
    print '[%s]\n%s\n' % (name,data.comment)
    document=BSXPathEvaluator(data.html)
    context=document.evaluate(data.contextExpr,document,None,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,None).snapshotItem(0)
    tests=data.tests
    cnt=0
    for test in tests:
      cnt=cnt+1
      if tnumbers:
        if not str(cnt) in tnumbers: continue
      print u'No.%d' % cnt
      expr=test.expr
      print u'expr  : %s' % (expr)
      
      (nodes,time,resultType)=document.applyXPath(context,expr)
      
      print u'time  : %d.%06d sec' % (time.seconds,time.microseconds)
      print u'result: %s' % nodesStr(nodes)
      print u'expect: %s' % (test.data)
      
      judge=testNodes(nodes,test.data)
      
      print u'judge : %s (%s)' % (judge.status,judge.detail)
      print u''
    
    print u''
def collectFriendsEmails():
    """collectFriendsEmails()
        uses official facebook api to get list of friends
        uses list of friends to manually access info page of each
        saves each contact information in csv
    """
    global usr, debug, browser, debug
    startTime = time.time() #save current time for calculation of elapsed time

    logger.info("%s launching CONTACT-DATA COLLECTION" % stages[2])


    try:#get access token
        res = browser.open('http://developers.facebook.com/docs/reference/api')
        html = res.read()

        if debug: print "%s fetching access token..." % stages[2]
        if debug:open('referenceAPI','w').write(BeautifulSoup(html).prettify())

        match = re.search('access_token=(.*?)"', html)
        acc = match.group(1)

        if debug: print 'access token: ' + acc

        #get friends
        res = browser.open('https://graph.facebook.com/me/friends?access_token=%s' % acc)
        html = res.read()
        friends = json.loads(html)
    except Exception as e:
        logger.error("%s could not get list of friends. Are you executing multiple instances with these credentials?: %s"%(stages[2],str(e)))
        if debug: print sys.exc_info()
        return

    #create csv writer
    f = open('%s.csv' % usr, 'ab')
    writer = UnicodeWriter(f)

    #writer = csv.writer(open('%s.csv' % usr, 'ab'), delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    #logger.info('%s******************LIST OF CONTACTS******************' %stages[2])

    for acc in friends['data']: #for each dataset in JSON data
        friend_id = acc['id']
        friend_name = acc['name']

        #open profile url
        try:
            res = browser.open('http://m.facebook.com/profile.php?id=%s&v=info&refid=17' % friend_id,timeout=4.0)
            html = res.read()

            document = BSXPathEvaluator(html)

            #output_line=friend_id.encode('utf-8')+' | '+friend_name.encode('utf-8')
            resume=True
            i = 1
            contact_infos = [friend_id,friend_name]

            while resume: #while further contact data available
                #look for line in table of contact details and extra contact detail
                result = document.evaluate('//div[@id="contact"]//table//tr[%d]'%i,document,None,XPathResult.STRING_TYPE,None)
                contact_info = result.stringValue
                i+=1
                if len(contact_info)==0:
                    resume=False
                else:
                    contact_info=contact_info.replace('&#064;','@') #replace html character code
                    contact_info=contact_info.replace('%40', '@') #replace url encoding
                    if 'Website' not in contact_info:
                        contact_infos.append(contact_info) #append contact info to list of infos
                        #output_line+= " | "+contact_info.encode('utf-8')
            #if len(contact_infos)>2: #if contact info apart from id and name was found
            #logger.info(
                #stages[2]+'****************************************************\n'+
                #stages[2]+'** '+output_line+'\n'+
                #stages[2]+'****************************************************'
            #)
            logger.info(contact_infos)

            writer.writerow(contact_infos) #write to csv
        except URLError as e:
            logger.error('%s a URL TIMEOUT occured while fetching data for %s: %s' % (stages[2],friend_name,str(e)))
        except socket.error as e:
            logger.error('%s a SOCKET ERROR occured while fetching data for %s: %s' % (stages[2],friend_name,str(e)))
        except:
            logger.error('%s an error occured while fetching data for %s: %s' % (stages[2],friend_name,sys.exc_info()))

    endTime = time.time() #set end time for calculation of 'time elapsed'
    logger.info('%s fetched data of %d friends in %d seconds' %(stages[2],len(friends['data']),endTime-startTime))
    logger.info('%s saved collection of contact data in %s.csv! \n program will exit when crawling is finished...' % (stages[2], usr))
示例#5
0
def test():
    global document, options, DEFAULT_TESTDIR, url_data

    def nodesStr(nodes):
        def tagstr(node):
            try:
                strs = ['<' + node.name]
                i = node.get('id')
                c = node.get('class')
                if i:
                    strs.append('id=' + i)
                if c:
                    strs.append('class=' + c)
                return escapeStr(' '.join(strs) + '>')
            except:
                return escapeStr(unicode(node))

        if isinstance(nodes, list):
            return ' '.join([tagstr(node) for node in nodes])
        elif getattr(nodes, 'nodeType', None) or isinstance(nodes, basestring):
            return escapeStr(unicode(nodes))
        else:
            return nodes

    if options.web:
        fp = urllib2.urlopen(url_data)
        dirdoc = BSXPathEvaluator(fp.read())
        files = map(lambda node: node.get('href'),
                    dirdoc.getItemList('//li/a[@href!="../"]'))
    else:
        if options.path:
            testdir = options.path
        else:
            testdir = DEFAULT_TESTDIR
        files = os.listdir(testdir)

    tnames = ','.join(options.names).split(',') if options.names else None
    tnumbers = ','.join(
        options.numbers).split(',') if options.numbers else None
    for name in files:
        if tnames:
            fname = re.sub(r'\..*$', '', name)
            if not fname in tnames: continue
        target = url_data + '/' + name if options.web else os.path.join(
            testdir, name)
        data = parseTestData(target, options.web)
        print '[%s]\n%s\n' % (name, data.comment)
        document = BSXPathEvaluator(data.html)
        context = document.evaluate(data.contextExpr, document, None,
                                    XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
                                    None).snapshotItem(0)
        tests = data.tests
        cnt = 0
        for test in tests:
            cnt = cnt + 1
            if tnumbers:
                if not str(cnt) in tnumbers: continue
            print u'No.%d' % cnt
            expr = test.expr
            print u'expr  : %s' % (expr)

            (nodes, time, resultType) = document.applyXPath(context, expr)

            print u'time  : %d.%06d sec' % (time.seconds, time.microseconds)
            print u'result: %s' % nodesStr(nodes)
            print u'expect: %s' % (test.data)

            judge = testNodes(nodes, test.data)

            print u'judge : %s (%s)' % (judge.status, judge.detail)
            print u''

        print u''