示例#1
0
def getinfo(url,html):
  document = BSXPathEvaluator(html)
  setting={}
  setting['next_xpath']=u"//a[contains(text(),'下章') or contains(text(),'下一章') or contains(text(),'下一页') or contains(text(),'下页')]"
  setting['title_xpath']="//title"
  next_link = document.getFirstItem(setting['next_xpath'])['href']#获取下一页URL
  next_url=urlparse.urljoin(url,next_link)#修正为绝对URL
  title= document.getFirstItem(setting['title_xpath']).string
  #site=root=urlparse.urlparse(url).netloc
  return title,next_url
示例#2
0
文件: main.py 项目: dreampuf/mmacgoo
 def get(self):
     from google.appengine.api import urlfetch
     from BeautifulSoup import BeautifulSoup
     from BSXPath import BSXPathEvaluator,XPathResult
     result = urlfetch.fetch(url="http://www.u17.com/comic_list/le!_th99_gr99_ca99_ss99_ob0_m0_p1.html",
                             headers={'dd': 'dd'})
     if(result.status_code == 200):
         doc = BSXPathEvaluator(result.content)#/OL[20]/DIV[1]/A[1]/IMG[1]
         r = doc.getFirstItem('/html[1]/BODY[1]/DIV[8]/DIV[3]/DIV[2]/DIV[12]')
         self.response.out.write(r)
def parse_document(document):
    BSXdocument = BSXPathEvaluator(document)
    
    XPath_table = './/*[@id="main"]/p[2]/table'
    XPath_table_body = '%s/tbody' % (XPath_table)
    XPath_table_header = '%s/tr[1]' % (XPath_table_body)
    XPath_table_lines = '%s/tr' % (XPath_table_body)
    rows = BSXdocument.getItemList(XPath_table_lines)[1:]
    
    for row_counter in xrange(len(rows)):
        row = rows[row_counter]
        # print row
        # print "======"
        rowDoc = BSXPathEvaluator('%s'%row)

        XPath_table_row = '/'

        XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1)
        cell_category = rowDoc.getFirstItem(XPath_table_row_cell_category)
        
        XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2)
        cell_type = rowDoc.getFirstItem(XPath_table_row_cell_type)

        XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3)
        cell_time = rowDoc.getFirstItem(XPath_table_row_cell_time)

        XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4)
        cell_level = rowDoc.getFirstItem(XPath_table_row_cell_level)

        XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5)
        cell_message = rowDoc.getFirstItem(XPath_table_row_cell_message)
        
        print "======", row_counter, "======"
        print "Category:",cell_category
        print "Type:",cell_type
        print "Time:",cell_time
        print "Level:",cell_level
        print "Message:",cell_message
        
    return rows
示例#4
0
def getinfo(url,html):
  document = BSXPathEvaluator(html)
  setting={}
  setting['next_xpath']=u"//a[contains(text(),'下章') or contains(text(),'下一章') or contains(text(),'下一页') or contains(text(),'下页') or contains(text(),'下一节')]"
  setting['title_xpath']="//title"
  title= ''+document.getFirstItem(setting['title_xpath']).string
  next_link = document.getItemList(setting['next_xpath'])
  if len(next_link)==0:
    return title,None
    pass
  next_url=urlparse.urljoin(url,next_link[0]['href'])#修正为绝对URL
  #site=root=urlparse.urlparse(url).netloc
  return title,next_url
def parse_document(document):
    BSXdocument = BSXPathEvaluator(document)
    
    XPath_table = './/*[@id="main"]/p[2]/table'
    XPath_table_body = '%s/tbody' % (XPath_table)
    XPath_table_header = '%s/tr[1]' % (XPath_table_body)
    XPath_table_lines = '%s/tr' % (XPath_table_body)
    rows = BSXdocument.getItemList(XPath_table_lines)[1:]
    # get cloud name
    fjson = open('panda_queues.json','r')
    data = fjson.read()
    dic = json.loads(data)
    fjson.close()
    
    records = []
    ex_record = []
    exist_records = []
    in_buf_records = []
    maxId = db.get_max_id()
    last_time = db.get_last_updated_time()
    if last_time is None:
        db.first_last_updated_time()
        last_time = db.get_last_updated_time()
    this_time = None
    skip_time = None
    set_last = None
    this_year = datetime.date.today().year
    if maxId is None:
        maxId = 0
    processed_rows = 0
    
    for row_counter in xrange(len(rows)):
        record = ()
        ex_rec = ()
        SHIFT=0
        
        row = rows[row_counter]

        rowDoc = BSXPathEvaluator('%s'%row)

        #XPath_table_row = '%s/tr[%d]' % (XPath_table_body, row_counter+1)
        XPath_table_row = '/'
        """
        XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1)
        cell_category = BSXdocument.getItemList(XPath_table_row_cell_category)
        if len(cell_category)>0:
            cell_category = cell_category[0]
        
        XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2)
        cell_type = BSXdocument.getItemList(XPath_table_row_cell_type)
        if len(cell_type)>0:
            cell_type = cell_type[0]
        """
        XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3)
        cell_time = rowDoc.getFirstItem(XPath_table_row_cell_time)
        #if len(cell_time)>0:
            #cell_time = cell_time[0]
        """
        XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4)
        cell_level = BSXdocument.getItemList(XPath_table_row_cell_level)
        if len(cell_level)>0:
            cell_level = cell_level[0]
        """
        XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5)
        cell_message = rowDoc.getFirstItem(XPath_table_row_cell_message)
        #if len(cell_message)>0:
            #cell_message = cell_message[0]
        
        
        message_category="no.category"
        message_date = ""
        message_time = ""
        message_dn = ""
        message_jobset ="no.jobset"
        message_jobdef = "no.jobdef"
        message_action = ""
        message_site="no.site"
        message_reason="no.reason"
        message_weight="no.weight"
        
        message_datetime = str(cell_time).split(' ')
        message_date = message_datetime[0].strip()
        message_time = message_datetime[1].strip()
        
        # Skip the leading uncompleted minute
        log_year = get_log_year(this_year, message_date, message_time)
        this_time = "%s-%s %s"%(log_year, message_date, message_time)
        
        if skip_time is None or skip_time == this_time:
            skip_time = this_time
            continue
        # set the last updated time when skip done.( Records in time DESC )
        if set_last is None:
            # save it when every thing done
            set_last = this_time
        
        # Break when reach the last_time
        if (last_time is not None) and (this_time <= last_time):
            break
               
        # print 'Debug:',message_date,message_time,row_counter,cell_message
        processed_rows += 1
        
        tmp_message = str(cell_message.replace('&nbsp;', ' ')).split(' : ')
        message_dn = tmp_message[0].split('=')[1].replace("\\\'","").strip().replace(' ','_')
        tmp_job = tmp_message[1].split(' ')
        if len(tmp_job) > 1:
            message_jobset = tmp_job[0].split('=')[1].strip() 
            message_jobdef = tmp_job[1].split('=')[1].strip()
        else:
            if is_this_category(tmp_job[0],'jobset'):
                message_jobset = tmp_job[0].split('=')[1].strip()
            if is_this_category(tmp_job[0],'jobdef'):
                message_jobdef = tmp_job[0].split('=')[1].strip()
        ###print;print;print
        #print u'DEBUG: date time=', message_date, message_time
        #print u'DEBUG: dn=', message_dn
        #print u'DEBUG: jobset=', message_jobset
        #print u'DEBUG: jobdef=', message_jobdef
        #print u'DEBUG: ln113: tmp_message[1]=', tmp_message[1]
        #print u'DEBUG: ln113: tmp_message[2]=', tmp_message[2]
        
        ## skip
        if is_this_category(cell_message, ' action=skip '):
            # continue # try to speed up
            message_category = "D"
            message_skip = tmp_message[2].split(' ')
            message_action = message_skip[0].split('=')[1].strip()
            message_site = message_skip[1].split('=')[1].strip()
            message_reason = message_skip[2].split('=')[1].strip()
            if re.search('=',message_skip[4]):
                message_weight = message_skip[4].split('=')[1].strip()
            else:
                message_reason = '_'.join(message_skip[3:]).strip('_')
        
        # exclude : add at 2011-10-26
        elif is_this_category(cell_message, ' action=exclude '):
            message_category = "E"
            message_skip = tmp_message[2].split(' ')
            message_action = message_skip[0].split('=')[1].strip()
            message_site = message_skip[1].split('=')[1].strip()
            message_reason = message_skip[2].split('=')[1].strip()
            if re.search('=',message_skip[4]):
                message_weight = message_skip[4].split('=')[1].strip()
            else:
                message_reason = '_'.join(message_skip[3:]).strip('_')
            site_name,cloud = get_sitecloud_name(dic,message_site)
            if is_excluded(ex_record,message_dn,message_jobset,site_name):
                message_category = "D" # skip if excluded by other jobdef of same jobset
            else:
                ex_rec = (message_dn, message_jobset, site_name)
                ex_record.insert(0, ex_rec)
        
        ## choose
        elif is_this_category(cell_message, ' action=choose '):
            message_category = "C"
            message_choose = tmp_message[2].split(' ')
            message_action = message_choose[0].split('=')[1].strip()
            message_site = message_choose[1].split('=')[1].strip()
            message_reason = message_choose[2].split('=')[1].strip()
            if re.search('=',message_choose[5]):
                message_weight = message_choose[5].split('=')[1].strip()
            else:
                message_reason = '_'.join(message_choose[3:]).strip('_')
        
        ## action=use: add at 2011-10-26
        elif is_this_category(cell_message, ' action=use '):
            #message_category = "C"
            message_choose = tmp_message[2].split(' ')
            message_action = message_choose[0].split('=')[1].strip()
            message_site = message_choose[1].split('=')[1].strip()
            # message_reason = message_choose[2].split('=')[1].strip()
            message_reason = '_'.join(message_choose[3:]).strip('_')
            if is_this_category(message_reason, 'site'):
                message_category = "A"
            if is_this_category(message_reason, 'cloud'):
                message_category = "B"
        
        ## use site or cloud
        elif is_this_category(cell_message, ' use '):
            message_use = tmp_message[2].split(' ')
            message_action = message_use[0].strip()
            message_site = message_use[1].strip()
            message_reason = '_'.join(message_use[3:]).strip('_')
            if is_this_category(message_reason, 'site'):
                message_category = "A"
            if is_this_category(message_reason, 'cloud'):
                message_category = "B"
                
        ## other actions
        elif is_this_category(cell_message, ' action='):
            message_buf = tmp_message[2].split(' ')
            message_action = message_buf[0].split('=')[1].strip()
            print "WARNING: action=%s is not processed!"%message_action
        
        ## append to records it belong to
        if message_category in ['A','B','C','E']:
            logDate = str("%s-%s"%(log_year, message_date))
            rec_idx = None
            site_name,cloud = get_sitecloud_name(dic,message_site)
            dailyLogId = db.is_exist_item(logDate, message_category, site_name, message_dn)
            if dailyLogId is None:
                rec_idx = is_in_buf(records, logDate, message_category, site_name, message_dn)
                
            if dailyLogId is not None:
                exist_records.append([dailyLogId])
            elif rec_idx is not None:
                record = (logDate, message_category, site_name, message_dn)
                in_buf_records.append(record)
            else:
                maxId += 1
                count = 1               
                record = (maxId, logDate, message_category, site_name, \
                  cloud, message_dn, count)
                records.append(record)
        
        if DEBUG==1:
            print "========="
            print "DEBUG:",message_category,": ",row
            print "========="

    db.set_last_updated_time(set_last) # set when all done.
    if (this_time is not None) and not (this_time <= last_time):
        print "Error: === NOT Reach the last updated time (%s -> %s) ==="%(this_time,last_time)
        
    return (processed_rows,records, exist_records, in_buf_records)