def doWeb(self, doc, url): if type(doc) == type("huh"): #then it's not BeautifulSoup tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser()) links = tree.xpath( "/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']" ) #print "links = ", links #for each in links: # print type(links[0]) document = BSXPathEvaluator(doc) else: document = doc if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]", document, None, XPathResult.ANY_TYPE, None): articles = [] if (self.detectWeb(doc, url) == "multiple"): #search page items = {} xpath = None if (url.count("_ob=PublicationURL") > 0): xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a' else: xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a' rows = document.evaluate(xpath, document, None, XPathResult.ANY_TYPE, None) print rows next_row = None #for next_row in rows.iterateNext(): isTrue = True next_row = rows while isTrue: try: next_row = rows.iterateNext() except IndexError: isTrue = False #while (next_row = rows.iterateNext()): print next_row.__dict__ title = "some title here" #next_row.text link = "some href here" #next_row.href if not re.match("PDF \(", title) and not re.match( "Related Articles", title): items[link] = title #items = zotero.SelectItems(items) #let's assume we want all of them [articles.append(i) for i in items] result_sets = [] for article in articles: result_sets.append({'article': article}) else: articles = [url] return_sets = [{"currentdoc": doc}] if len(articles) == 0: print "ERROR: no items were found" return print "articles = ", articles print "result_sets = ", result_sets return result_sets #return all articles or the currentdoc in a dict for stuff that we want to grab
def getinfo(url,html): document = BSXPathEvaluator(html) setting={} setting['next_xpath']=u"//a[contains(text(),'下章') or contains(text(),'下一章') or contains(text(),'下一页') or contains(text(),'下页')]" setting['title_xpath']="//title" next_link = document.getFirstItem(setting['next_xpath'])['href']#获取下一页URL next_url=urlparse.urljoin(url,next_link)#修正为绝对URL title= document.getFirstItem(setting['title_xpath']).string #site=root=urlparse.urlparse(url).netloc return title,next_url
def get(self): from google.appengine.api import urlfetch from BeautifulSoup import BeautifulSoup from BSXPath import BSXPathEvaluator,XPathResult result = urlfetch.fetch(url="http://www.u17.com/comic_list/le!_th99_gr99_ca99_ss99_ob0_m0_p1.html", headers={'dd': 'dd'}) if(result.status_code == 200): doc = BSXPathEvaluator(result.content)#/OL[20]/DIV[1]/A[1]/IMG[1] r = doc.getFirstItem('/html[1]/BODY[1]/DIV[8]/DIV[3]/DIV[2]/DIV[12]') self.response.out.write(r)
def getinfo(url,html): document = BSXPathEvaluator(html) setting={} setting['next_xpath']=u"//a[contains(text(),'下章') or contains(text(),'下一章') or contains(text(),'下一页') or contains(text(),'下页') or contains(text(),'下一节')]" setting['title_xpath']="//title" title= ''+document.getFirstItem(setting['title_xpath']).string next_link = document.getItemList(setting['next_xpath']) if len(next_link)==0: return title,None pass next_url=urlparse.urljoin(url,next_link[0]['href'])#修正为绝对URL #site=root=urlparse.urlparse(url).netloc return title,next_url
def get_error_apache(self, document): ### init apache_error = '' ### xpath terms xpath_apache_error = './/title/text()' ### get the title and description BSXdocument = BSXPathEvaluator(document) err = BSXdocument.getItemList(xpath_apache_error) if len(err) > 0: apache_error = '%s' % err[0] ### cleanup del err del BSXdocument ### return error title and description return apache_error
def doWeb(self, doc, url): if type(doc) == type("huh"): #then it's not BeautifulSoup tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser()) links = tree.xpath("/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']") #print "links = ", links #for each in links: # print type(links[0]) document = BSXPathEvaluator(doc) else: document = doc if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]", document, None, XPathResult.ANY_TYPE, None): articles = [] if (self.detectWeb(doc, url) == "multiple"): #search page items = {} xpath = None if (url.count("_ob=PublicationURL") > 0): xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a' else: xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a' rows = document.evaluate(xpath, document, None, XPathResult.ANY_TYPE, None) print rows next_row = None #for next_row in rows.iterateNext(): isTrue = True next_row = rows while isTrue: try: next_row=rows.iterateNext() except IndexError: isTrue=False #while (next_row = rows.iterateNext()): print next_row.__dict__ title = "some title here" #next_row.text link = "some href here" #next_row.href if not re.match("PDF \(",title) and not re.match("Related Articles",title): items[link] = title; #items = zotero.SelectItems(items) #let's assume we want all of them [articles.append(i) for i in items] result_sets = [] for article in articles: result_sets.append({'article':article}) else: articles = [url] return_sets = [{"currentdoc":doc}] if len(articles) == 0: print "ERROR: no items were found" return print "articles = ", articles print "result_sets = ", result_sets return result_sets #return all articles or the currentdoc in a dict for stuff that we want to grab
def get_error_from_django(self, document): ### init error_title = error_description = '' ### xpath terms xpath_title = './/div[@id="summary"]/h1/text()' xpath_description = './/div[@id="summary"]/pre/text()' ### get the title and description BSXdocument = BSXPathEvaluator(document) title = BSXdocument.getItemList(xpath_title) if len(title) > 0: error_title = '%s' % title[0] description = BSXdocument.getItemList(xpath_description) if len(description) > 0: error_description = '%s' % description[0] ### cleanup del title del description del BSXdocument ### return error title and description return (error_title, error_description)
def detectWeb(self, doc, url): if type(doc) == type(""): doc = BSXPathEvaluator(doc) if url.count( "_ob=DownloadURL") != 0 or doc.title == "ScienceDirect Login": return False if ((not re.match("pdf", url)) and url.count("_ob=ArticleURL") == 0 and url.count("/article/") == 0) or url.count("/journal/") != 0: return "multiple" elif not re.match("pdf", url): return "journalArticle" return False
def parse_catalog(catalog_url, parser): fetch_result = urlfetch.fetch(catalog_url, allow_truncated=True) html = fetch_result.content.decode(parser.site_coding, "ignore") document = BSXPathEvaluator(html, convertEntities=BeautifulSoup.HTML_ENTITIES) # 转换实体字符 parse_result = {} vol_list = document.getItemList(parser.vol_and_chapter_xpath) chapter_url_list = [] chapter_title_list = [] if parser.url_remove_prefix_re: # 加速,下面要重复使用 url_remove_prefix_re = re.compile(parser.url_remove_prefix_re) for i in vol_list: if i.name != "a": # 判断是否解析到了VIP卷 if not parser.vol_vip_string or unicode(i).find(parser.vol_vip_string) == -1: chapter_url_list.append("") # 数据库的列表不能保存None chapter_title_list.append(get_all_contents(i)) else: chapter_url_list.append("") # 数据库的列表不能保存None chapter_title_list.append(parser.vol_vip_string) break else: url = i["href"] if parser.url_remove_prefix_re: url = url_remove_prefix_re.sub("", url) chapter_url_list.append(url) chapter_title_list.append(get_all_contents(i)) put_into_dict(parse_result, "chapter_url_list", chapter_url_list) put_into_dict(parse_result, "chapter_title_list", chapter_title_list) return parse_result
def parse_document(document): BSXdocument = BSXPathEvaluator(document) XPath_table = './/*[@id="main"]/p[2]/table' XPath_table_body = '%s/tbody' % (XPath_table) XPath_table_header = '%s/tr[1]' % (XPath_table_body) XPath_table_lines = '%s/tr' % (XPath_table_body) rows = BSXdocument.getItemList(XPath_table_lines)[1:] for row_counter in xrange(len(rows)): row = rows[row_counter] # print row # print "======" rowDoc = BSXPathEvaluator('%s'%row) XPath_table_row = '/' XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1) cell_category = rowDoc.getFirstItem(XPath_table_row_cell_category) XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2) cell_type = rowDoc.getFirstItem(XPath_table_row_cell_type) XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3) cell_time = rowDoc.getFirstItem(XPath_table_row_cell_time) XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4) cell_level = rowDoc.getFirstItem(XPath_table_row_cell_level) XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5) cell_message = rowDoc.getFirstItem(XPath_table_row_cell_message) print "======", row_counter, "======" print "Category:",cell_category print "Type:",cell_type print "Time:",cell_time print "Level:",cell_level print "Message:",cell_message return rows
def parse_document(document): BSXdocument = BSXPathEvaluator(document) XPath_table = './/*[@id="main"]/p[2]/table' XPath_table_body = '%s/tbody' % (XPath_table) XPath_table_header = '%s/tr[1]' % (XPath_table_body) XPath_table_lines = '%s/tr' % (XPath_table_body) rows = BSXdocument.getItemList(XPath_table_lines)[1:] records = [] for row_counter in xrange(len(rows)): record = () SHIFT=0 row = rows[row_counter] XPath_table_row = '%s/tr[%d]' % (XPath_table_body, row_counter+1) XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1) cell_category = BSXdocument.getItemList(XPath_table_row_cell_category) if len(cell_category)>0: cell_category = cell_category[0] XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2) cell_type = BSXdocument.getItemList(XPath_table_row_cell_type) if len(cell_type)>0: cell_type = cell_type[0] XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3) cell_time = BSXdocument.getItemList(XPath_table_row_cell_time) if len(cell_time)>0: cell_time = cell_time[0] XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4) cell_level = BSXdocument.getItemList(XPath_table_row_cell_level) if len(cell_level)>0: cell_level = cell_level[0] XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5) cell_message = BSXdocument.getItemList(XPath_table_row_cell_message) if len(cell_message)>0: cell_message = cell_message[0] message_category="" message_date="" message_time="" message_dataset="" message_site="no.site" message_reason="no.reason" message_weight="no.weight" message_weight_val=0 message_weight_0=0 message_weight_1=0 message_weight_2=0 message_weight_3=0 message_weight_4=0 message_weight_5=0 message_treshold="no.treshold" message_treshold_current=0 message_treshold_expected=0 ###print;print;print ###print u'DEBUG: ln113: cell_message=', cell_message ## SKIPPED if is_this_category(cell_message, ' - action=SKIPPED '): tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';') #print u'test', 'row:', row_counter, u'|tmp_message|=', len(tmp_message), tmp_message ###print u'DEBUG ln123: tmp_message=', tmp_message message_category="SKIPPED" message_date=tmp_message[0] message_time=tmp_message[1] message_dataset=tmp_message[5].split('=')[1] message_reason=tmp_message[4].split('=')[1] ### SKIPPED_REASONS=['TOO_MANY_T2_REPLICAS', 'TOO_MANY_T2_SUBSCRIPTIONS'] if message_reason=="TOO_MANY_T2_REPLICAS": try: #message_treshold_current=re.sub(r"[)(>]", '', re.sub(">", '', str(tmp_message[13]) ) ).split('=')[0] #message_treshold_expected=re.sub(r"[)]", '', str(tmp_message[14]) ).split('=')[1] message_treshold_current=re.sub(r"[)(>]", '', str(tmp_message[13]) ).split('=')[0] message_treshold_expected=re.sub(r"[)(>]", '', str(tmp_message[13]) ).split('=')[1] except: message_treshold_current=-1 message_treshold_expected=-1 #print u'test', 'row:', row_counter, u'|tmp_message|=', len(tmp_message), tmp_message elif message_reason=="TOO_MANY_T2_SUBSCRIPTIONS": try: #message_treshold_current=re.sub(r"[)(>]", '', re.sub(">", '', str(tmp_message[11]) ) ).split('=')[0] #message_treshold_expected=re.sub(r"[)]", '', str(tmp_message[12]) ).split('=')[1] message_treshold_current=re.sub(r"[)(>]", '', str(tmp_message[12]) ).split('=')[0] message_treshold_expected=re.sub(r"[)]", '', str(tmp_message[12]) ).split('=')[1] except: message_treshold_current=-1 message_treshold_expected=-1 #print u'test', 'row:', row_counter, u'|tmp_message|=', len(tmp_message), tmp_message #message_treshold=tmp_message[13] #print u'row:', row_counter, message_treshold, re.sub(r"[)(]", '', str(message_treshold) ) #message_treshold_current=re.sub(r"[)(>]", '', re.sub(">", '', str(tmp_message[13]) ) ).split('=')[0] #message_treshold_expected=re.sub(r"[)]", '', str(tmp_message[13]) ).split('=')[1] #message_weight=0 #message_weight_val=0.0 #message_weight_0=0 #message_weight_1=0 #message_weight_2=0 #message_weight_3=0 #message_weight_4=0 #message_weight_5=0 #print u'test', message_date, message_time, message_dataset, message_reason, message_treshold_current, message_treshold_expected #print u'test::', tmp_message ## triggered if is_this_category(cell_message, ' - triggered '): tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';') ###print u'DEBUG ln166: tmp_message=', tmp_message message_category="triggered" message_date=tmp_message[0] message_time=tmp_message[1] message_dataset=tmp_message[6] #message_weight=0 #message_weight_val=0.0 #message_weight_0=0 #message_weight_1=0 #message_weight_2=0 #message_weight_3=0 #message_weight_4=0 #message_weight_5=0 #message_treshold="" #message_treshold_current=0 #message_treshold_expected=0 ###print u'test', message_date, message_time, message_dataset ## UNSELECTEDT2 if is_this_category(cell_message, ' - action=UNSELECTEDT2 '): tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';') ###print u'DEBUG ln184: tmp_message=', tmp_message message_category="UNSELECTED" message_date=tmp_message[0] message_time=tmp_message[1] try: message_dataset=tmp_message[6].split('=')[1] except IndexError: dataset_field = "" for tmp_item in tmp_message: if re.search('^dataset=', tmp_item): message_dataset=tmp_item.split('=')[1] break message_site=tmp_message[4].split('=')[1] message_weight=tmp_message[5].split('=')[1] if message_weight == WEIGHT_NA_STRING: message_weight=message_weight_0=message_weight_1=message_weight_2=message_weight_3=message_weight_4=message_weight_5=WEIGHT_NA_VALUE message_weight_val=WEIGHT_NA_VALUE else: #message_weight_val=eval(float(message_weight)*1.0) message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';') message_weight_0=message_weight_params[0] message_weight_1=message_weight_params[1] message_weight_2=message_weight_params[2] message_weight_3=message_weight_params[3] message_weight_4=message_weight_params[4] message_weight_5=message_weight_params[5] try: message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5) except: message_weight_val=-1 #print u'test', 'row:', row_counter, u'|tmp_message|=', len(tmp_message), tmp_message #message_treshold="" #message_treshold_current=0 #message_treshold_expected=0 ###print u'test', message_date, message_time, message_dataset, message_site, message_weight, message_weight_0, message_weight_1, message_weight_2, message_weight_3, message_weight_4, message_weight_5 ## SELECTEDT1 if is_this_category(cell_message, ' - action=SELECTEDT1 '): tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';') ###print u'DEBUG ln213: tmp_message=', tmp_message message_category="SELECTEDT1" message_date=tmp_message[0] message_time=tmp_message[1] #message_dataset=tmp_message[6].split('=')[1] try: message_dataset=tmp_message[6].split('=')[1] except IndexError: dataset_field = "" for tmp_item in tmp_message: if re.search('^dataset=', tmp_item): message_dataset=tmp_item.split('=')[1] break message_site=tmp_message[4].split('=')[1] #message_weight=tmp_message[5].split('=')[1] #message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';') #print u'DEBUG ln246: message_weight_params=', message_weight_params #message_weight_0=message_weight_params[0] #message_weight_1=message_weight_params[1] #message_weight_2=message_weight_params[2] #message_weight_3=message_weight_params[3] #message_weight_4=message_weight_params[4] #message_weight_5=message_weight_params[5] #try: # message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5) #except: # message_weight_val=-1 message_weight=message_weight_0=message_weight_1=message_weight_2=message_weight_3=message_weight_4=message_weight_5=message_weight_val=WEIGHT_T1_VALUE ## SELECTEDT2 if is_this_category(cell_message, ' - action=SELECTEDT2 '): tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';') ###print u'DEBUG ln213: tmp_message=', tmp_message message_category="SELECTEDT2" message_date=tmp_message[0] message_time=tmp_message[1] #message_dataset=tmp_message[6].split('=')[1] try: message_dataset=tmp_message[6].split('=')[1] except IndexError: dataset_field = "" for tmp_item in tmp_message: if re.search('^dataset=', tmp_item): message_dataset=tmp_item.split('=')[1] break message_site=tmp_message[4].split('=')[1] message_weight=tmp_message[5].split('=')[1] message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';') message_weight_0=message_weight_params[0] message_weight_1=message_weight_params[1] message_weight_2=message_weight_params[2] message_weight_3=message_weight_params[3] message_weight_4=message_weight_params[4] message_weight_5=message_weight_params[5] try: message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5) except: message_weight_val=-1 ## SELECTEDT2_T1MOU if is_this_category(cell_message, ' - action=SELECTEDT2_T1MOU '): tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';') ###print u'DEBUG ln213: tmp_message=', tmp_message message_category="SELECTEDT2_T1MOU" message_date=tmp_message[0] message_time=tmp_message[1] #message_dataset=tmp_message[6].split('=')[1] try: message_dataset=tmp_message[6].split('=')[1] except IndexError: dataset_field = "" for tmp_item in tmp_message: if re.search('^dataset=', tmp_item): message_dataset=tmp_item.split('=')[1] break message_site=tmp_message[4].split('=')[1] #message_weight=tmp_message[5].split('=')[1] #message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';') #message_weight_0=message_weight_params[0] #message_weight_1=message_weight_params[1] #message_weight_2=message_weight_params[2] #message_weight_3=message_weight_params[3] #message_weight_4=message_weight_params[4] #message_weight_5=message_weight_params[5] #try: # message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5) #except: # message_weight_val=-1 ###message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';') message_weight=message_weight_0=message_weight_1=message_weight_2=message_weight_3=message_weight_4=message_weight_5=message_weight_val=WEIGHT_T2_T1MOU_VALUE ## SELECTEDT2_T2MOU if is_this_category(cell_message, ' - action=SELECTEDT2_T2MOU '): tmp_message=re.sub(r'\s+', ';', str( cell_message.replace(' ', ' ') ) ).split(';') ###print u'DEBUG ln213: tmp_message=', tmp_message message_category="SELECTEDT2_T2MOU" message_date=tmp_message[0] message_time=tmp_message[1] #message_dataset=tmp_message[6].split('=')[1] try: message_dataset=tmp_message[6].split('=')[1] except IndexError: dataset_field = "" for tmp_item in tmp_message: if re.search('^dataset=', tmp_item): message_dataset=tmp_item.split('=')[1] break message_site=tmp_message[4].split('=')[1] #message_weight=tmp_message[5].split('=')[1] #message_weight_params=re.sub(r"[()]", '', re.sub(r"[+/*]", ';', str(message_weight) ) ).split(';') #message_weight_0=message_weight_params[0] #message_weight_1=message_weight_params[1] #message_weight_2=message_weight_params[2] #message_weight_3=message_weight_params[3] #message_weight_4=message_weight_params[4] #message_weight_5=message_weight_params[5] #try: # message_weight_val=(float(message_weight_0)+float(message_weight_1)/float(message_weight_2))*float(message_weight_3)/float(message_weight_4)/float(message_weight_5) #except: # message_weight_val=-1 message_weight=message_weight_0=message_weight_1=message_weight_2=message_weight_3=message_weight_4=message_weight_5=message_weight_val=WEIGHT_T2_T2MOU_VALUE #print u'DEBUG:: message:', cell_message #print u'=============' record = (message_date, message_time, message_category, message_dataset, message_site, \ message_reason, message_weight, \ message_weight_val, message_weight_0, message_weight_1, \ message_weight_2, message_weight_3, \ message_weight_4, message_weight_5, \ message_treshold_current, message_treshold_expected ) records.append(record) return records
def parse_document(document): BSXdocument = BSXPathEvaluator(document) XPath_table = './/*[@id="main"]/p[2]/table' XPath_table_body = '%s/tbody' % (XPath_table) XPath_table_header = '%s/tr[1]' % (XPath_table_body) XPath_table_lines = '%s/tr' % (XPath_table_body) rows = BSXdocument.getItemList(XPath_table_lines)[1:] # get cloud name fjson = open('panda_queues.json','r') data = fjson.read() dic = json.loads(data) fjson.close() records = [] ex_record = [] exist_records = [] in_buf_records = [] maxId = db.get_max_id() last_time = db.get_last_updated_time() if last_time is None: db.first_last_updated_time() last_time = db.get_last_updated_time() this_time = None skip_time = None set_last = None this_year = datetime.date.today().year if maxId is None: maxId = 0 processed_rows = 0 for row_counter in xrange(len(rows)): record = () ex_rec = () SHIFT=0 row = rows[row_counter] rowDoc = BSXPathEvaluator('%s'%row) #XPath_table_row = '%s/tr[%d]' % (XPath_table_body, row_counter+1) XPath_table_row = '/' """ XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1) cell_category = BSXdocument.getItemList(XPath_table_row_cell_category) if len(cell_category)>0: cell_category = cell_category[0] XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2) cell_type = BSXdocument.getItemList(XPath_table_row_cell_type) if len(cell_type)>0: cell_type = cell_type[0] """ XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3) cell_time = rowDoc.getFirstItem(XPath_table_row_cell_time) #if len(cell_time)>0: #cell_time = cell_time[0] """ XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4) cell_level = BSXdocument.getItemList(XPath_table_row_cell_level) if len(cell_level)>0: cell_level = cell_level[0] """ XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5) cell_message = rowDoc.getFirstItem(XPath_table_row_cell_message) #if len(cell_message)>0: #cell_message = cell_message[0] message_category="no.category" message_date = "" message_time = "" message_dn = "" message_jobset ="no.jobset" message_jobdef = "no.jobdef" message_action = "" message_site="no.site" message_reason="no.reason" message_weight="no.weight" message_datetime = str(cell_time).split(' ') message_date = message_datetime[0].strip() message_time = message_datetime[1].strip() # Skip the leading uncompleted minute log_year = get_log_year(this_year, message_date, message_time) this_time = "%s-%s %s"%(log_year, message_date, message_time) if skip_time is None or skip_time == this_time: skip_time = this_time continue # set the last updated time when skip done.( Records in time DESC ) if set_last is None: # save it when every thing done set_last = this_time # Break when reach the last_time if (last_time is not None) and (this_time <= last_time): break # print 'Debug:',message_date,message_time,row_counter,cell_message processed_rows += 1 tmp_message = str(cell_message.replace(' ', ' ')).split(' : ') message_dn = tmp_message[0].split('=')[1].replace("\\\'","").strip().replace(' ','_') tmp_job = tmp_message[1].split(' ') if len(tmp_job) > 1: message_jobset = tmp_job[0].split('=')[1].strip() message_jobdef = tmp_job[1].split('=')[1].strip() else: if is_this_category(tmp_job[0],'jobset'): message_jobset = tmp_job[0].split('=')[1].strip() if is_this_category(tmp_job[0],'jobdef'): message_jobdef = tmp_job[0].split('=')[1].strip() ###print;print;print #print u'DEBUG: date time=', message_date, message_time #print u'DEBUG: dn=', message_dn #print u'DEBUG: jobset=', message_jobset #print u'DEBUG: jobdef=', message_jobdef #print u'DEBUG: ln113: tmp_message[1]=', tmp_message[1] #print u'DEBUG: ln113: tmp_message[2]=', tmp_message[2] ## skip if is_this_category(cell_message, ' action=skip '): # continue # try to speed up message_category = "D" message_skip = tmp_message[2].split(' ') message_action = message_skip[0].split('=')[1].strip() message_site = message_skip[1].split('=')[1].strip() message_reason = message_skip[2].split('=')[1].strip() if re.search('=',message_skip[4]): message_weight = message_skip[4].split('=')[1].strip() else: message_reason = '_'.join(message_skip[3:]).strip('_') # exclude : add at 2011-10-26 elif is_this_category(cell_message, ' action=exclude '): message_category = "E" message_skip = tmp_message[2].split(' ') message_action = message_skip[0].split('=')[1].strip() message_site = message_skip[1].split('=')[1].strip() message_reason = message_skip[2].split('=')[1].strip() if re.search('=',message_skip[4]): message_weight = message_skip[4].split('=')[1].strip() else: message_reason = '_'.join(message_skip[3:]).strip('_') site_name,cloud = get_sitecloud_name(dic,message_site) if is_excluded(ex_record,message_dn,message_jobset,site_name): message_category = "D" # skip if excluded by other jobdef of same jobset else: ex_rec = (message_dn, message_jobset, site_name) ex_record.insert(0, ex_rec) ## choose elif is_this_category(cell_message, ' action=choose '): message_category = "C" message_choose = tmp_message[2].split(' ') message_action = message_choose[0].split('=')[1].strip() message_site = message_choose[1].split('=')[1].strip() message_reason = message_choose[2].split('=')[1].strip() if re.search('=',message_choose[5]): message_weight = message_choose[5].split('=')[1].strip() else: message_reason = '_'.join(message_choose[3:]).strip('_') ## action=use: add at 2011-10-26 elif is_this_category(cell_message, ' action=use '): #message_category = "C" message_choose = tmp_message[2].split(' ') message_action = message_choose[0].split('=')[1].strip() message_site = message_choose[1].split('=')[1].strip() # message_reason = message_choose[2].split('=')[1].strip() message_reason = '_'.join(message_choose[3:]).strip('_') if is_this_category(message_reason, 'site'): message_category = "A" if is_this_category(message_reason, 'cloud'): message_category = "B" ## use site or cloud elif is_this_category(cell_message, ' use '): message_use = tmp_message[2].split(' ') message_action = message_use[0].strip() message_site = message_use[1].strip() message_reason = '_'.join(message_use[3:]).strip('_') if is_this_category(message_reason, 'site'): message_category = "A" if is_this_category(message_reason, 'cloud'): message_category = "B" ## other actions elif is_this_category(cell_message, ' action='): message_buf = tmp_message[2].split(' ') message_action = message_buf[0].split('=')[1].strip() print "WARNING: action=%s is not processed!"%message_action ## append to records it belong to if message_category in ['A','B','C','E']: logDate = str("%s-%s"%(log_year, message_date)) rec_idx = None site_name,cloud = get_sitecloud_name(dic,message_site) dailyLogId = db.is_exist_item(logDate, message_category, site_name, message_dn) if dailyLogId is None: rec_idx = is_in_buf(records, logDate, message_category, site_name, message_dn) if dailyLogId is not None: exist_records.append([dailyLogId]) elif rec_idx is not None: record = (logDate, message_category, site_name, message_dn) in_buf_records.append(record) else: maxId += 1 count = 1 record = (maxId, logDate, message_category, site_name, \ cloud, message_dn, count) records.append(record) if DEBUG==1: print "=========" print "DEBUG:",message_category,": ",row print "=========" db.set_last_updated_time(set_last) # set when all done. if (this_time is not None) and not (this_time <= last_time): print "Error: === NOT Reach the last updated time (%s -> %s) ==="%(this_time,last_time) return (processed_rows,records, exist_records, in_buf_records)
def test(): global document,options,DEFAULT_TESTDIR,url_data def nodesStr(nodes): def tagstr(node): try: strs=['<'+node.name] i=node.get('id') c=node.get('class') if i: strs.append('id='+i) if c: strs.append('class='+c) return escapeStr(' '.join(strs)+'>') except: return escapeStr(unicode(node)) if isinstance(nodes,list): return ' '.join([tagstr(node) for node in nodes]) elif getattr(nodes,'nodeType',None) or isinstance(nodes,basestring): return escapeStr(unicode(nodes)) else: return nodes if options.web: fp=urllib2.urlopen(url_data) dirdoc=BSXPathEvaluator(fp.read()) files=map(lambda node:node.get('href'),dirdoc.getItemList('//li/a[@href!="../"]')) else: if options.path: testdir=options.path else: testdir=DEFAULT_TESTDIR files=os.listdir(testdir) tnames=','.join(options.names).split(',') if options.names else None tnumbers=','.join(options.numbers).split(',') if options.numbers else None for name in files: if tnames: fname=re.sub(r'\..*$','',name) if not fname in tnames: continue target=url_data+'/'+name if options.web else os.path.join(testdir,name) data=parseTestData(target,options.web) print '[%s]\n%s\n' % (name,data.comment) document=BSXPathEvaluator(data.html) context=document.evaluate(data.contextExpr,document,None,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,None).snapshotItem(0) tests=data.tests cnt=0 for test in tests: cnt=cnt+1 if tnumbers: if not str(cnt) in tnumbers: continue print u'No.%d' % cnt expr=test.expr print u'expr : %s' % (expr) (nodes,time,resultType)=document.applyXPath(context,expr) print u'time : %d.%06d sec' % (time.seconds,time.microseconds) print u'result: %s' % nodesStr(nodes) print u'expect: %s' % (test.data) judge=testNodes(nodes,test.data) print u'judge : %s (%s)' % (judge.status,judge.detail) print u'' print u''
def collectFriendsEmails(): """collectFriendsEmails() uses official facebook api to get list of friends uses list of friends to manually access info page of each saves each contact information in csv """ global usr, debug, browser, debug startTime = time.time() #save current time for calculation of elapsed time logger.info("%s launching CONTACT-DATA COLLECTION" % stages[2]) try:#get access token res = browser.open('http://developers.facebook.com/docs/reference/api') html = res.read() if debug: print "%s fetching access token..." % stages[2] if debug:open('referenceAPI','w').write(BeautifulSoup(html).prettify()) match = re.search('access_token=(.*?)"', html) acc = match.group(1) if debug: print 'access token: ' + acc #get friends res = browser.open('https://graph.facebook.com/me/friends?access_token=%s' % acc) html = res.read() friends = json.loads(html) except Exception as e: logger.error("%s could not get list of friends. Are you executing multiple instances with these credentials?: %s"%(stages[2],str(e))) if debug: print sys.exc_info() return #create csv writer f = open('%s.csv' % usr, 'ab') writer = UnicodeWriter(f) #writer = csv.writer(open('%s.csv' % usr, 'ab'), delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) #logger.info('%s******************LIST OF CONTACTS******************' %stages[2]) for acc in friends['data']: #for each dataset in JSON data friend_id = acc['id'] friend_name = acc['name'] #open profile url try: res = browser.open('http://m.facebook.com/profile.php?id=%s&v=info&refid=17' % friend_id,timeout=4.0) html = res.read() document = BSXPathEvaluator(html) #output_line=friend_id.encode('utf-8')+' | '+friend_name.encode('utf-8') resume=True i = 1 contact_infos = [friend_id,friend_name] while resume: #while further contact data available #look for line in table of contact details and extra contact detail result = document.evaluate('//div[@id="contact"]//table//tr[%d]'%i,document,None,XPathResult.STRING_TYPE,None) contact_info = result.stringValue i+=1 if len(contact_info)==0: resume=False else: contact_info=contact_info.replace('@','@') #replace html character code contact_info=contact_info.replace('%40', '@') #replace url encoding if 'Website' not in contact_info: contact_infos.append(contact_info) #append contact info to list of infos #output_line+= " | "+contact_info.encode('utf-8') #if len(contact_infos)>2: #if contact info apart from id and name was found #logger.info( #stages[2]+'****************************************************\n'+ #stages[2]+'** '+output_line+'\n'+ #stages[2]+'****************************************************' #) logger.info(contact_infos) writer.writerow(contact_infos) #write to csv except URLError as e: logger.error('%s a URL TIMEOUT occured while fetching data for %s: %s' % (stages[2],friend_name,str(e))) except socket.error as e: logger.error('%s a SOCKET ERROR occured while fetching data for %s: %s' % (stages[2],friend_name,str(e))) except: logger.error('%s an error occured while fetching data for %s: %s' % (stages[2],friend_name,sys.exc_info())) endTime = time.time() #set end time for calculation of 'time elapsed' logger.info('%s fetched data of %d friends in %d seconds' %(stages[2],len(friends['data']),endTime-startTime)) logger.info('%s saved collection of contact data in %s.csv! \n program will exit when crawling is finished...' % (stages[2], usr))
def test(): global document, options, DEFAULT_TESTDIR, url_data def nodesStr(nodes): def tagstr(node): try: strs = ['<' + node.name] i = node.get('id') c = node.get('class') if i: strs.append('id=' + i) if c: strs.append('class=' + c) return escapeStr(' '.join(strs) + '>') except: return escapeStr(unicode(node)) if isinstance(nodes, list): return ' '.join([tagstr(node) for node in nodes]) elif getattr(nodes, 'nodeType', None) or isinstance(nodes, basestring): return escapeStr(unicode(nodes)) else: return nodes if options.web: fp = urllib2.urlopen(url_data) dirdoc = BSXPathEvaluator(fp.read()) files = map(lambda node: node.get('href'), dirdoc.getItemList('//li/a[@href!="../"]')) else: if options.path: testdir = options.path else: testdir = DEFAULT_TESTDIR files = os.listdir(testdir) tnames = ','.join(options.names).split(',') if options.names else None tnumbers = ','.join( options.numbers).split(',') if options.numbers else None for name in files: if tnames: fname = re.sub(r'\..*$', '', name) if not fname in tnames: continue target = url_data + '/' + name if options.web else os.path.join( testdir, name) data = parseTestData(target, options.web) print '[%s]\n%s\n' % (name, data.comment) document = BSXPathEvaluator(data.html) context = document.evaluate(data.contextExpr, document, None, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, None).snapshotItem(0) tests = data.tests cnt = 0 for test in tests: cnt = cnt + 1 if tnumbers: if not str(cnt) in tnumbers: continue print u'No.%d' % cnt expr = test.expr print u'expr : %s' % (expr) (nodes, time, resultType) = document.applyXPath(context, expr) print u'time : %d.%06d sec' % (time.seconds, time.microseconds) print u'result: %s' % nodesStr(nodes) print u'expect: %s' % (test.data) judge = testNodes(nodes, test.data) print u'judge : %s (%s)' % (judge.status, judge.detail) print u'' print u''