def __init__(self,list): self.parser = etree.HTMLParser(encoding = 'utf-8') self.db = BaseDb() self.db.connectdb() self.category_links = list['category_links'] self.url_list = list['url'] self.url_set = list['url_set']
class Parse_json(BaseParse): def __init__(self,list): self.parser = etree.HTMLParser(encoding = 'utf-8') self.db = BaseDb() self.db.connectdb() self.url_list = list['url'] self.url_set = list['url_set'] def parse(self, url, page,threadName): search = urlparse.parse_qs(url,True).get('q')[0] try: # The non-standard json string, the regular process standardization re_item = re.compile(r'(?<=[{,])\w+') page = re_item.sub("\"\g<0>\"", page) result = json.loads(page) except: print 'json parse error,search:%s,page:%s' % (search,page) return True for item in result.get('result'): if (item is None) or (item[0] is None) or item[0] == '': continue word=format.strip_tags(item[0]) #print 'search:%s;word:%s;count:%s' % (search.decode('utf-8'),item[0],item[1]) try: self.save2db(search.decode('utf-8'),word,item[1]) except: print 'save to database faild,search word:%s' % search self.add2redis(word) return True def save2db(self,search,word,count): sql = self.db.sql_insert(search,word,count) if self.db.execsql(sql): print 'save word:【%s】 to database,success!' % word else: print 'save word:【%s】 to database,faild' % word def add2redis(self,word): #r = getRedis() try: s=format.extract(word) for w in s: tmp = suggest_url % w #r.rpush('url_list', tmp) self.url_list.push(tmp) except: print 'put to redis error,word:%s' % word
def Rules(self): #linkbase linkbase = getRedis(2) #linkbase.flushdb() db = BaseDb() db.connectdb() db.getAllCategorys() category_links = Categoryids(linkbase) url_list = DQueue(linkbase,'url_news') # category_links.set('aaaasw222','zhz') # print category_links.get('zhz') # sys.exit(0) for store in drugstoreurl: url_set = Record(linkbase, store) #print url_list.len() #sys.exit() #for i in xrange(30): #url = url_list.pop() #url_set.delete(url,store) #url_set.delete(url,'crawled_set') #print url_list.len() # print url_list.len() if(url_list.len() == 0): for item in base.category_ids: if(store == 'http://search.jianke.com/prod'): url = store+'?wd='+item['name']+'&catagoryid='+str(item['id']) elif(store == 'http://www.jxdyf.com/search'): url = store+'/'+item['name']+'.html?catagoryid='+str(item['id']) elif(store == 'http://search.360kad.com'): url = store+'?pageText='+item['name']+'&catagoryid='+str(item['id']) elif(store == 'http://www.ehaoyao.com/search'): url = store+'/search/'+item['name']+'?catagoryid='+str(item['id']) elif(store == 'http://www.yaofang.cn/n/public/search'): url = store+'?s_words='+item['name']+'&sort=interrelated&catagoryid='+str(item['id']) url_list.push(url) #url_list.pop() #print url_list.len() #sys.exit(0) base.url_maps = get_Maps() signal.signal(60, self.reload_handler) list = { 'url':url_list, 'url_set':url_set, 'category_links':category_links } self.AddRules(list, 'Parse_url', 'url', 10)
def __init__(self,list): self.parser = etree.HTMLParser(encoding = 'utf-8') self.db = BaseDb() self.db.connectdb() self.url_list = list['url'] self.url_set = list['url_set']
class Parse_url(BaseParse): def __init__(self,list): self.parser = etree.HTMLParser(encoding = 'utf-8') self.db = BaseDb() self.db.connectdb() self.category_links = list['category_links'] self.url_list = list['url'] self.url_set = list['url_set'] def parse(self, url, page,threadName): #Match extraction rules #print url #print page #sys.exit() rule = self.findrules(url) if not rule: print 'Not found xpath:' + url self.db.exception(url, 1)#No matched return False #extraction information with regular if 'page_re' in rule: self.extract_re(rule, url, page) return True #Make DOM of html page try: self.html = etree.HTML(page, self.parser) except: print sys.exc_info()[0],url return False self.crawled = 'crawled_set' if 'crawled' in rule: self.crawled = rule['crawled'] #extraction information with xpath if 'link_xpath' in rule: self.extract_link(rule,url) if 'page_xpath' in rule: if not self.extract_page_xpath(rule, url): return True if 'extra' in rule: self.doextra(rule, url) self.url_set.crawled(url, self.crawled) return True def doextra(self, rule, url): try: tmp = eval(rule['extra']) tmp += '&table=%s&url=%s' % (rule['table'], url) self.url_list.lpush(tmp) except: return None def geturlparam(self, url, param): result = urlparse.urlparse(url) return urlparse.parse_qs(result.query, True)[param][0] def extract_link(self, rule,url): categoryid = self.geturlparam(url,'catagoryid') #url_verufy = base.url_verify pre_url = '' if 'pre_url' in rule: pre_url = rule['pre_url'] for lx in rule['link_xpath']: urls = self.html.xpath(lx) for i in urls: #save the url to the queue of linkbase if i[0] == '/': tmp = pre_url + i elif i.find('http://') == 0: tmp = i else: tmp = pre_url + '/' + i if not self.url_set.exist(tmp, self.crawled): print 'push into url_list:' + tmp self.url_list.push(tmp) self.category_links.set(categoryid,tmp) #print base.url_verify self.url_set.insert(tmp, self.crawled) def extract_page_xpath(self, rule, url): self.item.clear() self.item['url'] = url try: for k,v in rule['page_xpath'].items(): self.add_xpath2(k, v) if self.item['name'] == '0': if 'page_xpath2' in rule: self.item.clear() self.item['url'] = url for k,v in rule['page_xpath2'].items(): self.add_xpath2(k, v) if self.item['name'] == '0': self.db.exception(url, 2) return False else: self.db.exception(url, 2) return False except: info=sys.exc_info() print info[0],":---",info[1] self.db.exception(url, 2) return False #save the information into mysql #url_verify = base.url_verify print self.category_links.get(url) print url if 'type' in rule: if(rule['type'] == 'product'): self.item['drugstore'] = rule['drugstore'] self.item['category_id'] = self.category_links.get(url) self.item['store'] = rule['store'] self.item['price'] = self.search('\d+(\.\d*)?',str(self.item['price'])) self.item['original_price'] = self.search('\d+(\.\d*)?',str(self.item['original_price'])) print self.item #sys.exit() if self.url_set.isnewpage(url, self.crawled):#insert sql = self.db.sql_insert(rule['table'], self.item) if self.db.execsql(sql): return True else: self.db.exception(url, 3) return False else:#update sql = self.db.sql_update(rule['table'], self.item) if self.db.execsql(sql): return True else: self.db.exception(url, 4) return False def extract_re(self, rule, url, page): self.item.clear() try: self.item['url'] = self.geturlparam(url,'url') table = self.geturlparam(url, 'table') except: self.db.exception(url, 5) return False for k,v in rule['page_re'].items(): self.item[k] = self.search(v, page) sql = self.db.sql_update(table, self.item) if not self.db.execsql(sql): self.db.exception(url, 5) def search(self, pe, str): result = re.search(pe, str) if result: return result.group() else: return None
class Parse_url(BaseParse): def __init__(self,list): self.parser = etree.HTMLParser(encoding = 'utf-8') self.db = BaseDb() self.db.connectdb() self.url_list = list['url'] self.url_set = list['url_set'] def parse(self, url, page,threadName): #Match extraction rules rule = self.findrules(url) if not rule: print 'Not found xpath:' + url self.db.exception(url, 1)#No matched return False #extraction information with regular if 'page_re' in rule: self.extract_re(rule, url, page) return True #Make DOM of html page try: self.html = etree.HTML(page, self.parser) except: print sys.exc_info()[0],url return False self.crawled = 'crawled_set' if 'crawled' in rule: self.crawled = rule['crawled'] #extraction information with xpath if 'link_xpath' in rule: self.extract_link(rule) if 'page_xpath' in rule: if not self.extract_page_xpath(rule, url): return True if 'extra' in rule: self.doextra(rule, url) self.url_set.crawled(url, self.crawled) return True def doextra(self, rule, url): try: tmp = eval(rule['extra']) tmp += '&table=%s&url=%s' % (rule['table'], url) self.url_list.lpush(tmp) except: return None def geturlparam(self, url, param): result = urlparse.urlparse(url) return urlparse.parse_qs(result.query, True)[param][0] def extract_link(self, rule): pre_url = '' if 'pre_url' in rule: pre_url = rule['pre_url'] for lx in rule['link_xpath']: urls = self.html.xpath(lx) for i in urls: #save the url to the queue of linkbase if i[0] == '/': tmp = pre_url + i elif i.find('http://') == 0: tmp = i else: tmp = pre_url + '/' + i if not self.url_set.exist(tmp, self.crawled): print 'push into url_list:' + tmp self.url_list.push(tmp) self.url_set.insert(tmp, self.crawled) def extract_page_xpath(self, rule, url): self.item.clear() self.item['url'] = url try: for k,v in rule['page_xpath'].items(): self.add_xpath2(k, v) if self.item['name'] == '0': if 'page_xpath2' in rule: self.item.clear() self.item['url'] = url for k,v in rule['page_xpath2'].items(): self.add_xpath2(k, v) if self.item['name'] == '0': self.db.exception(url, 2) return False else: self.db.exception(url, 2) return False except: info=sys.exc_info() print info[0],":---",info[1] self.db.exception(url, 2) return False #save the information into mysql if self.url_set.isnewpage(url, self.crawled):#insert sql = self.db.sql_insert(rule['table'], self.item) if self.db.execsql(sql): return True else: self.db.exception(url, 3) return False else:#update sql = self.db.sql_update(rule['table'], self.item) if self.db.execsql(sql): return True else: self.db.exception(url, 4) return False def extract_re(self, rule, url, page): self.item.clear() try: self.item['url'] = self.geturlparam(url,'url') table = self.geturlparam(url, 'table') except: self.db.exception(url, 5) return False for k,v in rule['page_re'].items(): self.item[k] = self.search(v, page) sql = self.db.sql_update(table, self.item) if not self.db.execsql(sql): self.db.exception(url, 5) def search(self, pe, str): result = re.search(pe, str) if result: return result.group(1) else: return None