def Get_URL_SRC(self,URL_Tuple,Flag,que): #得到一个页面的源码,que,Flag if Flag == 2: Url = urllib2.Request(url=URL_Tuple,headers=self.__headers__) Mid_SRC = urllib2.urlopen(Url).read() return Mid_SRC else: if re.search('.*(png|jpg)$', URL_Tuple,re.MULTILINE): print u'不是网页,获取不到源码\n',URL_Tuple else: try: Url = urllib2.Request(url=URL_Tuple,headers=self.__headers__) Mid_SRC = urllib2.urlopen(Url).read() Url_SRC = re.search('<html lang="zh_CN">(.*?)在周四更新</p>', Mid_SRC, re.S).group(1) # Url_dict = {URL_Tuple:Url_SRC} if Flag == 1: Data = Url_SRC.replace('\'', '').replace('\n', '').replace('\r', '') mmy = My_Save('xiaozhang') mmy.command("insert into Content(content) values('%s')"%Data) # if que.put(Url_SRC): # print u'failed' # print URL_Tuple else: return Url_SRC except urllib2.HTTPError,e: print '\n_________________________________\n',e,u'\n丢弃' except urllib2.URLError,e: print u'\n打开失败,稍后重新获取\n' except AttributeError,e: print u'\n过滤',URL_Tuple,'\n'
def Find_exist(value,FF): My = My_Save('xiaozhang') if not My.command("select Flag from url where url = '%s'"%value,type='return'): My = My_Save('xiaozhang') My.command("insert into adddate(url) values('%s')"%value) # print u'发现新连接',value FF += 1 else: FF = 0 return FF
def Find_Key_value(Key): mmy = My_Save('xiaozhang') print w={'w':Key} ww=urllib.urlencode(w) date = re.search('=(.*?)$', ww).group(1) mmy.command("insert into xiaozhang.temp(url) select url from adddate where url like '%%%s%%'"%date,type = 'save') mmy = My_Save('xiaozhang') value = mmy.command('select count(*) from temp', type='return') return u'查找到%s'%value
def Load_Data(Key,Table): My = My_Save('xiaozhang') return My.command("select %s from %s"%(Key,Table), type='return')
if __name__ == '__main__': Tuple_All = [] URL_Tuple = [] U1 = Analyse(Url) Que1 = Queue() Que2 = Queue() ttt = [] FF = 0 # print U1.Get_URL_SRC(urls, 0, Que1) choice = raw_input(u'检查更新y/n:') #更新数据库连接 if choice == 'y': Update_Url(URL_Tuple) choice = raw_input(u'查找y/n:') if choice =='y': Value = Find_Key_value(raw_input(u'输入关键字:')) choice = raw_input(u'是否开始分析源码y/n:') #数据库取连接 if choice == 'y': Many_process_analyse_web(URL_Tuple,Que1) while not Que1.empty(): Data = Que1.get().replace('\'', '').replace('\n', '').replace('\r', '') mmy = My_Save('xiaozhang') mmy.command("insert into Content(content) values('%s')"%Data) choice = raw_input(u'获取数据y/n:') Down_load() M = My_Save('xiaozhang') M.command('truncate table Content;',type='save') M = My_Save('xiaozhang') M.command('truncate table temp;',type='save') M = My_Save('xiaozhang') M.command('truncate table adddate;',type='save')