def analyseurl(urls): """ 功能:分析urls,返回列表格式的字典 字典格式:{'name':names,'urls':url} 这里将符合要求的页面信息插入数据库,还包括日志信息 还包括 key的判断???? """ returns = [] print urls html = urllib2.urlopen(urls, timeout=50) try: conn = sqlite3.connect(options.dbfile) cor = conn.cursor() cor.execute( 'create table if not exists keyofhtml( id integer primary key,urls text,key text,htmls text)' ) data = html.read() rr = re.compile(r"""content\=["|']text\/html\;charset\=(\w*?)["|']""") m = rr.search(data) if m: code = m.group(1) if code: data = data.decode(code) rekey = re.compile(keyinsys) good = rekey.search(data) if good: data = data.replace("'", '"') #纠结的单引号怎么处理? sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')" cor.execute(sqls % (urls, keyinsys, data)) conn.commit() conn.close() logging2.debug('reading ' + urls) logging2.info('what should i write here') logging2.warning('a warning here') logging2.error('a error test here') logging2.critical('what is a critical??') #print 'reading' except: print 'error' logging2.error('error ong reading ' + urls) soup = BeautifulSoup.BeautifulSoup(data) temp = soup.findAll('a', href=re.compile(r'http.*')) #为什么不直接用re匹配a标签 logging2.debug('analysing ' + urls) #print 'analysing' for tt in temp: hrefs = tt['href'] #have? if hrefs.startswith('http'): if tt.string: #span????? returns.append({'name': tt.string, 'urls': hrefs}) else: returns.append({'name': 'NoName', 'urls': hrefs}) else: continue return returns
def analyseurl(urls): """ 功能:分析urls,返回列表格式的字典 字典格式:{'name':names,'urls':url} 这里将符合要求的页面信息插入数据库,还包括日志信息 还包括 key的判断???? """ returns=[] print urls html = urllib2.urlopen(urls,timeout=50) try: conn = sqlite3.connect(options.dbfile) cor = conn.cursor() cor.execute('create table if not exists keyofhtml( id integer primary key,urls text,key text,htmls text)') data = html.read() rr = re.compile(r"""content\=["|']text\/html\;charset\=(\w*?)["|']""") m = rr.search(data) if m: code = m.group(1) if code: data = data.decode(code) rekey = re.compile(keyinsys) good = rekey.search(data) if good: data = data.replace("'",'"')#纠结的单引号怎么处理? sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')" cor.execute(sqls%(urls,keyinsys,data)) conn.commit() conn.close() logging2.debug('reading '+urls) logging2.info('what should i write here') logging2.warning('a warning here') logging2.error('a error test here') logging2.critical('what is a critical??') #print 'reading' except: print 'error' logging2.error('error ong reading '+urls) soup = BeautifulSoup.BeautifulSoup(data) temp = soup.findAll('a',href=re.compile(r'http.*'))#为什么不直接用re匹配a标签 logging2.debug('analysing '+urls) #print 'analysing' for tt in temp: hrefs = tt['href']#have? if hrefs.startswith('http'): if tt.string:#span????? returns.append({'name':tt.string,'urls':hrefs}) else: returns.append({'name':'NoName','urls':hrefs}) else: continue return returns
data = data.replace("'",'"')#纠结的单引号怎么处理? sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')" try: cor.execute(sqls%(urls,keyinsys,data)) except UnicodeDecodeError,e: #print e cor.execute(sqls%(urls,keyinsys,'decode error')) logging2.error('reading '+urls+' decode error') conn.commit() #print 'donessss' conn.close() logging2.debug('reading '+urls) logging2.info('what should i write here') logging2.warning('a warning here') logging2.error('a error test here') logging2.critical('what is a critical??') #print 'reading' #except: #print 'error' #logging2.error('error ong reading '+urls) return returns def main(): i = 0 th = threading2.ThreadPool(workQueue,resultQueue,options.number) td = threading2.MyThread2(workQueue,resultQueue,i,10)#屏幕打印进程 while i <= options.deep:#层次循环 if i == 0:
data = data.replace("'", '"') #纠结的单引号怎么处理? sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')" try: cor.execute(sqls % (urls, keyinsys, data)) except UnicodeDecodeError, e: #print e cor.execute(sqls % (urls, keyinsys, 'decode error')) logging2.error('reading ' + urls + ' decode error') conn.commit() #print 'donessss' conn.close() logging2.debug('reading ' + urls) logging2.info('what should i write here') logging2.warning('a warning here') logging2.error('a error test here') logging2.critical('what is a critical??') return returns def main(): """ 执行入口,层次判断,任务转移. >>> main() 时间 深度 当前完成 待完成 """ i = 0 th = threading2.ThreadPool(workQueue, resultQueue, options.number) td = threading2.MyThread2(workQueue, resultQueue, i, 10) #屏幕打印进程