def main(): # keyword generate urls realUrlSet = capterURL.capture("inurl:/index.php/module/action/param1/") # urls to hosts domainNameSet = capterURL.getAllDomainName(realUrlSet) log.log(str(realUrlSet),log.INFO) log.log(str(domainNameSet),log.INFO) pass
def captureUrlMultThread(keyword,start,end,step,Type=BAIDU): global realUrlSet realUrlSet = set() threadpool=[] startTime = time.time() for index in range(start,end,step): th = threading.Thread(target=capterUrl,args=(keyword,index,Type)) threadpool.append(th) for th in threadpool: th.start() for th in threadpool : threading.Thread.join(th) endTime = time.time() log.log("%fs used to capter url." % (endTime-startTime), log.INFO) return realUrlSet
def getUnRedirectUrl(url): httplib.HTTPConnection.debuglevel = 1 send_headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection':'keep-alive' } request = urllib2.Request(url,headers=send_headers) opener = urllib2.build_opener() try: f = opener.open(request,timeout=5) return f.url except Exception as e: log.log("Url:%s Exception:%s" %(url, str(e)), log.DEBUG) return None
def capterHost(keyword,index,Type): log.log("keyword: " + keyword + " start: " + str(index), log.INFO) realUrlList = [] global realUrlSet if Type == BAIDU: html = baiduSearch(keyword,index) content = unicode(html, 'utf-8','ignore') regex = u"能够取得Host的正则表达式" arrList = getList(regex, content) log.log("There are %d scores." %(len(arrList)), log.VERBOSE) elif Type == GOOGLE: log.log("google no realize", log.VERBOSE) log.log("keyword: " + keyword + " end: " + str(index+10) , log.INFO) pass
def capterUrl(keyword,index,Type=BAIDU): log.log("keyword: " + keyword + " start: " + str(index), log.INFO) realUrlList = [] global realUrlSet if Type == BAIDU: html = baiduSearch(keyword,index) content = unicode(html, 'utf-8','ignore') regex = u"<div class=\"result c-container[\s\S]*?<\/a>" arrList = getList(regex, content) log.log("There are %d scores." %(len(arrList)), log.VERBOSE) for item in arrList: regex = u"href = \"[\s\S]*?\"" url = getMatch(regex,item)[8:-1] realUrl = getUnRedirectUrl(url) realUrlList.append(realUrl) regex = u"id=\"[\s\S]*?\"" id = getMatch(regex,item) if None != realUrl: realUrlSet.add(realUrl) log.log("%s %s %s" % (id ,realUrl, url), log.VERBOSE) elif Type == GOOGLE: log.log("google no realize", log.VERBOSE) log.log("keyword: " + keyword + " end: " + str(index+10) , log.INFO)
# 状态信息 写入日志 status_dict_ = { 'DATE_ID': partition_value, 'QA_ID': __test_case_dict['test_case_id'], 'PROC_FILE': os.path.basename(__file__), 'TARGET_DB': target_db_.upper(), 'TARGET_TABLE': target_table_.upper(), 'STEP_STATUS': '0', 'START_TIME': utime, 'END_TIME': end_time, 'DURING': '0', 'OUTPUT': 'initialize success!', 'CREATE_BY': 'FU1098', 'CREATE_TIME': end_time } # 插入日志 mysql_log.insert(log_table, status_dict_, 'insert') log('<===qa00001001.py begin at %s===>' % (utime)) try: main() except: err_detail(log_file) status_dict_['STEP_STATUS'], status_dict_['OUTPUT'], status_dict_['END_TIME'], status_dict_['DURING'] = \ '-1', "ERROR: message in %s " % (log_file), end_time, dur_time mysql_log.insert(log_table, status_dict_, 'insert') end_time, dur_time = get_time(utime) log('<===qa00001001.py end at %s===>' % (end_time))