Пример #1
0
def main():

    # keyword generate urls
    realUrlSet = capterURL.capture("inurl:/index.php/module/action/param1/")
    # urls to hosts
    domainNameSet = capterURL.getAllDomainName(realUrlSet)

    log.log(str(realUrlSet),log.INFO)
    log.log(str(domainNameSet),log.INFO)


    pass
Пример #2
0
def captureUrlMultThread(keyword,start,end,step,Type=BAIDU):
    global realUrlSet
    realUrlSet = set()
    threadpool=[]
    startTime = time.time()
    for index in range(start,end,step):
        th = threading.Thread(target=capterUrl,args=(keyword,index,Type))
        threadpool.append(th)
    for th in threadpool:
        th.start()
    for th in threadpool :
        threading.Thread.join(th)
    endTime = time.time()
    log.log("%fs used to capter url." % (endTime-startTime), log.INFO)
    return realUrlSet
Пример #3
0
def getUnRedirectUrl(url):
    httplib.HTTPConnection.debuglevel = 1
    send_headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Connection':'keep-alive'
    }
    request = urllib2.Request(url,headers=send_headers)
    opener = urllib2.build_opener()
    try:
        f = opener.open(request,timeout=5)
        return f.url
    except Exception as e:
        log.log("Url:%s Exception:%s" %(url, str(e)), log.DEBUG)
        return None
Пример #4
0
def capterHost(keyword,index,Type):
    log.log("keyword: " + keyword + " start: " + str(index), log.INFO)
    realUrlList = []
    global realUrlSet
    if Type == BAIDU:
        html = baiduSearch(keyword,index)
        content = unicode(html, 'utf-8','ignore')
        regex = u"能够取得Host的正则表达式"
        arrList = getList(regex, content)
        log.log("There are %d scores." %(len(arrList)), log.VERBOSE)
    elif Type == GOOGLE:
        log.log("google no realize", log.VERBOSE)
    log.log("keyword: " + keyword + " end: " + str(index+10) , log.INFO)
    pass
Пример #5
0
def capterUrl(keyword,index,Type=BAIDU):
    log.log("keyword: " + keyword + " start: " + str(index), log.INFO)
    realUrlList = []
    global realUrlSet
    if Type == BAIDU:
        html = baiduSearch(keyword,index)
        content = unicode(html, 'utf-8','ignore')
        regex = u"<div class=\"result c-container[\s\S]*?<\/a>"
        arrList = getList(regex, content)
        log.log("There are %d scores." %(len(arrList)), log.VERBOSE)
        for item in arrList:
            regex = u"href = \"[\s\S]*?\""
            url = getMatch(regex,item)[8:-1]
            realUrl = getUnRedirectUrl(url)
            realUrlList.append(realUrl)
            regex = u"id=\"[\s\S]*?\""
            id = getMatch(regex,item)
            if None != realUrl:
                realUrlSet.add(realUrl)
            log.log("%s %s %s" % (id ,realUrl, url), log.VERBOSE)
    elif Type == GOOGLE:
        log.log("google no realize", log.VERBOSE)
    log.log("keyword: " + keyword + " end: " + str(index+10) , log.INFO)
Пример #6
0
    # 状态信息 写入日志
    status_dict_ = {
        'DATE_ID': partition_value,
        'QA_ID': __test_case_dict['test_case_id'],
        'PROC_FILE': os.path.basename(__file__),
        'TARGET_DB': target_db_.upper(),
        'TARGET_TABLE': target_table_.upper(),
        'STEP_STATUS': '0',
        'START_TIME': utime,
        'END_TIME': end_time,
        'DURING': '0',
        'OUTPUT': 'initialize success!',
        'CREATE_BY': 'FU1098',
        'CREATE_TIME': end_time
    }

    # 插入日志
    mysql_log.insert(log_table, status_dict_, 'insert')

    log('<===qa00001001.py begin at %s===>' % (utime))

    try:
        main()
    except:
        err_detail(log_file)
        status_dict_['STEP_STATUS'], status_dict_['OUTPUT'], status_dict_['END_TIME'], status_dict_['DURING'] = \
            '-1', "ERROR: message in %s " % (log_file), end_time, dur_time
        mysql_log.insert(log_table, status_dict_, 'insert')

    end_time, dur_time = get_time(utime)
    log('<===qa00001001.py end at %s===>' % (end_time))