Exemplo n.º 1
0
def doTask(task, param):
    proxyList = jTool.getProxy("proxy.txt")
    pcount = len(proxyList) - 1
    param["id"] = str(task[0])
    #    param['url'] = param['preUrlip'] + task[1]
    param["url"] = task[1]
    param["ename"] = task[2]
    cursor = param["conn"].cursor()
    proxy = str(proxyList[random.randint(0, pcount)]).strip()
    #    try:
    result = rp.crawlGetUrl(param["conn"], param, proxy)
    count = 1
    while not result["logic"] and count <= 2:
        proxy = str(proxyList[random.randint(0, pcount)]).strip()
        result = rp.crawlGetUrl(param["conn"], param, proxy)
        count += 1
    if result["logic"]:
        completeTask(param, task[3])
    if not result["logic"]:
        print "error record:" + str(param["id"])
    #    except Exception, e:
    #        print 'doTask', __name__, e
    #        return True
    param["conn"].commit()
    cursor.close()
Exemplo n.º 2
0
def mainLoop(num, start, end):
    param = makeParam()
    param['taskTable'] = 'item_url_task'
    param['num'] = str(num)
    count = 0
    querySql = 'select id, url, enterprise_name from '+param['taskTable']+' where id >= '+str(start)+' and id <= '+ str(end)
    cursor = param['conn'].cursor()
    cursor.execute(querySql)
    proxyList = jTool.getProxy('proxy.txt')
    pcount = len(proxyList)-1
    proxy = proxyList[random.randint(0, pcount)]
    while start<end:
        record = cursor.fetchone()
        task = list(record)
        print 'Get new task on '+ str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        result = doTask(task, param, proxy)
        count += 1
        print count
        ids = ''
        if result:
            completeTask(param, task[0])
            start += 1
            continue
        if not result:
            rollbackTask(param, task[0])
            start += 1
            continue
    cursor.close()
    param['conn'].close()
Exemplo n.º 3
0
def doTask(task, param):
    proxyList = jTool.getProxy('proxy.txt')
    pcount = len(proxyList)-1
    param['id'] = str(task[0])
    param['url'] = param['preUrlip'] + task[1]
    param['ename'] = task[2]
    param['iid'] = task[3]
    cursor = param['conn'].cursor()
    proxy = str(proxyList[random.randint(0, pcount)]).strip()
#    try:
    result = rp.crawlPostUrl(param['conn'], param, proxy)
    count = 1
    while not result['logic'] and count<=2:
            proxy = str(proxyList[random.randint(0, pcount)]).strip()
            result = rp.crawlPostUrl(param['conn'], param, proxy)
            count += 1
    if not result['logic']:
        result['rtData']['eid'] = param['id']
        result['rtData']['url'] = param['url']
        result['rtData']['ename'] = param['ename']
        result['rtData']['proxy'] = proxy.strip()
#        jTool.insertDatai(cursor, 'error_log_p', result['rtData'])
        print 'error record:'+ str(param['id'])
#    except Exception, e:
#        print 'doTask', __name__, e
#        return True
    param['conn'].commit()
    cursor.close()
Exemplo n.º 4
0
def doTask(eid, url, ename, param):
    '''
    循环6次尝试获取指定url的内容
    然后存储到远程数据库
    返回True,否则False
    '''
    param['eid'] = str(eid)
    param['ename'] = ename
#    param['url'] = param['preUrl'] + url
    param['url'] = param['preUrlip'] + url
    proxyList = jTool.getProxy('proxy.txt')
    return rp.crawlUrl(param['conn'], param, proxyList)
Exemplo n.º 5
0
def mainLoop(start, end):
    proxyList = jTool.getProxy('proxy.txt')
    pcount = len(proxyList)-1
    head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3',
                    'Accept-Encoding:gzip,deflate,sdch',
                    'Accept-Language:zh-CN,zh;q=0.8',
                    'Cache-Control:max-age=0',
                    'Connection:keep-alive',
                    'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81',
                    'Host:www.zjcredit.gov.cn:8000',
                    'Pragma:no-cache',
                    'Cookie:_gscu_374314293=73631708ff8h1y17; lzstat_uv=106813037832225946|2529639; ECStaticSession=ECS80; ASP.NET_SessionId=5dhxxl45gr4d0aexnf1uiu55; _gscbrs_374314293=1; lzstat_ss=815622537_1_1374448570_2529639; _gscs_374314293=t74419759zee6a318|pv:2',
                    'Origin:http://www.zjcredit.gov.cn:8000',
                    'Referer:http://www.zjcredit.gov.cn:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False',
                    'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36'
                    ]
    conn = jTool.initCursor('localhost', 'root', 'root', 'rawData')
    cursor = conn.cursor()
    cursor2 = conn.cursor()
    while start<=end:
        sql = 'select *  from base_page_list where id = '+str(start)+' and status = 0 limit 1'
        cursor.execute(sql)
        record = cursor.fetchone()  
        if not record and start<=end:
            start += 1
            continue
        
        corpName = record[1]
        rowID = record[3]
        print corpName+', '+str(start)
        rt = None
        count = 1
        while not rt and count<=2:
#            print count
            proxy = str(proxyList[random.randint(0, pcount)]).strip()
#            print proxy
            rt = getPageField(conn, proxy, head, str(start), rowID, corpName)
#            print rt
            count += 1
            if rt:
                print 'id'+str(start)+' ok'
                jTool.updateData(cursor2, ' where id = '+str(start)+' ', 'base_page_list', {'status': '1'})
                continue
        start += 1
        conn.commit()
    cursor.close()
    cursor2.close()
    conn.close()
Exemplo n.º 6
0
def operLog(logFileName):
    file = open(logFileName)
    line = file.readline()
    param = makeParam()
    param['taskTable'] = 'item_url_task'
    param['num'] = '999'
    proxyList = jTool.getProxy('proxy.txt')
    pcount = len(proxyList)-1
    proxy = proxyList[random.randint(0, pcount)]
    cursor = param['conn'].cursor()
    cursor2 = param['conn'].cursor()
    cursor3 = param['conn'].cursor()
    count = 0
    while line:
        tmp = line.split(',')
        if len(tmp)>2:
            for t in tmp:
                tt = t.split(':')
                if len(tt)>1:
                    param[tt[0].strip()] = tt[1]
                    if len(tt)>3:
                        param[tt[0]] = tt[1]+':'+tt[2]+':'+tt[3].strip('\n')
            for i in range(10):
                ext = jTool.exsitsRecord(cursor, 'enterprise_raw_'+str(i), 'eid', param['id'])
                i += 1
                if not ext:
                    jTool.insertDatai(cursor, 'error_log', {'eid': param['id'], 'url': param['url'], 'ename': param['ename']})
                    print  str(param['id'])
                else:
                    try:
                        if ext!='error':
                            jTool.getField(cursor2, 'enterprise_raw_'+str(i), 'postContent', ' where eid = '+str(param['id']))
                            val = cursor2.fetchone()
                            if not val[0]:
                                jTool.insertDatai(cursor3, 'error_log', {'eid': param['id'], 'url': param['url'], 'ename': param['ename']})
                                print  str(param['id'])
                    except:
                        pass
            param['conn'].commit()
        line = file.readline()
        count += 1
    cursor.close()
    cursor2.close()
    cursor3.close()
    param['conn'].close()
    print 'line error :'+str(count)
Exemplo n.º 7
0
def operLog(logFileName):
    file = open(logFileName)
    line = file.readline()
    param = makeParam()
    param['taskTable'] = 'item_url_task'
    param['num'] = '999'
    proxyList = jTool.getProxy('proxy.txt')
    pcount = len(proxyList)-1
    proxy = proxyList[random.randint(0, pcount)]
    while line:
        if line[0]=='G':
            tmp = line.split(',')
            for t in tmp:
                tt = t.split(':')
                if len(tt)>1:
                    param[tt[0].strip()] = tt[1]
                    if len(tt)>2:
                        param[tt[0]] = tt[1]+':'+tt[2]+':'+tt[3].strip('\n')
            param['id'] = param['eid']
            rp.crawlUrl(param['conn'], param, proxy)
        if line[0]=='P':
#            print line
            pass
        line = file.readline()
Exemplo n.º 8
0
def makeParam():
    paramDic = {}
    paramDic['station'] = '信用浙江'
    paramDic['begin_url'] = 'http://www.zjcredit.gov.cn:8000/CreditQuery.aspx?sectionID=02'
    paramDic['query_url'] = 'http://www.zjcredit.gov.cn:8000/ListQuery.aspx'
    paramDic['post_data_dic'] = {'isIntermediary': 'False', 'isOpen': 'False', 'pageLength': '20', 'recordTotal': '1778190', 'sectionID': '02', 'sortDirection': '1', 'sortField': 'CreditID'}
    paramDic['preUrl'] = 'http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID='
    paramDic['preUrlip'] = 'http://218.108.28.28:8000/EnterpriseInfo.aspx?creditID='
    paramDic['basePostUrl'] = 'http://www.zjcredit.gov.cn:8000/GetInfoByDataSupplier.aspx'
    paramDic['basePostUrlip'] = 'http://218.108.28.28:8000/GetInfoByDataSupplier.aspx'
    paramDic['dbHost'] = 'localhost'
    paramDic['dbUser'] = '******'
    paramDic['dbPasswd'] = 'root'
    paramDic['rdb'] = 'rawData'
    conn = jTool.initCursor(paramDic['dbHost'], paramDic['dbUser'], paramDic['dbPasswd'], paramDic['rdb'])
    paramDic['conn'] = conn
    return paramDic

param = makeParam()
param['taskTable'] = 'item_url_task'
param['num'] = '999'
proxyList = jTool.getProxy('proxy.txt')
pcount = len(proxyList)-1
proxy = proxyList[random.randint(0, pcount)]

param['url'] = 'http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID=F651B7F17FEEDA7A'
param['eid'] = '69999'
param['id'] = '69999'
param['ename'] = '湖州市邮政局千金邮电所'
rp.crawlUrl(param['conn'], param, proxy)