Exemplo n.º 1
0
def operLog(logFileName):
    file = open(logFileName)
    line = file.readline()
    param = makeParam()
    param['taskTable'] = 'item_url_task'
    param['num'] = '999'
    proxyList = jTool.getProxy('proxy.txt')
    pcount = len(proxyList)-1
    proxy = proxyList[random.randint(0, pcount)]
    cursor = param['conn'].cursor()
    cursor2 = param['conn'].cursor()
    cursor3 = param['conn'].cursor()
    count = 0
    while line:
        tmp = line.split(',')
        if len(tmp)>2:
            for t in tmp:
                tt = t.split(':')
                if len(tt)>1:
                    param[tt[0].strip()] = tt[1]
                    if len(tt)>3:
                        param[tt[0]] = tt[1]+':'+tt[2]+':'+tt[3].strip('\n')
            for i in range(10):
                ext = jTool.exsitsRecord(cursor, 'enterprise_raw_'+str(i), 'eid', param['id'])
                i += 1
                if not ext:
                    jTool.insertDatai(cursor, 'error_log', {'eid': param['id'], 'url': param['url'], 'ename': param['ename']})
                    print  str(param['id'])
                else:
                    try:
                        if ext!='error':
                            jTool.getField(cursor2, 'enterprise_raw_'+str(i), 'postContent', ' where eid = '+str(param['id']))
                            val = cursor2.fetchone()
                            if not val[0]:
                                jTool.insertDatai(cursor3, 'error_log', {'eid': param['id'], 'url': param['url'], 'ename': param['ename']})
                                print  str(param['id'])
                    except:
                        pass
            param['conn'].commit()
        line = file.readline()
        count += 1
    cursor.close()
    cursor2.close()
    cursor3.close()
    param['conn'].close()
    print 'line error :'+str(count)
Exemplo n.º 2
0
def crawlPostUrl(conn, param, proxy):
    cursor = conn.cursor()
    cursor1 = conn.cursor()    
    dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录"
    print 'eid:'+str(param['id'])
    post_data_dic = {'corpName': param['ename'], 'creditID': param['id'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'}
    recEntDetailDic = {}
    recEntDetailDic['url'] = param['url']
    recEntDetailDic['eid'] = param['id']
    recEntDetailDic['enterprise_name'] = param['ename'].strip()
    recEntDetailDic['records'] = ' '
    recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    contentRecord = None
    jTool.getField(cursor, 'enterprise_raw', 'postContent', ' where id = '+str(param['rid']))
    postContent =cursor.fetchone()
    cursor.close()
    if len(postContent[0])>10:
        print '企业细节记录已存在,跳过'
        return {'logic': True}
    print 'post url:'+str(param['basePostUrlip'])+', '+param['ename'].strip()+',id:'+param['id']+', proxy:'+proxy
    try:
        contentRecord = jTool.getContentByProxy(proxy, str(param['basePostUrlip']), post_data_dic)
    except Exception, e:
        print '页面未包含详细记录或获取失败', __name__, e
Exemplo n.º 3
0
        except Exception, e:
            print 'insert error ', __name__, e
            print 'Fail to insert record '+ ', id is '+str(param['id'])+', proxy:'+proxy
            return {'logic': False, 'rtData': {'type': 'insertPageError', 'postContent': ' '}}

    dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录"
    print 'eid:'+str(param['id'])
    post_data_dic = {'corpName': param['ename'], 'creditID': param['id'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'}
    recEntDetailDic = {}
    recEntDetailDic['url'] = param['url']
    recEntDetailDic['eid'] = param['id']
    recEntDetailDic['enterprise_name'] = param['ename'].strip()
    recEntDetailDic['records'] = ' '
    recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    contentRecord = None
    jTool.getField(cursor6, 'enterprise_raw', 'postContent', ' where eid = '+str(param['id']))
    postContent =cursor6.fetchone()
    cursor6.close()
    if len(postContent[0])>10:
        print '企业细节记录已存在,跳过'
        return {'logic': True}
    print 'post url:'+str(param['basePostUrlip'])+', '+param['ename'].strip()+',id:'+param['id']+', proxy:'+proxy
    try:
        contentRecord = jTool.getContentByProxy(proxy, str(param['basePostUrlip']), post_data_dic)
    except Exception, e:
        print '页面未包含详细记录或获取失败', __name__, e
        return {'logic': False, 'rtData': {'type': 'postContentError', 'postContent': ' '}}
    
    try:
        contentRecord = cutContent(contentRecord)
        print '    Get post content OK'
Exemplo n.º 4
0
def crawlUrl(conn, param, proxy):
    '''
    访问url并获得企业基本信息和信用记录并入库
    '''
    cursor2 = conn.cursor()
    cursor4 = conn.cursor()
    cursor5 = conn.cursor()
    cursor6 = conn.cursor()
    recordExists = jTool.notExsitsRecord(cursor4, 'enterprise_raw', 'eid', param['id'])
    cursor4.close()
    if recordExists:
        print '企业'+param['ename']+'基本信息已存在,直接尝试获取记录信息 \npage url:'+param['url']
    if not recordExists:
        print 'get url:'+param['url']+', '+param['ename'].strip()+',id:'+param['id']
        content = jTool.getContentByProxy(proxy, param['url'])
#        jTool.logit(str(content), 'errorPage.txt')
        if not content:
            print '获取基本信息页面内容为空或失败'
            jTool.logError('\nGet page content fail, method:get, ename:'+param['ename']+',id:'+param['id']+',url:'+ param['url'])
            return False

        recEntBaseDic = None
        if content:
            try:
                recEntBaseDic = getEntBase(content)
            except:
                print '解析企业基本信息页面失败'
                jTool.logError('\nParse page content fail, method:get, ename:'+param['ename']+',id:'+param['id']+',url:'+ param['url'])
                return False
        if not recEntBaseDic:
            return False
        recEntBaseDic['url'] = param['url']
        recEntBaseDic['eid'] = param['id']
        recEntBaseDic['postContent'] = ' '
        recEntBaseDic['enterprise_name'] = param['ename'].strip('')
        recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

        try:
            jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic)
            conn.commit()
            print '    Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        except:
            print 'Fail to insert record '+ ', id is '+str(param['id'])
            jTool.logError('Fail to insert record on '+ 'url is '+param['url']+', id is '+str(id))
            return False

    dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录"
    post_data_dic = {'corpName': param['ename'], 'creditID': param['id'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'}
    recEntDetailDic = {}
    recEntDetailDic['url'] = param['url']
    recEntDetailDic['eid'] = param['id']
    recEntDetailDic['enterprise_name'] = param['ename'].strip()
    recEntDetailDic['records'] = ' '
    recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    contentRecord = None
    jTool.getField(cursor6, 'enterprise_raw', 'postContent', ' where eid = '+str(param['id']))
    postContent =cursor6.fetchone()
    cursor6.close()
    if len(postContent[0])>10:
        print '企业细节记录已存在,跳过'
        return True
    print 'post url:'+str(param['basePostUrlip'])+', '+param['ename'].strip()+',id:'+param['id']
    try:
        contentRecord = jTool.getContentByProxy(proxy, str(param['basePostUrlip']), post_data_dic)
    except:
        print '页面未包含详细记录或获取失败'
        jTool.logError('\nPost get None, ename:'+param['ename'].strip()+',id:'+param['id']+',url:'+ param['url'])
        return True
    try:
        contentRecord = cutContent(contentRecord)
        print 'Get post content OK'
        jTool.logit(str(contentRecord), 'errorContent.log')
        if contentRecord:
            jTool.updateData(cursor5, ' where eid = '+param['id'], 'enterprise_raw', {'postContent': str(contentRecord).decode('utf-8', 'ignore')})
            cursor5.close()
        else:
            print 'POST得到页面非企业详细记录页面'
            jTool.logError('\nPage content error, method:post, ename:'+param['ename']+', id:'+param['id']+',url:'+ param['url']+', proxy:'+proxy)
            return True
    except:
        print '解析页面详细记录失败'
        jTool.logError('\nParse page content fail, method:post, ename:'+param['ename']+', id:'+param['id']+',url:'+ param['url']+', proxy:'+proxy)
        return False
    print 'Fetch and insert successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    return True