def operLog(logFileName): file = open(logFileName) line = file.readline() param = makeParam() param['taskTable'] = 'item_url_task' param['num'] = '999' proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 proxy = proxyList[random.randint(0, pcount)] cursor = param['conn'].cursor() cursor2 = param['conn'].cursor() cursor3 = param['conn'].cursor() count = 0 while line: tmp = line.split(',') if len(tmp)>2: for t in tmp: tt = t.split(':') if len(tt)>1: param[tt[0].strip()] = tt[1] if len(tt)>3: param[tt[0]] = tt[1]+':'+tt[2]+':'+tt[3].strip('\n') for i in range(10): ext = jTool.exsitsRecord(cursor, 'enterprise_raw_'+str(i), 'eid', param['id']) i += 1 if not ext: jTool.insertDatai(cursor, 'error_log', {'eid': param['id'], 'url': param['url'], 'ename': param['ename']}) print str(param['id']) else: try: if ext!='error': jTool.getField(cursor2, 'enterprise_raw_'+str(i), 'postContent', ' where eid = '+str(param['id'])) val = cursor2.fetchone() if not val[0]: jTool.insertDatai(cursor3, 'error_log', {'eid': param['id'], 'url': param['url'], 'ename': param['ename']}) print str(param['id']) except: pass param['conn'].commit() line = file.readline() count += 1 cursor.close() cursor2.close() cursor3.close() param['conn'].close() print 'line error :'+str(count)
def crawlPostUrl(conn, param, proxy): cursor = conn.cursor() cursor1 = conn.cursor() dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录" print 'eid:'+str(param['id']) post_data_dic = {'corpName': param['ename'], 'creditID': param['id'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'} recEntDetailDic = {} recEntDetailDic['url'] = param['url'] recEntDetailDic['eid'] = param['id'] recEntDetailDic['enterprise_name'] = param['ename'].strip() recEntDetailDic['records'] = ' ' recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) contentRecord = None jTool.getField(cursor, 'enterprise_raw', 'postContent', ' where id = '+str(param['rid'])) postContent =cursor.fetchone() cursor.close() if len(postContent[0])>10: print '企业细节记录已存在,跳过' return {'logic': True} print 'post url:'+str(param['basePostUrlip'])+', '+param['ename'].strip()+',id:'+param['id']+', proxy:'+proxy try: contentRecord = jTool.getContentByProxy(proxy, str(param['basePostUrlip']), post_data_dic) except Exception, e: print '页面未包含详细记录或获取失败', __name__, e
except Exception, e: print 'insert error ', __name__, e print 'Fail to insert record '+ ', id is '+str(param['id'])+', proxy:'+proxy return {'logic': False, 'rtData': {'type': 'insertPageError', 'postContent': ' '}} dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录" print 'eid:'+str(param['id']) post_data_dic = {'corpName': param['ename'], 'creditID': param['id'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'} recEntDetailDic = {} recEntDetailDic['url'] = param['url'] recEntDetailDic['eid'] = param['id'] recEntDetailDic['enterprise_name'] = param['ename'].strip() recEntDetailDic['records'] = ' ' recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) contentRecord = None jTool.getField(cursor6, 'enterprise_raw', 'postContent', ' where eid = '+str(param['id'])) postContent =cursor6.fetchone() cursor6.close() if len(postContent[0])>10: print '企业细节记录已存在,跳过' return {'logic': True} print 'post url:'+str(param['basePostUrlip'])+', '+param['ename'].strip()+',id:'+param['id']+', proxy:'+proxy try: contentRecord = jTool.getContentByProxy(proxy, str(param['basePostUrlip']), post_data_dic) except Exception, e: print '页面未包含详细记录或获取失败', __name__, e return {'logic': False, 'rtData': {'type': 'postContentError', 'postContent': ' '}} try: contentRecord = cutContent(contentRecord) print ' Get post content OK'
def crawlUrl(conn, param, proxy): ''' 访问url并获得企业基本信息和信用记录并入库 ''' cursor2 = conn.cursor() cursor4 = conn.cursor() cursor5 = conn.cursor() cursor6 = conn.cursor() recordExists = jTool.notExsitsRecord(cursor4, 'enterprise_raw', 'eid', param['id']) cursor4.close() if recordExists: print '企业'+param['ename']+'基本信息已存在,直接尝试获取记录信息 \npage url:'+param['url'] if not recordExists: print 'get url:'+param['url']+', '+param['ename'].strip()+',id:'+param['id'] content = jTool.getContentByProxy(proxy, param['url']) # jTool.logit(str(content), 'errorPage.txt') if not content: print '获取基本信息页面内容为空或失败' jTool.logError('\nGet page content fail, method:get, ename:'+param['ename']+',id:'+param['id']+',url:'+ param['url']) return False recEntBaseDic = None if content: try: recEntBaseDic = getEntBase(content) except: print '解析企业基本信息页面失败' jTool.logError('\nParse page content fail, method:get, ename:'+param['ename']+',id:'+param['id']+',url:'+ param['url']) return False if not recEntBaseDic: return False recEntBaseDic['url'] = param['url'] recEntBaseDic['eid'] = param['id'] recEntBaseDic['postContent'] = ' ' recEntBaseDic['enterprise_name'] = param['ename'].strip('') recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) try: jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic) conn.commit() print ' Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) except: print 'Fail to insert record '+ ', id is '+str(param['id']) jTool.logError('Fail to insert record on '+ 'url is '+param['url']+', id is '+str(id)) return False dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录" post_data_dic = {'corpName': param['ename'], 'creditID': param['id'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'} recEntDetailDic = {} recEntDetailDic['url'] = param['url'] recEntDetailDic['eid'] = param['id'] recEntDetailDic['enterprise_name'] = param['ename'].strip() recEntDetailDic['records'] = ' ' recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) contentRecord = None jTool.getField(cursor6, 'enterprise_raw', 'postContent', ' where eid = '+str(param['id'])) postContent =cursor6.fetchone() cursor6.close() if len(postContent[0])>10: print '企业细节记录已存在,跳过' return True print 'post url:'+str(param['basePostUrlip'])+', '+param['ename'].strip()+',id:'+param['id'] try: contentRecord = jTool.getContentByProxy(proxy, str(param['basePostUrlip']), post_data_dic) except: print '页面未包含详细记录或获取失败' jTool.logError('\nPost get None, ename:'+param['ename'].strip()+',id:'+param['id']+',url:'+ param['url']) return True try: contentRecord = cutContent(contentRecord) print 'Get post content OK' jTool.logit(str(contentRecord), 'errorContent.log') if contentRecord: jTool.updateData(cursor5, ' where eid = '+param['id'], 'enterprise_raw', {'postContent': str(contentRecord).decode('utf-8', 'ignore')}) cursor5.close() else: print 'POST得到页面非企业详细记录页面' jTool.logError('\nPage content error, method:post, ename:'+param['ename']+', id:'+param['id']+',url:'+ param['url']+', proxy:'+proxy) return True except: print '解析页面详细记录失败' jTool.logError('\nParse page content fail, method:post, ename:'+param['ename']+', id:'+param['id']+',url:'+ param['url']+', proxy:'+proxy) return False print 'Fetch and insert successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) return True