def enterprise_record_raw_1_function(conn, start, end): ''' 规则:看字段转换表内内容 把enterprise_record_raw记录转换转入自己表中的其他字段(轻度数据提取) ''' tableName = 'enterprise_record_raw_1' cursor = conn.cursor() cursor2 = conn.cursor() qSql = "select id, eid, enterprise_name, content from " + tableName + ' where id >='+str(start)+' and id <='+str(end) cursor.execute(qSql) print "start extract data in "+str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) count = 0 while 1: record = cursor.fetchone() if not record or record[0]>end: continue count += 1 id = record[0] eid = record[1] content = record[3].strip() # print id # print eid clist = content.split('r_obj = t_obj.add_record(true);\n') print '企业名称:'+ record[2]+', eid:'+str(eid)+', id:'+str(id) tmp = clist[0].split(':') publisher = tmp[0].split(',')[-1].strip() print publisher try: category = tmp[1].split(',')[0].strip() print category except: continue if len(clist)<2: recs = clist[0] else: recs = ''.join(clist[1].split(', false);')) tmp = recs.split('\n') for i in range(len(tmp)): tt = tmp[i].split(',') del tt[0] tmp[i] = ''.join(tt) records = ';'.join(tmp) records = jTool.clearX(['(', ')', 'true'], records).strip() print records print '*'*30 tmpDic = {} tmpDic['publisher'] = publisher tmpDic['category'] = category tmpDic['records'] = records where = ' where id = '+str(id) jTool.updateData(cursor2, where, tableName, tmpDic) tmpDic = {} conn.commit() print "complete extract "+str(count)+" records in "+str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) cursor2.close() cursor.close() conn.close() return True
def cutContent(content): ''' 剪切post获取的页面内容 ''' result = ' ' if content: start = jTool.getStrIndex('function my_init()', content) end = jTool.getStrIndex('</script>', content) if start>0 and end>0: result = jTool.getBetween(start, end, 'function my_init()', content) result = jTool.clearX(["'"], result) return result
def crawlUrl(conn, param, proxyList): ''' 访问url并获得企业基本信息和信用记录并入库 ''' cursor2 = conn.cursor() cursor3 = conn.cursor() pcount = len(proxyList)-1 proxy = proxyList[random.randint(0,pcount)] content = jTool.getContentByProxy(proxy, param['url']) if not content: jTool.logError('Fail to get page content, url:'+ param['url']) return False dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录" post_data_dic = {'corpName': param['ename'], 'creditID': param['eid'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'} recEntBaseDic = getEntBase(content) if not recEntBaseDic: return False recEntBaseDic['url'] = param['url'] recEntBaseDic['eid'] = param['eid'] recEntBaseDic['enterprise_name'] = param['ename'].strip('') recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) try: jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic) conn.commit() print ' Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) except: print 'Fail to insert record '+ ', eid is '+str(param['eid']) jTool.logError('Fail to insert record on '+ 'url is '+param['url']+', id is '+str(id)) return False recEntDetailDic = {} recEntDetailDic['url'] = param['url'] recEntDetailDic['eid'] = param['eid'] recEntDetailDic['enterprise_name'] = param['ename'].strip() recEntDetailDic['records'] = ' ' recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # detailRecUrl = param['basePostUrl'] detailRecUrl = param['basePostUrlip'] contentRecord = jTool.getContentByProxy(proxy, detailRecUrl, post_data_dic) if not contentRecord: return True recList = getEntDetail(contentRecord) scount = 0 for rec in recList: scount += 1 rec = jTool.clearX(["'"], rec) recEntDetailDic['content'] = str(rec) jTool.insertData(cursor3, 'enterprise_record_raw', recEntDetailDic) conn.commit() print ' Insert enterprise detail info No. '+str(scount)+' successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) print 'Fetch and insert successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) return True
def extractPostContent(conn, table, start, end): ''' 提取enterprise_raw表中的postContent字段到enterprse_record_raw中 每条包括多种信用记录,每种信用记录包括多条 ''' cursor = conn.cursor() cursor2 = conn.cursor() id = int(start) while id>=int(start) and id<=int(end): print '*'*30 sql = 'select eid, enterprise_name, url, postContent, id from '+str(table)+' where id ='+str(id) id += 1 cursor.execute(sql) record = cursor.fetchone() if not record: continue recList = rp.getEntDetail(record[3]) scount = 0 recEntDetailDic = {} recEntDetailDic['url'] = record[2] recEntDetailDic['eid'] = record[0] recEntDetailDic['enterprise_name'] = record[1].strip() print 'id:'+str(record[4])+', '+recEntDetailDic['enterprise_name'] for rec in recList: scount += 1 rDic = contentToRecords(str(rec).strip()) if not rDic: continue rec = jTool.clearX(["'"], rec) recEntDetailDic['content'] = (str(rec)).strip() recEntDetailDic['records'] = ' ' recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) fDic = dict(recEntDetailDic, **rDic) jTool.insertData(cursor2, 'enterprise_record_raw', fDic) conn.commit() cursor.close() cursor2.close() conn.close()
except Exception, e: print __name__, e return tmpDic del clist[0] ccount = len(clist) recs = '' for c in range(ccount): recs = recs+''.join(clist[c].split(', false);')) tmp = recs.split('\n') for i in range(len(tmp)): tt = tmp[i].split(',') del tt[0] tmp[i] = ''.join(tt) records = ';'.join(tmp) records = jTool.clearX(['(', ')', 'true'], records).strip() tmpDic['publisher'] = publisher tmpDic['category'] = category tmpDic['records'] = records return tmpDic def extractPostContent(conn, table, start, end): ''' 提取enterprise_raw表中的postContent字段到enterprse_record_raw中 每条包括多种信用记录,每种信用记录包括多条 ''' cursor = conn.cursor() cursor2 = conn.cursor() id = int(start) while id>=int(start) and id<=int(end):