예제 #1
0
def enterprise_record_raw_1_function(conn, start, end):
    '''
    规则:看字段转换表内内容
    把enterprise_record_raw记录转换转入自己表中的其他字段(轻度数据提取)
    '''
    tableName = 'enterprise_record_raw_1'
    cursor = conn.cursor()
    cursor2 = conn.cursor()
    qSql = "select id, eid, enterprise_name, content from " + tableName + ' where id >='+str(start)+' and id <='+str(end)
    cursor.execute(qSql)
    print  "start extract data in "+str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    count = 0
    while 1:
        record = cursor.fetchone()
        if not record or record[0]>end:
            continue      
        count += 1
        id = record[0]
        eid = record[1]
        content = record[3].strip()
#        print id
#        print eid
        clist = content.split('r_obj = t_obj.add_record(true);\n')
       
        print '企业名称:'+ record[2]+', eid:'+str(eid)+', id:'+str(id)
        tmp = clist[0].split(':')
        publisher = tmp[0].split(',')[-1].strip()
        print publisher
        try:
            category = tmp[1].split(',')[0].strip()
            print category
        except:
            continue
        if len(clist)<2:
            recs = clist[0]
        else:
            recs = ''.join(clist[1].split(', false);'))
        tmp = recs.split('\n')
        for i in range(len(tmp)):
            tt = tmp[i].split(',')
            del tt[0]
            tmp[i] = ''.join(tt)
        records = ';'.join(tmp)
        records = jTool.clearX(['(', ')', 'true'], records).strip()
        print records
        print '*'*30
        tmpDic = {}
        tmpDic['publisher'] = publisher
        tmpDic['category'] = category
        tmpDic['records'] = records
        where = ' where id = '+str(id)
        jTool.updateData(cursor2, where, tableName, tmpDic)
        tmpDic = {}
        conn.commit()
    print  "complete extract "+str(count)+" records in "+str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    cursor2.close()
    cursor.close()
    conn.close()
    return True
예제 #2
0
def cutContent(content):
    '''
    剪切post获取的页面内容
    '''
    result = ' '
    if content:
        start = jTool.getStrIndex('function my_init()', content)
        end = jTool.getStrIndex('</script>', content)
        if start>0 and end>0:
            result = jTool.getBetween(start, end, 'function my_init()', content)
            result = jTool.clearX(["'"], result)
    return result
예제 #3
0
def crawlUrl(conn, param, proxyList):
    '''
    访问url并获得企业基本信息和信用记录并入库
    '''
    cursor2 = conn.cursor()
    cursor3 = conn.cursor()    
    pcount = len(proxyList)-1
    proxy = proxyList[random.randint(0,pcount)]    
    content = jTool.getContentByProxy(proxy, param['url'])
    if not content:
        jTool.logError('Fail to get page content, url:'+ param['url'])
        return False
    dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录"
    post_data_dic = {'corpName': param['ename'], 'creditID': param['eid'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'}
    recEntBaseDic = getEntBase(content)
    if not recEntBaseDic:
        return False
    recEntBaseDic['url'] = param['url']
    recEntBaseDic['eid'] = param['eid']
    recEntBaseDic['enterprise_name'] = param['ename'].strip('')
    recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    try:
        jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic)
        conn.commit()
        print '    Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    except:
        print 'Fail to insert record '+ ', eid is '+str(param['eid'])
        jTool.logError('Fail to insert record on '+ 'url is '+param['url']+', id is '+str(id))
        return False
    recEntDetailDic = {}
    recEntDetailDic['url'] = param['url']
    recEntDetailDic['eid'] = param['eid']
    recEntDetailDic['enterprise_name'] = param['ename'].strip()
    recEntDetailDic['records'] = ' '
    recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
#    detailRecUrl = param['basePostUrl']
    detailRecUrl = param['basePostUrlip']
    contentRecord = jTool.getContentByProxy(proxy, detailRecUrl, post_data_dic)
    if not contentRecord:
        return True
    recList = getEntDetail(contentRecord)
    scount = 0
    for rec in recList:
        scount += 1
        rec = jTool.clearX(["'"], rec)
        recEntDetailDic['content'] = str(rec)
        jTool.insertData(cursor3, 'enterprise_record_raw', recEntDetailDic)
        conn.commit()
        print '       Insert enterprise detail info No. '+str(scount)+' successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    print 'Fetch and insert successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    return True
예제 #4
0
def extractPostContent(conn, table, start, end):
    '''
    提取enterprise_raw表中的postContent字段到enterprse_record_raw中
    每条包括多种信用记录,每种信用记录包括多条
    '''
    cursor = conn.cursor()
    cursor2 = conn.cursor()
    id = int(start)
    while id>=int(start) and id<=int(end):
        print '*'*30
        sql = 'select eid, enterprise_name, url, postContent, id from '+str(table)+' where id ='+str(id)
        id += 1
        cursor.execute(sql)
        record = cursor.fetchone()
        if not record:
            continue
        recList = rp.getEntDetail(record[3])
        scount = 0
        recEntDetailDic = {}
        recEntDetailDic['url'] = record[2]
        recEntDetailDic['eid'] = record[0]
        recEntDetailDic['enterprise_name'] = record[1].strip() 
        print 'id:'+str(record[4])+', '+recEntDetailDic['enterprise_name']
        for rec in recList:
            scount += 1
            rDic = contentToRecords(str(rec).strip())
            if not rDic:
                continue
            rec = jTool.clearX(["'"], rec)
            recEntDetailDic['content'] = (str(rec)).strip()
            recEntDetailDic['records'] = ' '
            recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            fDic = dict(recEntDetailDic, **rDic)
            jTool.insertData(cursor2, 'enterprise_record_raw', fDic)
            conn.commit()
    cursor.close()
    cursor2.close()
    conn.close()
예제 #5
0
    except Exception, e:
        print __name__, e
        return tmpDic
        
    del clist[0]
    ccount = len(clist)
    recs = ''
    for c in range(ccount):
        recs = recs+''.join(clist[c].split(', false);'))
    tmp = recs.split('\n')
    for i in range(len(tmp)):
        tt = tmp[i].split(',')
        del tt[0]
        tmp[i] = ''.join(tt)
    records = ';'.join(tmp)
    records = jTool.clearX(['(', ')', 'true'], records).strip()
    
    tmpDic['publisher'] = publisher
    tmpDic['category'] = category
    tmpDic['records'] = records
    return tmpDic   

def extractPostContent(conn, table, start, end):
    '''
    提取enterprise_raw表中的postContent字段到enterprse_record_raw中
    每条包括多种信用记录,每种信用记录包括多条
    '''
    cursor = conn.cursor()
    cursor2 = conn.cursor()
    id = int(start)
    while id>=int(start) and id<=int(end):