Пример #1
0
def qichachaFromIndustry(f, t):
    myLogging.info('start from %s to %s ', f, t)
    indBaseUrl = 'http://www.qichacha.com/gongsi_industry?industryCode='
    conn, csor = getComConnCsor()
    for code in range(f, t + 1):
        industCode = chr(code + 65)
        industOrder = code
        inductBasePageUrl = indBaseUrl + industCode + '&industryorder=' + str(
            industOrder)

        try:
            myLogging.info('start indust base pages, %s', inductBasePageUrl)
            # qichachaFromIndustPageUrl(inductBasePageUrl,conn, csor)
            myLogging.info('end indust base pages, %s', inductBasePageUrl)

            myLogging.info('start indust subIndust pages, %s',
                           inductBasePageUrl)
            pageContent = getQichachaHtml(inductBasePageUrl)
            pageSoup = getSoupByStrEncode(pageContent, 'utf-8')
            subUrlTags = pageSoup.select('.filter-tag')[1]
            if not subUrlTags:
                myLogging.error('no subUrls, skipped, %s', inductBasePageUrl)
            for tag in subUrlTags.select('a'):
                subUri = tag['href']
                subUrl = urlparse.urljoin(indBaseUrl, subUri)

                myLogging.info('start sub indust base pages, %s', subUrl)
                qichachaFromIndustPageUrl(subUrl, conn, csor)
                myLogging.info('end sub indust base pages, %s', subUrl)
        except Exception as e:
            myLogging.error('indust error, industCode: %s url: %s; error: %s ',
                            industCode, inductBasePageUrl, e)
Пример #2
0
def insertInvestList(uid, content):
    global conn, csor
    if not conn or (not csor):
        conn, csor = getComConnCsor()
    csor.execute('insert ignore com_invest (uid, investList) values (%s, %s)',
                 (uid, content))
    conn.commit()
Пример #3
0
def loadComNameByLength(nameLength):
    global conn, csor
    if not conn or (not csor):
        conn, csor = getComConnCsor()
    csor.execute(
        'select companyName from com_base_copy where length(companyName) = %s ',
        (nameLength, ))
    result = csor.fetchall()
    return result
Пример #4
0
def getQichachaInvestDigests():
    idbloom = getBloom()
    conn, csor = getComConnCsor()
    csor.execute('select uid from com_invest')
    ids = csor.fetchall()
    [idbloom.add(mid[0]) for mid in ids]
    # if ids[0][0] in idbloom:
    myLogging.info('load exists ids ok')

    return idbloom
Пример #5
0
def crawlBaseInfo(begin, end):
    print 'start from ', begin, ' to ', end
    baseUrl = 'http://www.tianyancha.com/IcpList/'
    conn, csor = getComConnCsor()
    seq = range(begin, end)

    random.shuffle(seq)
    for id in seq:
        try:
            dealById(baseUrl, conn, csor, id)
        except Exception as e:
            print id, ':  ', e
Пример #6
0
def qichachaFromProvs(provs):
    myLogging.info('start: provs %s', str(provs))
    catBaseIrl = 'http://www.qichacha.com/gongsi_area_prov_'
    conn, csor = getComConnCsor()
    for prov in provs:
        pageBaseUrl = catBaseIrl + prov + '_p_'
        for pageCount in range(1, 501):
            pageUrl = pageBaseUrl + str(pageCount) + '.shtml'
            try:
                pageContent = getQichachaHtml(pageUrl)
                pageSoup = getSoupByStrEncode(pageContent, 'utf-8')
                dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov)
            except Exception as ee:
                myLogging.error('page ' + str(pageCount) + ' error %s', ee)
Пример #7
0
def insertWithUid(conn2, csor2, prv, uid):

    if uid in idBloom:
        print 'already crawled uid:', uid
        return

    # idBloom.add(uid)

    global conn, csor
    if not conn or (not csor):
        conn2, csor2 = getComConnCsor()

    com_base_info_str = getBaseInfoById(prv, uid)
    com_base_info_json = json.loads(com_base_info_str)
    if com_base_info_json['status'] != 1:
        print 'json int not succ , uid: ', uid, ' content:', com_base_info_str
        return
    data = com_base_info_json['data']['Company']
    companyType = data['EconKind']
    # webName = data['webName']
    companyName = data['Name']
    liscense = data['No']
    if not liscense:
        liscense = data['OrgNo']
    examineDate = ''
    if data['CheckDate']:
        examineDate = data['CheckDate'].strip()
        # webSite = ','.join(data['webSite'])
        # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName)

    global staticInsertTotolCount, staticInsertTotolTime, staticInsertCarry
    startTime = time.time()

    try:
        csor2.execute(
            """insert ignore into com_base_copy (id,companyName,companyType,examineDate,liscense,source,src_content)
            values (%s,%s,%s,%s,%s,%s,%s);""",
            (uid, companyName, companyType, examineDate, liscense, "qichacha",
             com_base_info_str))
        conn2.commit()
        myLogging.info('comOk, uid: %s, comName: %s', uid,
                       unicode(companyName).encode('utf-8'))
        endTime = time.time()
        thisSpentTime = endTime - startTime

        statisMysqlInsert(staticInsertCarry, thisSpentTime)

    except Exception as e:
        myLogging.error('insert error, uid: %s, error:%s', uid, e)
Пример #8
0
def fromInvestInt():

    global conn, csor
    if not conn or (not csor):
        conn, csor = getComConnCsor()
    csor.execute(
        "select id,companyName from com_base_copy where id = '6bc7e7ccdb755391651316a0227c059b' and companyName is not Null  limit 10;"
    )
    result = csor.fetchall()
    for comInfo in result:
        uid = comInfo[0]
        cName = comInfo[1]
        if not cName:
            myLogging.warning('no comName skip, uid: %s', uid)
            continue
        getInvestListByNameId(uid, cName)
Пример #9
0
def getQichachaDigests():
    idbloom = loadBloomFromFile('local/qichachaUIDs')
    if idbloom:
        myLogging.info('load bloom from file succ, no need load from db')
        # return idbloom
    else:
        myLogging.info('no dump bloom file,  load from db')
        idbloom = getBloom(2000 * 10000)
        # idbloom = getBloom()
        conn, csor = getComConnCsor()
        csor.execute('select id from com_base_copy')
        # csor.execute('select id from com_base_copy limit 10')
        ids = csor.fetchall()
        [idbloom.add(mid[0]) for mid in ids]
        # if ids[0][0] in idbloom:
        myLogging.info('load exists ids ok, generate dump bloom file')
        dumpBloomToFile(idbloom, fileName='local/qichachaUIDs')
    return idbloom