예제 #1
0
def parse_base(item):
    if item is None:
        return None

    company_key = item["key"]
    content = item["content"]
    artifacts = []
    link = util.norm_url(content["website"])
    atype, market = util.get_market(link)
    if atype is not None:
        artifacts.append({
            "type": atype,
            "name": content["name"],
            "desc": content["desc"],
            "link": link
        })

    if content["url_android"] is not None:
        artifacts.append({
            "type": 4050,
            "name": content["name"],
            "desc": content["desc"],
            "link": util.norm_url(content["url_android"])
        })

    if content["url_ios"] is not None:
        artifacts.append({
            "type": 4040,
            "name": content["name"],
            "desc": content["desc"],
            "link": util.norm_url(content["url_ios"])
        })

    return {
        "shortName": content["name"],
        "fullName": None,
        "productName": content["name"],
        "description": None,
        "brief": content["desc"],
        "round": 0,
        "roundDesc": "",
        "companyStatus": 2010,
        "fundingType": 0,
        "locationId": 0,
        "establishDate": None,
        "logo": None,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": None,
        "type": 41020,
        "score": content["score"],
        "artifacts": artifacts
    }
예제 #2
0
def parse_artifact(item):
    if item is None:
        return None

    artifacts = []
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    #artifact
    logger.info("*** artifact ***")
    lis = d('ul.list-prod> li> a')
    for li in lis:
        l = pq(li)
        type = l('h4> span').text().strip()
        if type != u"网站" and type != "app":
            continue

        link = l.attr("href").strip()
        if link == "":
            continue

        if type == u"网站":
            type = 4010
            link = util.norm_url(link)
        else:
            continue
            #TODO
            '''
            if link.find("itunes.apple.com") >= 0 and link.find("/app/") >=0:
                type = 4040
                result = util.re_get_result('(id\d*)',link)
                if result is None:
                    continue
                app_id, = result
                link = "https://itunes.apple.com/cn/app/%s" % app_id
            elif link.find("www.wandoujia.com/apps/") >= 0:
                type = 4050
            else:
                continue
            '''

        name = l('h4> b').text().strip()
        desc = l('p').text().strip()
        logger.info("type: %s, name: %s, link: %s, desc: %s" %
                    (type, name, link, desc))
        link = util.norm_url(link)
        artifact = {"type": type, "name": name, "desc": desc, "link": link}
        artifacts.append(artifact)

    logger.info("")
    return artifacts
예제 #3
0
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> b').clone().children().remove(
    ).end().text().strip()
    temps = product_name.split("/", 1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace(
        "公司全称:", "")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""
    company_name = util.norm_company_name(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:", "")
    result = util.re_get_result('(\d*?).(\d*?)$', str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year, month),
                                                    '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId = 0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$', str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        conn = db.connect_torndb()
        result = conn.get("select * from location where locationName=%s", city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = conn.get("select * from location where locationName=%s",
                              province)
            if result != None:
                locationId = result["locationId"]
        conn.close()
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)

    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(
        " ", ",")
    logger.info("tags: %s" % tags)

    desc = d("div.des").text().strip()
    logger.info("desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    logger.info("logo: %s", logo)

    website = d('div.link-line> a').attr("href").strip()
    if website == "http://%e6%9a%82%e6%97%a0":
        website = ""
    website = util.norm_url(website)
    logger.info("website: %s" % website)

    artifacts = [{
        "type": 4010,
        "name": product_name,
        "desc": desc,
        "link": website
    }]

    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(", "").replace(
        ")", "").replace("获投状态:", "").strip()
    fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")

    return {
        "shortName": company_short_name,
        "fullName": company_name,
        "productName": product_name,
        "description": desc,
        "brief": "",
        "round": 0,
        "roundDesc": "",
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "artifacts": artifacts
    }
예제 #4
0
def parse_company(company_key):
    company = fromdb.company.find_one({"source": source, "company_key":company_key})
    if company == None:
        return

    content = company["content"]

    d = pq(content)

    logo_url = d('.top_info_wrap > img').attr('src')
    logo_id = None
    if logo_url is not None:
        logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url)

    name = d('.company_main > h1 > a').text()
    link = d('.company_main > h1 > a').attr('href')
    website = util.norm_url(link)
    fullName = d('.company_main > h1 > a').attr('title')

    # print logo_id
    # print name
    # print website
    # print fullName

    if name is None or fullName is None:
        return

    if len(name) > len(fullName):
        name = fullName

    brief = d('.company_word').text()
    desc_text = d('.company_intro_text').text()

    # print website
    # print brief
    # print desc

    if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10 :
        return

    desc = d('.company_intro_text > .company_content').html()
    desc = desc.replace('<span class="text_over">展开</span>', '')

    soup = BeautifulSoup(desc)
    raw = soup.getText()

    # logger.info(desc)
    logger.info(raw)

    desc = raw

    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        field = d('#basic_container > .item_content >ul > li:eq(0) > span').text()
        stage = d('#basic_container > .item_content >ul > li:eq(1) > span').text()
        headCount = d('#basic_container > .item_content >ul > li:eq(2) > span').text()
        headCount = headCount[0:headCount.index(u'人')]
        location = d('#basic_container > .item_content >ul > li:eq(3) > span').text()
        address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text()
    except:
        pass

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            min_staff = staffarr[0]
            max_staff = None


    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110

    location_id = parser_util.get_location_id(location)


    source_company = {"name": name,
                      "fullName": fullName,
                      "description": desc,
                      "brief": brief,
                      "round": stage,
                      "roundDesc": None,
                      "companyStatus": 2010,
                      'fundingType':funding_type,
                      "locationId": location_id,
                      "address": address,
                      "phone": None,
                      "establishDate": None,
                      "logo": logo_id,
                      "source": source,
                      "sourceId": company_key,
                      "field": field,
                      "subField": None,
                      "tags": None,
                      "headCountMin": min_staff,
                      "headCountMax": max_staff
                      }
    source_company_id = parser_util.insert_source_company(source_company)

    parse_artifact(d, source_company_id)

    parser_member(d, company_key, source_company_id)

    parser_develop(d, company_key, source_company_id)


    parser_job(company_key, source_company_id)

    msg = {"type":"company", "id":source_company_id}
    logger.info(msg)
    kafka_producer.send_messages("parser_v2", json.dumps(msg))
예제 #5
0
def parseCompany(source, company_key):
    logger.info("*****************************************")
    logger.info("parseComany, company_key=%s" % company_key)
    try:
        item = fromdb.company.find_one({"source":source, "company_key":company_key})
        if item is None:
            return

        html = item["content"]
        #doc = lxml.html.fromstring(html)
        d = pq(html)

        company_short_name = ""
        product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip()
        temps = product_name.split("/",1)
        if len(temps) == 2:
            product_name = temps[0].strip()
            company_short_name = temps[1].strip()
        if company_short_name == "":
            company_short_name = product_name
        logger.info("product name: " + product_name)
        logger.info("company short name: " + company_short_name)

        company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","")
        if company_name == "暂无" or company_name == "暂未收录":
            company_name = ""
        company_name = util.norm_company_name(company_name)
        logger.info("company name: " + company_name)

        website = d('div.link-line> a.weblink').attr("href").strip()
        if website=="http://%e6%9a%82%e6%97%a0":
            website = ""
        logger.info("website: " + website)

        if company_short_name == "" and company_name == "" and website == "":
            return

        establish_date = None
        str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","")
        result = util.re_get_result('(\d*?).(\d*?)$',str)
        if result != None:
            (year, month) = result
            establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
        logger.info("establish date: %s", establish_date)

        locationId=0
        str = d('span.loca').text().strip()
        #logger.info(str)
        result = util.re_get_result(u'(.*?)·(.*?)$',str)
        if result != None:
            (province, city) = result
            province = province.strip()
            city = city.strip()
            logger.info("location: %s-%s" % (province, city))

            locationId = 0
            result = conn.get("select * from location where locationName=%s", city)
            if result != None:
                locationId = result["locationId"]
            else:
                result = conn.get("select * from location where locationName=%s", province)
                if result != None:
                    locationId = result["locationId"]

        logger.info("locationId: %d" % locationId)

        company_status = 2010
        str = d('div.des-more> div').eq(2).text().strip()
        if str == "已关闭":
            company_status = 2020
        logger.info("company_status: %d" % company_status)

        funding_type = 0
        str = d("span.tag.bg-c").text().strip()
        logger.info(str)
        if str == "融资需求 · 需要融资":
            funding_type = 8020
        elif str == "融资需求 · 寻求收购":
            funding_type = 8020
        logger.info("funding_type=%d" % funding_type)

        field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
        logger.info("field: " + field)

        sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
        logger.info("sub field: " + sub_field)

        tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",")
        logger.info(tags)

        desc = d("div.des").text().strip()
        logger.info("desc: " + desc)

        #logo
        logo_id = None
        source_company = conn.get("select * from source_company where source=%s and sourceId=%s", source, company_key)
        if source_company == None or source_company["logo"] == None or source_company["logo"] == "":
            log_url = d("div.pic >img").attr("src")
            if log_url is not None and len(log_url.strip()) > 0:
                logger.info(log_url)
                image_value = my_request.get_image(logger,log_url)
                if image_value != None:
                    logo_id = imgfs.put(image_value, content_type='jpeg', filename='company_%s_%s.jpg' % (source, company_key))
                    pass
        else:
            logo_id = source_company["logo"]
        logger.info("gridfs logo_id=%s" % logo_id)

        if source_company == None:
            source_company_id = conn.insert("insert source_company(name,fullName,description,brief,\
                        round,roundDesc,companyStatus,fundingType,locationId,establishDate,logo,\
                        source,sourceId,createTime,modifyTime,\
                        field,subField,tags) \
                        values(%s,%s,%s,%s,\
                        %s,%s,%s,%s,%s,%s,%s,\
                        %s,%s,now(),now(),\
                        %s,%s,%s)",
                        product_name, company_name, desc, '',
                        0,'',company_status,funding_type,locationId,establish_date,logo_id,
                        SOURCE,company_key,
                        field,sub_field,",".join(tags)
                        )
        else:
            source_company_id = source_company["id"]
            conn.update("update source_company set \
                        name=%s,fullName=%s,description=%s, \
                        companyStatus=%s,fundingType=%s,locationId=%s,establishDate=%s,logo=%s, \
                        field=%s,subField=%s,\
                        modifyTime=now() \
                        where id=%s",
                        product_name, company_name, desc,
                        company_status,funding_type,locationId,establish_date,logo_id,
                        field,sub_field,
                        source_company_id
                        )

        #artifact
        logger.info("*** artifact ***")
        lis = d('ul.list-prod> li> a')
        for li in lis:
            l = pq(li)
            type = l('h4> span').text().strip()
            if type == "网站":
                link = l.attr("href").strip()
                name = l('h4> b').text().strip()
                desc = l('p').text().strip()
                logger.info("name: %s, link: %s, desc: %s" % (name,link,desc))
                if link == "":
                    continue
                link = util.norm_url(link)
                source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s",
                        source_company_id, link)
                if source_artifact is None:
                    sql = "insert source_artifact(sourceCompanyId,`name`,`description`,`link`,`type`,createTime,modifyTime) \
                          values(%s,%s,%s,%s,4010,now(),now())"
                    conn.insert(sql, source_company_id,name,desc,link)

        if website != "":
            source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s",
                            source_company_id, website)
            if source_artifact is None:
                sql = "insert source_artifact(sourceCompanyId,name,description,link,type,createTime,modifyTime) \
                      values(%s,%s,%s,%s,4010,now(),now())"
                logger.info("name: %s, link: %s, desc: %s" % (product_name,website,desc))
                conn.insert(sql,source_company_id,product_name,desc,website)

        #footprint
        logger.info("*** footprint ***")
        lis = d('ul.list-milestone> li')
        for li in lis:
            l = pq(li)
            footDesc = l('p').eq(0).text().strip()
            if footDesc is None or footDesc == "":
                continue
            footDateText = l('p> span').text().strip()
            if footDateText is None or footDateText == "":
                continue
            result = util.re_get_result('(\d*?)\.(\d*?)$',footDateText)
            if result == None:
                continue
            (year, month) = result
            year = int(year)
            try:
                month = int(month)
            except:
                month = 1

            if month<=0 or month>12:
                month = 1
            if year < 1970 or year > 3000:
                year = 1970
            footDate = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
            logger.info(footDate)
            logger.info(footDesc)

            fp = conn.get("select * from source_footprint where sourceCompanyId=%s and footDate=%s and description=%s",
                              source_company_id, footDate, footDesc)
            if fp == None:
                conn.insert("insert source_footprint(sourceCompanyId,footDate,description,createTime,modifyTime) \
                            values(%s,%s,%s,now(),now())",
                            source_company_id, footDate, footDesc)

        # funding
        logger.info("*** funding ***")
        lis = d('table.list-round-v2> tr')
        for li in lis:
            l = pq(li)
            dateStr = l('td> span.date').text().strip()
            result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$',dateStr)
            fundingDate = None
            if result != None:
                (year, month, day) = result
                fundingDate = datetime.datetime.strptime("%s-%s-%s" % (year,month,day), '%Y-%m-%d')
            logger.info(fundingDate)

            roundStr = l('td.mobile-none> span.round> a').text().strip().replace("轮","")
            logger.info(roundStr)
            fundingRound = 0
            if roundStr.startswith("种子"):
                fundingRound = 1010
                roundStr = "天使"
            elif roundStr.startswith("天使"):
                fundingRound = 1010
            elif roundStr.startswith("Pre-A"):
                fundingRound = 1020
            elif roundStr.startswith("A"):
                fundingRound = 1030
            elif roundStr.startswith("B"):
                fundingRound = 1040
            elif roundStr.startswith("Pre-B"):
                fundingRound = 1040
            elif roundStr.startswith("C"):
                fundingRound = 1050
            elif roundStr.startswith("D"):
                fundingRound = 1060
            elif roundStr.startswith("E"):
                fundingRound = 1070
            elif roundStr.startswith("F"):
                fundingRound = 1100
            elif roundStr.startswith("IPO"):
                fundingRound = 1110
            elif roundStr.startswith("收购"):
                fundingRound = 1120
            logger.info("fundingRound=%d" % fundingRound)

            moneyStr = l('td> span.finades> a').text().strip()
            (currency, investment, precise) = parseMoney(moneyStr)
            logger.info("%s - %s - %s" % (currency, investment, precise))

            source_funding = conn.get("select * from source_funding where sourceCompanyId=%s and roundDesc=%s",
                                          source_company_id, roundStr)
            if source_funding == None:
                source_funding_id = conn.insert("insert source_funding(sourceCompanyId,investment,round,roundDesc, currency, precise, fundingDate,createTime,modifyTime) \
                                                values(%s,%s,%s,%s,%s,%s,%s,now(),now())",
                                                source_company_id, investment, fundingRound, roundStr,
                                                currency, precise,fundingDate)
            else:
                source_funding_id = source_funding["id"]
                conn.update("update source_funding set investment=%s,currency=%s, precise=%s, fundingDate=%s, modifyTime=now() \
                            where id=%s",
                            investment, currency, precise, fundingDate, source_funding_id
                                )

            hs = l('td:eq(3)> a')
            for h in hs:
                h = pq(h)
                investor_name = h.text().strip()
                investor_url = h.attr("href").strip()
                (investor_key,) = util.re_get_result(r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url)
                logger.info(investor_name)
                logger.info(investor_url)
                logger.info(investor_key)

                item = fromdb.investor.find_one({"source":source, "investor_key":investor_key})
                inv = parseInvestor(item)

                if inv is not None:
                    (name, logo, website, stage, field, desc) = inv
                    source_investor = conn.get("select * from source_investor where source=%s and sourceId=%s",
                                               source, investor_key)
                    logo_id = None
                    if source_investor == None or source_investor["logo"] == None or source_investor["logo"] == "":
                        if logo is not None and logo != "":
                            image_value = my_request.get_image(logger,logo)
                            logo_id = imgfs.put(image_value, content_type='jpeg', filename='investor_%s_%s.jpg' % (source, investor_key))
                            logger.info("gridfs logo_id=%s" % logo_id)
                    else:
                        logo_id = source_investor["logo"]

                    if source_investor is None:
                        sql = "insert source_investor(name,website,description,logo,stage,field,type, \
                        source,sourceId,createTime,modifyTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())"
                        source_investor_id = conn.insert(sql,
                            name,website,desc,logo_id,stage,field,10020,source,investor_key)
                    else:
                        source_investor_id = source_investor["id"]
                        sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\
                        field=%s,type=%s,modifyTime=now() where id=%s"
                        conn.update(sql,
                            name,website,desc,logo_id,stage,field,10020, source_investor_id)

                    source_funding_investor_rel = conn.get("select * from source_funding_investor_rel where \
                            sourceFundingId=%s and sourceInvestorId=%s",
                            source_funding_id, source_investor_id)
                    if source_funding_investor_rel is None:
                        conn.insert("insert source_funding_investor_rel(sourceFundingId, sourceInvestorId, \
                                    createTime,modifyTime) \
                                    values(%s,%s, now(),now())", source_funding_id, source_investor_id)

        # members
        logger.info("*** member ****")
        lis = d('ul.list-prodcase> li')
        for li in lis:
            l = pq(li)
            member_name = l('h4> a> b> span.c').text().strip()
            position = l('h4> a> b> span.c-gray').text().strip()
            str = l('h4> a').attr("href").strip()
            (member_key,) = util.re_get_result(r'person/(\d*?)$',str)
            logger.info("member_key: %s, member_name: %s, position: %s" % (member_key, member_name, position))

            item = fromdb.member.find_one({"source":source, "member_key":member_key})
            m = parseMember(item)

            if m is not None:
                (weibo, introduction, education, work, location, role, pictureUrl) = m

                source_member = conn.get("select * from source_member where source=%s and sourceId=%s",
                                                   source, member_key)
                logo_id = None
                if source_member == None or source_member["photo"] == None or source_member["photo"] == "":
                    if pictureUrl is not None and pictureUrl != "":
                        image_value = my_request.get_image(logger,pictureUrl)
                        logo_id = imgfs.put(image_value, content_type='jpeg', filename='member_%s_%s.jpg' % (source, member_key))
                        logger.info("gridfs logo_id=%s" % logo_id)
                else:
                    logo_id = source_member["photo"]

                if source_member is None:
                    sql = "insert source_member(name,photo,weibo,location,role,description,\
                    education,work,source,sourceId,createTime,modifyTime) \
                    values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())"
                    source_member_id = conn.insert(sql,
                        member_name,logo_id,weibo,location,role,introduction,
                        education,work,source,member_key)
                else:
                    source_member_id = source_member["id"]
                    sql = "update source_member set name=%s,photo=%s,weibo=%s,location=%s,role=%s,description=%s,\
                    education=%s,work=%s,modifyTime=now() where id=%s"
                    conn.update(sql,
                        member_name,logo_id,weibo,location,role,introduction,
                        education,work,source_member_id)

                source_company_member_rel = conn.get("select * from source_company_member_rel where \
                        sourceCompanyId=%s and sourceMemberId=%s",
                        source_company_id, source_member_id)
                if source_company_member_rel is None:
                    conn.insert("insert source_company_member_rel(sourceCompanyId, sourceMemberId, \
                                position,type,createTime,modifyTime) \
                                values(%s,%s,%s,%s, now(),now())",
                                source_company_id, source_member_id,position,0)

        #news
        logger.info("*** news ***")
        lis = d('ul.list-news> li')
        for li in lis:
            try:
                l = pq(li)
                news_url = l('p.title> a').attr("href").strip()
                (news_key,) = util.re_get_result(r"http://www.itjuzi.com/overview/news/(\d*)$", news_url)

                item = fromdb.news.find_one({"source":source, "company_key":company_key, "news_key":news_key})
                parseNews(item)
            except Exception,ex:
                logger.exception(ex)

        msg = {"type":"company", "id":source_company_id}
        kafkaProducer.send_messages("parser_v2", json.dumps(msg))
예제 #6
0
def aggregate(source_company_id):
    logger.info("source_company_id: %s" % source_company_id)
    s = conn.get("select * from source_company where id=%s", source_company_id)
    if s is None:
        return

    company_id = find_company(s)

    #company
    if company_id is not None:
        logger.info("Update company: %s" % s["name"])
    else:
        logger.info("New company: %s" % s["name"])
        if s["companyStatus"] != 2020:
            code = get_company_code(s["name"])
            sql = "insert company(code,name,fullName,description,brief,\
                productDesc, modelDesc, operationDesc, teamDesc, marketDesc, compititorDesc, advantageDesc, planDesc, \
                round,roundDesc,companyStatus,fundingType,preMoney,currency,\
                locationId,address,phone,establishDate,logo,type,\
                headCountMin,headCountMax,\
                active,createTime,modifyTime) \
                values(%s,%s,%s,%s,%s,\
                    %s,%s,%s,%s,%s,%s,%s,%s, \
                    %s,%s,%s,%s,%s,%s,\
                    %s,%s,%s,%s,%s,41020,\
                    %s,%s,\
                    %s,now(),now())"

            company_id = conn.insert(sql, code, s["name"], s["fullName"],
                                     s["description"], s["brief"],
                                     s.get("productDesc"), s.get("modelDesc"),
                                     s.get("operationDesc"), s.get("teamDesc"),
                                     s.get("marketDesc"),
                                     s.get("compititorDesc"),
                                     s.get("advantageDesc"), s.get("planDesc"),
                                     s["round"], s["roundDesc"],
                                     s["companyStatus"], s["fundingType"],
                                     s["preMoney"], s["currency"],
                                     s["locationId"], s["address"], s["phone"],
                                     s["establishDate"], s["logo"],
                                     s["headCountMin"], s["headCountMax"], 'Y')
        else:
            return

    logger.info("companyId=%s", company_id)
    conn.update("update source_company set companyId=%s where id=%s",
                company_id, source_company_id)

    # company_alias
    add_company_alias(company_id, s["fullName"])

    # domain & company_alias
    source_domains = conn.query(
        "select * from source_domain where sourceCompanyId=%s",
        source_company_id)
    for sd in source_domains:
        if sd["organizerType"] == "企业":
            add_company_alias(company_id, sd["organizer"])

        if sd["organizer"] is not None:
            domain = conn.get(
                "select * from domain where companyId=%s and domain=%s and organizer=%s",
                company_id, sd["domain"], sd["organizer"])
        else:
            domain = conn.get(
                "select * from domain where companyId=%s and domain=%s limit 1",
                company_id, sd["domain"])
        if domain is None:
            sql = "insert domain(companyId,domain,organizer,organizerType,beianhao,mainBeianhao,\
                    websiteName,homepage,beianDate,expire,\
                    active,createTime,modifyTime)\
                    values(%s,%s,%s,%s,%s,%s,\
                    %s,%s,%s,%s,\
                    'Y',now(),now())"
            conn.insert(sql,
                    company_id,
                    sd["domain"],sd["organizer"],sd["organizerType"],sd["beianhao"],sd["mainBeianhao"],\
                    sd["websiteName"],sd["homepage"],sd["beianDate"],sd["expire"]
                    )
        #TODO expire处理

    # artifact
    sas = conn.query("select * from source_artifact where sourceCompanyId=%s",
                     source_company_id)
    for sa in sas:
        if sa["artifactId"] is not None:
            continue
        if sa["type"] == 4010:  #website
            if sa["link"] is not None and sa["link"] != "":
                link = util.norm_url(sa["link"])
                try:
                    domain = util.get_domain(link)
                except:
                    continue
                a = conn.get(
                    "select * from artifact where companyId=%s and type=4010 and (name=%s or link=%s) limit 1",
                    company_id, sa["name"], link)
                if a is None:
                    sql = "insert artifact(companyId,name,description,link,domain,type,active,createTime,modifyTime) \
                            values(%s,%s,%s,%s,%s,4010,'Y',now(),now())"

                    artifact_id = conn.insert(sql, company_id, sa["name"],
                                              sa["description"], link, domain)
                else:
                    artifact_id = a["id"]
                conn.update(
                    "update source_artifact set artifactId=%s where id=%s",
                    artifact_id, sa["id"])
        elif sa["type"] == 4040:  #itunes
            result = util.re_get_result('id(\d*)', sa["link"])
            if result is None:
                continue
            app_id, = result

            a = conn.get(
                "select * from artifact where type=4040 and domain=%s", app_id)
            if a is None:
                sql = "insert artifact(companyId,name,description,link,domain,type,active,createTime,modifyTime) \
                        values(%s,%s,%s,%s,%s,4040,'Y',now(),now())"

                artifact_id = conn.insert(sql, company_id, sa["name"],
                                          sa["description"], sa["link"],
                                          app_id)
            else:
                artifact_id = a["id"]
            conn.update("update source_artifact set artifactId=%s where id=%s",
                        artifact_id, sa["id"])
        elif sa["type"] == 4050:  #android
            package = None
            type, market = util.get_market(sa["link"])
            if market == 16030:  #wandoujia
                result = util.re_get_result('wandoujia.com/apps/(.*)',
                                            sa["link"])
                if result is None:
                    continue
                package, = result
            elif market == 16040:
                result = util.re_get_result('apkName=(.*)', sa["link"])
                if result is None:
                    continue
                package, = result
            else:
                continue
            a = conn.get(
                "select * from artifact where type=4050 and domain=%s",
                package)
            if a is None:
                sql = "insert artifact(companyId,name,description,link,domain,type,active,createTime,modifyTime) \
                        values(%s,%s,%s,%s,%s,4050,'Y',now(),now())"

                artifact_id = conn.insert(sql, company_id, sa["name"],
                                          sa["description"], sa["link"],
                                          package)
            else:
                artifact_id = a["id"]
            conn.update("update source_artifact set artifactId=%s where id=%s",
                        artifact_id, sa["id"])

    msg = {"type": "company", "id": company_id}
    flag = False
    while flag == False:
        try:
            kafkaProducer.send_messages("aggregator_v2", json.dumps(msg))
            flag = True
        except Exception, e:
            logger.exception(e)
            time.sleep(60)