def add_company_alias(company_id, full_name):
    if full_name is None or full_name == "":
        return

    full_name = util.norm_company_name(full_name)
    alias = conn.get(
        "select * from company_alias where companyId=%s and name=%s",
        company_id, full_name)

    if alias is None:
        sql = "insert company_alias(companyId,name,type,active,createTime) \
                values(%s,%s,%s,%s,now())"

        conn.insert(sql, company_id, full_name, 12010, 'Y')
def find_company_by_full_name(full_name):
    if full_name is None or full_name == "":
        return None

    full_name = util.norm_company_name(full_name)
    company = conn.get("select * from company where fullName=%s", full_name)
    if company is not None:
        return company["id"]

    company_alias = conn.get(
        "select * from company_alias where type=12010 and name=%s", full_name)
    if company_alias is not None:
        return company_alias["companyId"]
    return None
示例#3
0
def parse_company(company_key):
    item = fromdb.company.find_one({
        "source": source,
        "company_key": company_key
    })
    if item is None:
        return

    #company basic info
    c = item["company_base"]["data"]["company"]

    if c["status"] == "INIT":
        return

    tags = item["company_base"]["data"]["tags"]
    tags2 = []
    for tag in tags:
        tags2.append(tag["name"])
    tags_str = ",".join(tags2)

    logo_id = None
    logo_url = c["logo"]
    if logo_url != '':
        logo_id = parser_util.get_logo_id(source, company_key, 'company',
                                          logo_url)

    establish_date = None
    if c.has_key("startDate"):
        d = time.localtime(c["startDate"] / 1000)
        establish_date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday)

    address1 = None
    address2 = None
    if c.has_key("address1"):
        address1 = c["address1"]
    if c.has_key("address2"):
        address2 = c["address2"]

    location_id = 0
    if address2 != None:
        city = kr36_cities.get(str(address2), None)
        if city != None:
            location_id = parser_util.get_location_id(formCityName(city))

    if location_id == 0 and address1 != None:
        city = kr36_cities.get(str(address1), None)
        if city != None:
            location_id = parser_util.get_location_id(formCityName(city))

    fullName = c["fullName"]
    fullName = fullName.replace("_", "")
    idx = fullName.rfind(u"公司")
    if idx != -1:
        fullName = fullName[:(idx + len(u"公司"))]
    fullName = util.norm_company_name(fullName)

    desc = ""
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None

    if c.has_key("projectAdvantage"):
        productDesc = c["projectAdvantage"].strip()
    if c.has_key("dataLights"):
        operationDesc = c["dataLights"].strip()
    if c.has_key("projectPlan"):
        modelDesc = c["projectPlan"].strip()
    if c.has_key("competitor"):
        compititorDesc = c["competitor"].strip()
    if c.has_key("intro"):
        desc = c["intro"].strip()
    if c.has_key("story"):
        teamDesc = c["story"].strip()

    source_company = {
        "name": c["name"],
        "fullName": fullName,
        "description": desc,
        "productDesc": productDesc,
        "modelDesc": modelDesc,
        "operationDesc": operationDesc,
        "teamDesc": teamDesc,
        "marketDesc": marketDesc,
        "compititorDesc": compititorDesc,
        "advantageDesc": advantageDesc,
        "planDesc": planDesc,
        "brief": c["brief"],
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo_id,
        "source": source,
        "sourceId": company_key,
        "field": c.get("industry"),
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None
    }

    source_company_id = parser_util.insert_source_company(source_company)

    # artifact
    website = c.get("website", "").strip()
    if website is not None and website != "":
        source_artifact = {
            "sourceCompanyId": source_company_id,
            "name": c["name"],
            "description": None,
            "link": website,
            "type": 4010
        }
        parser_util.insert_source_artifact(source_artifact)

    weibo = c.get("weibo", "").strip()
    if weibo is not None and weibo != "":
        source_artifact = {
            "sourceCompanyId": source_company_id,
            "name": c["name"],
            "description": None,
            "link": weibo,
            "type": 4030
        }
        parser_util.insert_source_artifact(source_artifact)

    weixin = c.get("weixin", "").strip()
    if weixin is not None and weixin != "":
        source_artifact = {
            "sourceCompanyId": source_company_id,
            "name": c["name"],
            "description": None,
            "link": weixin,
            "type": 4020
        }
        parser_util.insert_source_artifact(source_artifact)

    iphoneAppstoreLink = c.get("iphoneAppstoreLink", "").strip()
    if iphoneAppstoreLink is not None and iphoneAppstoreLink != "":
        source_artifact = {
            "sourceCompanyId": source_company_id,
            "name": c["name"],
            "description": None,
            "link": iphoneAppstoreLink,
            "type": 4040
        }
        parser_util.insert_source_artifact(source_artifact)

    ipadAppstoreLink = c.get("ipadAppstoreLink", "").strip()
    if ipadAppstoreLink is not None and ipadAppstoreLink != "":
        source_artifact = {
            "sourceCompanyId": source_company_id,
            "name": c["name"],
            "description": None,
            "link": ipadAppstoreLink,
            "type": 4040
        }
        parser_util.insert_source_artifact(source_artifact)

    # funding / past_finance
    parseFinance(source_company_id, item["past_finance"]["data"]["data"])

    # members
    parseMember(source_company_id, 5010, item["founders"]["data"]["data"])
    parseMember(source_company_id, 5030, item["employees"]["data"]["data"])
    parseMember(source_company_id, 5040,
                item["former_members"]["data"]["data"])

    msg = {"type": "company", "id": source_company_id}
    kafka_producer.send_messages("parser_v2", json.dumps(msg))
示例#4
0
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> b').clone().children().remove(
    ).end().text().strip()
    temps = product_name.split("/", 1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace(
        "公司全称:", "")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""
    company_name = util.norm_company_name(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:", "")
    result = util.re_get_result('(\d*?).(\d*?)$', str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year, month),
                                                    '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId = 0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$', str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        conn = db.connect_torndb()
        result = conn.get("select * from location where locationName=%s", city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = conn.get("select * from location where locationName=%s",
                              province)
            if result != None:
                locationId = result["locationId"]
        conn.close()
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)

    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(
        " ", ",")
    logger.info("tags: %s" % tags)

    desc = d("div.des").text().strip()
    logger.info("desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    logger.info("logo: %s", logo)

    website = d('div.link-line> a').attr("href").strip()
    if website == "http://%e6%9a%82%e6%97%a0":
        website = ""
    website = util.norm_url(website)
    logger.info("website: %s" % website)

    artifacts = [{
        "type": 4010,
        "name": product_name,
        "desc": desc,
        "link": website
    }]

    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(", "").replace(
        ")", "").replace("获投状态:", "").strip()
    fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")

    return {
        "shortName": company_short_name,
        "fullName": company_name,
        "productName": product_name,
        "description": desc,
        "brief": "",
        "round": 0,
        "roundDesc": "",
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "artifacts": artifacts
    }
示例#5
0
def parse_query(source_company_id,html):
    doc = lxml.html.fromstring(html)
    dms = doc.xpath("//tr[@bgcolor='#FFFFFF']")
    for dm in dms:
        try:
            temps = dm.xpath("td")
            if len(temps) == 3:
                #未备案
                idx = temps[0].xpath("text()")[0].strip()
                domain_name = temps[1].xpath("a/text()")[0].strip()
                logger.info("%s 未备案", domain_name)
                domain = conn.get("select * from source_domain where domain=%s limit 1", domain_name)
                if domain is None:
                    conn.insert("insert source_domain(sourceCompanyId,domain,createTime,modifyTime) \
                                values(%s,%s,now(),now())",
                            source_company_id,domain_name)
                continue

            if len(temps) < 8:
                continue

            idx = temps[0].xpath("text()")[0].strip()
            domain_name = temps[1].xpath("a/text()")[0].strip()

            expire = 'N'
            dels = dm.xpath("td/del")
            if len(dels) >=6:
                expire = 'Y'

            if expire == 'N':
                temp = temps[2].xpath("a/font/text()")
                if len(temp) > 0:
                    organizer_name = temp[0].strip()
                else:
                    temp = temps[2].xpath("a/text()")
                    if len(temp) > 0:
                        organizer_name = temp[0].strip()
                organizer_type = temps[3].xpath("text()")[0].strip()
                beianhao = temps[4].xpath("a/text()")[0].strip()
                if beianhao == "":
                    beianhao = temps[4].xpath("a/font/text()")[0].strip() + temps[4].xpath("a/text()")[1].strip()
                website_name = temps[5].xpath("a/text()")[0].strip()
                website_homepage = temps[6].xpath("text()")[0].strip()
                review_date = temps[7].xpath("text()")[0].strip()
            else:
                organizer_name = dels[0].xpath("a/text()")[0].strip()
                organizer_type = dels[1].xpath("text()")[0].strip()
                beianhao = dels[2].xpath("a/text()")[0].strip()

                website_name = dels[3].xpath("a/text()")[0].strip()
                website_homepage = dels[4].xpath("text()")[0].strip()
                review_date = dels[5].xpath("text()")[0].strip()
            main_beianhao = get_main_beianhao(beianhao)
            organizer_name = util.norm_company_name(organizer_name)
            logger.info("%s, %s, %s, %s, %s, %s, %s, %s" %
                        (idx, domain_name, organizer_name, organizer_type, beianhao,website_name,website_homepage,review_date))

            domain = conn.get("select * from source_domain where domain=%s and organizer=%s limit 1", domain_name, organizer_name)
            if domain is None:
                conn.insert("insert source_domain(sourceCompanyId,domain,organizer,organizerType,\
                                        beianhao,mainBeianhao,websiteName,homepage,beianDate,expire,\
                                        createTime,modifyTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())",
                            source_company_id,domain_name,organizer_name,organizer_type,
                            beianhao,main_beianhao,website_name,website_homepage,review_date,expire
                            )
        except Exception,ex:
            logger.exception(ex)
示例#6
0
def parseCompany(source, company_key):
    logger.info("*****************************************")
    logger.info("parseComany, company_key=%s" % company_key)
    try:
        item = fromdb.company.find_one({"source":source, "company_key":company_key})
        if item is None:
            return

        html = item["content"]
        #doc = lxml.html.fromstring(html)
        d = pq(html)

        company_short_name = ""
        product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip()
        temps = product_name.split("/",1)
        if len(temps) == 2:
            product_name = temps[0].strip()
            company_short_name = temps[1].strip()
        if company_short_name == "":
            company_short_name = product_name
        logger.info("product name: " + product_name)
        logger.info("company short name: " + company_short_name)

        company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","")
        if company_name == "暂无" or company_name == "暂未收录":
            company_name = ""
        company_name = util.norm_company_name(company_name)
        logger.info("company name: " + company_name)

        website = d('div.link-line> a.weblink').attr("href").strip()
        if website=="http://%e6%9a%82%e6%97%a0":
            website = ""
        logger.info("website: " + website)

        if company_short_name == "" and company_name == "" and website == "":
            return

        establish_date = None
        str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","")
        result = util.re_get_result('(\d*?).(\d*?)$',str)
        if result != None:
            (year, month) = result
            establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
        logger.info("establish date: %s", establish_date)

        locationId=0
        str = d('span.loca').text().strip()
        #logger.info(str)
        result = util.re_get_result(u'(.*?)·(.*?)$',str)
        if result != None:
            (province, city) = result
            province = province.strip()
            city = city.strip()
            logger.info("location: %s-%s" % (province, city))

            locationId = 0
            result = conn.get("select * from location where locationName=%s", city)
            if result != None:
                locationId = result["locationId"]
            else:
                result = conn.get("select * from location where locationName=%s", province)
                if result != None:
                    locationId = result["locationId"]

        logger.info("locationId: %d" % locationId)

        company_status = 2010
        str = d('div.des-more> div').eq(2).text().strip()
        if str == "已关闭":
            company_status = 2020
        logger.info("company_status: %d" % company_status)

        funding_type = 0
        str = d("span.tag.bg-c").text().strip()
        logger.info(str)
        if str == "融资需求 · 需要融资":
            funding_type = 8020
        elif str == "融资需求 · 寻求收购":
            funding_type = 8020
        logger.info("funding_type=%d" % funding_type)

        field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
        logger.info("field: " + field)

        sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
        logger.info("sub field: " + sub_field)

        tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",")
        logger.info(tags)

        desc = d("div.des").text().strip()
        logger.info("desc: " + desc)

        #logo
        logo_id = None
        source_company = conn.get("select * from source_company where source=%s and sourceId=%s", source, company_key)
        if source_company == None or source_company["logo"] == None or source_company["logo"] == "":
            log_url = d("div.pic >img").attr("src")
            if log_url is not None and len(log_url.strip()) > 0:
                logger.info(log_url)
                image_value = my_request.get_image(logger,log_url)
                if image_value != None:
                    logo_id = imgfs.put(image_value, content_type='jpeg', filename='company_%s_%s.jpg' % (source, company_key))
                    pass
        else:
            logo_id = source_company["logo"]
        logger.info("gridfs logo_id=%s" % logo_id)

        if source_company == None:
            source_company_id = conn.insert("insert source_company(name,fullName,description,brief,\
                        round,roundDesc,companyStatus,fundingType,locationId,establishDate,logo,\
                        source,sourceId,createTime,modifyTime,\
                        field,subField,tags) \
                        values(%s,%s,%s,%s,\
                        %s,%s,%s,%s,%s,%s,%s,\
                        %s,%s,now(),now(),\
                        %s,%s,%s)",
                        product_name, company_name, desc, '',
                        0,'',company_status,funding_type,locationId,establish_date,logo_id,
                        SOURCE,company_key,
                        field,sub_field,",".join(tags)
                        )
        else:
            source_company_id = source_company["id"]
            conn.update("update source_company set \
                        name=%s,fullName=%s,description=%s, \
                        companyStatus=%s,fundingType=%s,locationId=%s,establishDate=%s,logo=%s, \
                        field=%s,subField=%s,\
                        modifyTime=now() \
                        where id=%s",
                        product_name, company_name, desc,
                        company_status,funding_type,locationId,establish_date,logo_id,
                        field,sub_field,
                        source_company_id
                        )

        #artifact
        logger.info("*** artifact ***")
        lis = d('ul.list-prod> li> a')
        for li in lis:
            l = pq(li)
            type = l('h4> span').text().strip()
            if type == "网站":
                link = l.attr("href").strip()
                name = l('h4> b').text().strip()
                desc = l('p').text().strip()
                logger.info("name: %s, link: %s, desc: %s" % (name,link,desc))
                if link == "":
                    continue
                link = util.norm_url(link)
                source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s",
                        source_company_id, link)
                if source_artifact is None:
                    sql = "insert source_artifact(sourceCompanyId,`name`,`description`,`link`,`type`,createTime,modifyTime) \
                          values(%s,%s,%s,%s,4010,now(),now())"
                    conn.insert(sql, source_company_id,name,desc,link)

        if website != "":
            source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s",
                            source_company_id, website)
            if source_artifact is None:
                sql = "insert source_artifact(sourceCompanyId,name,description,link,type,createTime,modifyTime) \
                      values(%s,%s,%s,%s,4010,now(),now())"
                logger.info("name: %s, link: %s, desc: %s" % (product_name,website,desc))
                conn.insert(sql,source_company_id,product_name,desc,website)

        #footprint
        logger.info("*** footprint ***")
        lis = d('ul.list-milestone> li')
        for li in lis:
            l = pq(li)
            footDesc = l('p').eq(0).text().strip()
            if footDesc is None or footDesc == "":
                continue
            footDateText = l('p> span').text().strip()
            if footDateText is None or footDateText == "":
                continue
            result = util.re_get_result('(\d*?)\.(\d*?)$',footDateText)
            if result == None:
                continue
            (year, month) = result
            year = int(year)
            try:
                month = int(month)
            except:
                month = 1

            if month<=0 or month>12:
                month = 1
            if year < 1970 or year > 3000:
                year = 1970
            footDate = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
            logger.info(footDate)
            logger.info(footDesc)

            fp = conn.get("select * from source_footprint where sourceCompanyId=%s and footDate=%s and description=%s",
                              source_company_id, footDate, footDesc)
            if fp == None:
                conn.insert("insert source_footprint(sourceCompanyId,footDate,description,createTime,modifyTime) \
                            values(%s,%s,%s,now(),now())",
                            source_company_id, footDate, footDesc)

        # funding
        logger.info("*** funding ***")
        lis = d('table.list-round-v2> tr')
        for li in lis:
            l = pq(li)
            dateStr = l('td> span.date').text().strip()
            result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$',dateStr)
            fundingDate = None
            if result != None:
                (year, month, day) = result
                fundingDate = datetime.datetime.strptime("%s-%s-%s" % (year,month,day), '%Y-%m-%d')
            logger.info(fundingDate)

            roundStr = l('td.mobile-none> span.round> a').text().strip().replace("轮","")
            logger.info(roundStr)
            fundingRound = 0
            if roundStr.startswith("种子"):
                fundingRound = 1010
                roundStr = "天使"
            elif roundStr.startswith("天使"):
                fundingRound = 1010
            elif roundStr.startswith("Pre-A"):
                fundingRound = 1020
            elif roundStr.startswith("A"):
                fundingRound = 1030
            elif roundStr.startswith("B"):
                fundingRound = 1040
            elif roundStr.startswith("Pre-B"):
                fundingRound = 1040
            elif roundStr.startswith("C"):
                fundingRound = 1050
            elif roundStr.startswith("D"):
                fundingRound = 1060
            elif roundStr.startswith("E"):
                fundingRound = 1070
            elif roundStr.startswith("F"):
                fundingRound = 1100
            elif roundStr.startswith("IPO"):
                fundingRound = 1110
            elif roundStr.startswith("收购"):
                fundingRound = 1120
            logger.info("fundingRound=%d" % fundingRound)

            moneyStr = l('td> span.finades> a').text().strip()
            (currency, investment, precise) = parseMoney(moneyStr)
            logger.info("%s - %s - %s" % (currency, investment, precise))

            source_funding = conn.get("select * from source_funding where sourceCompanyId=%s and roundDesc=%s",
                                          source_company_id, roundStr)
            if source_funding == None:
                source_funding_id = conn.insert("insert source_funding(sourceCompanyId,investment,round,roundDesc, currency, precise, fundingDate,createTime,modifyTime) \
                                                values(%s,%s,%s,%s,%s,%s,%s,now(),now())",
                                                source_company_id, investment, fundingRound, roundStr,
                                                currency, precise,fundingDate)
            else:
                source_funding_id = source_funding["id"]
                conn.update("update source_funding set investment=%s,currency=%s, precise=%s, fundingDate=%s, modifyTime=now() \
                            where id=%s",
                            investment, currency, precise, fundingDate, source_funding_id
                                )

            hs = l('td:eq(3)> a')
            for h in hs:
                h = pq(h)
                investor_name = h.text().strip()
                investor_url = h.attr("href").strip()
                (investor_key,) = util.re_get_result(r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url)
                logger.info(investor_name)
                logger.info(investor_url)
                logger.info(investor_key)

                item = fromdb.investor.find_one({"source":source, "investor_key":investor_key})
                inv = parseInvestor(item)

                if inv is not None:
                    (name, logo, website, stage, field, desc) = inv
                    source_investor = conn.get("select * from source_investor where source=%s and sourceId=%s",
                                               source, investor_key)
                    logo_id = None
                    if source_investor == None or source_investor["logo"] == None or source_investor["logo"] == "":
                        if logo is not None and logo != "":
                            image_value = my_request.get_image(logger,logo)
                            logo_id = imgfs.put(image_value, content_type='jpeg', filename='investor_%s_%s.jpg' % (source, investor_key))
                            logger.info("gridfs logo_id=%s" % logo_id)
                    else:
                        logo_id = source_investor["logo"]

                    if source_investor is None:
                        sql = "insert source_investor(name,website,description,logo,stage,field,type, \
                        source,sourceId,createTime,modifyTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())"
                        source_investor_id = conn.insert(sql,
                            name,website,desc,logo_id,stage,field,10020,source,investor_key)
                    else:
                        source_investor_id = source_investor["id"]
                        sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\
                        field=%s,type=%s,modifyTime=now() where id=%s"
                        conn.update(sql,
                            name,website,desc,logo_id,stage,field,10020, source_investor_id)

                    source_funding_investor_rel = conn.get("select * from source_funding_investor_rel where \
                            sourceFundingId=%s and sourceInvestorId=%s",
                            source_funding_id, source_investor_id)
                    if source_funding_investor_rel is None:
                        conn.insert("insert source_funding_investor_rel(sourceFundingId, sourceInvestorId, \
                                    createTime,modifyTime) \
                                    values(%s,%s, now(),now())", source_funding_id, source_investor_id)

        # members
        logger.info("*** member ****")
        lis = d('ul.list-prodcase> li')
        for li in lis:
            l = pq(li)
            member_name = l('h4> a> b> span.c').text().strip()
            position = l('h4> a> b> span.c-gray').text().strip()
            str = l('h4> a').attr("href").strip()
            (member_key,) = util.re_get_result(r'person/(\d*?)$',str)
            logger.info("member_key: %s, member_name: %s, position: %s" % (member_key, member_name, position))

            item = fromdb.member.find_one({"source":source, "member_key":member_key})
            m = parseMember(item)

            if m is not None:
                (weibo, introduction, education, work, location, role, pictureUrl) = m

                source_member = conn.get("select * from source_member where source=%s and sourceId=%s",
                                                   source, member_key)
                logo_id = None
                if source_member == None or source_member["photo"] == None or source_member["photo"] == "":
                    if pictureUrl is not None and pictureUrl != "":
                        image_value = my_request.get_image(logger,pictureUrl)
                        logo_id = imgfs.put(image_value, content_type='jpeg', filename='member_%s_%s.jpg' % (source, member_key))
                        logger.info("gridfs logo_id=%s" % logo_id)
                else:
                    logo_id = source_member["photo"]

                if source_member is None:
                    sql = "insert source_member(name,photo,weibo,location,role,description,\
                    education,work,source,sourceId,createTime,modifyTime) \
                    values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())"
                    source_member_id = conn.insert(sql,
                        member_name,logo_id,weibo,location,role,introduction,
                        education,work,source,member_key)
                else:
                    source_member_id = source_member["id"]
                    sql = "update source_member set name=%s,photo=%s,weibo=%s,location=%s,role=%s,description=%s,\
                    education=%s,work=%s,modifyTime=now() where id=%s"
                    conn.update(sql,
                        member_name,logo_id,weibo,location,role,introduction,
                        education,work,source_member_id)

                source_company_member_rel = conn.get("select * from source_company_member_rel where \
                        sourceCompanyId=%s and sourceMemberId=%s",
                        source_company_id, source_member_id)
                if source_company_member_rel is None:
                    conn.insert("insert source_company_member_rel(sourceCompanyId, sourceMemberId, \
                                position,type,createTime,modifyTime) \
                                values(%s,%s,%s,%s, now(),now())",
                                source_company_id, source_member_id,position,0)

        #news
        logger.info("*** news ***")
        lis = d('ul.list-news> li')
        for li in lis:
            try:
                l = pq(li)
                news_url = l('p.title> a').attr("href").strip()
                (news_key,) = util.re_get_result(r"http://www.itjuzi.com/overview/news/(\d*)$", news_url)

                item = fromdb.news.find_one({"source":source, "company_key":company_key, "news_key":news_key})
                parseNews(item)
            except Exception,ex:
                logger.exception(ex)

        msg = {"type":"company", "id":source_company_id}
        kafkaProducer.send_messages("parser_v2", json.dumps(msg))