def add_company_alias(company_id, full_name): if full_name is None or full_name == "": return full_name = util.norm_company_name(full_name) alias = conn.get( "select * from company_alias where companyId=%s and name=%s", company_id, full_name) if alias is None: sql = "insert company_alias(companyId,name,type,active,createTime) \ values(%s,%s,%s,%s,now())" conn.insert(sql, company_id, full_name, 12010, 'Y')
def find_company_by_full_name(full_name): if full_name is None or full_name == "": return None full_name = util.norm_company_name(full_name) company = conn.get("select * from company where fullName=%s", full_name) if company is not None: return company["id"] company_alias = conn.get( "select * from company_alias where type=12010 and name=%s", full_name) if company_alias is not None: return company_alias["companyId"] return None
def parse_company(company_key): item = fromdb.company.find_one({ "source": source, "company_key": company_key }) if item is None: return #company basic info c = item["company_base"]["data"]["company"] if c["status"] == "INIT": return tags = item["company_base"]["data"]["tags"] tags2 = [] for tag in tags: tags2.append(tag["name"]) tags_str = ",".join(tags2) logo_id = None logo_url = c["logo"] if logo_url != '': logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url) establish_date = None if c.has_key("startDate"): d = time.localtime(c["startDate"] / 1000) establish_date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday) address1 = None address2 = None if c.has_key("address1"): address1 = c["address1"] if c.has_key("address2"): address2 = c["address2"] location_id = 0 if address2 != None: city = kr36_cities.get(str(address2), None) if city != None: location_id = parser_util.get_location_id(formCityName(city)) if location_id == 0 and address1 != None: city = kr36_cities.get(str(address1), None) if city != None: location_id = parser_util.get_location_id(formCityName(city)) fullName = c["fullName"] fullName = fullName.replace("_", "") idx = fullName.rfind(u"公司") if idx != -1: fullName = fullName[:(idx + len(u"公司"))] fullName = util.norm_company_name(fullName) desc = "" productDesc = None modelDesc = None operationDesc = None teamDesc = None marketDesc = None compititorDesc = None advantageDesc = None planDesc = None if c.has_key("projectAdvantage"): productDesc = c["projectAdvantage"].strip() if c.has_key("dataLights"): operationDesc = c["dataLights"].strip() if c.has_key("projectPlan"): modelDesc = c["projectPlan"].strip() if c.has_key("competitor"): compititorDesc = c["competitor"].strip() if c.has_key("intro"): desc = c["intro"].strip() if c.has_key("story"): teamDesc = c["story"].strip() source_company = { "name": c["name"], "fullName": fullName, "description": desc, "productDesc": productDesc, "modelDesc": modelDesc, "operationDesc": operationDesc, "teamDesc": teamDesc, "marketDesc": marketDesc, "compititorDesc": compititorDesc, "advantageDesc": advantageDesc, "planDesc": planDesc, "brief": c["brief"], "round": 0, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": establish_date, "logo": logo_id, "source": source, "sourceId": company_key, "field": c.get("industry"), "subField": None, "tags": tags_str, "headCountMin": None, "headCountMax": None } source_company_id = parser_util.insert_source_company(source_company) # artifact website = c.get("website", "").strip() if website is not None and website != "": source_artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": website, "type": 4010 } parser_util.insert_source_artifact(source_artifact) weibo = c.get("weibo", "").strip() if weibo is not None and weibo != "": source_artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": weibo, "type": 4030 } parser_util.insert_source_artifact(source_artifact) weixin = c.get("weixin", "").strip() if weixin is not None and weixin != "": source_artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": weixin, "type": 4020 } parser_util.insert_source_artifact(source_artifact) iphoneAppstoreLink = c.get("iphoneAppstoreLink", "").strip() if iphoneAppstoreLink is not None and iphoneAppstoreLink != "": source_artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": iphoneAppstoreLink, "type": 4040 } parser_util.insert_source_artifact(source_artifact) ipadAppstoreLink = c.get("ipadAppstoreLink", "").strip() if ipadAppstoreLink is not None and ipadAppstoreLink != "": source_artifact = { "sourceCompanyId": source_company_id, "name": c["name"], "description": None, "link": ipadAppstoreLink, "type": 4040 } parser_util.insert_source_artifact(source_artifact) # funding / past_finance parseFinance(source_company_id, item["past_finance"]["data"]["data"]) # members parseMember(source_company_id, 5010, item["founders"]["data"]["data"]) parseMember(source_company_id, 5030, item["employees"]["data"]["data"]) parseMember(source_company_id, 5040, item["former_members"]["data"]["data"]) msg = {"type": "company", "id": source_company_id} kafka_producer.send_messages("parser_v2", json.dumps(msg))
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> b').clone().children().remove( ).end().text().strip() temps = product_name.split("/", 1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace( "公司全称:", "") if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = util.norm_company_name(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:", "") result = util.re_get_result('(\d*?).(\d*?)$', str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year, month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId = 0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$', str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 conn = db.connect_torndb() result = conn.get("select * from location where locationName=%s", city) if result != None: locationId = result["locationId"] else: result = conn.get("select * from location where locationName=%s", province) if result != None: locationId = result["locationId"] conn.close() logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace( " ", ",") logger.info("tags: %s" % tags) desc = d("div.des").text().strip() logger.info("desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") logger.info("logo: %s", logo) website = d('div.link-line> a').attr("href").strip() if website == "http://%e6%9a%82%e6%97%a0": website = "" website = util.norm_url(website) logger.info("website: %s" % website) artifacts = [{ "type": 4010, "name": product_name, "desc": desc, "link": website }] #获投状态 roundStr = d('span.t-small.c-green').text().replace("(", "").replace( ")", "").replace("获投状态:", "").strip() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "shortName": company_short_name, "fullName": company_name, "productName": product_name, "description": desc, "brief": "", "round": 0, "roundDesc": "", "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "artifacts": artifacts }
def parse_query(source_company_id,html): doc = lxml.html.fromstring(html) dms = doc.xpath("//tr[@bgcolor='#FFFFFF']") for dm in dms: try: temps = dm.xpath("td") if len(temps) == 3: #未备案 idx = temps[0].xpath("text()")[0].strip() domain_name = temps[1].xpath("a/text()")[0].strip() logger.info("%s 未备案", domain_name) domain = conn.get("select * from source_domain where domain=%s limit 1", domain_name) if domain is None: conn.insert("insert source_domain(sourceCompanyId,domain,createTime,modifyTime) \ values(%s,%s,now(),now())", source_company_id,domain_name) continue if len(temps) < 8: continue idx = temps[0].xpath("text()")[0].strip() domain_name = temps[1].xpath("a/text()")[0].strip() expire = 'N' dels = dm.xpath("td/del") if len(dels) >=6: expire = 'Y' if expire == 'N': temp = temps[2].xpath("a/font/text()") if len(temp) > 0: organizer_name = temp[0].strip() else: temp = temps[2].xpath("a/text()") if len(temp) > 0: organizer_name = temp[0].strip() organizer_type = temps[3].xpath("text()")[0].strip() beianhao = temps[4].xpath("a/text()")[0].strip() if beianhao == "": beianhao = temps[4].xpath("a/font/text()")[0].strip() + temps[4].xpath("a/text()")[1].strip() website_name = temps[5].xpath("a/text()")[0].strip() website_homepage = temps[6].xpath("text()")[0].strip() review_date = temps[7].xpath("text()")[0].strip() else: organizer_name = dels[0].xpath("a/text()")[0].strip() organizer_type = dels[1].xpath("text()")[0].strip() beianhao = dels[2].xpath("a/text()")[0].strip() website_name = dels[3].xpath("a/text()")[0].strip() website_homepage = dels[4].xpath("text()")[0].strip() review_date = dels[5].xpath("text()")[0].strip() main_beianhao = get_main_beianhao(beianhao) organizer_name = util.norm_company_name(organizer_name) logger.info("%s, %s, %s, %s, %s, %s, %s, %s" % (idx, domain_name, organizer_name, organizer_type, beianhao,website_name,website_homepage,review_date)) domain = conn.get("select * from source_domain where domain=%s and organizer=%s limit 1", domain_name, organizer_name) if domain is None: conn.insert("insert source_domain(sourceCompanyId,domain,organizer,organizerType,\ beianhao,mainBeianhao,websiteName,homepage,beianDate,expire,\ createTime,modifyTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())", source_company_id,domain_name,organizer_name,organizer_type, beianhao,main_beianhao,website_name,website_homepage,review_date,expire ) except Exception,ex: logger.exception(ex)
def parseCompany(source, company_key): logger.info("*****************************************") logger.info("parseComany, company_key=%s" % company_key) try: item = fromdb.company.find_one({"source":source, "company_key":company_key}) if item is None: return html = item["content"] #doc = lxml.html.fromstring(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip() temps = product_name.split("/",1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: " + product_name) logger.info("company short name: " + company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","") if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = util.norm_company_name(company_name) logger.info("company name: " + company_name) website = d('div.link-line> a.weblink').attr("href").strip() if website=="http://%e6%9a%82%e6%97%a0": website = "" logger.info("website: " + website) if company_short_name == "" and company_name == "" and website == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","") result = util.re_get_result('(\d*?).(\d*?)$',str) if result != None: (year, month) = result establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info("establish date: %s", establish_date) locationId=0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$',str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 result = conn.get("select * from location where locationName=%s", city) if result != None: locationId = result["locationId"] else: result = conn.get("select * from location where locationName=%s", province) if result != None: locationId = result["locationId"] logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info(str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: " + field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: " + sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",") logger.info(tags) desc = d("div.des").text().strip() logger.info("desc: " + desc) #logo logo_id = None source_company = conn.get("select * from source_company where source=%s and sourceId=%s", source, company_key) if source_company == None or source_company["logo"] == None or source_company["logo"] == "": log_url = d("div.pic >img").attr("src") if log_url is not None and len(log_url.strip()) > 0: logger.info(log_url) image_value = my_request.get_image(logger,log_url) if image_value != None: logo_id = imgfs.put(image_value, content_type='jpeg', filename='company_%s_%s.jpg' % (source, company_key)) pass else: logo_id = source_company["logo"] logger.info("gridfs logo_id=%s" % logo_id) if source_company == None: source_company_id = conn.insert("insert source_company(name,fullName,description,brief,\ round,roundDesc,companyStatus,fundingType,locationId,establishDate,logo,\ source,sourceId,createTime,modifyTime,\ field,subField,tags) \ values(%s,%s,%s,%s,\ %s,%s,%s,%s,%s,%s,%s,\ %s,%s,now(),now(),\ %s,%s,%s)", product_name, company_name, desc, '', 0,'',company_status,funding_type,locationId,establish_date,logo_id, SOURCE,company_key, field,sub_field,",".join(tags) ) else: source_company_id = source_company["id"] conn.update("update source_company set \ name=%s,fullName=%s,description=%s, \ companyStatus=%s,fundingType=%s,locationId=%s,establishDate=%s,logo=%s, \ field=%s,subField=%s,\ modifyTime=now() \ where id=%s", product_name, company_name, desc, company_status,funding_type,locationId,establish_date,logo_id, field,sub_field, source_company_id ) #artifact logger.info("*** artifact ***") lis = d('ul.list-prod> li> a') for li in lis: l = pq(li) type = l('h4> span').text().strip() if type == "网站": link = l.attr("href").strip() name = l('h4> b').text().strip() desc = l('p').text().strip() logger.info("name: %s, link: %s, desc: %s" % (name,link,desc)) if link == "": continue link = util.norm_url(link) source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s", source_company_id, link) if source_artifact is None: sql = "insert source_artifact(sourceCompanyId,`name`,`description`,`link`,`type`,createTime,modifyTime) \ values(%s,%s,%s,%s,4010,now(),now())" conn.insert(sql, source_company_id,name,desc,link) if website != "": source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s", source_company_id, website) if source_artifact is None: sql = "insert source_artifact(sourceCompanyId,name,description,link,type,createTime,modifyTime) \ values(%s,%s,%s,%s,4010,now(),now())" logger.info("name: %s, link: %s, desc: %s" % (product_name,website,desc)) conn.insert(sql,source_company_id,product_name,desc,website) #footprint logger.info("*** footprint ***") lis = d('ul.list-milestone> li') for li in lis: l = pq(li) footDesc = l('p').eq(0).text().strip() if footDesc is None or footDesc == "": continue footDateText = l('p> span').text().strip() if footDateText is None or footDateText == "": continue result = util.re_get_result('(\d*?)\.(\d*?)$',footDateText) if result == None: continue (year, month) = result year = int(year) try: month = int(month) except: month = 1 if month<=0 or month>12: month = 1 if year < 1970 or year > 3000: year = 1970 footDate = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info(footDate) logger.info(footDesc) fp = conn.get("select * from source_footprint where sourceCompanyId=%s and footDate=%s and description=%s", source_company_id, footDate, footDesc) if fp == None: conn.insert("insert source_footprint(sourceCompanyId,footDate,description,createTime,modifyTime) \ values(%s,%s,%s,now(),now())", source_company_id, footDate, footDesc) # funding logger.info("*** funding ***") lis = d('table.list-round-v2> tr') for li in lis: l = pq(li) dateStr = l('td> span.date').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$',dateStr) fundingDate = None if result != None: (year, month, day) = result fundingDate = datetime.datetime.strptime("%s-%s-%s" % (year,month,day), '%Y-%m-%d') logger.info(fundingDate) roundStr = l('td.mobile-none> span.round> a').text().strip().replace("轮","") logger.info(roundStr) fundingRound = 0 if roundStr.startswith("种子"): fundingRound = 1010 roundStr = "天使" elif roundStr.startswith("天使"): fundingRound = 1010 elif roundStr.startswith("Pre-A"): fundingRound = 1020 elif roundStr.startswith("A"): fundingRound = 1030 elif roundStr.startswith("B"): fundingRound = 1040 elif roundStr.startswith("Pre-B"): fundingRound = 1040 elif roundStr.startswith("C"): fundingRound = 1050 elif roundStr.startswith("D"): fundingRound = 1060 elif roundStr.startswith("E"): fundingRound = 1070 elif roundStr.startswith("F"): fundingRound = 1100 elif roundStr.startswith("IPO"): fundingRound = 1110 elif roundStr.startswith("收购"): fundingRound = 1120 logger.info("fundingRound=%d" % fundingRound) moneyStr = l('td> span.finades> a').text().strip() (currency, investment, precise) = parseMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) source_funding = conn.get("select * from source_funding where sourceCompanyId=%s and roundDesc=%s", source_company_id, roundStr) if source_funding == None: source_funding_id = conn.insert("insert source_funding(sourceCompanyId,investment,round,roundDesc, currency, precise, fundingDate,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,%s,%s,now(),now())", source_company_id, investment, fundingRound, roundStr, currency, precise,fundingDate) else: source_funding_id = source_funding["id"] conn.update("update source_funding set investment=%s,currency=%s, precise=%s, fundingDate=%s, modifyTime=now() \ where id=%s", investment, currency, precise, fundingDate, source_funding_id ) hs = l('td:eq(3)> a') for h in hs: h = pq(h) investor_name = h.text().strip() investor_url = h.attr("href").strip() (investor_key,) = util.re_get_result(r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url) logger.info(investor_name) logger.info(investor_url) logger.info(investor_key) item = fromdb.investor.find_one({"source":source, "investor_key":investor_key}) inv = parseInvestor(item) if inv is not None: (name, logo, website, stage, field, desc) = inv source_investor = conn.get("select * from source_investor where source=%s and sourceId=%s", source, investor_key) logo_id = None if source_investor == None or source_investor["logo"] == None or source_investor["logo"] == "": if logo is not None and logo != "": image_value = my_request.get_image(logger,logo) logo_id = imgfs.put(image_value, content_type='jpeg', filename='investor_%s_%s.jpg' % (source, investor_key)) logger.info("gridfs logo_id=%s" % logo_id) else: logo_id = source_investor["logo"] if source_investor is None: sql = "insert source_investor(name,website,description,logo,stage,field,type, \ source,sourceId,createTime,modifyTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())" source_investor_id = conn.insert(sql, name,website,desc,logo_id,stage,field,10020,source,investor_key) else: source_investor_id = source_investor["id"] sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\ field=%s,type=%s,modifyTime=now() where id=%s" conn.update(sql, name,website,desc,logo_id,stage,field,10020, source_investor_id) source_funding_investor_rel = conn.get("select * from source_funding_investor_rel where \ sourceFundingId=%s and sourceInvestorId=%s", source_funding_id, source_investor_id) if source_funding_investor_rel is None: conn.insert("insert source_funding_investor_rel(sourceFundingId, sourceInvestorId, \ createTime,modifyTime) \ values(%s,%s, now(),now())", source_funding_id, source_investor_id) # members logger.info("*** member ****") lis = d('ul.list-prodcase> li') for li in lis: l = pq(li) member_name = l('h4> a> b> span.c').text().strip() position = l('h4> a> b> span.c-gray').text().strip() str = l('h4> a').attr("href").strip() (member_key,) = util.re_get_result(r'person/(\d*?)$',str) logger.info("member_key: %s, member_name: %s, position: %s" % (member_key, member_name, position)) item = fromdb.member.find_one({"source":source, "member_key":member_key}) m = parseMember(item) if m is not None: (weibo, introduction, education, work, location, role, pictureUrl) = m source_member = conn.get("select * from source_member where source=%s and sourceId=%s", source, member_key) logo_id = None if source_member == None or source_member["photo"] == None or source_member["photo"] == "": if pictureUrl is not None and pictureUrl != "": image_value = my_request.get_image(logger,pictureUrl) logo_id = imgfs.put(image_value, content_type='jpeg', filename='member_%s_%s.jpg' % (source, member_key)) logger.info("gridfs logo_id=%s" % logo_id) else: logo_id = source_member["photo"] if source_member is None: sql = "insert source_member(name,photo,weibo,location,role,description,\ education,work,source,sourceId,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())" source_member_id = conn.insert(sql, member_name,logo_id,weibo,location,role,introduction, education,work,source,member_key) else: source_member_id = source_member["id"] sql = "update source_member set name=%s,photo=%s,weibo=%s,location=%s,role=%s,description=%s,\ education=%s,work=%s,modifyTime=now() where id=%s" conn.update(sql, member_name,logo_id,weibo,location,role,introduction, education,work,source_member_id) source_company_member_rel = conn.get("select * from source_company_member_rel where \ sourceCompanyId=%s and sourceMemberId=%s", source_company_id, source_member_id) if source_company_member_rel is None: conn.insert("insert source_company_member_rel(sourceCompanyId, sourceMemberId, \ position,type,createTime,modifyTime) \ values(%s,%s,%s,%s, now(),now())", source_company_id, source_member_id,position,0) #news logger.info("*** news ***") lis = d('ul.list-news> li') for li in lis: try: l = pq(li) news_url = l('p.title> a').attr("href").strip() (news_key,) = util.re_get_result(r"http://www.itjuzi.com/overview/news/(\d*)$", news_url) item = fromdb.news.find_one({"source":source, "company_key":company_key, "news_key":news_key}) parseNews(item) except Exception,ex: logger.exception(ex) msg = {"type":"company", "id":source_company_id} kafkaProducer.send_messages("parser_v2", json.dumps(msg))