def get_android_domain(app_market, app_id): domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id return domain
def parse_company(item): logger.info("parse_company") company_key = item["postdata"]["id"] #company basic info c = item["data"]["basic"] tags = c["tags"] tags_str = tags.replace("|",",") logo=c["icon"] if logo.find("product_default.png") >= 0: logo = None establish_date = None if c.has_key("open_time"): try: establish_date = datetime.datetime.strptime(c["open_time"], "%Y-%m-%d") except: pass address1 = None address2 = None if c.has_key("city"): address2 = c["city"] if c.has_key("province"): address1 = c["province"] location_id = 0 if address2!=None and address2.strip()!="": location = parser_db_util.get_location(address2) if location != None: location_id= location["locationId"] if location_id==0 and address1 != None and address1.strip()!="": location = parser_db_util.get_location(address1) if location != None: location_id = location["locationId"] fullName = c["company"] if fullName is None or fullName.strip() == "": fullName = None else: fullName = fullName.replace("_","") idx = fullName.rfind(u"公司") if idx != -1: fullName = fullName[:(idx+len(u"公司"))] fullName = name_helper.company_name_normalize(fullName) name = c["product"] desc = "" brief = "" productDesc = None modelDesc = None operationDesc = None teamDesc = None marketDesc = None compititorDesc = None advantageDesc = None planDesc = None otherDesc = None if c.has_key("desc"): # 其他 # otherDesc = c["intro"].strip() desc = c["desc"].strip() if c.has_key("yewu"): # 其他 # otherDesc = c["intro"].strip() brief = c["yewu"].strip() if name is None or fullName is None: return { "status": "No_Name", } artifacts = [] websites = [] if c.has_key("gw_link") is True and c["gw_link"].strip() !="" and c["gw_link"] not in websites: websites.append(c["gw_link"]) if c.has_key("source_gw_link") is True and c["source_gw_link"].strip() != "" and c["source_gw_link"] not in websites: websites.append(c["source_gw_link"]) if item["data"].has_key("productinfos") is True: for pi in item["data"]["productinfos"]: if pi.has_key("link") is True and pi["link"].strip() !="" and pi["link"] not in websites: websites.append(pi["link"]) for website in websites: type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("qimingpian.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": brief, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": brief, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": brief, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": brief, "link": website, "domain": domain }) return { "name": name, "fullName": fullName, "description": desc, "productDesc": productDesc, "modelDesc": modelDesc, "operationDesc": operationDesc, "teamDesc": teamDesc, "marketDesc": marketDesc, "compititorDesc": compititorDesc, "advantageDesc": advantageDesc, "planDesc": planDesc, "otherDesc": otherDesc, "brief": brief, "round": 0, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": establish_date, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": tags_str, "headCountMin": None, "headCountMax": None, "artifacts": artifacts, }
def parse_company(item): logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) logo = d('.logo-block > img').attr('src') if logo == 'http://assets3.chuangyepu.com/chuangyepu/images/big-screenshot.png': logo = None basic_info = d('div.col-md-9> div> table> tr> td').eq(1) #logger.info(basic_info) name = pq(basic_info)('div.name').text().strip() brief = pq(basic_info)('div.desc').eq(0).text().strip() if name is None: return { "status": "No_Name", } #logger.info(name+" "+brief) try: website = pq(basic_info)('div.desc').eq(1)('a').text().strip() except: website = None #logger.info("website: %s",website) #parser artifact tags = pq(basic_info)('div.line-block').text().strip().replace(" ", ",") #logger.info(tags) main_blocks = d('div.col-md-9> div.col-sm-12') h4s = d('div.col-md-9> h4') logger.info("main: %d, h4: %d", len(main_blocks), len(h4s)) #产品介绍/团队成员/媒体报道/融资历史 if len(h4s) != len(main_blocks) - 1: return { "status": "No_Data", } desc = None round = None roundDesc = None source_fundings = [] for i in xrange(len(h4s)): h4 = h4s.eq(i).text().strip() d = main_blocks.eq(i + 1) #DESC if h4 == "产品介绍": desc = d('div.content> div> p.desc').text().strip() #parser finance if h4 == "融资历史": lines = d('table> tr') for li in lines: line = pq(li) if line.text().find("时间") >= 0: continue #logger.info(line) date = line('td.investment_date> span').text().strip() + "/01" try: fundingDate = datetime.datetime.strptime(date, '%Y/%m/%d') except: fundingDate = None #logger.info(fundingDate) roundStr = line('td.investment-round').text().strip() fundingRound, roundStr = chuangyepu_helper.getFundingRound( roundStr) #logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = line('td.money').text().strip() (currency, investment, precise) = chuangyepu_helper.getMoney(moneyStr) #logger.info("%s - %s - %s" % (currency, investment, precise)) fs = line('td').eq(3)('p> a') investors = [] for f in fs: iv = pq(f) investor_url = iv.attr("href") investor_name = iv.text().strip() if investor_name is not None and investor_url is not None and investor_url != "" and investor_url.find( "institutions") >= 0: investor_key = investor_url.strip().split("/")[-1] investor = {"name": investor_name, "key": investor_key} investors.append(investor) source_funding = { "investment": investment, "precise": precise, "round": fundingRound, "roundDesc": roundStr, "currency": currency, "fundingDate": fundingDate, "investors": investors } #logger.info(json.dumps(source_funding, ensure_ascii=False, cls=util.CJsonEncoder)) source_fundings.append(source_funding) if round is None or round < fundingRound: round = fundingRound roundDesc = roundStr if h4 == "团队成员": #not accurate member infos pass if h4 == "媒体报道": pass artifacts = [] if desc is None: desc = brief if brief is not None and len(brief.decode('utf-8')) > 200: brief = None type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) #logger.info("Desc: %s", desc) #logger.info("round: %s, roundDesc: %s", round, roundDesc) source_company = { "name": name, "fullName": None, "description": desc, "productDesc": None, "modelDesc": None, "operationDesc": None, "teamDesc": None, "marketDesc": None, "compititorDesc": None, "advantageDesc": None, "planDesc": None, "brief": brief, "round": round, "roundDesc": roundDesc, "companyStatus": 2010, 'fundingType': 0, "locationId": None, "address": None, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": None, "headCountMin": None, "headCountMax": None, "artifacts": artifacts, "fundings": source_fundings, "status": 1 } #for i in source_company: # logger.info("%s -> %s", i, source_company[i]) return source_company
def parse_company(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) # logo_id processed in parser_db_util ''' logo_id = None if logo_url is not None: logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url) ''' if html.decode("utf-8").find("这个公司的主页还在建设") >= 0: return { "status": "No_Name", } name = d('.company_main > h1 > a').text() link = d('.company_main > h1 > a').attr('href') fullName = d('.company_main > h1 > a').attr('title') fullName = name_helper.company_name_normalize(fullName) if name is None or fullName is None or name.find("拉勾") >= 0: return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: return { "status": "No_Name", } logo = d('.top_info_wrap > img').attr('src') if logo.startswith("http") or logo.startswith("https"): pass else: logo = "http:" + logo if logo.find("logo_default") >= 0: logo = None brief = d('.company_word').text() desc_text = d('.company_intro_text').text() if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10: desc = None else: desc = d('.company_intro_text > .company_content').html() desc = desc.replace('<span class="text_over">展开</span>', '') soup = BeautifulSoup(desc, "lxml") raw = soup.getText() # logger.info(desc) #logger.info(raw) desc = raw # if desc is None or desc.strip() == "": # return { # "status": "No_Name", # } field = '' stage = '' headCount = '' location = '' address = '' try: field = d( '#basic_container > .item_content >ul > li:eq(0) > span').text() stage = d( '#basic_container > .item_content >ul > li:eq(1) > span').text() headCount = d( '#basic_container > .item_content >ul > li:eq(2) > span').text() headCount = headCount[0:headCount.index(u'人')] location = d( '#basic_container > .item_content >ul > li:eq(3) > span').text() address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text() except: pass headCount = headCount.replace("people", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 location_id = 0 location_new = parser_db_util.get_location(location) if location_new != None: location_id = location_new["locationId"] #website = util.norm_url(link) website = url_helper.url_normalize(link) logger.info("website: %s" % website) artifacts = [] type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("lagou.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "productDesc": None, "modelDesc": None, "operationDesc": None, "teamDesc": None, "marketDesc": None, "compititorDesc": None, "advantageDesc": None, "planDesc": None, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": location_id, "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": field, "subField": None, "tags": None, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "status": 1 } return source_company
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip() if product_name is None or product_name.strip() == "": product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip() temps = product_name.split("/",1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","") if company_name == "暂无" or company_name == "暂未收录": company_name = "" if company_name is None or company_name.strip() == "": try: company_name = d('div.des-more> h2').text().strip() except: pass if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = name_helper.company_name_normalize(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","") result = util.re_get_result('(\d*)\.(\d*)',str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId=0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$',str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 result = parser_db_util.get_location(city) if result != None: locationId = result["locationId"] else: result = parser_db_util.get_location(province) if result != None: locationId = result["locationId"] if locationId == 0: loc1,loc2 = name_helper.get_location_from_company_name(company_name) if loc1 is not None: result = parser_db_util.get_location(loc1) if result != None: locationId = result["locationId"] logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) try: brief = d("h2.seo-slogan").text().strip() except: brief = "" logger.info("brief: %s" % brief) if brief.find("暂未收录"): brief = "" field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",") logger.info("tags: %s" % tags) desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\ replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip() logger.info("********desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") #if logo: # logo = logo.replace("http://", "https://") logger.info("logo: %s", logo) # website = d('div.link-line> a').text().strip() # if website is None or website == "": # website = d('div.link-line> a.webTink').text().strip() # if website is None or website == "": # try: # logger.info("here") # website = d('div.link-line> span.weblink> a').eq(1).text().strip() # logger.info(website) # except: # pass artifacts = [] for ty in [1,2,3]: if ty == 1: was = d('div.link-line> a') else: was = d('div.link-line> span.weblink,span.webTink> a') for wa in was: webs =[] try: website = pq(wa).attr("href").strip() if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass try: website = pq(wa).text().strip() if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass # # if website=="http://%e6%9a%82%e6%97%a0": # website = "" # website = url_helper.url_normalize(website) # logger.info("website: %s" % website) # artifacts = [] for website in webs: type, app_market, app_id = url_helper.get_market(website) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type":4010, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4020: domain = app_id if domain is not None: artifacts.append({ "type": 4020, "name": product_name, "desc": None, "link": website, "domain": website }) elif type == 4030: domain = app_id if domain is not None: artifacts.append({ "type": 4030, "name": product_name, "desc": None, "link": website, "domain": None }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type":4040, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type":4050, "name":product_name, "desc":desc, "link":website, "domain": domain }) #获投状态 roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip() fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "shortName": company_short_name, "fullName": company_name if company_name is not None and company_name.strip() != "" else None, "productName": product_name, "description": desc, "brief": brief, "round": fundingRound, "roundDesc": roundStr, "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "type":41010, "artifacts":artifacts }
def parse_artifact(item): if item is None: return None artifacts = [] company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) #artifact logger.info("*** artifact ***") lis = d('ul.list-prod> li> div.on-edit-hide') for li in lis: l = pq(li) strtype = l('h4> span.tag').text().strip() #logger.info(strtype) if strtype != u"网站" and strtype != "app": continue link = l('h4> b> a').attr("href").strip() if link == "": continue domain = None type = None if strtype == u"网站": type, app_market, app_id = url_helper.get_market(link) if type == 4010: link = url_helper.url_normalize(link) flag, domain = url_helper.get_domain(link) if flag is None: continue if flag is False: domain = None if type != 4010: type, app_market, app_id = url_helper.get_market(link) if type == 4040: domain = app_id elif type == 4050: if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is None and type !=4030 and type != 4020: continue name = l('h4> b').text().strip() desc = l('p').text().strip() logger.info("type: %s, name: %s, link: %s, desc: %s" % (type, name,link,desc)) artifact = { "type":type, "name":name, "desc":desc, "link":link, "domain": domain } artifacts.append(artifact) logger.info("") return artifacts