def parse_base(item): if item is None: return None company_key = item["key"] content = item["content"] artifacts = [] link = util.norm_url(content["website"]) atype, market = util.get_market(link) if atype is not None: artifacts.append({ "type": atype, "name": content["name"], "desc": content["desc"], "link": link }) if content["url_android"] is not None: artifacts.append({ "type": 4050, "name": content["name"], "desc": content["desc"], "link": util.norm_url(content["url_android"]) }) if content["url_ios"] is not None: artifacts.append({ "type": 4040, "name": content["name"], "desc": content["desc"], "link": util.norm_url(content["url_ios"]) }) return { "shortName": content["name"], "fullName": None, "productName": content["name"], "description": None, "brief": content["desc"], "round": 0, "roundDesc": "", "companyStatus": 2010, "fundingType": 0, "locationId": 0, "establishDate": None, "logo": None, "sourceId": company_key, "field": None, "subField": None, "tags": None, "type": 41020, "score": content["score"], "artifacts": artifacts }
def parse_artifact(item): if item is None: return None artifacts = [] company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) #artifact logger.info("*** artifact ***") lis = d('ul.list-prod> li> a') for li in lis: l = pq(li) type = l('h4> span').text().strip() if type != u"网站" and type != "app": continue link = l.attr("href").strip() if link == "": continue if type == u"网站": type = 4010 link = util.norm_url(link) else: continue #TODO ''' if link.find("itunes.apple.com") >= 0 and link.find("/app/") >=0: type = 4040 result = util.re_get_result('(id\d*)',link) if result is None: continue app_id, = result link = "https://itunes.apple.com/cn/app/%s" % app_id elif link.find("www.wandoujia.com/apps/") >= 0: type = 4050 else: continue ''' name = l('h4> b').text().strip() desc = l('p').text().strip() logger.info("type: %s, name: %s, link: %s, desc: %s" % (type, name, link, desc)) link = util.norm_url(link) artifact = {"type": type, "name": name, "desc": desc, "link": link} artifacts.append(artifact) logger.info("") return artifacts
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> b').clone().children().remove( ).end().text().strip() temps = product_name.split("/", 1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace( "公司全称:", "") if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = util.norm_company_name(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:", "") result = util.re_get_result('(\d*?).(\d*?)$', str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year, month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId = 0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$', str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 conn = db.connect_torndb() result = conn.get("select * from location where locationName=%s", city) if result != None: locationId = result["locationId"] else: result = conn.get("select * from location where locationName=%s", province) if result != None: locationId = result["locationId"] conn.close() logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace( " ", ",") logger.info("tags: %s" % tags) desc = d("div.des").text().strip() logger.info("desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") logger.info("logo: %s", logo) website = d('div.link-line> a').attr("href").strip() if website == "http://%e6%9a%82%e6%97%a0": website = "" website = util.norm_url(website) logger.info("website: %s" % website) artifacts = [{ "type": 4010, "name": product_name, "desc": desc, "link": website }] #获投状态 roundStr = d('span.t-small.c-green').text().replace("(", "").replace( ")", "").replace("获投状态:", "").strip() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "shortName": company_short_name, "fullName": company_name, "productName": product_name, "description": desc, "brief": "", "round": 0, "roundDesc": "", "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "artifacts": artifacts }
def parse_company(company_key): company = fromdb.company.find_one({"source": source, "company_key":company_key}) if company == None: return content = company["content"] d = pq(content) logo_url = d('.top_info_wrap > img').attr('src') logo_id = None if logo_url is not None: logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url) name = d('.company_main > h1 > a').text() link = d('.company_main > h1 > a').attr('href') website = util.norm_url(link) fullName = d('.company_main > h1 > a').attr('title') # print logo_id # print name # print website # print fullName if name is None or fullName is None: return if len(name) > len(fullName): name = fullName brief = d('.company_word').text() desc_text = d('.company_intro_text').text() # print website # print brief # print desc if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10 : return desc = d('.company_intro_text > .company_content').html() desc = desc.replace('<span class="text_over">展开</span>', '') soup = BeautifulSoup(desc) raw = soup.getText() # logger.info(desc) logger.info(raw) desc = raw field = '' stage = '' headCount = '' location = '' address = '' try: field = d('#basic_container > .item_content >ul > li:eq(0) > span').text() stage = d('#basic_container > .item_content >ul > li:eq(1) > span').text() headCount = d('#basic_container > .item_content >ul > li:eq(2) > span').text() headCount = headCount[0:headCount.index(u'人')] location = d('#basic_container > .item_content >ul > li:eq(3) > span').text() address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text() except: pass if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: min_staff = staffarr[0] max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 location_id = parser_util.get_location_id(location) source_company = {"name": name, "fullName": fullName, "description": desc, "brief": brief, "round": stage, "roundDesc": None, "companyStatus": 2010, 'fundingType':funding_type, "locationId": location_id, "address": address, "phone": None, "establishDate": None, "logo": logo_id, "source": source, "sourceId": company_key, "field": field, "subField": None, "tags": None, "headCountMin": min_staff, "headCountMax": max_staff } source_company_id = parser_util.insert_source_company(source_company) parse_artifact(d, source_company_id) parser_member(d, company_key, source_company_id) parser_develop(d, company_key, source_company_id) parser_job(company_key, source_company_id) msg = {"type":"company", "id":source_company_id} logger.info(msg) kafka_producer.send_messages("parser_v2", json.dumps(msg))
def parseCompany(source, company_key): logger.info("*****************************************") logger.info("parseComany, company_key=%s" % company_key) try: item = fromdb.company.find_one({"source":source, "company_key":company_key}) if item is None: return html = item["content"] #doc = lxml.html.fromstring(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip() temps = product_name.split("/",1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: " + product_name) logger.info("company short name: " + company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","") if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = util.norm_company_name(company_name) logger.info("company name: " + company_name) website = d('div.link-line> a.weblink').attr("href").strip() if website=="http://%e6%9a%82%e6%97%a0": website = "" logger.info("website: " + website) if company_short_name == "" and company_name == "" and website == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","") result = util.re_get_result('(\d*?).(\d*?)$',str) if result != None: (year, month) = result establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info("establish date: %s", establish_date) locationId=0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$',str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 result = conn.get("select * from location where locationName=%s", city) if result != None: locationId = result["locationId"] else: result = conn.get("select * from location where locationName=%s", province) if result != None: locationId = result["locationId"] logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info(str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: " + field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: " + sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",") logger.info(tags) desc = d("div.des").text().strip() logger.info("desc: " + desc) #logo logo_id = None source_company = conn.get("select * from source_company where source=%s and sourceId=%s", source, company_key) if source_company == None or source_company["logo"] == None or source_company["logo"] == "": log_url = d("div.pic >img").attr("src") if log_url is not None and len(log_url.strip()) > 0: logger.info(log_url) image_value = my_request.get_image(logger,log_url) if image_value != None: logo_id = imgfs.put(image_value, content_type='jpeg', filename='company_%s_%s.jpg' % (source, company_key)) pass else: logo_id = source_company["logo"] logger.info("gridfs logo_id=%s" % logo_id) if source_company == None: source_company_id = conn.insert("insert source_company(name,fullName,description,brief,\ round,roundDesc,companyStatus,fundingType,locationId,establishDate,logo,\ source,sourceId,createTime,modifyTime,\ field,subField,tags) \ values(%s,%s,%s,%s,\ %s,%s,%s,%s,%s,%s,%s,\ %s,%s,now(),now(),\ %s,%s,%s)", product_name, company_name, desc, '', 0,'',company_status,funding_type,locationId,establish_date,logo_id, SOURCE,company_key, field,sub_field,",".join(tags) ) else: source_company_id = source_company["id"] conn.update("update source_company set \ name=%s,fullName=%s,description=%s, \ companyStatus=%s,fundingType=%s,locationId=%s,establishDate=%s,logo=%s, \ field=%s,subField=%s,\ modifyTime=now() \ where id=%s", product_name, company_name, desc, company_status,funding_type,locationId,establish_date,logo_id, field,sub_field, source_company_id ) #artifact logger.info("*** artifact ***") lis = d('ul.list-prod> li> a') for li in lis: l = pq(li) type = l('h4> span').text().strip() if type == "网站": link = l.attr("href").strip() name = l('h4> b').text().strip() desc = l('p').text().strip() logger.info("name: %s, link: %s, desc: %s" % (name,link,desc)) if link == "": continue link = util.norm_url(link) source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s", source_company_id, link) if source_artifact is None: sql = "insert source_artifact(sourceCompanyId,`name`,`description`,`link`,`type`,createTime,modifyTime) \ values(%s,%s,%s,%s,4010,now(),now())" conn.insert(sql, source_company_id,name,desc,link) if website != "": source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s", source_company_id, website) if source_artifact is None: sql = "insert source_artifact(sourceCompanyId,name,description,link,type,createTime,modifyTime) \ values(%s,%s,%s,%s,4010,now(),now())" logger.info("name: %s, link: %s, desc: %s" % (product_name,website,desc)) conn.insert(sql,source_company_id,product_name,desc,website) #footprint logger.info("*** footprint ***") lis = d('ul.list-milestone> li') for li in lis: l = pq(li) footDesc = l('p').eq(0).text().strip() if footDesc is None or footDesc == "": continue footDateText = l('p> span').text().strip() if footDateText is None or footDateText == "": continue result = util.re_get_result('(\d*?)\.(\d*?)$',footDateText) if result == None: continue (year, month) = result year = int(year) try: month = int(month) except: month = 1 if month<=0 or month>12: month = 1 if year < 1970 or year > 3000: year = 1970 footDate = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info(footDate) logger.info(footDesc) fp = conn.get("select * from source_footprint where sourceCompanyId=%s and footDate=%s and description=%s", source_company_id, footDate, footDesc) if fp == None: conn.insert("insert source_footprint(sourceCompanyId,footDate,description,createTime,modifyTime) \ values(%s,%s,%s,now(),now())", source_company_id, footDate, footDesc) # funding logger.info("*** funding ***") lis = d('table.list-round-v2> tr') for li in lis: l = pq(li) dateStr = l('td> span.date').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$',dateStr) fundingDate = None if result != None: (year, month, day) = result fundingDate = datetime.datetime.strptime("%s-%s-%s" % (year,month,day), '%Y-%m-%d') logger.info(fundingDate) roundStr = l('td.mobile-none> span.round> a').text().strip().replace("轮","") logger.info(roundStr) fundingRound = 0 if roundStr.startswith("种子"): fundingRound = 1010 roundStr = "天使" elif roundStr.startswith("天使"): fundingRound = 1010 elif roundStr.startswith("Pre-A"): fundingRound = 1020 elif roundStr.startswith("A"): fundingRound = 1030 elif roundStr.startswith("B"): fundingRound = 1040 elif roundStr.startswith("Pre-B"): fundingRound = 1040 elif roundStr.startswith("C"): fundingRound = 1050 elif roundStr.startswith("D"): fundingRound = 1060 elif roundStr.startswith("E"): fundingRound = 1070 elif roundStr.startswith("F"): fundingRound = 1100 elif roundStr.startswith("IPO"): fundingRound = 1110 elif roundStr.startswith("收购"): fundingRound = 1120 logger.info("fundingRound=%d" % fundingRound) moneyStr = l('td> span.finades> a').text().strip() (currency, investment, precise) = parseMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) source_funding = conn.get("select * from source_funding where sourceCompanyId=%s and roundDesc=%s", source_company_id, roundStr) if source_funding == None: source_funding_id = conn.insert("insert source_funding(sourceCompanyId,investment,round,roundDesc, currency, precise, fundingDate,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,%s,%s,now(),now())", source_company_id, investment, fundingRound, roundStr, currency, precise,fundingDate) else: source_funding_id = source_funding["id"] conn.update("update source_funding set investment=%s,currency=%s, precise=%s, fundingDate=%s, modifyTime=now() \ where id=%s", investment, currency, precise, fundingDate, source_funding_id ) hs = l('td:eq(3)> a') for h in hs: h = pq(h) investor_name = h.text().strip() investor_url = h.attr("href").strip() (investor_key,) = util.re_get_result(r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url) logger.info(investor_name) logger.info(investor_url) logger.info(investor_key) item = fromdb.investor.find_one({"source":source, "investor_key":investor_key}) inv = parseInvestor(item) if inv is not None: (name, logo, website, stage, field, desc) = inv source_investor = conn.get("select * from source_investor where source=%s and sourceId=%s", source, investor_key) logo_id = None if source_investor == None or source_investor["logo"] == None or source_investor["logo"] == "": if logo is not None and logo != "": image_value = my_request.get_image(logger,logo) logo_id = imgfs.put(image_value, content_type='jpeg', filename='investor_%s_%s.jpg' % (source, investor_key)) logger.info("gridfs logo_id=%s" % logo_id) else: logo_id = source_investor["logo"] if source_investor is None: sql = "insert source_investor(name,website,description,logo,stage,field,type, \ source,sourceId,createTime,modifyTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())" source_investor_id = conn.insert(sql, name,website,desc,logo_id,stage,field,10020,source,investor_key) else: source_investor_id = source_investor["id"] sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\ field=%s,type=%s,modifyTime=now() where id=%s" conn.update(sql, name,website,desc,logo_id,stage,field,10020, source_investor_id) source_funding_investor_rel = conn.get("select * from source_funding_investor_rel where \ sourceFundingId=%s and sourceInvestorId=%s", source_funding_id, source_investor_id) if source_funding_investor_rel is None: conn.insert("insert source_funding_investor_rel(sourceFundingId, sourceInvestorId, \ createTime,modifyTime) \ values(%s,%s, now(),now())", source_funding_id, source_investor_id) # members logger.info("*** member ****") lis = d('ul.list-prodcase> li') for li in lis: l = pq(li) member_name = l('h4> a> b> span.c').text().strip() position = l('h4> a> b> span.c-gray').text().strip() str = l('h4> a').attr("href").strip() (member_key,) = util.re_get_result(r'person/(\d*?)$',str) logger.info("member_key: %s, member_name: %s, position: %s" % (member_key, member_name, position)) item = fromdb.member.find_one({"source":source, "member_key":member_key}) m = parseMember(item) if m is not None: (weibo, introduction, education, work, location, role, pictureUrl) = m source_member = conn.get("select * from source_member where source=%s and sourceId=%s", source, member_key) logo_id = None if source_member == None or source_member["photo"] == None or source_member["photo"] == "": if pictureUrl is not None and pictureUrl != "": image_value = my_request.get_image(logger,pictureUrl) logo_id = imgfs.put(image_value, content_type='jpeg', filename='member_%s_%s.jpg' % (source, member_key)) logger.info("gridfs logo_id=%s" % logo_id) else: logo_id = source_member["photo"] if source_member is None: sql = "insert source_member(name,photo,weibo,location,role,description,\ education,work,source,sourceId,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())" source_member_id = conn.insert(sql, member_name,logo_id,weibo,location,role,introduction, education,work,source,member_key) else: source_member_id = source_member["id"] sql = "update source_member set name=%s,photo=%s,weibo=%s,location=%s,role=%s,description=%s,\ education=%s,work=%s,modifyTime=now() where id=%s" conn.update(sql, member_name,logo_id,weibo,location,role,introduction, education,work,source_member_id) source_company_member_rel = conn.get("select * from source_company_member_rel where \ sourceCompanyId=%s and sourceMemberId=%s", source_company_id, source_member_id) if source_company_member_rel is None: conn.insert("insert source_company_member_rel(sourceCompanyId, sourceMemberId, \ position,type,createTime,modifyTime) \ values(%s,%s,%s,%s, now(),now())", source_company_id, source_member_id,position,0) #news logger.info("*** news ***") lis = d('ul.list-news> li') for li in lis: try: l = pq(li) news_url = l('p.title> a').attr("href").strip() (news_key,) = util.re_get_result(r"http://www.itjuzi.com/overview/news/(\d*)$", news_url) item = fromdb.news.find_one({"source":source, "company_key":company_key, "news_key":news_key}) parseNews(item) except Exception,ex: logger.exception(ex) msg = {"type":"company", "id":source_company_id} kafkaProducer.send_messages("parser_v2", json.dumps(msg))
def aggregate(source_company_id): logger.info("source_company_id: %s" % source_company_id) s = conn.get("select * from source_company where id=%s", source_company_id) if s is None: return company_id = find_company(s) #company if company_id is not None: logger.info("Update company: %s" % s["name"]) else: logger.info("New company: %s" % s["name"]) if s["companyStatus"] != 2020: code = get_company_code(s["name"]) sql = "insert company(code,name,fullName,description,brief,\ productDesc, modelDesc, operationDesc, teamDesc, marketDesc, compititorDesc, advantageDesc, planDesc, \ round,roundDesc,companyStatus,fundingType,preMoney,currency,\ locationId,address,phone,establishDate,logo,type,\ headCountMin,headCountMax,\ active,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,\ %s,%s,%s,%s,%s,%s,%s,%s, \ %s,%s,%s,%s,%s,%s,\ %s,%s,%s,%s,%s,41020,\ %s,%s,\ %s,now(),now())" company_id = conn.insert(sql, code, s["name"], s["fullName"], s["description"], s["brief"], s.get("productDesc"), s.get("modelDesc"), s.get("operationDesc"), s.get("teamDesc"), s.get("marketDesc"), s.get("compititorDesc"), s.get("advantageDesc"), s.get("planDesc"), s["round"], s["roundDesc"], s["companyStatus"], s["fundingType"], s["preMoney"], s["currency"], s["locationId"], s["address"], s["phone"], s["establishDate"], s["logo"], s["headCountMin"], s["headCountMax"], 'Y') else: return logger.info("companyId=%s", company_id) conn.update("update source_company set companyId=%s where id=%s", company_id, source_company_id) # company_alias add_company_alias(company_id, s["fullName"]) # domain & company_alias source_domains = conn.query( "select * from source_domain where sourceCompanyId=%s", source_company_id) for sd in source_domains: if sd["organizerType"] == "企业": add_company_alias(company_id, sd["organizer"]) if sd["organizer"] is not None: domain = conn.get( "select * from domain where companyId=%s and domain=%s and organizer=%s", company_id, sd["domain"], sd["organizer"]) else: domain = conn.get( "select * from domain where companyId=%s and domain=%s limit 1", company_id, sd["domain"]) if domain is None: sql = "insert domain(companyId,domain,organizer,organizerType,beianhao,mainBeianhao,\ websiteName,homepage,beianDate,expire,\ active,createTime,modifyTime)\ values(%s,%s,%s,%s,%s,%s,\ %s,%s,%s,%s,\ 'Y',now(),now())" conn.insert(sql, company_id, sd["domain"],sd["organizer"],sd["organizerType"],sd["beianhao"],sd["mainBeianhao"],\ sd["websiteName"],sd["homepage"],sd["beianDate"],sd["expire"] ) #TODO expire处理 # artifact sas = conn.query("select * from source_artifact where sourceCompanyId=%s", source_company_id) for sa in sas: if sa["artifactId"] is not None: continue if sa["type"] == 4010: #website if sa["link"] is not None and sa["link"] != "": link = util.norm_url(sa["link"]) try: domain = util.get_domain(link) except: continue a = conn.get( "select * from artifact where companyId=%s and type=4010 and (name=%s or link=%s) limit 1", company_id, sa["name"], link) if a is None: sql = "insert artifact(companyId,name,description,link,domain,type,active,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,4010,'Y',now(),now())" artifact_id = conn.insert(sql, company_id, sa["name"], sa["description"], link, domain) else: artifact_id = a["id"] conn.update( "update source_artifact set artifactId=%s where id=%s", artifact_id, sa["id"]) elif sa["type"] == 4040: #itunes result = util.re_get_result('id(\d*)', sa["link"]) if result is None: continue app_id, = result a = conn.get( "select * from artifact where type=4040 and domain=%s", app_id) if a is None: sql = "insert artifact(companyId,name,description,link,domain,type,active,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,4040,'Y',now(),now())" artifact_id = conn.insert(sql, company_id, sa["name"], sa["description"], sa["link"], app_id) else: artifact_id = a["id"] conn.update("update source_artifact set artifactId=%s where id=%s", artifact_id, sa["id"]) elif sa["type"] == 4050: #android package = None type, market = util.get_market(sa["link"]) if market == 16030: #wandoujia result = util.re_get_result('wandoujia.com/apps/(.*)', sa["link"]) if result is None: continue package, = result elif market == 16040: result = util.re_get_result('apkName=(.*)', sa["link"]) if result is None: continue package, = result else: continue a = conn.get( "select * from artifact where type=4050 and domain=%s", package) if a is None: sql = "insert artifact(companyId,name,description,link,domain,type,active,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,4050,'Y',now(),now())" artifact_id = conn.insert(sql, company_id, sa["name"], sa["description"], sa["link"], package) else: artifact_id = a["id"] conn.update("update source_artifact set artifactId=%s where id=%s", artifact_id, sa["id"]) msg = {"type": "company", "id": company_id} flag = False while flag == False: try: kafkaProducer.send_messages("aggregator_v2", json.dumps(msg)) flag = True except Exception, e: logger.exception(e) time.sleep(60)