def insert_funding(sid, roundstr, inv, fundingDate, investor): try: inv = "".join(inv.split()) if inv in ["超千万人民币", "千万人民币", "近千万人民币", "过千万人民币", "上千万人民币", "1千万人民币"]: inv = "1000万人民币" elif inv in [ "超亿人民币", "近亿人民币", "过亿人民币", "上亿人民币", "亿人民币", "一亿人民币", "亿人民币及以上人民币" ]: inv = "1亿人民币" elif inv in ["超千万美元", "千万美元", "近千万美元", "过千万美元", "上千万美元", "1千万美元"]: inv = "1000万美元" elif inv in ["百万美元", "近百万美元", "过百万美元", "上百万美元", "1百万美元"]: inv = "100万美元" elif inv in ["百万人民币", "近百万人民币", "过百万人民币", "上百万人民币", "1百万人民币"]: inv = "100万人民币" if roundstr == "re-A轮": roundstr = "Pre-A" elif roundstr == "re-IPO": roundstr = "Pre-IPO" fundingRound, roundStr = itjuzi_helper.getFundingRound( unicode(roundstr)) currency, investment, precise = itjuzi_helper.getMoney(unicode(inv)) source_funding = { "sourceCompanyId": sid, "preMoney": None, "postMoney": None, "investment": investment, "precise": precise, "round": fundingRound, "roundDesc": roundStr, "currency": currency, "fundingDate": fundingDate, "newsUrl": None } source_investors = [] source_investor = { "name": investor, "website": None, "description": None, "logo_url": None, "stage": None, "field": None, "type": 10020, "source": 13100, "sourceId": util.md5str(investor) } source_investors.append(source_investor) parser_db_util.save_funding_standard(source_funding, download_crawler, source_investors) # logger.info("%s/%s-------%s/%s/%s/%s", roundstr, inv, fundingRound, investment,precise,currency) except: logger.info("%s/%s/%s/%s", roundstr, inv, fdate, investor) # exit() pass
def parse(item): if item is None: return None company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) logger.info("*** funding ***") str = d("a.name").attr("href") if str is None: return -1 company_key = str.strip().split("/")[-1] logger.info("company_key: %s", company_key) conn = db.connect_torndb() source_company = conn.get( "select * from source_company where source=%s and sourceId=%s", SOURCE, company_key) conn.close() if source_company is None: logger.info("this source company doesn't exist yet") return None else: source_company_id = source_company["id"] logger.info("sourceComapnyId: %s", source_company_id) dateStr = d( 'div.block> div.titlebar-center> p> span.date').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr) fundingDate = None if result != None: (year, month, day) = result y = int(year) if y >= 2100 and y <= 2109: year = 2010 + y % 10 m = int(month) if m > 12: m = 12 month = "12" if (m == 4 or m == 6 or m == 9 or m == 11) and int(day) > 30: day = "30" elif itjuzi_helper.isRunnian( int(year)) and m == 2 and int(day) > 29: day = 29 elif itjuzi_helper.isRunnian( int(year)) == False and m == 2 and int(day) > 28: day = 28 elif int(day) > 31: day = 31 fundingDate = datetime.datetime.strptime( "%s-%s-%s" % (year, month, day), '%Y-%m-%d') logger.info(fundingDate) roundStr = d('span.round').text().strip() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = d('span.fina').text().strip() (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) investors = [] fs = d('h4.person-name> b >a.title') for f in fs: l = pq(f) investor_name = l.text().strip() if investor_name == "": continue investor_url = l.attr("href") if investor_url is not None and investor_url != "": investor_key = investor_url.strip().split("/")[-1] investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) else: investor_key = None temps = investor_name.split(";") for name in temps: name = name.strip() if name == "": continue investor = { "name": name, "key": None, "url": None, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, name, investor_url) return { "sourceCompanyId": source_company_id, "fundingDate": fundingDate, "fundingRound": fundingRound, "roundStr": roundStr, "currency": currency, "investment": investment, "precise": precise, "investors": investors } fundings = [] # 并购信息 lis = d('table.list-round> tr') for li in lis: l = pq(li) dateStr = l('td:eq(2)').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr) fundingDate = None if result != None: (year, month, day) = result fundingDate = datetime.datetime.strptime( "%s-%s-%s" % (year, month, day), '%Y-%m-%d') logger.info(fundingDate) roundStr = l('td.base> a> span').text().strip() fundingRound, roundStr = getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = l( 'td.base> a').clone().children().remove().end().text().strip() (currency, investment, precise) = getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) funding = { "fundingDate": fundingDate, "fundingRound": fundingRound, "roundStr": roundStr, "currency": currency, "investment": investment, "precise": precise } investors = [] hs = l('td.investor> a') for h in hs: h = pq(h) investor_name = h.text().strip() if investor_name == u"并购方未透露" or investor_name == u"未透露" or investor_name == "": continue investor_url = h.attr("href").strip() if investor_url is not None and investor_url != "": (investor_key, ) = util.re_get_result( r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url) else: investor_key = None logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) funding["investors"] = investors fundings.append(funding) # funding lis = d('table.list-round-v2> tr') for li in lis: l = pq(li) dateStr = l('td> span.date').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr) fundingDate = None if result != None: (year, month, day) = result fundingDate = datetime.datetime.strptime( "%s-%s-%s" % (year, month, day), '%Y-%m-%d') logger.info(fundingDate) roundStr = l('td.mobile-none> span.round> a').text().strip() fundingRound, roundStr = getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = l('td> span.finades> a').text().strip() (currency, investment, precise) = getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) funding = { "fundingDate": fundingDate, "fundingRound": fundingRound, "roundStr": roundStr, "currency": currency, "investment": investment, "precise": precise } investors = [] hs = l('td:eq(3)> a') for h in hs: h = pq(h) investor_name = h.text().strip() investor_url = h.attr("href").strip() (investor_key, ) = util.re_get_result( r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url) logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) hs = l('td:eq(3)> span') for h in hs: h = pq(h) investor_name = h.text().strip() if investor_name == u"投资方未透露" or investor_name == "": continue investor_url = None investor_key = None logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) funding["investors"] = investors fundings.append(funding) logger.info("") return fundings
def parseFinance_save(source_company_id, item, sourceId, download_crawler): logger.info("parseFinance_save") if item is None: return None d = pq(html.fromstring(item['content'].decode("utf-8"))) finances = d('.funding-info tbody tr') for finance in finances: roundStr = d(finance)('td:nth-child(1)').text() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) fundingInvestment = d(finance)('.amount').text() if fundingInvestment.find('¥ ') >= 0: fundingInvestment = fundingInvestment.replace('¥ ', '') + '人民币' elif fundingInvestment.find('$ ') >= 0: fundingInvestment = fundingInvestment.replace('$ ', '') + '美元' else: logger.info('not RMB:%s %s', sourceId, fundingInvestment) # todo exit() fundingCurrency, fundingInvestment, precise = itjuzi_helper.getMoney( fundingInvestment) fundingDate = datetime.datetime.strptime( d(finance)('.date').text(), '%Y-%m-%d') source_funding = { "sourceCompanyId": source_company_id, "preMoney": None, "postMoney": None, "investment": fundingInvestment, "precise": precise, "round": fundingRound, "roundDesc": roundStr, "currency": fundingCurrency, "fundingDate": fundingDate, "newsUrl": None } # logger.info(json.dumps(source_funding, ensure_ascii=False, cls=util.CJsonEncoder)) if fundingInvestment == 0: logger.info("new invest case: %s", sourceId) exit() logger.info("%s, %s, %s, %s", roundStr, fundingRound, fundingInvestment, fundingCurrency) source_investors = [] investors = d(finance)('.investor a') for investor in investors: entityName = d(investor).text().strip() logger.info(entityName) entityId = str(d(investor).attr('href').split('startups/')[-1]) source_investor = { "name": entityName, "website": None, "description": None, "logo_url": None, "stage": None, "field": None, "type": 10020, "source": SOURCE, "sourceId": entityId } source_investors.append(source_investor) logger.info( json.dumps(source_investors, ensure_ascii=False, cls=util.CJsonEncoder)) try: parser_db_util.save_funding_standard(source_funding, download_crawler, source_investors) except: pass
def parse(item): if item is None: return None funding_key = item["key"] logger.info("funding_key: %s", funding_key) data = item["content"] logger.info("*** funding ***") company_key = data["com_id"] logger.info("company_key: %s", company_key) source_company = parser_db_util.get_company(13030, company_key) if source_company is None: logger.info("this source company doesn't exist yet") if int(company_key) not in nokeys: nokeys.append(int(company_key)) return None else: source_company_id = source_company["id"] logger.info("sourceComapnyId: %s", source_company_id) fundingDate = datetime.datetime.strptime(data["date"], '%Y-%m-%d') logger.info(fundingDate) roundStr = data["round"] fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = data["money"] + data["currency"] (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) investors = [] if data.has_key("invsest_with") and isinstance(data["invsest_with"], dict): for fi in data["invsest_with"]: f = data["invsest_with"][fi] investor_name = f["invst_name"] if investor_name == "" or investor_name == "未透露": continue investor_url = None if investor_url is not None and investor_url != "": investor_key = investor_url.strip().split("/")[-1] investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) else: investor_key = None temps = investor_name.split(";") for name in temps: name = name.strip() if name == "": continue investor = { "name": name, "key": None, "url": None, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, name, investor_url) return { "sourceCompanyId": source_company_id, "fundingDate": fundingDate, "fundingRound": fundingRound, "roundStr": roundStr, "currency": currency, "investment": investment, "precise": precise, "investors": investors }
def parse(item): if item is None: return None funding_key = item["key"] logger.info("funding_key: %s", funding_key) html = item["content"] #logger.info(html) d = pq(html) logger.info("*** funding ***") str = d("a.name").attr("href") if str is None: return -1 company_key = str.strip().split("/")[-1] logger.info("company_key: %s", company_key) source_company = parser_db_util.get_company(SOURCE, company_key) if source_company is None: logger.info("this source company doesn't exist yet") if int(company_key) not in nokeys: nokeys.append(int(company_key)) return None else: source_company_id = source_company["id"] logger.info("sourceComapnyId: %s", source_company_id) dateStr = d('div.title> h1> span').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$',dateStr) fundingDate = None if result != None: (year, month, day) = result y = int(year) if y >= 2100 and y <= 2109: year = 2010 + y%10 m = int(month) if m > 12: m = 12 month = "12" if (m==4 or m==6 or m==9 or m==11) and int(day)>30: day = "30" elif itjuzi_helper.isRunnian(int(year)) and m==2 and int(day)>29: day = 29 elif itjuzi_helper.isRunnian(int(year)) == False and m==2 and int(day)>28: day = 28 elif int(day) > 31: day = 31 fundingDate = datetime.datetime.strptime("%s-%s-%s" % (year,month,day), '%Y-%m-%d') logger.info(fundingDate) roundStr = d('div.block-inc-fina> table> tbody> tr> td> span.round').text().strip() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = d('div.block-inc-fina> table> tbody> tr> td> span.fina').text().strip() (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) investors = [] # fs = d('div.right> h4 >a.title') # for f in fs: # l = pq(f) # investor_name = l.text().strip() # if investor_name == "": # continue # investor_url = l.attr("href") # if investor_url is not None and investor_url != "": # investor_key = investor_url.strip().split("/")[-1] # investor = { # "name":investor_name, # "key":investor_key, # "url":investor_url, # "type":38001 # } # investors.append(investor) # logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) # else: # investor_key = None # temps = investor_name.split(";") # for name in temps: # name = name.strip() # if name == "": # continue # investor = { # "name":name, # "key":None, # "url":None, # "type":38001 # } # investors.append(investor) # logger.info("Investor: %s, %s, %s", investor_key, name, investor_url) fs = pq(d('div.pad.finan-history> table >tr> td').eq(2))('span> a') for f in fs: l = pq(f) investor_name = l.text().strip() if investor_name == "": continue investor_url = l.attr("href") if investor_url is not None and investor_url != "": investor_key = investor_url.strip().split("/")[-1] investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) else: investor_key = None temps = investor_name.split(";") for name in temps: name = name.strip() if name == "": continue investor = { "name": name, "key": None, "url": None, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, name, investor_url) return { "sourceCompanyId":source_company_id, "fundingDate":fundingDate, "fundingRound":fundingRound, "roundStr":roundStr, "currency":currency, "investment":investment, "precise":precise, "investors":investors }