def crawlDivident(code): link = "https://stock.xueqiu.com/v5/stock/f10/us/bonus.json?symbol=%s&size=10000&page=1&extend=true" % code session = HTMLSession() r = session.get(link, headers=HEADERS, cookies=COOKIES) content = json.dumps(json.loads(r.content)) path = "C:/project/stockdata/USDivident/%s.json" % code write2File(path, content) FileLogger.info("get divident of code: %s in size: %d" % (code, len(content)))
def crawlCashflow(code): link = "https://stock.xueqiu.com/v5/stock/finance/us/cash_flow.json?symbol=%s&type=all&is_detail=true&count=1000×tamp=1616585707592" % code session = HTMLSession() r = session.get(link, headers=HEADERS, cookies=COOKIES) content = json.dumps(json.loads(r.content)) path = "C:/project/stockdata/USCashflow/%s.json" % code write2File(path, content) FileLogger.info("get cashflow of code: %s in size: %d" % (code, len(content)))
def crawlUSStocks(): # 查询语句:select ts_code from usstock.stocklist; stockList = pd.read_csv("C:/project/Tushare/usstock/code.csv").to_numpy() for code in stockList: FileLogger.info("running on code: " + code[0]) try: crawlHistory(code[0]) time.sleep(1) except Exception as ex: FileLogger.error(ex) FileLogger.error("crawl error on code: %s" % code) time.sleep(5)
def parseIncomeBase(code): FileLogger.info("running on code: %s" % code) path = "C:/project/stockdata/USIncome/%s.json" % code text = readFile(path) if text: jsonObj = json.loads(text) jsonData = jsonObj['data'] del jsonData['list'] jsonData['ts_code'] = code global incomeBaseDF incomeBaseDF = incomeBaseDF.append([jsonData], ignore_index=True)
def gettodayStock(): curDate = time.strftime("%Y%m%d", time.localtime()) tryagain = True while tryagain: try: content = crawlLatestUsStocks() if content: path = "C:/project/stockdata/USDay/%s.txt" % curDate write2File(path, content, mode="w") FileLogger.info("crawl stock list successfully on date:" + curDate) tryagain = False else: time.sleep(60) except Exception as ex: FileLogger.error(ex) FileLogger.error("crawl stock list error, retry in 60 seconds") time.sleep(60)
def crawlHistory(code) -> bool: link = "https://stock.xueqiu.com/v5/stock/chart/kline.json?symbol=%s&begin=1616585707592&period=day&type=before&count=-100000&indicator=kline,pe,pb,ps,pcf,market_capital,agt,ggt,balance" % code session = HTMLSession() r = session.get(link, headers=HEADERS, cookies=COOKIES) jsonObj = json.loads(r.content) if jsonObj['error_code'] != 0 or not jsonObj["data"].__contains__("column") or not jsonObj["data"].__contains__("item"): FileLogger.error("get content error from: %s" % code) return False columns = jsonObj["data"]["column"] items = jsonObj["data"]["item"] if len(items) > 0: path = "C:/project/stockdata/UShistory/%s.csv" % code save2csv(columns, items, path) FileLogger.info("get %d lines from code: %s" % (len(items), code)) return True
def crawlStockNotices(code, orgId): records = [] link = "http://www.cninfo.com.cn/new/hisAnnouncement/query" session = HTMLSession() data = POSTDATA.copy() data["stock"] = "%s,%s" % (code, orgId) r = session.post(link, data=data, headers=HEADERS) if r.content: jsonContent = json.loads(r.content) totalpages = jsonContent["totalpages"] announcements = jsonContent["announcements"] records.extend(announcements) FileLogger.info("get records on code: %s of totalPages:%d" % (code, totalpages)) for pageNum in range(2, totalpages + 2): time.sleep(0.1) data["pageNum"] = pageNum r = session.post(link, data=data, headers=HEADERS) if r.content: jsonContent = json.loads(r.content) announcements = jsonContent["announcements"] if announcements is not None and len(announcements) > 0: records.extend(announcements) FileLogger.info("get records on pageNum: %d" % pageNum) FileLogger.info("get %d records on code: %s" % (len(records), code)) if len(records) != 0: content = json.dumps(records) path = "C:/project/stockdata/StockNotices/%s.json" % code write2File(path, content)
def crawlBalance(code, companyType): records = [] for date in DATES: link = "http://f10.eastmoney.com/NewFinanceAnalysis/zcfzbAjaxNew?companyType=%d&reportDateType=0&reportType=1&dates=%s&code=%s" % ( companyType, date, code) session = HTMLSession() r = session.get(link, headers=HEADERS) jsonContent = json.loads(r.content) if "data" not in jsonContent: FileLogger.info("no more data on %s at dates: %s" % (code, date)) break for obj in jsonContent["data"]: records.append(obj) FileLogger.info("get balance of code: %s in size: %d" % (code, len(jsonContent["data"]))) # time.sleep(0.5) if len(records) != 0: content = json.dumps(records) path = "C:/project/stockdata/EastMoneyBalance/%s.json" % code write2File(path, content)
FileLogger.info("get %d records on code: %s" % (len(records), code)) if len(records) != 0: content = json.dumps(records) path = "C:/project/stockdata/StockNotices/%s.json" % code write2File(path, content) if __name__ == "__main__": stockList = getJsonFromFile("C:/project/stockdata/StockNotices/stock.json") stockList = stockList["stockList"] # stockList = [{"orgId":"9900002701","category":"A股","code":"002127","pinyin":"njds","zwjc":"南极电商"}] for stock in stockList: FileLogger.info("running on stock: %s(%s)" % (stock["zwjc"], stock["code"])) filePath = "C:/project/stockdata/StockNotices/%s.json" % stock['code'] if (os.path.exists(filePath)): continue try: crawlStockNotices(stock["code"], stock["orgId"]) time.sleep(1) except Exception as ex: FileLogger.error(ex) FileLogger.error("crawl balance error on code: %s" % stock["code"]) time.sleep(3)
stockdf = pd.read_csv( "C:/project/Tushare/eastmoney/codewithcompanytype.csv") stockList = stockdf[['ts_code', 'companytype']].to_numpy() stockList = [['SZ000002', 4]] # stockList = [['SZ300144', 4]] # add the base info into DB for item in stockList: code = item[0] companyType = item[1] # need to process companyType 1-3 if companyType != 4: continue FileLogger.info("running on code: %s" % code) # try: incomedf = dataGetter.getDataFromIncome(code) incomedf = incomedf.set_index("REPORT_DATE") incomedf = processor.keepOnlyYearData(incomedf).fillna(0) balancedf = dataGetter.getDataFromBalance(code) balancedf = balancedf.set_index("REPORT_DATE") balancedf = processor.keepOnlyYearData(balancedf).fillna(0) # rate = getIncomeYoY(code, incomedf) # rate = getGrossProfitRate(code, incomedf) # rate = getNetProfitRate(code, incomedf) # rate = getOperateProfitRate(code, incomedf) # rate = getProfitRate(code, incomedf) # rate = getOperateTaxRate(code, incomedf) # rate = getSalesRate(code, incomedf)
def retrieveAnualQuarterlyReport(): stockList = getJsonFromFile("C:/project/stockdata/StockNotices/stock.json") stockList = stockList["stockList"] stockList = [{"orgId":"9900002701","category":"A股","code":"002127","pinyin":"njds","zwjc":"南极电商"}] # stockList = [{"orgId":"gssz0000002","category":"A股","code":"000002","pinyin":"njds","zwjc":"万科A"}] for stock in stockList: FileLogger.info("running on stock: %s(%s)" % (stock["zwjc"], stock["code"])) try: filePath = "C:/project/stockdata/StockNotices/%s.json" % stock['code'] jsonList = getJsonFromFile(filePath) annualDf = None for jsonObj in jsonList: announcementType = jsonObj['announcementType'] fileType = jsonObj['adjunctType'] # 得到公告类型,一季报半年报三季报年报 # 公告类型:{'01030501': 第一季度报全文, '01030701':第三季度报, '01030301': 半年报, '01030101':年报全文} noticeType = None if announcementType.find("01030101") != -1: noticeType = "年报" elif announcementType.find("01030701") != -1: noticeType = "三季度报" elif announcementType.find("01030301") != -1: noticeType = "半年报" elif announcementType.find("01030501") != -1: noticeType = "一季度报" if noticeType is not None and (fileType == 'PDF' or filePath == 'PDF ' or fileType == 'pdf'): FileLogger.info("downloading file: %s" % jsonObj["announcementTitle"]) noticeDay = jsonObj['adjunctUrl'][10:20] url = "http://www.cninfo.com.cn/new/announcement/download?bulletinId=%s&announceTime=%s" % (jsonObj['announcementId'], noticeDay) annualData = { 'code': jsonObj['secCode'], 'name': jsonObj['secName'], 'announcementId': jsonObj['announcementId'], 'title': jsonObj['announcementTitle'], 'noticeDay': noticeDay, 'fileType': jsonObj['adjunctType'], 'url': url, 'Type': noticeType, 'year': int(noticeDay[0:4])-1 if noticeType == "年报" else int(noticeDay[0:4]) } if annualDf is None: annualDf = pd.DataFrame(columns=annualData.keys()) annualDf = annualDf.append(annualData, ignore_index=True) else: annualDf = annualDf.append(annualData, ignore_index=True) time.sleep(0) # save to DB from sqlalchemy import create_engine ENGINE = create_engine("mysql+pymysql://root:4401821211@localhost:3306/eastmoney?charset=utf8") annualDf.to_sql(name="reportbasic", con=ENGINE, if_exists="append") except Exception as ex: FileLogger.error(ex) FileLogger.error("retrieve error on code: %s" % stock["code"]) time.sleep(3)
if __name__ == "__main__": # http://www.cninfo.com.cn/new/announcement/bulletin_detail?announceId=13519195&flag=true&announceTime=2004-01-17 # retrieveAnualQuarterlyReport() stockdf = pd.read_csv("C:/project/stockdata/StockNoticesFile/annualreportlist.csv", dtype={'code': np.str, 'year': np.str}) # stockdf = stockdf[stockdf['code'] == '000002'] stockList = stockdf[['code', 'name', 'year', 'announcementId', 'url']].to_numpy() # stockList = stockList[1:3] try: for stock in stockList: fileName = "[%s]%s年报-%s" % (stock[1], stock[2], stock[3]) savePath = "C:/project/stockdata/StockNoticesFile/pdf/%s.pdf" % fileName # make sure it's a valid path, no \/:?*"<>| savePath = savePath.replace("*", "") unresolvedPath = "C:/project/stockdata/StockNoticesFile/unresolved/%s.pdf" % fileName url = stock[4] if os.path.exists(savePath) or os.path.exists(unresolvedPath): FileLogger.info("file %s exists, skip!" % fileName) else: FileLogger.info("downloading file: %s" % fileName) downloadFile(url, savePath) except Exception as ex: FileLogger.error(ex) FileLogger.error("download error on file: %s" % fileName) time.sleep(3)