def collectDishBasic(self, pair_id, href, title): # href = "/equities/celesio-mu-historical-data" # / equities / el - paso - cor basic_url = "https://cn.investing.com" + href basic_url = "https://cn.investing.com/equities/boeing-co" print("basic_url = " + basic_url) dish = Dish() dish.setCompanyName(title) dish.setCompany(href.replace("/equities/", "")) service = ScrapyService() basic_response = service.callGetRequst(basic_url, "") print(basic_response.text) basic_soup = bs4.BeautifulSoup(basic_response.text, "html.parser") scripts_elements = basic_soup.findAll("script", {"type": "application/ld+json"}) if len(scripts_elements) == 0: return script_json = json.loads(scripts_elements[0].string) dish.setTicker(script_json["tickersymbol"]) # dish.setCompany(script_json["legalname"]) # print(basic_soup.find("button", {"data-testid": "button-parent"}).find("span")) # dish_select_element = basic_soup.select('#DropDownContainer')[0].select('#DropdownBtn')[0].select('.btnTextDropDwn')[0] # dish.setCategory(dish_select_element.text) overview_element = basic_soup.select('.overviewDataTable')[0] basic_elements = overview_element.findAll("div", {"class": "inlineblock"}) self.collectHistoryDish( basic_soup.select('#pairSublinksLevel2')[0].findAll("li"), pair_id, dish.ticker) for basic_ele in basic_elements: basic_attr = basic_ele.select('.float_lang_base_1')[0].text basic_value = basic_ele.select('.float_lang_base_2')[0].text # print(basic_attr+":"+basic_value) self.setDishBasicInfo(dish, basic_attr, basic_value) # print(dish.toJson()) return dish
def collectIndexHistory(self, obj): now = datetime.datetime.now() if not self.isHistory: self.st_date = now.strftime("%Y/01/01") post_url = "https://cn.investing.com/instruments/HistoricalDataAjax" service = ScrapyService() dish_price_list = service.getDishItemHistoryService( obj.get("ticker"), obj.get("curr_id"), obj.get("smlID"), self.st_date) miracleService = MiracleService() indexItem = Index() indexItem.setTicker(obj.get("ticker")) indexItem.setDescription(obj.get("description")) miracleService.saveIndexItem(indexItem) miracleService.saveIndexPriceHistory(dish_price_list)
def findSmlId(self, historical_url): service = ScrapyService() historical_response = service.callGetRequst(historical_url, "") historical_soup = bs4.BeautifulSoup(historical_response.text, "html.parser") for script in historical_soup.find_all('script'): if script.prettify().find('window.histDataExcessInfo') > 0: print(script.prettify()) json_string = script.prettify().replace('<script>', '').replace( '</script>', '') json_string = json_string[json_string.find('=') + 1:].strip().replace( 'pairId', '"pairId"').replace( 'smlId', '"smlId"') print(json_string) return json.loads(json_string)
def collectHistoryDish(self, historyLIs, pair_id, ticker): # print(historyLIs) for li in historyLIs: if li.find('a') is not None and li.find('a').text == '历史数据': history_href = li.find('a').get('href') print(li.find('a').get('href')) historical_url = 'https://cn.investing.com' + history_href print(historical_url) json_o = self.findSmlId(historical_url) pairId = json_o['pairId'] smlId = json_o['smlId'] print(str(pairId) + " : " + str(smlId)) service = ScrapyService() dish_price_list = service.getDishItemHistoryService( ticker, pairId, smlId) miracleService = MiracleService() miracleService.saveDishPriceHistory(dish_price_list) # print(basic_response.text) basic_url = "https://cn.investing.com/instruments/HistoricalDataAjax" service = ScrapyService() basic_response = service.callPostRequst(basic_url, "")
def __init__(self): self.workDirectory = '/Users/eyin/index_history' self.miracleService = MiracleService() self.scrapyService = ScrapyService() self.dishes = [ {"ticker": "DJI", "curr_id": "169", "smlID": "2030170", "url": "https://www.investing.com/indices/us-30-historical-data"}, {"ticker": "HSI", "curr_id": "179", "smlID": "2030179", "url": "https://www.investing.com/indices/hang-sen-40-historical-data"}, {"ticker": "SPX", "curr_id": "166", "smlID": "2030167", "url": "https://www.investing.com/indices/us-spx-500-historical-data" }, {"ticker": "IXIC", "curr_id": "14958", "smlID": "2035302", "url": "https://www.investing.com/indices/nasdaq-composite-historical-data"}, {"ticker": "NDX", "curr_id": "20", "smlID": "2030165", "url": "https://www.investing.com/indices/nq-100-historical-data"}, {"ticker": "SSE50", "curr_id": "995204", "smlID": "2144337", "url": "https://www.investing.com/indices/shanghai-se-50-historical-data"}, {"ticker": "SSEC", "curr_id": "40820", "smlID": "2057370", "url": "https://www.investing.com/indices/shanghai-composite-historical-data"}, {"ticker": "CSI300", "curr_id": "940801", "smlID": "2065987", "url": "https://www.investing.com/indices/csi300-historical-data"}, {"ticker": "CNT", "curr_id": "945512", "smlID": "2073873", "url": "https://www.investing.com/indices/chinext-price-historical-data"}, {"ticker": "NYSE", "url": "https://www.investing.com/indices/nyse-composite-historical-data"}, {"ticker": "S&P", "url": "https://www.investing.com/indices/s-p-citic50-historical-data"}, {"ticker": "NYA", "url": "https://www.investing.com/indices/nyse-composite-historical-data"}, {"ticker": "SZI", "url": "https://www.investing.com/indices/szse-component-historical-data"} ] self.module = 'scrapy' self.logger = logHelper(self.module)
def collectDishes(self, index_id): baseUrl = "https://cn.investing.com/equities/StocksFilter?noconstruct=1&smlID=800&sid=&tabletype=price&index_id=" + index_id # baseUrl = "https://cn.investing.com/equities/StocksFilter?noconstruct=1&smlID=800&sid=&tabletype=price&index_id=all" print("dishUrl= " + baseUrl) service = ScrapyService() response = service.callGetRequst(baseUrl, "") soup = bs4.BeautifulSoup(response.text, "html.parser") trs_element = soup.select('#cross_rate_markets_stocks_1')[0].select( 'tr[id*="pair_"]') # trs_element = soup.select('tr[id*="pair_"]') backendService = MiracleService() # tr = trs_element[0] for tr in trs_element: # if tr is not None: pair_id = tr.get('id') href = tr.select('a[href]')[0].get('href') title = tr.select('a[href]')[0].get('title') dish = self.collectDishBasic(pair_id, href, title)
class SectionMenuScrapy(): def __init__(self): self.workDirectory = '/Users/eyin/index_history' self.miracleService = MiracleService() self.scrapyService = ScrapyService() self.dishes = [ {"ticker": "DJI", "curr_id": "169", "smlID": "2030170", "url": "https://www.investing.com/indices/us-30-historical-data"}, {"ticker": "HSI", "curr_id": "179", "smlID": "2030179", "url": "https://www.investing.com/indices/hang-sen-40-historical-data"}, {"ticker": "SPX", "curr_id": "166", "smlID": "2030167", "url": "https://www.investing.com/indices/us-spx-500-historical-data" }, {"ticker": "IXIC", "curr_id": "14958", "smlID": "2035302", "url": "https://www.investing.com/indices/nasdaq-composite-historical-data"}, {"ticker": "NDX", "curr_id": "20", "smlID": "2030165", "url": "https://www.investing.com/indices/nq-100-historical-data"}, {"ticker": "SSE50", "curr_id": "995204", "smlID": "2144337", "url": "https://www.investing.com/indices/shanghai-se-50-historical-data"}, {"ticker": "SSEC", "curr_id": "40820", "smlID": "2057370", "url": "https://www.investing.com/indices/shanghai-composite-historical-data"}, {"ticker": "CSI300", "curr_id": "940801", "smlID": "2065987", "url": "https://www.investing.com/indices/csi300-historical-data"}, {"ticker": "CNT", "curr_id": "945512", "smlID": "2073873", "url": "https://www.investing.com/indices/chinext-price-historical-data"}, {"ticker": "NYSE", "url": "https://www.investing.com/indices/nyse-composite-historical-data"}, {"ticker": "S&P", "url": "https://www.investing.com/indices/s-p-citic50-historical-data"}, {"ticker": "NYA", "url": "https://www.investing.com/indices/nyse-composite-historical-data"}, {"ticker": "SZI", "url": "https://www.investing.com/indices/szse-component-historical-data"} ] self.module = 'scrapy' self.logger = logHelper(self.module) def loopAllHistoryFiles(self): _, _, filenames = next(walk(self.workDirectory)) for file in filenames: if not file.endswith("csv"): continue dish_prices = [] indexModel = Index() ticker = file[:file.index('_')] description = file[file.index('_')+1:file.index('.csv')] indexModel.setTicker(ticker) indexModel.setDescription(description) # self.logger.debug(indexModel.toJson()) with open(self.workDirectory+'/'+file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count != 0: dishModel = self.packRow2Model(row, ticker) dish_prices.append(dishModel.toJson()) line_count = line_count+1 self.miracleService.saveIndexItem(indexModel) self.miracleService.saveIndexPriceHistory(dish_prices) def packRow2Model(self, row, ticker): dishModel = DishPrice() dishModel.setTicker(ticker) if len(row) == 7: dishModel.setDate(datetime.strptime(row[0], '%b %d, %Y').strftime('%m/%d/%Y')) dishModel.setClose(row[1]) dishModel.setOpen(row[2]) dishModel.setHigh(row[3]) dishModel.setLow(row[4]) dishModel.setVolume(row[5]) dishModel.setRange(row[6]) self.logger.debug(dishModel.toJson()) return dishModel def scrapyIndexHistory(self): for indexObject in self.dishes: self.logger.info(indexObject["url"]) ticker = indexObject["ticker"] response = self.scrapyService.callGetRequst(indexObject["url"], "") soup = bs4.BeautifulSoup(response.text, "html.parser") if soup.select('#curr_table') is not None: table_element = soup.select('#curr_table')[0] trs_element = table_element.select('tr') dist_list = [] for tr in trs_element: dishModel = self.packDishPriceFromTr(tr, ticker) if dishModel is not None: dist_list.append(dishModel.toJson()) self.logger.debug(dishModel.toJson()) self.miracleService.saveTickerIndexPriceHistory(ticker, dist_list) def packDishPriceFromTr(self, tr, ticker): dishModel = DishPrice() dishModel.setTicker(ticker) tds = tr.select('td') if len(tds) == 7: dishModel.setDate(datetime.strptime(tds[0].text, '%b %d, %Y').strftime('%m/%d/%Y')) dishModel.setClose(tds[1].text) dishModel.setOpen(tds[2].text) dishModel.setHigh(tds[3].text) dishModel.setLow(tds[4].text) dishModel.setVolume(tds[5].text) dishModel.setRange(tds[6].text) return dishModel else: return None def run(self, history): if history: self.loopAllHistoryFiles() else: self.scrapyIndexHistory()