Python ScrapyService示例，US.services.ScrapyService.ScrapyService Python示例

示例#1

0

显示文件

文件： DishMenuScrapy.py 项目： unicorn82/MillionaireMenu

    def collectDishBasic(self, pair_id, href, title):
        # href = "/equities/celesio-mu-historical-data"

        # / equities / el - paso - cor
        basic_url = "https://cn.investing.com" + href

        basic_url = "https://cn.investing.com/equities/boeing-co"
        print("basic_url = " + basic_url)

        dish = Dish()
        dish.setCompanyName(title)
        dish.setCompany(href.replace("/equities/", ""))
        service = ScrapyService()
        basic_response = service.callGetRequst(basic_url, "")
        print(basic_response.text)
        basic_soup = bs4.BeautifulSoup(basic_response.text, "html.parser")

        scripts_elements = basic_soup.findAll("script",
                                              {"type": "application/ld+json"})

        if len(scripts_elements) == 0:
            return

        script_json = json.loads(scripts_elements[0].string)
        dish.setTicker(script_json["tickersymbol"])
        # dish.setCompany(script_json["legalname"])

        # print(basic_soup.find("button", {"data-testid": "button-parent"}).find("span"))

        # dish_select_element = basic_soup.select('#DropDownContainer')[0].select('#DropdownBtn')[0].select('.btnTextDropDwn')[0]

        # dish.setCategory(dish_select_element.text)

        overview_element = basic_soup.select('.overviewDataTable')[0]

        basic_elements = overview_element.findAll("div",
                                                  {"class": "inlineblock"})

        self.collectHistoryDish(
            basic_soup.select('#pairSublinksLevel2')[0].findAll("li"), pair_id,
            dish.ticker)

        for basic_ele in basic_elements:
            basic_attr = basic_ele.select('.float_lang_base_1')[0].text
            basic_value = basic_ele.select('.float_lang_base_2')[0].text
            # print(basic_attr+":"+basic_value)
            self.setDishBasicInfo(dish, basic_attr, basic_value)

        # print(dish.toJson())
        return dish

示例#2

0

显示文件

文件： DishHistoryScrapy.py 项目： unicorn82/MillionaireMenu

    def collectIndexHistory(self, obj):
        now = datetime.datetime.now()
        if not self.isHistory:
            self.st_date = now.strftime("%Y/01/01")
        post_url = "https://cn.investing.com/instruments/HistoricalDataAjax"
        service = ScrapyService()

        dish_price_list = service.getDishItemHistoryService(
            obj.get("ticker"), obj.get("curr_id"), obj.get("smlID"),
            self.st_date)

        miracleService = MiracleService()
        indexItem = Index()
        indexItem.setTicker(obj.get("ticker"))
        indexItem.setDescription(obj.get("description"))
        miracleService.saveIndexItem(indexItem)
        miracleService.saveIndexPriceHistory(dish_price_list)

示例#3

0

显示文件

文件： DishMenuScrapy.py 项目： unicorn82/MillionaireMenu

    def findSmlId(self, historical_url):
        service = ScrapyService()
        historical_response = service.callGetRequst(historical_url, "")
        historical_soup = bs4.BeautifulSoup(historical_response.text,
                                            "html.parser")
        for script in historical_soup.find_all('script'):

            if script.prettify().find('window.histDataExcessInfo') > 0:
                print(script.prettify())
                json_string = script.prettify().replace('<script>',
                                                        '').replace(
                                                            '</script>', '')
                json_string = json_string[json_string.find('=') +
                                          1:].strip().replace(
                                              'pairId', '"pairId"').replace(
                                                  'smlId', '"smlId"')
                print(json_string)
                return json.loads(json_string)

示例#4

0

显示文件

文件： DishMenuScrapy.py 项目： unicorn82/MillionaireMenu

    def collectHistoryDish(self, historyLIs, pair_id, ticker):
        # print(historyLIs)
        for li in historyLIs:

            if li.find('a') is not None and li.find('a').text == '历史数据':
                history_href = li.find('a').get('href')
                print(li.find('a').get('href'))
        historical_url = 'https://cn.investing.com' + history_href

        print(historical_url)
        json_o = self.findSmlId(historical_url)
        pairId = json_o['pairId']
        smlId = json_o['smlId']
        print(str(pairId) + " : " + str(smlId))
        service = ScrapyService()
        dish_price_list = service.getDishItemHistoryService(
            ticker, pairId, smlId)

        miracleService = MiracleService()

        miracleService.saveDishPriceHistory(dish_price_list)

        # print(basic_response.text)

        basic_url = "https://cn.investing.com/instruments/HistoricalDataAjax"
        service = ScrapyService()
        basic_response = service.callPostRequst(basic_url, "")

示例#5

0

显示文件

    def __init__(self):
        self.workDirectory = '/Users/eyin/index_history'
        self.miracleService = MiracleService()
        self.scrapyService = ScrapyService()
        self.dishes = [
            {"ticker": "DJI", "curr_id": "169", "smlID": "2030170", "url": "https://www.investing.com/indices/us-30-historical-data"},
            {"ticker": "HSI", "curr_id": "179", "smlID": "2030179",  "url": "https://www.investing.com/indices/hang-sen-40-historical-data"},
            {"ticker": "SPX", "curr_id": "166", "smlID": "2030167",  "url": "https://www.investing.com/indices/us-spx-500-historical-data" },
            {"ticker": "IXIC", "curr_id": "14958", "smlID": "2035302", "url": "https://www.investing.com/indices/nasdaq-composite-historical-data"},
            {"ticker": "NDX", "curr_id": "20", "smlID": "2030165", "url": "https://www.investing.com/indices/nq-100-historical-data"},
            {"ticker": "SSE50", "curr_id": "995204", "smlID": "2144337", "url": "https://www.investing.com/indices/shanghai-se-50-historical-data"},
            {"ticker": "SSEC", "curr_id": "40820", "smlID": "2057370", "url": "https://www.investing.com/indices/shanghai-composite-historical-data"},
            {"ticker": "CSI300", "curr_id": "940801", "smlID": "2065987", "url": "https://www.investing.com/indices/csi300-historical-data"},
            {"ticker": "CNT", "curr_id": "945512", "smlID": "2073873", "url": "https://www.investing.com/indices/chinext-price-historical-data"},
            {"ticker": "NYSE", "url": "https://www.investing.com/indices/nyse-composite-historical-data"},
            {"ticker": "S&P",  "url": "https://www.investing.com/indices/s-p-citic50-historical-data"},
            {"ticker": "NYA", "url": "https://www.investing.com/indices/nyse-composite-historical-data"},
            {"ticker": "SZI", "url": "https://www.investing.com/indices/szse-component-historical-data"}
        ]

        self.module = 'scrapy'
        self.logger = logHelper(self.module)

示例#6

0

显示文件

文件： DishMenuScrapy.py 项目： unicorn82/MillionaireMenu

    def collectDishes(self, index_id):
        baseUrl = "https://cn.investing.com/equities/StocksFilter?noconstruct=1&smlID=800&sid=&tabletype=price&index_id=" + index_id
        # baseUrl = "https://cn.investing.com/equities/StocksFilter?noconstruct=1&smlID=800&sid=&tabletype=price&index_id=all"
        print("dishUrl= " + baseUrl)
        service = ScrapyService()
        response = service.callGetRequst(baseUrl, "")
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        trs_element = soup.select('#cross_rate_markets_stocks_1')[0].select(
            'tr[id*="pair_"]')

        # trs_element = soup.select('tr[id*="pair_"]')

        backendService = MiracleService()

        # tr = trs_element[0]

        for tr in trs_element:
            # if tr is not None:

            pair_id = tr.get('id')
            href = tr.select('a[href]')[0].get('href')
            title = tr.select('a[href]')[0].get('title')
            dish = self.collectDishBasic(pair_id, href, title)

示例#7

0

显示文件

class SectionMenuScrapy():
    def __init__(self):
        self.workDirectory = '/Users/eyin/index_history'
        self.miracleService = MiracleService()
        self.scrapyService = ScrapyService()
        self.dishes = [
            {"ticker": "DJI", "curr_id": "169", "smlID": "2030170", "url": "https://www.investing.com/indices/us-30-historical-data"},
            {"ticker": "HSI", "curr_id": "179", "smlID": "2030179",  "url": "https://www.investing.com/indices/hang-sen-40-historical-data"},
            {"ticker": "SPX", "curr_id": "166", "smlID": "2030167",  "url": "https://www.investing.com/indices/us-spx-500-historical-data" },
            {"ticker": "IXIC", "curr_id": "14958", "smlID": "2035302", "url": "https://www.investing.com/indices/nasdaq-composite-historical-data"},
            {"ticker": "NDX", "curr_id": "20", "smlID": "2030165", "url": "https://www.investing.com/indices/nq-100-historical-data"},
            {"ticker": "SSE50", "curr_id": "995204", "smlID": "2144337", "url": "https://www.investing.com/indices/shanghai-se-50-historical-data"},
            {"ticker": "SSEC", "curr_id": "40820", "smlID": "2057370", "url": "https://www.investing.com/indices/shanghai-composite-historical-data"},
            {"ticker": "CSI300", "curr_id": "940801", "smlID": "2065987", "url": "https://www.investing.com/indices/csi300-historical-data"},
            {"ticker": "CNT", "curr_id": "945512", "smlID": "2073873", "url": "https://www.investing.com/indices/chinext-price-historical-data"},
            {"ticker": "NYSE", "url": "https://www.investing.com/indices/nyse-composite-historical-data"},
            {"ticker": "S&P",  "url": "https://www.investing.com/indices/s-p-citic50-historical-data"},
            {"ticker": "NYA", "url": "https://www.investing.com/indices/nyse-composite-historical-data"},
            {"ticker": "SZI", "url": "https://www.investing.com/indices/szse-component-historical-data"}
        ]

        self.module = 'scrapy'
        self.logger = logHelper(self.module)





    def loopAllHistoryFiles(self):
        _, _, filenames = next(walk(self.workDirectory))
        for file in filenames:
            if not file.endswith("csv"):
                continue
            dish_prices = []
            indexModel = Index()
            ticker = file[:file.index('_')]
            description = file[file.index('_')+1:file.index('.csv')]
            indexModel.setTicker(ticker)
            indexModel.setDescription(description)

            # self.logger.debug(indexModel.toJson())
            with open(self.workDirectory+'/'+file) as csv_file:
                csv_reader = csv.reader(csv_file, delimiter=',')
                line_count = 0
                for row in csv_reader:
                    if line_count != 0:
                        dishModel = self.packRow2Model(row, ticker)
                        dish_prices.append(dishModel.toJson())
                    line_count = line_count+1
                self.miracleService.saveIndexItem(indexModel)
                self.miracleService.saveIndexPriceHistory(dish_prices)

    def packRow2Model(self, row, ticker):
        dishModel = DishPrice()
        dishModel.setTicker(ticker)
        if len(row) == 7:
            dishModel.setDate(datetime.strptime(row[0], '%b %d, %Y').strftime('%m/%d/%Y'))
            dishModel.setClose(row[1])
            dishModel.setOpen(row[2])
            dishModel.setHigh(row[3])
            dishModel.setLow(row[4])
            dishModel.setVolume(row[5])
            dishModel.setRange(row[6])
            self.logger.debug(dishModel.toJson())
        return dishModel

    def scrapyIndexHistory(self):
        for indexObject in self.dishes:
            self.logger.info(indexObject["url"])
            ticker = indexObject["ticker"]
            response = self.scrapyService.callGetRequst(indexObject["url"], "")
  
            soup = bs4.BeautifulSoup(response.text, "html.parser")
            if soup.select('#curr_table') is not None:
                table_element = soup.select('#curr_table')[0]
                trs_element = table_element.select('tr')
                dist_list = []
                for tr in trs_element:
                    dishModel = self.packDishPriceFromTr(tr, ticker)
                    if dishModel is not None:
                        dist_list.append(dishModel.toJson())
                        self.logger.debug(dishModel.toJson())
            self.miracleService.saveTickerIndexPriceHistory(ticker, dist_list)




    def packDishPriceFromTr(self, tr, ticker):
        dishModel = DishPrice()
        dishModel.setTicker(ticker)
        tds = tr.select('td')
        if len(tds) == 7:
            dishModel.setDate(datetime.strptime(tds[0].text, '%b %d, %Y').strftime('%m/%d/%Y'))
            dishModel.setClose(tds[1].text)
            dishModel.setOpen(tds[2].text)
            dishModel.setHigh(tds[3].text)
            dishModel.setLow(tds[4].text)
            dishModel.setVolume(tds[5].text)
            dishModel.setRange(tds[6].text)
            return dishModel
        else:
            return None









    def run(self, history):
        if history:
            self.loopAllHistoryFiles()
        else:
            self.scrapyIndexHistory()