def parse(self, response: scrapy.http.response.Response): base_css = "[data-test=qsp-financial] tbody " datetimes = response.css( base_css + 'tr:first-child td:not(:first-child) span::text').extract() labels = response.css( base_css + 'tr:not(:first-child) td:first-child:not([colspan]) span::text' ).extract() values = response.css( base_css + 'tr:not(:first-child) td:not(:first-child) ::text').extract() datetimes = list(map(lambda x: x.replace('/', '-'), datetimes)) symbol = response.request.url.split('=')[1] target_file = os.path.join(YahooFinanceSpider.target_dir, symbol + '.csv') current_label_index = -1 current_datetime_index = -1 datetimes_len = len(datetimes) df = pd.DataFrame(index=labels, columns=datetimes) pd.options.mode.chained_assignment = None for i in range(0, len(values)): current_datetime_index += 1 if i % datetimes_len == 0: current_label_index += 1 current_datetime_index = 0 val = str(values[i]).replace('-', '') val = str(val).replace(',', '') if str(val) != '': val = int(float(val) * 1000) #TODO check if all numbers are in thousands df.loc[labels[current_label_index]][ datetimes[current_datetime_index]] = val mode = 'w' header = True if os.path.isfile(target_file): mode = 'a' header = False if df.shape[0] != 0 and df.shape[1] != 0: with open(target_file, mode) as f: df.to_csv(f, header=header)
def parse(self, response: scrapy.http.response.Response): print(response.url) print('gathering links', response.url) self.company_links[response.url] = response.css( '.qTableFull tr td:first-child a::attr(href)').extract() # Continue only when all company_links are gathered can_continue = True for start_url in self.start_urls: if start_url not in self.company_links: print('Not all company links yet gathered', response.url) can_continue = False break if can_continue: print('All links gathered. Proceeding.') company_links = [] # Organize links in correct order (same as start_urls) for start_url in self.start_urls: company_links += self.company_links[start_url] links_len = len(company_links) for i, link in enumerate(company_links): # print(self.url_base + link + self.suffix) yield scrapy.Request(self.url_base + link + self.suffix, self.parse_company_page, priority=links_len - i) print('Scheduled all requests. Total', links_len)
def parse(self, response: scrapy.http.response.Response): key_words = ['望京', '望馨花园', '望馨园', '东湖渠'] send = SendEmail() history = [] with open('history.txt') as f: tmp = f.readlines() if len(tmp): history.extend(tmp) else: self.log('历史记录是空', level=logging.WARNING) page = response.css('td.title') for i in page: title = i.css('a::text').extract_first().strip() link = i.css('a::attr(href)').extract_first() self.log('租房标题:{0}'.format(title), level=logging.WARNING) self.log('租房链接:{0}'.format(link), level=logging.WARNING) email_message = '租房标题:{0}\n租房链接:{1}'.format(title, link) for j in key_words: if j in title and link not in history: # QQ邮箱对发信频率有限制,所以没有找到好的方法之前,无脑 sleep time.sleep(10) send.send_email('', email_message) history.append(link + '\n') with open('history.txt', 'w') as f: f.writelines(history)
def parse_ticker_page(self, response: scrapy.http.response.Response): self.parse_price_page(response) next_page_href = response.css('.pages_right::attr(href)').extract() if len(next_page_href) > 0: time.sleep(1) return scrapy.Request(self.url_base + next_page_href[0], self.parse_ticker_page)
def parse_subpage(self, response: scrapy.http.response.Response): links = response.css("#main-content a::attr(href)").extract() for link in links: filename = link.rsplit('/', 1)[-1] pattern = re.compile("^R\d+\.htm$") if pattern.match(filename): yield scrapy.Request(self.url_base + link, self.get_data)
def get_data(self, response: scrapy.http.response.Response): document_type = response.css('th.tl strong').extract_first() period_label = response.css('th.th::text').extract_first() dt = response.css('th.th div::text').extract_first() if period_label is None or document_type is None or dt is None: # print(period_label) # print(document_type) # print(dt) return document_type = document_type.lower() period_label = period_label.lower() period_labels = ['12 months ended'] document_types = { 'income_statement': 'consolidated statements of income', 'balance_sheet': 'consolidated balance sheets', 'cash_flow': 'consolidated statements of cash flows' } is_period_important = False is_document_important = False for p_label in period_labels: if p_label in period_label: is_period_important = True break for slug, d_type in document_types.items(): if d_type in document_type: is_document_important = True break if is_period_important and is_document_important: if "thousand" in document_type: multiplier = 1000 elif "million" in document_type: multiplier = 1000000 elif "billion" in document_type: multiplier = 1000000000 else: raise RuntimeError('No multiplier defined in ' + response.url + '. Document heading: ' + document_type) year = dt[-4:] cik = response.url.rsplit('/')[-3] fin_dict = {'cik': cik} records = response.css('tr') for record in records: record_title = record.css('td.pl a::text').extract_first() if record_title: record_title = record_title.replace(',', '') value = record.css('td.nump::text').extract_first() # print(record_title, value) if value: digit_val = re.findall(r'[\d+,]+', value)[0] # print('digit_val', digit_val) if digit_val: digit_val = float(digit_val.replace( ',', '.')) * multiplier fin_dict[record_title] = str(digit_val) file_path = os.path.join(self.output_dir, year + '.csv') mode = 'w' if os.path.isfile(file_path): mode = 'a' with open(file_path, mode) as f: print('Saving output to ' + file_path) #FIXME sort before saving w = csv.DictWriter(f, fin_dict.keys()) # if mode == 'w': w.writeheader() w.writerow(fin_dict) else: pass
def parse(self, response: scrapy.http.response.Response): links = response.css("#main-content a::attr(href)").extract() for link in links: yield scrapy.Request(self.url_base + link, self.parse_subpage)
def parse(self, response: scrapy.http.response.Response): links = response.css( ".list a[href^=\/files\/dera\/data\/financial-statement-data-sets\/]::attr(href)" ).extract() for link in links: yield scrapy.Request(self.url_base + link, self.get_data)