def parse(self, response: scrapy.http.response.Response):

        base_css = "[data-test=qsp-financial] tbody "

        datetimes = response.css(
            base_css +
            'tr:first-child td:not(:first-child) span::text').extract()
        labels = response.css(
            base_css +
            'tr:not(:first-child) td:first-child:not([colspan]) span::text'
        ).extract()
        values = response.css(
            base_css +
            'tr:not(:first-child) td:not(:first-child) ::text').extract()

        datetimes = list(map(lambda x: x.replace('/', '-'), datetimes))

        symbol = response.request.url.split('=')[1]
        target_file = os.path.join(YahooFinanceSpider.target_dir,
                                   symbol + '.csv')

        current_label_index = -1
        current_datetime_index = -1
        datetimes_len = len(datetimes)

        df = pd.DataFrame(index=labels, columns=datetimes)
        pd.options.mode.chained_assignment = None

        for i in range(0, len(values)):

            current_datetime_index += 1
            if i % datetimes_len == 0:
                current_label_index += 1
                current_datetime_index = 0

            val = str(values[i]).replace('-', '')
            val = str(val).replace(',', '')
            if str(val) != '':
                val = int(float(val) *
                          1000)  #TODO check if all numbers are in thousands

            df.loc[labels[current_label_index]][
                datetimes[current_datetime_index]] = val

        mode = 'w'
        header = True
        if os.path.isfile(target_file):
            mode = 'a'
            header = False

        if df.shape[0] != 0 and df.shape[1] != 0:
            with open(target_file, mode) as f:
                df.to_csv(f, header=header)
Пример #2
0
    def parse(self, response: scrapy.http.response.Response):
        print(response.url)

        print('gathering links', response.url)
        self.company_links[response.url] = response.css(
            '.qTableFull tr td:first-child a::attr(href)').extract()

        # Continue only when all company_links are gathered
        can_continue = True
        for start_url in self.start_urls:
            if start_url not in self.company_links:
                print('Not all company links yet gathered', response.url)
                can_continue = False
                break

        if can_continue:

            print('All links gathered. Proceeding.')

            company_links = []
            # Organize links in correct order (same as start_urls)
            for start_url in self.start_urls:
                company_links += self.company_links[start_url]

            links_len = len(company_links)
            for i, link in enumerate(company_links):
                # print(self.url_base + link + self.suffix)
                yield scrapy.Request(self.url_base + link + self.suffix,
                                     self.parse_company_page,
                                     priority=links_len - i)
            print('Scheduled all requests. Total', links_len)
Пример #3
0
    def parse(self, response: scrapy.http.response.Response):

        key_words = ['望京', '望馨花园', '望馨园', '东湖渠']

        send = SendEmail()

        history = []

        with open('history.txt') as f:
            tmp = f.readlines()
            if len(tmp):
                history.extend(tmp)
            else:
                self.log('历史记录是空', level=logging.WARNING)

        page = response.css('td.title')
        for i in page:
            title = i.css('a::text').extract_first().strip()
            link = i.css('a::attr(href)').extract_first()
            self.log('租房标题:{0}'.format(title), level=logging.WARNING)
            self.log('租房链接:{0}'.format(link), level=logging.WARNING)
            email_message = '租房标题:{0}\n租房链接:{1}'.format(title, link)
            for j in key_words:
                if j in title and link not in history:
                    # QQ邮箱对发信频率有限制,所以没有找到好的方法之前,无脑 sleep
                    time.sleep(10)
                    send.send_email('', email_message)
                    history.append(link + '\n')
                    with open('history.txt', 'w') as f:
                        f.writelines(history)
Пример #4
0
    def parse_ticker_page(self, response: scrapy.http.response.Response):

        self.parse_price_page(response)
        next_page_href = response.css('.pages_right::attr(href)').extract()

        if len(next_page_href) > 0:
            time.sleep(1)
            return scrapy.Request(self.url_base + next_page_href[0],
                                  self.parse_ticker_page)
Пример #5
0
    def parse_subpage(self, response: scrapy.http.response.Response):
        links = response.css("#main-content a::attr(href)").extract()

        for link in links:

            filename = link.rsplit('/', 1)[-1]

            pattern = re.compile("^R\d+\.htm$")
            if pattern.match(filename):
                yield scrapy.Request(self.url_base + link, self.get_data)
Пример #6
0
    def get_data(self, response: scrapy.http.response.Response):

        document_type = response.css('th.tl strong').extract_first()
        period_label = response.css('th.th::text').extract_first()
        dt = response.css('th.th div::text').extract_first()

        if period_label is None or document_type is None or dt is None:
            # print(period_label)
            # print(document_type)
            # print(dt)
            return

        document_type = document_type.lower()
        period_label = period_label.lower()

        period_labels = ['12 months ended']
        document_types = {
            'income_statement': 'consolidated statements of income',
            'balance_sheet': 'consolidated balance sheets',
            'cash_flow': 'consolidated statements of cash flows'
        }

        is_period_important = False
        is_document_important = False

        for p_label in period_labels:
            if p_label in period_label:
                is_period_important = True
                break

        for slug, d_type in document_types.items():
            if d_type in document_type:
                is_document_important = True
                break

        if is_period_important and is_document_important:
            if "thousand" in document_type:
                multiplier = 1000
            elif "million" in document_type:
                multiplier = 1000000
            elif "billion" in document_type:
                multiplier = 1000000000
            else:
                raise RuntimeError('No multiplier defined in ' + response.url +
                                   '. Document heading: ' + document_type)

            year = dt[-4:]
            cik = response.url.rsplit('/')[-3]

            fin_dict = {'cik': cik}

            records = response.css('tr')
            for record in records:
                record_title = record.css('td.pl a::text').extract_first()
                if record_title:
                    record_title = record_title.replace(',', '')
                    value = record.css('td.nump::text').extract_first()
                    # print(record_title, value)
                    if value:
                        digit_val = re.findall(r'[\d+,]+', value)[0]
                        # print('digit_val', digit_val)
                        if digit_val:
                            digit_val = float(digit_val.replace(
                                ',', '.')) * multiplier
                            fin_dict[record_title] = str(digit_val)

            file_path = os.path.join(self.output_dir, year + '.csv')
            mode = 'w'
            if os.path.isfile(file_path):
                mode = 'a'
            with open(file_path, mode) as f:
                print('Saving output to ' + file_path)
                #FIXME sort before saving
                w = csv.DictWriter(f, fin_dict.keys())
                # if mode == 'w':
                w.writeheader()
                w.writerow(fin_dict)
        else:
            pass
Пример #7
0
    def parse(self, response: scrapy.http.response.Response):

        links = response.css("#main-content a::attr(href)").extract()
        for link in links:
            yield scrapy.Request(self.url_base + link, self.parse_subpage)
Пример #8
0
 def parse(self, response: scrapy.http.response.Response):
     links = response.css(
         ".list a[href^=\/files\/dera\/data\/financial-statement-data-sets\/]::attr(href)"
     ).extract()
     for link in links:
         yield scrapy.Request(self.url_base + link, self.get_data)