예제 #1
0
    def get_company_data(self, parser, ticker, type, t):
        global htmlData

        counter = 0

        print ("Started scraping for ", ticker, " ", type)
                
        while True:
            htmlData = []

            # parser should be a composition in this class, doesn't work though

            parser.feed(self.scrape_company_data(ticker, type, t, counter))
            
            if (len (htmlData) < 50):
                break
            else:
        
                #parse 
                index = 0

                #find report dates, check how many reports on page and prepare objects for reports
                for i in range(4):
                    if 'th' in htmlData:
                        index = htmlData.index('th')   
                        htmlData.pop(index)
                        reports.append(FinancialReport())
                        reports[counter + i].date = htmlData[index]
                        index = htmlData.index('th')   
                        htmlData.pop(index)
                    else:
                        break

                reports_on_page = i + 1 if i == 3 else i
                           
                #revenue_from_sale_of_merchandise_and_raw_materials
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].revenue_from_sale_of_merchandise_and_raw_materials = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #profit_from_operating_activities
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].profit_from_operating_activities = ''.join(htmlData[index].strip().split())    
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #gross_profit
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].gross_profit = ''.join(htmlData[index].strip().split()) 
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #net_profit
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].net_profit = ''.join(htmlData[index].strip().split())    
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #net_flow
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].net_flow = ''.join(htmlData[index].strip().split())  
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #net_operating_flow
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].net_operating_flow = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #net_investments_flow
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].net_investments_flow = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #net_financial_flow
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].net_financial_flow = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #assets
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].assets = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #liabilities_and_provision_for_liabilities
                for i in range(reports_on_page):                    
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].liabilities_and_provision_for_liabilities = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #non_current_liabilities
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].non_current_liabilities = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #current_liabilities
                for i in range(reports_on_page):            
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].current_liabilities = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #owners_equity
                for i in range(reports_on_page):                  
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].owners_equity = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #share_capital
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].share_capital = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #number_of_shares
                for i in range(reports_on_page):            
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].number_of_shares = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #book_worth_per_share        
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].book_worth_per_share = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #profit_per_share
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].profit_per_share = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #diluted_number_of_shares
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].diluted_number_of_shares = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #diluted_book_worth_per_share
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].diluted_book_worth_per_share = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #diluted_profit_per_share
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].diluted_profit_per_share = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                #dividend_per_share
                for i in range(reports_on_page):
                    index = htmlData.index('td')       
                    htmlData.pop(index)
                    if Helper.isReal(htmlData[index]):
                        reports[counter + i].dividend_per_share = ''.join(htmlData[index].strip().split())
                    index = htmlData.index('td')
                    htmlData.pop(index)

                counter += 4

        print ("Finished scraping successfully for ", ticker, " ", type)