Python clean_name示例

编程语言: Python

命名空间/包名称: util.cleaner

方法/功能: clean_name

hotexamples.com的示例: 2

Python clean_name - 已找到2个示例。这些是从开源项目中提取的最受好评的util.cleaner.clean_name现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： parser.py 项目： kudva/Secalpha

    def get_10k_docs(self, urls, number):
        """
        Steps:
        Get the 10K documents for a list of URLs in the index file
        Find the company name and addresses in the 10k filing
        Look for an exhibit 21 document within the 10k
        If the exhibit 21 exists, get that, then pull the subsidiaries out and add them
            to the newly created company


        """
        count_num  = 0
        companies = []
        for url in urls:
            count_num = count_num + 1
            company = Company()
            full = BASE_URL + '/Archives/%s-index.htm' % url[0:len(url)-4]
            company.url = full
            contents = self.downloader.get_url(full)

            try:
                soup = BeautifulSoup(contents, convertEntities=BeautifulSoup.HTML_ENTITIES)
                company_name = soup.findAll('span', attrs={'class' : 'companyName'})
                company_name = re.search('(?<=>).*?(?=<)', str(company_name[0]), re.DOTALL).group(0)
            except IndexError:
                self.downloader.purge(full)
                contents = self.downloader.get_url(full)
                soup = BeautifulSoup(contents, convertEntities=BeautifulSoup.HTML_ENTITIES)
                company_name = soup.findAll('span', attrs={'class' : 'companyName'})
                company_name = re.search('(?<=>).*?(?=<)', str(company_name[0]), re.DOTALL).group(0)

            company.name = clean_name(company_name)

            existing = self.backend.get_company(company.name)
            if not existing:
                addresses = soup.findAll('div', attrs={'class' : 'mailer'})
                for address in addresses:
                    if "business address" in address.text.lower():
                        #This is the business address
                        company.business_address = clean_addr(self.extract_address(address))
                        pass
                    if "mailing address" in address.text.lower():
                        company.mailing_address =  clean_addr(self.extract_address(address))

                table_rows = soup.findAll('tr')
                for row in table_rows:
                    #For each row in the table rows, see if it is likely a row that contains
                    #the exhibit 21.1 (subsidiares) form. If it is, grab the URL and load the page
                    #
                    if re.search(EXHIBIT21_WORDS, row.text, re.IGNORECASE):
                        ex21_url = row.find('a').get('href')
                        try:
                            company.subsidiaries = self.get_exhibit21(BASE_URL + ex21_url)
                        except TypeError:
                            #Sometimes beautifulsoup trys to concatenate a str and None?
                            pass

                self.backend.add(company)
            if count_num > number:
                   return

示例#2

显示文件

文件： parser.py 项目： rozap/corpcrawl-dead

    def get_subsidiary(self, lot):
        subsidiary = Company()
        lot = [snippet.text for snippet in lot]
        cmps = filter(lambda snippet : self.is_company(snippet), lot)
        for t in lot:
            s = self.get_state(t)
            if s:
                subsidiary.location = first_letter_caps(s)
        if not subsidiary.location:
            for t in lot:
                c = self.get_country(t)
                if c:
                    subsidiary.location = first_letter_caps(c)

        if len(cmps) > 0:
            subsidiary.name = clean_name(cmps[0])
        return subsidiary