示例#1
0
    def _parse_registry_block(self, registry_txt):
        business = reg.Business()

        lines = registry_txt.split('\n')
        registry_txt = registry_txt.replace('\n', '')

        # Look for name match in first line.
        name_match = re.match(self.name_pattern_1, lines[0])
        if not name_match:
            name_match = re.match(self.name_pattern_2, lines[0])
        if name_match:
            business.name = name_match.group(0)
            registry_txt = re.sub(re.escape(business.name), '', registry_txt)
        else:
            # Set to entire first line if no match found.
            business.name = lines[0]

        # Find address and bracket matches.
        address_match = re.search(self.address_pattern, registry_txt)
        if address_match:
            business.address = address_match.group(1)
            business.bracket = address_match.group(2)
            registry_txt = re.sub(re.escape(address_match.group(0)), '',
                                  registry_txt)

        # Find SIC matches.
        sic_matches = self.sic_pattern.findall(registry_txt)
        for desc, num in sic_matches:
            business.category.append(num)
            business.cat_desc.append(desc)

        # Append the current city.
        business.city = self.current_city

        return business
示例#2
0
    def _parse_registry_block(self, registry_txt):
        """works for registries from 1975-onward"""
        business = reg.Business()

        lines = registry_txt.split("\n")

        business.name = lines[0]
        business.address = lines[1]

        match = self.city_pattern.search(registry_txt)
        if match:
            city = match.group(0)
            match_city = self._city_detector.match_to_cities(city) # perform spell check and confirm this is a city
            if match_city:
                if match_city != city:
                    print("Imperfect city match: %s matched to %s" % (city, match_city))
                business.city = match_city

        match = self.emp_pattern.search(registry_txt)
        if match:
            match = re.search(r"\d+",match.group(0))
            if match:
                business.emp = match.group(0)

        return business
示例#3
0
    def _process_contour(self, contour_txt, contour_font_attrs, header_str):
        if contour_txt.count("\n") > 0:  # if the contour's text has 2 or more lines consider it a registry
            business = self._parse_registry_block(contour_txt)
            business.category = header_str
            if len(self.current_city) > 0:
                business.city = self.current_city
            if len(self.current_zip) > 0:
                business.zip = self.current_zip

            geo.geocode_business(business)
            return business
        else:  # check if city header
            segments = contour_txt.rpartition(" ")
            zip = ""

            # check if zip is in header
            if segments[2].isdigit() and len(segments[2]) == 5:
                zip = segments[2]
                contour_txt = segments[0]

            match_city = self._city_detector.match_to_cities(contour_txt)

            if match_city:
                self.current_city = match_city
                self.current_zip = zip
        return reg.Business()
示例#4
0
    def _parse_registry_block(self, registry_txt):
        business = reg.Business()

        lines = registry_txt.split('\n')

        business.name = lines[0]

        full_address = ""
        for line in lines:
            start = re.search('[0-9]{2,}', line)
            end = self.phone_pattern.search(line)
            if start:
                if end:
                    break
                full_address += ' ' + line

        match = self.paren_pattern.search(full_address)
        if match:
            business.address = match.group(1)
            match = self.bad_address_pattern.search(full_address)
            if match:
                business.city = match.group(1)
                business.zip = match.group(2)
        else:
            match = self.good_address_pattern.search(full_address)
            if match:
                business.address = match.group(1)
                business.city = match.group(2)
                business.zip = match.group(3)

        matches = self.sic_pattern.findall(registry_txt)
        category_pattern = re.compile(r'\d{4}')
        cat_desc_pattern = re.compile(r'[^\:0-9\n]+[\n]*[^0-9\:]*')
        one_sic_pattern = re.compile(r'(/d{4}):[/s]+(.*)', re.DOTALL)
        if len(matches) > 0:
            business.category = category_pattern.findall(matches[0])
            business.cat_desc = cat_desc_pattern.findall(matches[0])
        else:
            match = one_sic_pattern.search(registry_txt)
            if match:
                business.category = match.group(1)
                business.cat_desc = match.group(2)
            else:
                match = one_sic_pattern.search(registry_txt)
                if match:
                    business.category = match.group(1)
                    business.cat_desc = match.group(2)

        match = self.emp_pattern.search(registry_txt)
        if match:
            business.emp = match.group(1)

        match = self.sales_pattern.search(registry_txt)
        if match:
            business.sales = match.group(1)

        return business
示例#5
0
    def _process_contour(self, contour_txt, contour_font_attrs):
        registry_match = self.registry_pattern.search(contour_txt)
        city_match = self.city_pattern.search(contour_txt)

        if registry_match:
            return self._parse_registry_block(contour_txt)
        elif city_match:
            self.current_city = city_match.group(1)

        return reg.Business()
示例#6
0
    def _process_contour(self, contour_txt, contour_font_attrs):

        registry_match = self.registry_pattern.match(contour_txt)
        sic_match = self.sic_pattern.match(contour_txt)

        if registry_match and not sic_match:
            lines = contour_txt.split("\n")

            if len(lines) < 2:
                return reg.Business()

            lines[0] = self._start(self.name_prefix) + " " + lines[0] + " " + self._end(self.name_prefix)
            lines[1] = self._start(self.address_prefix) + " " + lines[1] + " " + self._end(self.address_prefix)

            self.registry_txt += "\n" + self._start(self.bus_prefix) + "\n"
            self.registry_txt += " ".join(lines)
            self.registry_txt += "\n" + self._end(self.bus_prefix) + "\n"

        return reg.Business()
示例#7
0
    def _process_contour(self, contour_txt, contour_font_attrs):
        registry_match = self.registry_pattern.match(contour_txt)
        sic_match = self.sic_pattern.match(contour_txt)

        if registry_match:
            business = self._parse_registry_block(contour_txt)
            business.category = self.current_sic

            geo.geocode_business(business)
            return business
        elif sic_match:
            self.current_sic = sic_match.group(0)
        return reg.Business()
示例#8
0
    def _parse_registry_block(self, registry_txt):
        """works for registries from 2005"""

        business = reg.Business()

        lines = registry_txt.split('\n')

        business.name = lines[0]

        # Get address lines
        full_address = ""
        for line in lines:
            start = re.search(r'[0-9]+', line)
            end = re.search(r'Phone', line)
            if start:
                if end:
                    break
                full_address += line

        # Get category description lines
        cat_desc = ""
        for line in lines:
            end = re.search(r'Employs', line)
            if end:
                break
            else:
                cat_desc += line

        # Search for regex pattern
        address_match = self.address_pattern.search(full_address)
        if address_match:
            business.address = address_match.group(1)
            business.zip = address_match.group(2)

        cat_desc_match = self.cat_desc_pattern.search(cat_desc)
        if cat_desc_match:
            business.cat_desc = cat_desc_match.group(1)

        sic_match = self.sic_pattern.search(registry_txt)
        if sic_match:
            business.category = sic_match.group(1)

        emp_match = self.emp_pattern.search(registry_txt)
        if emp_match:
            business.emp = emp_match.group(1)

        sales_match = self.sales_pattern.search(registry_txt)
        if sales_match:
            business.sales = sales_match.group(1)

        return business
示例#9
0
    def _parse_registry_block(self, registry_txt):
        business = reg.Business()

        lines = registry_txt.split('\n')

        # Set first line as business name.
        business.name = lines[0]

        # Delete lines that list managers/presidents/administrators.
        man_pattern = re.compile(r':\s([A-Za-z \t\r\f\v]+)')
        man_matches = man_pattern.findall(registry_txt)
        for match in man_matches:
            registry_txt = registry_txt.replace(match, '')

        # Find address match.
        address_match = re.search(self.address_pattern, registry_txt)
        if address_match:
            business.address = address_match.group(1)
        zip_match = re.search(self.zip_pattern, registry_txt)
        if zip_match:
            business.zip = zip_match.group(1)

        # Delete newline markers.
        registry_txt = registry_txt.replace('\n', '')

        # Find SIC matches.
        sic_matches = self.sic_pattern.findall(registry_txt)
        for desc, num in sic_matches:
            business.category.append(num)
            business.cat_desc.append(desc)

        # Find bracket match.
        bracket_match = re.search(self.bracket_pattern, registry_txt)
        if bracket_match:
            business.bracket = bracket_match.group(1)

        # Set business.city
        business.city = self.current_city

        return business
示例#10
0
    def _parse_registry_block(self, registry_txt):
        """works for registries from 1953-1975"""

        business = reg.Business()

        lines = registry_txt.split("\n")

        # get name
        business.name = lines[0]
        address_line = lines[1]

        match = self.zip_pattern.search(address_line)
        if match:
            business.zip = match.group("zip")
            address_line = match.group("address")

        business.address = address_line

        match = self.emp_pattern.search(registry_txt)
        if match:
            business.emp = match.group(0)[-1]

        return business