def test_chou_rouge(): e = ["choux rouge au vinaigre", "vinaigre balsamique"] entities = Entities(e) match = entities.match("choux rouge au vinaigre balsamique") assert len(match) == 2 assert match[0].text == e[0] assert match[1].text == e[1]
def test_chou_rave(): e = ["choux rave", "rave party"] entities = Entities(e) match = entities.match("choux rave party") assert len(match) == 2 assert match[0].text == e[0] assert match[1].text == e[1]
def test_transform(): entities = Entities(["Data-Publica"], transform=lambda t: t.lower()) match = entities.match("data-publica") assert len(match) == 1 assert match[0].text == "data-publica" entities = Entities(["data-publica"], transform=lambda t: t.lower()) match = entities.match("Data-Publica") assert len(match) == 1 assert match[0].text == "Data-Publica"
def __init__(self, cache_results=False, check_supported=True, **kwargs): """ :param cache_results (boolean) if true, the result caching mechanism is enabled """ self.logger = logging.getLogger("textmining:address_detecter") self.logger.setLevel(logging.INFO) if "fvoies" in kwargs: raise DeprecationWarning( "fvoies is deprecated.\n" "Please use detect_address(..., fvoies=X)") resources_dir = os.path.join(LIB_PATH, "resources/localization") self.check_supported = check_supported if not os.path.exists(resources_dir): raise NotImplementedError("No resources") # Cache where country specific resources are cached self.localization_cache = {} # Iterating over coutry specific resources for path in os.listdir(resources_dir): # We consider that all directories in the resources_dir represent a country if os.path.isdir(os.path.join(resources_dir, path)): country_name = path country_path = os.path.join(resources_dir, path) country = namedtuple( "Country", [ "cities", # Set of entities, "zipcodes", "voies", "main_regex", "secondary_regex", "streets_matcher" ]) # For some country (like France), there’s no need for a harcoded street list country.streets = None street_path = os.path.join(country_path, "streets.txt") # If streets.txt exists, a hardcoded street list is needed if os.path.exists(street_path): with open(street_path, "r") as f: country.streets = set(f.read().splitlines()) country.streets_matcher = Matcher() country.streets_matcher.set_words(country.streets) with open(os.path.join(country_path, "main.regex"), "r") as f: regex = f.read().strip() country.main_regex = re.compile(regex, re.S | re.U | re.I) try: with open(os.path.join(country_path, "secondary.regex"), "r") as f: regex = f.read().strip() country.secondary_regex = re.compile( regex, re.S | re.U | re.I) except IOError as e: country.secondary_regex = None print("Unable to open file secondary.rege" ) # Does not exist OR no read permissions country.voies = cities = country.zipcodes = set() try: with open(os.path.join(country_path, "cities.csv"), "r") as f: reader = csv.reader(f, delimiter=",") for zipcode, city in reader: zipcode = str( int(zipcode) ) # Ensures consistency when zipcode begins with 0 country.zipcodes.add(zipcode) city = normalize_text(city) cities.add(city) country.cities = Entities(cities) except IOError as e: country.cities = None print("Unable to open file cities.csv" ) # Does not exist OR no read permissions try: # Populating voies set with resource file with open(os.path.join(country_path, "voies.csv"), "r") as f: for row in f.readlines(): row = row.strip().lower() row = row.split(",") voies = map(normalize_text, row) country.voies.update(voies) except IOError as e: country.voies = None print("Unable to open file voies.csv" ) # Does not exist OR no read permissions self.localization_cache[country_name] = country self.results_cache = None if cache_results: # Caches matched strings in a set for bad results, and a dict for the one that yielded a good result self.empty_cache()
def _fetch(self, params="api/entities"): """ Fetches entities recorded for the given url :param url: :return: The corresponding list of entities """ self.logger.info("Fetching url [%s] " % self.api.base_url + "/" + params) #fetching... response = self.api.fetch(params) if not response.status_code == requests.codes.ok: self.logger.warning( "Fetching the API doesn't give an 200 HTTP response...") raise CannotFetchUrlException() results = response.json() if len(results) == 0: raise CannotFetchUrlException() entities_list = defaultdict() # Preparing entities based on matching method type entities_list[MatchMethod.EXACT] = Entities([], tokenizer=self.tokenizer, transform=identity) entities_list[MatchMethod.SHORT_LABEL] = Entities( [], tokenizer=self.tokenizer, transform=normalization) # Preparing expression matching using Ahocorasick data structure entities_list[MatchMethod.LONG_LABEL] = aho.Trie() count = 0 blacklisted = list() for result in results: id_ = result["id"] type_ = result["type"] labels = result["labels"] for entity in labels: if entity["method"] in ["SHORT_LABEL", "EXACT"]: if admit(entity["label"], self.bl): entities_list[MatchMethod( entity["method"])].add_entity( Entity(entity["label"], (id_, type_))) count += 1 else: blacklisted.append(entity["label"]) self.logger.warning("blacklisting short-label %s" % entity["label"]) elif entity["method"] == "LONG_LABEL": if admit_label(entity["label"], self.bl): entities_list[MatchMethod(entity["method"])].add_word( ascii_normalization(entity["label"]), (id_, type_)) count += 1 else: blacklisted.append(entity["label"]) self.logger.warning("blacklisting long-label %s" % entity["label"]) else: raise UnkownMatchingMethod( "method [%s] is not implemented." % entity["method"]) # computing automaton to prepare ahocorasick on long-label entities entities_list[MatchMethod("LONG_LABEL")].make_automaton() self.logger.info("total items retrieved : %d" % count) self.logger.warning("Has blacklisted %d items." % len(blacklisted)) return entities_list
def test_accents_in_entities(): entities = Entities(["abcéefg"]) match = entities.match("abc abcéefg abc") assert len(match) == 1
def test_choux_de_bruxelles(): e = ["Choux de Bruxelles", "Choux", "Choux Fleur"] entities = Entities(e) match = entities.match(e[0]) assert len(match) == 1 assert match[0].text == e[0]
def test_deleted_spaces(): entities = Entities(["data - publica"]) match = entities.match("data-publica") assert len(match) == 1
def test_oud(): entities = Entities(["oud"]) match = entities.match("ou d") assert len(match) == 0 match = entities.match("oud") assert len(match) == 1