def test_bootstrap(self): self.assertTrue(registry.test('DP_pre_timedelta_phrases')) self.assertTrue(registry.test('DP_post_timedelta_phrases')) self.assertEqual(type(registry.get('DP_pre_timedelta_phrases')), pyparsing.And) self.assertEqual(type(registry.get('DP_post_timedelta_phrases')), pyparsing.Or)
def test_bootStrap(self, get_prepositions_mock): get_prepositions_mock.return_value = ['of'] MeasurementParser.bootstrap(TestConfig) self.assertEqual( { 'acres': 'imperial_area', 'yards': 'imperial_length', 'yard': 'imperial_length', 'acre': 'imperial_area' }, registry.get('MP_units') ) self.assertEqual( { 'imperial_length': ('Imperial', 'Length'), 'imperial_area': ('Imperial', 'Area') }, registry.get('MP_systems') ) self.assertIsInstance( registry.get('MP_preposition_parser'), pyparsing.And ) self.assertIsInstance( registry.get('MP_measurement_parser'), pyparsing.And ) get_prepositions_mock.assert_called_once_with()
def test_bootstrapCarriesOutAsExpected(self): config = TestConfig() ProgrammingParser.bootstrap(config) self.bootstrapMock.assert_called_with(config) self.assertEqual(486, len(registry.get('PP_all_keywords'))) self.assertEqual(11, len(registry.get('PP_language_keywords')))
def __init__(self, config): """ :param config: cahoots config :type config: cahoots.config.BaseConfig """ BaseParser.__init__(self, config, "Programming", 0) self.all_keywords = registry.get("PP_all_keywords") self.language_keywords = registry.get("PP_language_keywords")
def __init__(self, config): """ :param config: cahoots config :type config: cahoots.config.BaseConfig """ BaseParser.__init__(self, config, "Programming", 0) self.all_keywords = registry.get('PP_all_keywords') self.language_keywords = registry.get('PP_language_keywords')
def parse(self, data_string): """ parses for dates :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ data_string = data_string.strip() if len(data_string) < 3 or len(data_string) > 50: return # Just date detection parsed_date = self.date_parse(data_string) if parsed_date: yield self.result(parsed_date[0], 100, parsed_date[1]) return # Looking for <number> <timescale> <prepositions> <datetime> pre_timedelta_phrases = registry.get('DP_pre_timedelta_phrases') try: pre_delta = pre_timedelta_phrases.parseString(data_string) except ParseException: pass else: parsed_date = self.date_parse(pre_delta[1]) if parsed_date: try: yield self.result( "Number Timescale Preposition Date", 100, parsed_date[1] + pre_delta[0] ) except OverflowError: pass return # Looking for <datetime> <plus/minus> <number> <timescale> post_timedelta_phrases = registry.get('DP_post_timedelta_phrases') post_deltas = \ [t for t in post_timedelta_phrases.scanString(data_string)] if len(post_deltas) == 1: for token, start, _ in post_deltas: parsed_date = self.date_parse(data_string[0:start].strip()) if parsed_date: try: yield self.result( "Date Operator Number Timescale", 100, parsed_date[1] + token.pop() ) except OverflowError: pass return
def __init__(self, config): """ :param config: cahoots config :type config: cahoots.config.BaseConfig """ BaseParser.__init__(self, config, "Measurement", 100) self.units = registry.get('MP_units') self.systems = registry.get('MP_systems') self.preposition_parser = registry.get('MP_preposition_parser') self.measurement_parser = registry.get('MP_measurement_parser')
def __init__(self, config): """ :param config: cahoots config :type config: cahoots.config.BaseConfig """ BaseParser.__init__(self, config, "Measurement", 100) self.units = registry.get('MP_units') self.systems = registry.get('MP_systems') self.preposition_parser = registry.get('MP_preposition_parser') self.measurement_parser = registry.get('MP_measurement_parser')
def test_bootstrap(self): self.assertTrue(registry.test('DP_pre_timedelta_phrases')) self.assertTrue(registry.test('DP_post_timedelta_phrases')) self.assertEqual( type(registry.get('DP_pre_timedelta_phrases')), pyparsing.And ) self.assertEqual( type(registry.get('DP_post_timedelta_phrases')), pyparsing.Or )
def parse(self, data_string): """ parses for dates :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ data_string = data_string.strip() if len(data_string) < 3 or len(data_string) > 50: return # Just date detection parsed_date = self.date_parse(data_string) if parsed_date: yield self.result(parsed_date[0], 100, parsed_date[1]) return # Looking for <number> <timescale> <prepositions> <datetime> pre_timedelta_phrases = registry.get('DP_pre_timedelta_phrases') try: pre_delta = pre_timedelta_phrases.parseString(data_string) except ParseException: pass else: parsed_date = self.date_parse(pre_delta[1]) if parsed_date: try: yield self.result("Number Timescale Preposition Date", 100, parsed_date[1] + pre_delta[0]) except OverflowError: pass return # Looking for <datetime> <plus/minus> <number> <timescale> post_timedelta_phrases = registry.get('DP_post_timedelta_phrases') post_deltas = \ [t for t in post_timedelta_phrases.scanString(data_string)] if len(post_deltas) == 1: for token, start, _ in post_deltas: parsed_date = self.date_parse(data_string[0:start].strip()) if parsed_date: try: yield self.result("Date Operator Number Timescale", 100, parsed_date[1] + token.pop()) except OverflowError: pass return
def parse(self, data): """ parses data to determine if this is a location :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ data = data.strip() test_parameters = self.get_coordinate_test_parameters() for reg_key, format_func, fmt, subtype, confidence in test_parameters: # checking each of our types of coordinates and breaking on find match = registry.get(reg_key).match(data) if match: # if a format_arg provided, we pass it into formatting func res = format_func(match, fmt) if fmt else format_func(match) # Prepping processed data with better metadata result, add_data = self.generate_result_data(res.to_string()) yield self.result(subtype, confidence, result, add_data) # Only looking to match one format, so we break here break
def parse(self, data): """ parses data to determine if this is a location :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ data = data.strip() test_parameters = self.get_coordinate_test_parameters() for reg_key, format_func, fmt, subtype, confidence in test_parameters: # checking each of our types of coordinates and breaking on find match = registry.get(reg_key).match(data) if match: # if a format_arg provided, we pass it into formatting func res = format_func(match, fmt) if fmt else format_func(match) # Prepping processed data with better metadata result, add_data = self.generate_result_data(res.to_string()) yield self.result(subtype, confidence, result, add_data) # Only looking to match one format, so we break here break
def prepare_landmark_datastring(self, data): """ Cleans up and validates the datastring :param data: data we want to check for being a location :type data: str :return: the cleaned up datastring :rtype: str """ data = registry.get('LP_the_regex').sub('', data).strip() if len(data) > 75: return name_parser = NameParser(self.config) if not name_parser.basic_validation(data.split()): return allowed_chars = \ string.whitespace + string.ascii_letters + string.digits allowed_chars += '.,-:' if [x for x in data if x not in allowed_chars]: return return data
def parse(self, data): """ Determines if the data is a name or not :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ # Making sure there are at least SOME uppercase letters in the phrase if not registry.get('NP_upper_alpha_regex').search(data): return data = data.split() # If someone has a name longer than 7 words...they need # help. Making sure each word in the phrase starts with an # uppercase letter or a number if len(data) >= 7 or not self.basic_validation(data): return self.detect_prefix_or_suffix(data) self.calculate_confidence(data) if self.confidence <= 0: return yield self.result("Name", min(100, self.confidence))
def test_bootstrapSetsUpClassifierAsExpected(self): ProgrammingBayesianClassifier.bootstrap(TestConfig) self.assertEqual( ZipFileStub.called, [ 'init-trainers.zip-r', 'namelist', 'read-foo.def', 'read-bar.def' ] ) self.assertTrue( ismethod(SimpleBayesStub.Tokenizer) or isfunction(SimpleBayesStub.Tokenizer) ) self.assertIsInstance(registry.get('PP_bayes'), SimpleBayesStub) self.assertEqual( SimpleBayesStub.Languages, { 'foo': 'foo.def-text', 'bar': 'bar.def-text' } )
def parse(self, data): """ Determines if the data is a name or not :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ # Making sure there are at least SOME uppercase letters in the phrase if not registry.get('NP_upper_alpha_regex').search(data): return data = data.split() # If someone has a name longer than 7 words...they need # help. Making sure each word in the phrase starts with an # uppercase letter or a number if len(data) >= 7 or not self.basic_validation(data): return self.detect_prefix_or_suffix(data) self.calculate_confidence(data) if self.confidence <= 0: return yield self.result("Name", min(100, self.confidence))
def parse(self, data): """ parses for potential address :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ data = data.strip() # If invalid length or there are no digits, we return. if len(data) > 100 or not [x for x in data if x.isdigit()]: return split_regex = registry.get('AP_split_regex') # splitting the data string and removing empty values data_set = [x for x in split_regex.split(data) if x] # At least 4 words and one of the words should start with a number if len(data_set) <= 3 or not [x for x in data_set if x[:1].isdigit()]: return results, token_count, data_set = \ self.generate_result_data(data, data_set) \ or (None, None, None) if token_count: # Subtracting a little confidence for each token that wasn't found self.confidence -= 5 * (len(data_set) - token_count) yield self.result(self.subtype, min(100, self.confidence), results)
def parse(self, data): """ parses for potential address :param data: the string we want to parse :type data: str :return: yields parse result(s) if there are any :rtype: ParseResult """ data = data.strip() # If invalid length or there are no digits, we return. if len(data) > 100 or not [x for x in data if x.isdigit()]: return split_regex = registry.get('AP_split_regex') # splitting the data string and removing empty values data_set = [x for x in split_regex.split(data) if x] # At least 4 words and one of the words should start with a number if len(data_set) <= 3 or not [x for x in data_set if x[:1].isdigit()]: return results, token_count, data_set = \ self.generate_result_data(data, data_set) \ or (None, None, None) if token_count: # Subtracting a little confidence for each token that wasn't found self.confidence -= 5*(len(data_set)-token_count) yield self.result(self.subtype, min(100, self.confidence), results)
def test_flush(self): registry.set('test', 'foo') self.assertEqual('foo', registry.get('test')) self.assertNotEqual(0, len(registry.storage)) registry.flush() self.assertEqual(0, len(registry.storage))
def get_preposition_literals(): """Generates the prepositions parser and returns it""" if registry.test('DP_prepositions'): return registry.get('DP_prepositions') prepositions = \ Or([CaselessLiteral(s) for s in DataHandler().get_prepositions()]) registry.set('DP_prepositions', prepositions) return prepositions
def test_flush(self): registry.set('test', 'foo') self.assertEqual('foo', registry.get('test')) self.assertNotEqual(0, len(registry.storage)) registry.flush() self.assertEqual(0, len(registry.storage))
def classify(cls, data_string): """ Takes an string and creates a dict of programming language match probabilities """ classifier = registry.get('PP_bayes') scores = classifier.score(data_string) return scores
def get_preposition_literals(): """Generates the prepositions parser and returns it""" if registry.test('DP_prepositions'): return registry.get('DP_prepositions') prepositions = \ Or([CaselessLiteral(s) for s in DataHandler().get_prepositions()]) registry.set('DP_prepositions', prepositions) return prepositions
def test_bootStrap(self, get_prepositions_mock): get_prepositions_mock.return_value = ['of'] MeasurementParser.bootstrap(TestConfig) self.assertEqual( { 'acres': 'imperial_area', 'yards': 'imperial_length', 'yard': 'imperial_length', 'acre': 'imperial_area' }, registry.get('MP_units')) self.assertEqual( { 'imperial_length': ('Imperial', 'Length'), 'imperial_area': ('Imperial', 'Area') }, registry.get('MP_systems')) self.assertIsInstance(registry.get('MP_preposition_parser'), pyparsing.And) self.assertIsInstance(registry.get('MP_measurement_parser'), pyparsing.And) get_prepositions_mock.assert_called_once_with()
def get_prepositions(self): """returns the list of prepositions""" if registry.test('DATA_prepositions'): return registry.get('DATA_prepositions') handle = self.get_file_handle('prepositions.yaml') prepositions = yaml.load(handle) handle.close() registry.set('DATA_prepositions', prepositions) return prepositions
def test_basicLanguageHeuristicFindsExpectedKeywords(self): config = TestConfig() ProgrammingParser.bootstrap(config) self.bootstrapMock.assert_called_with(config) parser = ProgrammingParser(config) result = parser.basic_language_heuristic( registry.get('PP_language_keywords')['php'], ['for', 'if', 'foobar'] ) self.assertEqual(2, len(result))
def parse(self, data_string): """ parses for email addresses :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ if len(data_string) > 254 or '@' not in data_string: return if registry.get('EP_valid_regex').match(data_string): yield self.result("Email Address", self.confidence)
def parse(self, data): """parses data to determine if this is a location""" data = data.strip() if len(data) >= 20: return postal_regex = registry.get('ZCP_postal_code_regex') if postal_regex.match(data): results = self.get_postal_code_data(data) if results is not None: self.calculate_confidence(data, results) if self.confidence > 0: yield self.result("Postal Code", self.confidence, results) return
def classify(cls, data_string): """ Takes an string and creates a dict of programming language match probabilities :param data_string: the string we want to classify :type data_string: str :return: bayesian probabilities :rtype: dict """ classifier = registry.get('PP_bayes') scores = classifier.score(data_string) return scores
def get_prepositions(self): """ returns the list of prepositions :return: list of prepositions :rtype: list """ if registry.test('DATA_prepositions'): return registry.get('DATA_prepositions') handle = self.get_file_handle('prepositions.yaml') prepositions = yaml.load(handle) handle.close() registry.set('DATA_prepositions', prepositions) return prepositions
def test_bootstrapSetsUpClassifierAsExpected(self): ProgrammingBayesianClassifier.bootstrap(TestConfig) self.assertEqual(ZipFileStub.called, [ 'init-trainers.zip-r', 'namelist', 'read-foo.def', 'read-bar.def' ]) self.assertTrue( ismethod(SimpleBayesStub.Tokenizer) or isfunction(SimpleBayesStub.Tokenizer)) self.assertIsInstance(registry.get('PP_bayes'), SimpleBayesStub) self.assertEqual(SimpleBayesStub.Languages, { 'foo': 'foo.def-text', 'bar': 'bar.def-text' })
def prepare_landmark_datastring(cls, data): """Cleans up and validates the datastring""" data = registry.get('LP_the_regex').sub('', data).strip() if len(data) > 75: return if not NameParser.basic_validation(data.split()): return allowed_chars = \ string.whitespace + string.ascii_letters + string.digits allowed_chars += '.,-:' if [x for x in data if x not in allowed_chars]: return return data
def prepare_landmark_datastring(self, data): """Cleans up and validates the datastring""" data = registry.get('LP_the_regex').sub('', data).strip() if len(data) > 75: return name_parser = NameParser(self.config) if not name_parser.basic_validation(data.split()): return allowed_chars = \ string.whitespace + string.ascii_letters + string.digits allowed_chars += '.,-:' if [x for x in data if x not in allowed_chars]: return return data
def test_postal_code_patterns_match(self): postal_regex = registry.get('ZCP_postal_code_regex') self.assertTrue(postal_regex.match('A999')) self.assertTrue(postal_regex.match('AB 12')) self.assertTrue(postal_regex.match('AD999')) self.assertTrue(postal_regex.match('999 99')) self.assertTrue(postal_regex.match('AA9999')) self.assertTrue(postal_regex.match('VC9999')) self.assertTrue(postal_regex.match('VG1199')) self.assertTrue(postal_regex.match('6799 W3')) self.assertTrue(postal_regex.match('9999 AA')) self.assertTrue(postal_regex.match('9999 AW')) self.assertTrue(postal_regex.match('9999 CW')) self.assertTrue(postal_regex.match('A9A 9A9')) self.assertTrue(postal_regex.match('AZ 9999')) self.assertTrue(postal_regex.match('BB99999')) self.assertTrue(postal_regex.match('GY9 9AA')) self.assertTrue(postal_regex.match('JE9 9AA')) self.assertTrue(postal_regex.match('JMAAA99')) self.assertTrue(postal_regex.match('LV-9999')) self.assertTrue(postal_regex.match('A9999AAA')) self.assertTrue(postal_regex.match('AA99 9AA')) self.assertTrue(postal_regex.match('AAA 9999')) self.assertTrue(postal_regex.match('AAAA 1ZZ')) self.assertTrue(postal_regex.match('FIQQ 1ZZ')) self.assertTrue(postal_regex.match('TKCA 1ZZ')) self.assertTrue(postal_regex.match('GX99 9AA')) self.assertTrue(postal_regex.match('IM99 9AA')) self.assertTrue(postal_regex.match('KY9-9999')) self.assertTrue(postal_regex.match('999')) self.assertTrue(postal_regex.match('9999')) self.assertTrue(postal_regex.match('99-99')) self.assertTrue(postal_regex.match('99-999')) self.assertTrue(postal_regex.match('999999')) self.assertTrue(postal_regex.match('999-999')) self.assertTrue(postal_regex.match('9999999')) self.assertTrue(postal_regex.match('999-9999')) self.assertTrue(postal_regex.match('9999-999')) self.assertTrue(postal_regex.match('99999-9999')) self.assertTrue(postal_regex.match('77515 CEDEX')) self.assertTrue(postal_regex.match('77515 CEDEX 9')) self.assertTrue(postal_regex.match('77515 CEDEX 99'))
def test_postal_code_patterns_match(self): postal_regex = registry.get('ZCP_postal_code_regex') self.assertTrue(postal_regex.match('A999')) self.assertTrue(postal_regex.match('AB 12')) self.assertTrue(postal_regex.match('AD999')) self.assertTrue(postal_regex.match('999 99')) self.assertTrue(postal_regex.match('AA9999')) self.assertTrue(postal_regex.match('VC9999')) self.assertTrue(postal_regex.match('VG1199')) self.assertTrue(postal_regex.match('6799 W3')) self.assertTrue(postal_regex.match('9999 AA')) self.assertTrue(postal_regex.match('9999 AW')) self.assertTrue(postal_regex.match('9999 CW')) self.assertTrue(postal_regex.match('A9A 9A9')) self.assertTrue(postal_regex.match('AZ 9999')) self.assertTrue(postal_regex.match('BB99999')) self.assertTrue(postal_regex.match('GY9 9AA')) self.assertTrue(postal_regex.match('JE9 9AA')) self.assertTrue(postal_regex.match('JMAAA99')) self.assertTrue(postal_regex.match('LV-9999')) self.assertTrue(postal_regex.match('A9999AAA')) self.assertTrue(postal_regex.match('AA99 9AA')) self.assertTrue(postal_regex.match('AAA 9999')) self.assertTrue(postal_regex.match('AAAA 1ZZ')) self.assertTrue(postal_regex.match('FIQQ 1ZZ')) self.assertTrue(postal_regex.match('TKCA 1ZZ')) self.assertTrue(postal_regex.match('GX99 9AA')) self.assertTrue(postal_regex.match('IM99 9AA')) self.assertTrue(postal_regex.match('KY9-9999')) self.assertTrue(postal_regex.match('999')) self.assertTrue(postal_regex.match('9999')) self.assertTrue(postal_regex.match('99-99')) self.assertTrue(postal_regex.match('99-999')) self.assertTrue(postal_regex.match('999999')) self.assertTrue(postal_regex.match('999-999')) self.assertTrue(postal_regex.match('9999999')) self.assertTrue(postal_regex.match('999-9999')) self.assertTrue(postal_regex.match('9999-999')) self.assertTrue(postal_regex.match('99999-9999')) self.assertTrue(postal_regex.match('77515 CEDEX')) self.assertTrue(postal_regex.match('77515 CEDEX 9')) self.assertTrue(postal_regex.match('77515 CEDEX 99'))
def parse(self, data): """ parses data to determine if this is a location :param data_string: the string we want to parse :type data_string: str :return: yields parse result(s) if there are any :rtype: ParseResult """ data = data.strip() if len(data) >= 20: return postal_regex = registry.get('ZCP_postal_code_regex') if postal_regex.match(data): results = self.get_postal_code_data(data) if results is not None: self.calculate_confidence(data, results) if self.confidence > 0: yield self.result("Postal Code", self.confidence, results) return
def parse(self, data): """ parses data to determine if this is a location :param data: the string we want to parse :type data: str :return: yields parse result(s) if there are any :rtype: ParseResult """ data = data.strip() if len(data) >= 20: return postal_regex = registry.get('ZCP_postal_code_regex') if postal_regex.match(data): results = self.get_postal_code_data(data) if results is not None: self.calculate_confidence(data, results) if self.confidence > 0: yield self.result("Postal Code", self.confidence, results) return
def test_get(self): registry.set('test', 'foo') self.assertEqual('foo', registry.get('test')) self.assertIsNone(registry.get('bar'))
def __init__(self, config): BaseParser.__init__(self, config, "Measurement", 100) self.units = registry.get('MP_units') self.systems = registry.get('MP_systems') self.preposition_parser = registry.get('MP_preposition_parser') self.measurement_parser = registry.get('MP_measurement_parser')
def __init__(self, config): BaseParser.__init__(self, config, "Programming", 0) self.all_keywords = registry.get('PP_all_keywords') self.language_keywords = registry.get('PP_language_keywords')
def test_get(self): registry.set('test', 'foo') self.assertEqual('foo', registry.get('test')) self.assertIsNone(registry.get('bar'))