Пример #1
0
 def bootstrap(config):
     """Bootstraps the location parser"""
     # Will test if something matches 5 or 9 digit postalcode pattern
     postal_regex = re.compile(
         r'^' +
         r'(\d{2,7}(-\d{2,4})?)|' +
         r'([a-zA-Z]\d{3})|' +
         r'([a-zA-Z]{2}\s\d{2})|' +
         r'([a-zA-Z]{2}-\d{2})|' +
         r'(AD\d{3})|' +
         r'(\d{3}\s\d{2})|' +
         r'([a-zA-Z]{2}\d{4})|' +
         r'(\d{4}\sW3)|' +
         r'(\d{4}\s[a-zA-Z]{2})|' +
         r'([a-zA-Z]\d[a-zA-Z]\s\d[a-zA-Z]\d)|' +
         r'(AZ\s\d{4})|' +
         r'(BB\d{1,5})|' +
         r'([a-zA-Z]{2}\d{1,2}\s\d[a-zA-Z]{2})|' +
         r'(JMA[a-zA-Z]{2}\d{2})|' +
         r'(AZ-\d{4})|' +
         r'([a-zA-Z]\d{4}[a-zA-Z]{3})|' +
         r'([a-zA-Z]{2}\d{2}\s\d[a-zA-Z]{2})|' +
         r'([a-zA-Z]{3}\s\d{4})|' +
         r'([a-zA-Z]{4}\s1ZZ)|' +
         r'([a-zA-Z]{2}\d{1,2}(-\d{4})?)|' +
         r'(\d{5}\sCEDEX(\s\d{1,2})?)' +
         r'$'
     )
     registry.set('ZCP_postal_code_regex', postal_regex)
Пример #2
0
    def bootstrap(config):
        """
        Trains the bayes classifier with examples
        from various programming languages

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        classifier = simplebayes.SimpleBayes(
            ProgrammingBayesianClassifier.bayes_tokenizer
        )

        directory = os.path.dirname(os.path.abspath(__file__))

        trainers = {}

        trainer_zip = zipfile.ZipFile(directory + '/trainers.zip', 'r')
        for filename in trainer_zip.namelist():
            language = filename.split('.')[0]
            trainers[language] = trainer_zip.read(filename)

        for language in trainers:
            classifier.train(language, trainers[language])

        registry.set('PP_bayes', classifier)
Пример #3
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        # Will test if something matches 5 or 9 digit postalcode pattern
        postal_regex = re.compile(
            r'^' +
            r'(\d{2,7}(-\d{2,4})?)|' +
            r'([a-zA-Z]\d{3})|' +
            r'([a-zA-Z]{2}\s\d{2})|' +
            r'([a-zA-Z]{2}-\d{2})|' +
            r'(AD\d{3})|' +
            r'(\d{3}\s\d{2})|' +
            r'([a-zA-Z]{2}\d{4})|' +
            r'(\d{4}\sW3)|' +
            r'(\d{4}\s[a-zA-Z]{2})|' +
            r'([a-zA-Z]\d[a-zA-Z]\s\d[a-zA-Z]\d)|' +
            r'(AZ\s\d{4})|' +
            r'(BB\d{1,5})|' +
            r'([a-zA-Z]{2}\d{1,2}\s\d[a-zA-Z]{2})|' +
            r'(JMA[a-zA-Z]{2}\d{2})|' +
            r'(AZ-\d{4})|' +
            r'([a-zA-Z]\d{4}[a-zA-Z]{3})|' +
            r'([a-zA-Z]{2}\d{2}\s\d[a-zA-Z]{2})|' +
            r'([a-zA-Z]{3}\s\d{4})|' +
            r'([a-zA-Z]{4}\s1ZZ)|' +
            r'([a-zA-Z]{2}\d{1,2}(-\d{4})?)|' +
            r'(\d{5}\sCEDEX(\s\d{1,2})?)' +
            r'$'
        )
        registry.set('ZCP_postal_code_regex', postal_regex)
Пример #4
0
    def bootstrap(config):
        """
        Trains the bayes classifier with examples
        from various programming languages

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        classifier = simplebayes.SimpleBayes(
            ProgrammingBayesianClassifier.bayes_tokenizer
        )

        directory = os.path.dirname(os.path.abspath(__file__))

        trainers = {}

        trainer_zip = zipfile.ZipFile(directory + '/trainers.zip', 'r')
        for filename in trainer_zip.namelist():
            language = filename.split('.')[0]
            trainers[language] = trainer_zip.read(filename)

        for language in trainers:
            classifier.train(language, trainers[language])

        registry.set('PP_bayes', classifier)
Пример #5
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        the_regex = re.compile('^the ', re.IGNORECASE)
        registry.set('LP_the_regex', the_regex)
Пример #6
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        email_regex = re.compile(VALID_ADDRESS_REGEXP)
        registry.set('EP_valid_regex', email_regex)
Пример #7
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        upper_alpha = re.compile('[A-Z]')
        registry.set('NP_upper_alpha_regex', upper_alpha)
Пример #8
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        split_regex = re.compile(r"[\s#`'?.;,-/]")
        registry.set('AP_split_regex', split_regex)
Пример #9
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        the_regex = re.compile('^the ', re.IGNORECASE)
        registry.set('LP_the_regex', the_regex)
Пример #10
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        split_regex = re.compile(r"[\s#`'?.;,-/]")
        registry.set('AP_split_regex', split_regex)
Пример #11
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        upper_alpha = re.compile('[A-Z]')
        registry.set('NP_upper_alpha_regex', upper_alpha)
Пример #12
0
    def test_flush(self):

        registry.set('test', 'foo')

        self.assertEqual('foo', registry.get('test'))
        self.assertNotEqual(0, len(registry.storage))

        registry.flush()

        self.assertEqual(0, len(registry.storage))
Пример #13
0
    def get_preposition_literals():
        """Generates the prepositions parser and returns it"""
        if registry.test('DP_prepositions'):
            return registry.get('DP_prepositions')

        prepositions = \
            Or([CaselessLiteral(s) for s in DataHandler().get_prepositions()])

        registry.set('DP_prepositions', prepositions)
        return prepositions
Пример #14
0
    def test_flush(self):

        registry.set('test', 'foo')

        self.assertEqual('foo', registry.get('test'))
        self.assertNotEqual(0, len(registry.storage))

        registry.flush()

        self.assertEqual(0, len(registry.storage))
Пример #15
0
    def get_preposition_literals():
        """Generates the prepositions parser and returns it"""
        if registry.test('DP_prepositions'):
            return registry.get('DP_prepositions')

        prepositions = \
            Or([CaselessLiteral(s) for s in DataHandler().get_prepositions()])

        registry.set('DP_prepositions', prepositions)
        return prepositions
Пример #16
0
    def get_prepositions(self):
        """returns the list of prepositions"""
        if registry.test('DATA_prepositions'):
            return registry.get('DATA_prepositions')

        handle = self.get_file_handle('prepositions.yaml')
        prepositions = yaml.load(handle)
        handle.close()

        registry.set('DATA_prepositions', prepositions)
        return prepositions
Пример #17
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        # Will test if something matches regular coordinates
        # 34.56,23.65 or 34.56 23.65 or 34.56 , 23.65
        coord_regex = re.compile(r'^(-?\d{1,3}(?:\.\d+)?)' +
                                 r'(?:(?:(?:\s+)?,(?:\s+)?)|(?:\s+))' +
                                 r'(-?\d{1,3}(?:\.\d+))?$')
        registry.set('CP_coord_regex', coord_regex)

        # Will test if something matches degree coordinates
        # 40.244° N 79.123° W
        deg_regex = re.compile(
            u('^(\d{1,3}\.\d+°?\s+[nNsS])') + u('\s+') +
            u('(\d{1,3}\.\d+°?\s+[wWeE])$'))
        registry.set('CP_deg_regex', deg_regex)

        # Will test if something matches deg/min coordinates
        # 13° 34.425' N 45° 37.983' W
        deg_min_regex = re.compile(
            u('^(\d{1,3}°?\s+\d{1,3}\.\d+\'?\s+[nNsS])') + u('\s+') +
            u('(\d{1,3}°?\s+\d{1,3}\.\d+\'?\s+[wWeE])$'))
        registry.set('CP_deg_min_regex', deg_min_regex)

        # Will test if something matches deg/min/sec coordinates
        # 40° 26' 46.56" N 79° 58' 56.88" W
        deg_min_sec_regex = re.compile(
            u('^(\d{1,3}°?\s+\d{1,3}\'?\s+\d{1,3}(?:\.\d+)?"?\s+[nNsS])') +
            u('\s+') +
            u('(\d{1,3}°?\s+\d{1,3}\'?\s+\d{1,3}(?:\.\d+)?"?\s+[wWeE])$'))
        registry.set('CP_deg_min_sec_regex', deg_min_sec_regex)
Пример #18
0
    def get_prepositions(self):
        """
        returns the list of prepositions

        :return: list of prepositions
        :rtype: list
        """
        if registry.test('DATA_prepositions'):
            return registry.get('DATA_prepositions')

        handle = self.get_file_handle('prepositions.yaml')
        prepositions = yaml.load(handle)
        handle.close()

        registry.set('DATA_prepositions', prepositions)
        return prepositions
Пример #19
0
    def bootstrap(config):
        """Loads tokens from the yaml files on disk"""
        all_keywords = []
        language_keywords = {}

        directory = os.path.dirname(os.path.abspath(__file__))
        path = os.path.join(directory, "languages/*.yaml")

        for file_path in glob.glob(path):
            with open(file_path, 'r') as language_file:
                language = yaml.load(language_file)
                all_keywords.extend(language['keywords'])
                language_keywords[language['id']] = language

        registry.set('PP_all_keywords', set(all_keywords))
        registry.set('PP_language_keywords', language_keywords)

        ProgrammingBayesianClassifier.bootstrap(config)
Пример #20
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        time_scales = [
            'microseconds',
            'milliseconds',
            'seconds',
            'minutes',
            'hours',
            'days',
            'weeks',
            'years',
            'microsecond',
            'millisecond',
            'second',
            'minute',
            'hour',
            'day',
            'week',
            'year',
        ]

        # <number> <timescale> <preposition>
        # 3 seconds until / 50 seconds since
        pre_timedeltas = Or(
            [DateParser.create_pre_timedelta_literal(t) for t in time_scales]
        )
        pre_timedelta_phrases = \
            pre_timedeltas + Word(alphas + nums + " .,;-/'")
        registry.set('DP_pre_timedelta_phrases', pre_timedelta_phrases)

        # <operator> <number> <timescale>
        # plus 5 hours / - 17 days
        post_timedelta_phrases = Or(
            [DateParser.create_post_timedelta_literal(t) for t in time_scales]
        )
        registry.set('DP_post_timedelta_phrases', post_timedelta_phrases)
Пример #21
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        # Will test if something matches 5 or 9 digit postalcode pattern
        postal_regex = re.compile(
            r'^' + r'(\d{2,7}(-\d{2,4})?)|' + r'([a-zA-Z]\d{3})|' +
            r'([a-zA-Z]{2}\s\d{2})|' + r'([a-zA-Z]{2}-\d{2})|' +
            r'(AD\d{3})|' + r'(\d{3}\s\d{2})|' + r'([a-zA-Z]{2}\d{4})|' +
            r'(\d{4}\sW3)|' + r'(\d{4}\s[a-zA-Z]{2})|' +
            r'([a-zA-Z]\d[a-zA-Z]\s\d[a-zA-Z]\d)|' + r'(AZ\s\d{4})|' +
            r'(BB\d{1,5})|' + r'([a-zA-Z]{2}\d{1,2}\s\d[a-zA-Z]{2})|' +
            r'(JMA[a-zA-Z]{2}\d{2})|' + r'(AZ-\d{4})|' +
            r'([a-zA-Z]\d{4}[a-zA-Z]{3})|' +
            r'([a-zA-Z]{2}\d{2}\s\d[a-zA-Z]{2})|' + r'([a-zA-Z]{3}\s\d{4})|' +
            r'([a-zA-Z]{4}\s1ZZ)|' + r'([a-zA-Z]{2}\d{1,2}(-\d{4})?)|' +
            r'(\d{5}\sCEDEX(\s\d{1,2})?)' + r'$')
        registry.set('ZCP_postal_code_regex', postal_regex)
Пример #22
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        time_scales = [
            'microseconds',
            'milliseconds',
            'seconds',
            'minutes',
            'hours',
            'days',
            'weeks',
            'years',
            'microsecond',
            'millisecond',
            'second',
            'minute',
            'hour',
            'day',
            'week',
            'year',
        ]

        # <number> <timescale> <preposition>
        # 3 seconds until / 50 seconds since
        pre_timedeltas = Or(
            [DateParser.create_pre_timedelta_literal(t) for t in time_scales])
        pre_timedelta_phrases = \
            pre_timedeltas + Word(alphas + nums + " .,;-/'")
        registry.set('DP_pre_timedelta_phrases', pre_timedelta_phrases)

        # <operator> <number> <timescale>
        # plus 5 hours / - 17 days
        post_timedelta_phrases = Or(
            [DateParser.create_post_timedelta_literal(t) for t in time_scales])
        registry.set('DP_post_timedelta_phrases', post_timedelta_phrases)
Пример #23
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        all_keywords = []
        language_keywords = {}

        directory = os.path.dirname(os.path.abspath(__file__))
        path = os.path.join(directory, "languages/*.yaml")

        for file_path in glob.glob(path):
            with open(file_path, "r") as language_file:
                language = yaml.load(language_file)
                all_keywords.extend(language["keywords"])
                language_keywords[language["id"]] = language

        registry.set("PP_all_keywords", set(all_keywords))
        registry.set("PP_language_keywords", language_keywords)

        ProgrammingBayesianClassifier.bootstrap(config)
Пример #24
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        all_keywords = []
        language_keywords = {}

        directory = os.path.dirname(os.path.abspath(__file__))
        path = os.path.join(directory, "languages/*.yaml")

        for file_path in glob.glob(path):
            with open(file_path, 'r') as language_file:
                language = yaml.load(language_file)
                all_keywords.extend(language['keywords'])
                language_keywords[language['id']] = language

        registry.set('PP_all_keywords', set(all_keywords))
        registry.set('PP_language_keywords', language_keywords)

        ProgrammingBayesianClassifier.bootstrap(config)
Пример #25
0
    def bootstrap(config):
        """
        This method is statically called to bootstrap a parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        # Will test if something matches regular coordinates
        # 34.56,23.65 or 34.56 23.65 or 34.56 , 23.65
        coord_regex = re.compile(
            r'^(-?\d{1,3}(?:\.\d+)?)' +
            r'(?:(?:(?:\s+)?,(?:\s+)?)|(?:\s+))' +
            r'(-?\d{1,3}(?:\.\d+))?$'
        )
        registry.set('CP_coord_regex', coord_regex)

        # Will test if something matches degree coordinates
        # 40.244° N 79.123° W
        deg_regex = re.compile(
            u('^(\d{1,3}\.\d+°?\s+[nNsS])') +
            u('\s+') +
            u('(\d{1,3}\.\d+°?\s+[wWeE])$')
        )
        registry.set('CP_deg_regex', deg_regex)

        # Will test if something matches deg/min coordinates
        # 13° 34.425' N 45° 37.983' W
        deg_min_regex = re.compile(
            u('^(\d{1,3}°?\s+\d{1,3}\.\d+\'?\s+[nNsS])') +
            u('\s+') +
            u('(\d{1,3}°?\s+\d{1,3}\.\d+\'?\s+[wWeE])$')
        )
        registry.set('CP_deg_min_regex', deg_min_regex)

        # Will test if something matches deg/min/sec coordinates
        # 40° 26' 46.56" N 79° 58' 56.88" W
        deg_min_sec_regex = re.compile(
            u('^(\d{1,3}°?\s+\d{1,3}\'?\s+\d{1,3}(?:\.\d+)?"?\s+[nNsS])') +
            u('\s+') +
            u('(\d{1,3}°?\s+\d{1,3}\'?\s+\d{1,3}(?:\.\d+)?"?\s+[wWeE])$')
        )
        registry.set('CP_deg_min_sec_regex', deg_min_sec_regex)
Пример #26
0
    def bootstrap(config):
        """
        Loads unit lists for use in this instance of the measurement parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        units = {}
        systems = {}
        prepositions = DataHandler().get_prepositions()

        directory = os.path.dirname(os.path.abspath(__file__))
        path = os.path.join(directory, "units/*.yaml")

        for file_path in glob.glob(path):
            unit_file = open(file_path, 'r')
            unit_type = yaml.load(unit_file)
            for unit in unit_type['keywords']:
                units[unit] = unit_type['id']
            systems[unit_type['id']] = \
                (unit_type['system'], unit_type['type'])

        preposition_parser = \
            Or([CaselessLiteral(s) for s in prepositions]) + Word(alphas)

        measurement_parser = \
            originalTextFor(
                Word(nums) +
                ZeroOrMore(',' + Word(nums+',')) +
                ZeroOrMore('.' + Word(nums)) +
                ZeroOrMore(Word(nums) + '/' + Word(nums))
            ) + \
            Or([CaselessLiteral(s) for s in units.keys()]) + \
            Optional(originalTextFor(preposition_parser))

        registry.set('MP_units', units)
        registry.set('MP_systems', systems)
        registry.set('MP_preposition_parser', preposition_parser)
        registry.set('MP_measurement_parser', measurement_parser)
Пример #27
0
    def bootstrap(config):
        """
        Loads unit lists for use in this instance of the measurement parser

        :param config: cahoots config
        :type config: cahoots.config.BaseConfig
        """
        units = {}
        systems = {}
        prepositions = DataHandler().get_prepositions()

        directory = os.path.dirname(os.path.abspath(__file__))
        path = os.path.join(directory, "units/*.yaml")

        for file_path in glob.glob(path):
            unit_file = open(file_path, 'r')
            unit_type = yaml.load(unit_file)
            for unit in unit_type['keywords']:
                units[unit] = unit_type['id']
            systems[unit_type['id']] = \
                (unit_type['system'], unit_type['type'])

        preposition_parser = \
            Or([CaselessLiteral(s) for s in prepositions]) + Word(alphas)

        measurement_parser = \
            originalTextFor(
                Word(nums, max=3) +
                ZeroOrMore(',' + Word(nums, exact=3)) +
                ZeroOrMore('.' + Word(nums)) +
                ZeroOrMore(Word(nums) + '/' + Word(nums))
            ) + \
            Or([CaselessLiteral(s) for s in units.keys()]) + \
            Optional(originalTextFor(preposition_parser))

        registry.set('MP_units', units)
        registry.set('MP_systems', systems)
        registry.set('MP_preposition_parser', preposition_parser)
        registry.set('MP_measurement_parser', measurement_parser)
Пример #28
0
 def setUp(self):
     registry.set('PP_bayes', SimpleBayesStub())
Пример #29
0
    def test_set(self):

        registry.set('test', 'foo')

        self.assertEqual('foo', registry.storage['test'])
Пример #30
0
    def test_test(self):

        registry.set('test', 'foo')

        self.assertTrue(registry.test('test'))
        self.assertFalse(registry.test('bar'))
Пример #31
0
    def test_get(self):

        registry.set('test', 'foo')

        self.assertEqual('foo', registry.get('test'))
        self.assertIsNone(registry.get('bar'))
Пример #32
0
 def setUp(self):
     registry.set('PP_bayes', SimpleBayesStub())
Пример #33
0
 def bootstrap(config):
     """preps the address parser"""
     the_regex = re.compile('^the ', re.IGNORECASE)
     registry.set('LP_the_regex', the_regex)
Пример #34
0
    def test_set(self):

        registry.set('test', 'foo')

        self.assertEqual('foo', registry.storage['test'])
Пример #35
0
    def test_get(self):

        registry.set('test', 'foo')

        self.assertEqual('foo', registry.get('test'))
        self.assertIsNone(registry.get('bar'))
Пример #36
0
    def test_test(self):

        registry.set('test', 'foo')

        self.assertTrue(registry.test('test'))
        self.assertFalse(registry.test('bar'))