Пример #1
0
    def test_empty_keyword(self):
        kwtree = KeywordTree()
        kwtree.add('')
        kwtree.finalize()

        result = kwtree.search('')
        self.assertIsNone(result)
Пример #2
0
    def test_text_end_situation_2(self):
        kwtree = KeywordTree()
        kwtree.add('blaaaaaf')
        kwtree.add('la')
        kwtree.finalize()

        result = kwtree.search('bla')
        self.assertEqual(('la', 1), result)
Пример #3
0
 def test_state_to_string(self):
     words = ['peter', 'horst', 'gandalf', 'frodo']
     tree = KeywordTree(case_insensitive=True)
     for word in words:
         tree.add(word)
     tree.finalize()
     as_string = str(tree._zero_state)
     self.assertIsNotNone(as_string)
Пример #4
0
def create_keywordtree(lst, s):
    kwtree = KeywordTree(case_insensitive=True)
    for w in lst:
        kwtree.add(w)
    kwtree.finalize()
    # (keyword, position)のタプルのリストを返す
    res = kwtree.search_all(s)
    return res
Пример #5
0
    def test_simple_back_to_zero_state_example(self):
        kwtree = KeywordTree()
        keyword_list = ['ab', 'bca']
        for keyword in keyword_list:
            kwtree.add(keyword)
        kwtree.finalize()

        result = kwtree.search('blbabca')
        self.assertEqual(('ab', 3), result)
Пример #6
0
 def test_utility_calls(self):
     kwtree = KeywordTree(case_insensitive=True)
     kwtree.add('bla')
     kwtree.add('blue')
     kwtree.finalize()
     # Just test that there are no errors
     rep = repr(kwtree)
     self.assertGreater(len(rep), 0)
     tostring = str(kwtree)
     self.assertGreater(len(tostring), 0)
Пример #7
0
def search(patterns, content):
    kwtree = KeywordTree(case_insensitive=True)
    for p in patterns:
        kwtree.add(p)

    kwtree.finalize()
    results = kwtree.search_all(content)
    result_list = []
    for result in results:
        result_list.append(result[0])
    return result_list
Пример #8
0
    def test_search_all_issue_1_similar(self):
        text = '/foo/bar'
        words = ['/bara', '/foo/barb', 'bar']
        tree = KeywordTree(case_insensitive=True)
        for word in words:
            tree.add(word)
        tree.finalize()

        results = tree.search_all(text)

        self.assertEqual(('bar', 5), next(results))
Пример #9
0
def aho_corasick_search(peptides: List[str], proteins: Dict[str, str]) -> List[Tuple[str, str]]:
    matches: List[Tuple[str, str]] = []

    kwtree = KeywordTree(case_insensitive=True)
    for peptide in peptides:
        kwtree.add(peptide)
    kwtree.finalize()
    for key, protein in tqdm(proteins.items()):
        match = kwtree.search(protein)
        if match != None:
            matches.append((key, match[0])) 
    return matches
Пример #10
0
    def test_readme_example(self):
        '''
        As used in the projects README. If you have to change this test case,
        please update the README accordingly.
        '''
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('malaga')
        kwtree.add('lacrosse')
        kwtree.add('mallorca')
        kwtree.add('mallorca bella')
        kwtree.add('orca')
        kwtree.finalize()

        result = kwtree.search('My favorite islands are malaga and sylt.')
        self.assertEqual(('malaga', 24), result)

        result = kwtree.search(
            'idontlikewhitespaceswhereismalacrossequestionmark')
        self.assertEqual(('lacrosse', 29), result)

        results = kwtree.search_all('malheur on mallorca bellacrosse')
        self.assertIsNotNone(results)
        self.assertEqual(('mallorca', 11), next(results))
        self.assertEqual(('orca', 15), next(results))
        self.assertEqual(('mallorca bella', 11), next(results))
        self.assertEqual(('lacrosse', 23), next(results))
        with self.assertRaises(StopIteration):
            next(results)
Пример #11
0
 def init_context(self):
     concept_context = {}
     for k in self.atomic_op:
         if k.startswith('IN_'):
             concept = '_'.join(k.split('_')[1:-1])
             kt = KeywordTree(case_insensitive=False)
             for keyword in self.concept_words[concept]:
                 kt.add(keyword)
             kt.finalize()
             concept_context[k] = kt
         else:
             #BEFORE
             pass
     self.concept_context = concept_context
def ahocorasick_all_match(text, keywords):
    kwtree_all = KeywordTree(case_insensitive=True)
    for key in keywords:
        kwtree_all.add(key)
    kwtree_all.finalize()

    all_match = list()
    results = kwtree_all.search_all(text)
    for result in results:
        if result[0] in all_match:
            pass
        else:
            all_match.append(result[0])

    return len(all_match)
Пример #13
0
 def searcher(self, filepath):
     kwtree_word = []
     kwtree_weight = []
     f = open(filepath)
     for line in f:
         word, weight = line.split(' ')
         word = word.replace('_', ' ')
         kwtree_word.append(word)
         weight = weight.split('\n')[0]
         kwtree_weight.append(weight)
     f.close()
     kwtree = KeywordTree(case_insensitive=True)
     for word in kwtree_word:
         kwtree.add(word)
     kwtree.finalize()
     return kwtree, kwtree_word, kwtree_weight
Пример #14
0
    def test_visualizer(self):
        # Needs working pygraphviz on system
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('malaga')
        kwtree.add('lacrosse')
        kwtree.add('mallorca')
        kwtree.add('mallorca bella')
        kwtree.add('orca')
        kwtree.finalize()

        visualizer = Visualizer()
        visualizer.draw('readme_example.png', kwtree)
Пример #15
0
    def test_simple(self):
        kwtree = KeywordTree()
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.finalize()

        result = kwtree.search('bl')
        self.assertIsNone(result)

        result = kwtree.search('')
        self.assertIsNone(result)

        result = kwtree.search('zef')
        self.assertIsNone(result)

        result = kwtree.search('blaaaa')
        self.assertEqual(('bla', 0), result)

        result = kwtree.search('red green blue grey')
        self.assertEqual(('blue', 10), result)
Пример #16
0
    def test_suffix_stuff(self):
        kwtree = KeywordTree()
        kwtree.add('blaaaaaf')
        kwtree.add('bluez')
        kwtree.add('aaaamen')
        kwtree.add('uebergaaat')
        kwtree.finalize()

        result = kwtree.search('blaaaaamentada')
        self.assertEqual(('aaaamen', 3), result)

        result = kwtree.search('clueuebergaaameblaaaamenbluez')
        self.assertEqual(('aaaamen', 17), result)
Пример #17
0
    def test_pickling_simple(self):
        words = ['peter', 'horst', 'gandalf', 'frodo']
        tree = KeywordTree(case_insensitive=True)
        for word in words:
            tree.add(word)
        tree.finalize()
        as_bytes = dumps(tree)

        self.assertIsNotNone(as_bytes)

        deserialized = loads(as_bytes)

        self.assertIsNotNone(deserialized)

        text = 'Gollum did not like frodo. But gandalf did.'

        results = deserialized.search_all(text)

        self.assertEqual(('frodo', 20), next(results))
        self.assertEqual(('gandalf', 31), next(results))
Пример #18
0
    def test_many_keywords(self):
        kwtree = KeywordTree(case_insensitive=True)
        with open('tests/data/names.txt') as keyword_file:
            keyword_list = list(map(str.strip, keyword_file.readlines()))

        for kw in keyword_list:
            kwtree.add(kw)

        kwtree.finalize()
        with open('tests/data/textblob.txt') as keyword_file:
            textblob = keyword_file.read()

        result = kwtree.search(textblob)
        self.assertEqual(('Dawn Higgins', 34153), result)

        results = kwtree.search_all(textblob)
        self.assertIsNotNone(results)
        self.assertEqual(('Dawn Higgins', 34153), next(results))
        with self.assertRaises(StopIteration):
            next(results)
Пример #19
0
class Minus_words:
    def __init__(self, filename, minus_words):
        self.filename = filename
        self.tree = KeywordTree(case_insensitive=True)
        for word in minus_words:
            self.tree.add(word)
        self.tree.finalize()

    def minus(self):
        self.pool = ThreadPool()
        self.pre_result = self.pool.map(self._minus_function, self.filename)
        self.result = filter(None, self.pre_result)
        self.pool.close()
        self.pool.join()
        return (self.result)

    def _minus_function(self, word):
        if self.tree.search(word):
            return (None)
        else:
            return (word)
Пример #20
0
class Blacklist:
    """Class to check if a certain dir_name / dir_path is blacklisted"""

    def __init__(self, names: typing.List[str], path_parts: typing.List[str]):
        self.names = names
        self.path_parts = path_parts

        # The lookup algorithm
        self.lookup = self.is_blacklisted_part
        self.tree = None

        try:
            # If package is available, use Aho-Corasick algorithm,
            from ahocorapy.keywordtree import KeywordTree  # type: ignore

            self.tree = KeywordTree(case_insensitive=True)

            for p in self.path_parts:
                self.tree.add(p)
            self.tree.finalize()

            self.lookup = self.is_blacklisted_part_aho
        except ImportError:
            pass

    def is_blacklisted(self, dir_name: str, dir_path: str) -> bool:
        # First check if exact dir name is blacklisted.
        if dir_name in self.names:
            return True

        # Check if a path part is blacklisted (e.g. util/cmake)
        return self.lookup(dir_path)

    def is_blacklisted_part(self, dir_path: str) -> bool:
        if any(part in dir_path for part in self.path_parts):
            return True
        return False

    def is_blacklisted_part_aho(self, dir_path: str) -> bool:
        return self.tree.search(dir_path) is not None  # type: ignore
Пример #21
0
    def _load_kw_trees(self) -> List[KeywordTree]:
        """ Загружает префиксные деревья для терминов из словарей (название каждого файла соответствует количеству
        токенов в терминах этого файла

        :return: Список префиксных деревьев
        """
        fnames = [
            '1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt',
            '8.txt', '9.txt', '10.txt', '11.txt', '12.txt', '13.txt', '14.txt',
            '20.txt'
        ]
        files_dir_path = os.path.join(DICT_EXTRACTOR_PATH, TERMS_DIR_NAME)
        kw_trees = []
        for fname in fnames[::-1]:
            kwtree = KeywordTree()
            with open(os.path.join(files_dir_path, fname), 'r') as f:
                for ngramm in f.read().split('\n'):
                    if ngramm != '':
                        kwtree.add(ngramm.split())
                kwtree.finalize()
                kw_trees.append(kwtree)
        return kw_trees
Пример #22
0
    def test_domains(self):
        kwtree = KeywordTree()
        kwtree.add('searchenginemarketingfordummies.com')
        kwtree.add('linkpt.com')
        kwtree.add('fnbpeterstown.com')
        kwtree.finalize()

        result = kwtree.search('*****@*****.**')
        self.assertEqual(('linkpt.com', 10), result)
Пример #23
0
    def test_finalize_errors(self):
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.add('blue')

        self.assertRaises(ValueError, kwtree.search, 'blueb')

        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.finalize()

        self.assertRaises(ValueError, kwtree.add, 'blueb')

        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.finalize()

        self.assertRaises(ValueError, kwtree.finalize)
Пример #24
0
    def test_case_insensitivity_mode(self):
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.add('blISs')
        kwtree.finalize()

        result = kwtree.search('bLa')
        self.assertEqual(('bla', 0), result)

        result = kwtree.search('BLISS')
        self.assertEqual(('blISs', 0), result)
Пример #25
0
    def test_unicode(self):
        kwtree = KeywordTree()
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.add(u'颜到')
        kwtree.finalize()

        result = kwtree.search(u'春华变苍颜到处群魔乱')
        self.assertEqual((u'颜到', 4), result)

        result = kwtree.search(u'三年过')
        self.assertIsNone(result)
Пример #26
0
    def test_case_sensitivity(self):
        kwtree = KeywordTree()
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.add('blISs')
        kwtree.finalize()

        result = kwtree.search('bLa')
        self.assertIsNone(result)

        result = kwtree.search('BLISS')
        self.assertIsNone(result)

        result = kwtree.search('bliss')
        self.assertIsNone(result)

        result = kwtree.search('blISs')
        self.assertEqual(('blISs', 0), result)
Пример #27
0
def init_ahocorapy():
    kwtree = KeywordTree()
    for keyword in keyword_list:
        kwtree.add(keyword)
    kwtree.finalize()
    return kwtree
Пример #28
0
class SymbolExtractor:
    """
    Takes a reddit submission and extracts all mentioned tickers
    Utilizes the aho corasick algorithm known from antivirus software
    """

    __tickers: DataFrame
    __searchTree: KeywordTree

    def __init__(self, ticker_file: str):
        """
        Create new symbol extractor

        :param ticker_file: Path to a csv file with a Ticker column containing all relevant tickers
        """
        self.__tickers = pd.read_csv(ticker_file, sep="\t")
        self.__create_search_tree()

    def extract_symbols(self, submission: Submission) -> List[str]:
        """
        Extracts stock symbols from all text contained in a submission

        :param submission: to be searched
        :return: list of all found tickers
        """
        symbols: List[str] = self.__extract_symbols_from_title(submission)
        symbols += self.__extract_symbols_from_self_text(submission)
        symbols = self.__remove_duplicates(symbols)
        return symbols

    def __extract_symbols_from_title(self, submission: Submission) -> List[str]:
        """
        Extracts symbols from the title of a submission

        :param submission: to be searched
        :return: list of all found tickers
        """
        title = submission.title
        return self.find_symbols_in_text(title)

    def __extract_symbols_from_self_text(self, submission: Submission) -> List[str]:
        """
        Extracts symbols from the text of a submission

        :param submission: to be searched
        :return: list of all found tickers
        """
        if hasattr(submission, "self_text"):
            text = submission.self_text
            return self.find_symbols_in_text(text)
        return []

    def find_symbols_in_text(self, text: str) -> List[str]:
        """
        Extracts symbols from a text

        :param text: to be searched
        :return: List of all found tickers
        """
        matches = self.__searchTree.search_all(text)
        match_list = [ticker for (ticker, position) in matches]
        return match_list

    def __create_search_tree(self):
        """
        Initializes the search tree with the list of tickers in __tickers
        """
        self.__searchTree = KeywordTree()
        tickers: Series = self.__tickers.Ticker
        for ticker in tickers:
            self.__searchTree.add(ticker)
        self.__searchTree.finalize()

    @staticmethod
    def __remove_duplicates(symbols: List[str]):
        return list(set(symbols))
Пример #29
0
class Monitor(threading.Thread):
    """Continously scan for BLE advertisements."""

    def __init__(self, callback, bt_device_id, device_filter, packet_filter, scan_parameters):
        """Construct interface object."""
        # do import here so that the package can be used in parsing-only mode (no bluez required)
        self.backend = import_module('beacontools.backend')

        threading.Thread.__init__(self)
        self.daemon = False
        self.keep_going = True
        self.callback = callback

        # number of the bt device (hciX)
        self.bt_device_id = bt_device_id
        # list of beacons to monitor
        self.device_filter = device_filter
        self.mode = get_mode(device_filter)
        # list of packet types to monitor
        self.packet_filter = packet_filter
        # bluetooth socket
        self.socket = None
        # keep track of Eddystone Beacon <-> bt addr mapping
        self.eddystone_mappings = []
        # parameters to pass to bt device
        self.scan_parameters = scan_parameters
        # hci version
        self.hci_version = HCIVersion.BT_CORE_SPEC_1_0

        # construct an aho-corasick search tree for efficient prefiltering
        service_uuid_prefix = b"\x03\x03"
        self.kwtree = KeywordTree()
        if self.mode & ScannerMode.MODE_IBEACON:
            self.kwtree.add(bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + IBEACON_MANUFACTURER_ID + IBEACON_PROXIMITY_TYPE)
        if self.mode & ScannerMode.MODE_EDDYSTONE:
            self.kwtree.add(service_uuid_prefix + EDDYSTONE_UUID)
        if self.mode & ScannerMode.MODE_ESTIMOTE:
            self.kwtree.add(service_uuid_prefix + ESTIMOTE_UUID)
            self.kwtree.add(bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + ESTIMOTE_MANUFACTURER_ID)
        if self.mode & ScannerMode.MODE_CJMONITOR:
            self.kwtree.add(bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + CJ_MANUFACTURER_ID)
        if self.mode & ScannerMode.MODE_EXPOSURE_NOTIFICATION:
            self.kwtree.add(service_uuid_prefix + EXPOSURE_NOTIFICATION_UUID)
        self.kwtree.finalize()

    def run(self):
        """Continously scan for BLE advertisements."""
        self.socket = self.backend.open_dev(self.bt_device_id)

        self.hci_version = self.get_hci_version()
        self.set_scan_parameters(**self.scan_parameters)
        self.toggle_scan(True)

        while self.keep_going:
            pkt = self.socket.recv(255)
            event = to_int(pkt[1])
            subevent = to_int(pkt[3])
            if event == LE_META_EVENT and subevent in [EVT_LE_ADVERTISING_REPORT, EVT_LE_EXT_ADVERTISING_REPORT]:
                # we have an BLE advertisement
                self.process_packet(pkt)
        self.socket.close()

    def get_hci_version(self):
        """Gets the HCI version"""
        local_version = Struct(
            "status" / Byte,
            "hci_version" / Byte,
            "hci_revision" / Bytes(2),
            "lmp_version" / Byte,
            "manufacturer_name" / Bytes(2),
            "lmp_subversion" / Bytes(2),
        )

        try:
            resp = self.backend.send_req(self.socket, OGF_INFO_PARAM, OCF_READ_LOCAL_VERSION,
                                         EVT_CMD_COMPLETE, local_version.sizeof(), bytes(), 0)
            return HCIVersion(GreedyRange(local_version).parse(resp)[0]["hci_version"])
        except (ConstructError, NotImplementedError):
            return HCIVersion.BT_CORE_SPEC_1_0

    def set_scan_parameters(self, scan_type=ScanType.ACTIVE, interval_ms=10, window_ms=10,
                            address_type=BluetoothAddressType.RANDOM, filter_type=ScanFilter.ALL):
        """"Sets the le scan parameters

        For extended set scan parameters command additional parameter scanning PHYs has to be provided.
        The parameter indicates the PHY(s) on which the advertising packets should be received on the
        primary advertising physical channel. For further information have a look on BT Core 5.1 Specification,
        page 1439 ( LE Set Extended Scan Parameters command).

        Args:
            scan_type: ScanType.(PASSIVE|ACTIVE)
            interval: ms (as float) between scans (valid range 2.5ms - 10240ms or 40.95s for extended version)
                ..note:: when interval and window are equal, the scan
                    runs continuos
            window: ms (as float) scan duration (valid range 2.5ms - 10240ms or 40.95s for extended version)
            address_type: Bluetooth address type BluetoothAddressType.(PUBLIC|RANDOM)
                * PUBLIC = use device MAC address
                * RANDOM = generate a random MAC address and use that
            filter: ScanFilter.(ALL|WHITELIST_ONLY) only ALL is supported, which will
                return all fetched bluetooth packets (WHITELIST_ONLY is not supported,
                because OCF_LE_ADD_DEVICE_TO_WHITE_LIST command is not implemented)

        Raises:
            ValueError: A value had an unexpected format or was not in range
        """
        max_interval = (0x4000 if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0 else 0xFFFF)
        interval_fractions = interval_ms / MS_FRACTION_DIVIDER
        if interval_fractions < 0x0004 or interval_fractions > max_interval:
            raise ValueError(
                "Invalid interval given {}, must be in range of 2.5ms to {}ms!".format(
                    interval_fractions, max_interval * MS_FRACTION_DIVIDER))
        window_fractions = window_ms / MS_FRACTION_DIVIDER
        if window_fractions < 0x0004 or window_fractions > max_interval:
            raise ValueError(
                "Invalid window given {}, must be in range of 2.5ms to {}ms!".format(
                    window_fractions, max_interval * MS_FRACTION_DIVIDER))

        interval_fractions, window_fractions = int(interval_fractions), int(window_fractions)

        if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0:
            command_field = OCF_LE_SET_SCAN_PARAMETERS
            scan_parameter_pkg = struct.pack(
                "<BHHBB",
                scan_type,
                interval_fractions,
                window_fractions,
                address_type,
                filter_type)
        else:
            command_field = OCF_LE_SET_EXT_SCAN_PARAMETERS
            scan_parameter_pkg = struct.pack(
                "<BBBBHH",
                address_type,
                filter_type,
                1,  # scan advertisements on the LE 1M PHY
                scan_type,
                interval_fractions,
                window_fractions)

        self.backend.send_cmd(self.socket, OGF_LE_CTL, command_field, scan_parameter_pkg)

    def toggle_scan(self, enable, filter_duplicates=False):
        """Enables or disables BLE scanning

        For extended set scan enable command additional parameters duration and period have
        to be provided. When both are zero, the controller shall continue scanning until
        scanning is disabled. For non-zero values have a look on BT Core 5.1 Specification,
        page 1442 (LE Set Extended Scan Enable command).

        Args:
            enable: boolean value to enable (True) or disable (False) scanner
            filter_duplicates: boolean value to enable/disable filter, that
                omits duplicated packets"""
        if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0:
            command_field = OCF_LE_SET_SCAN_ENABLE
            command = struct.pack("BB", enable, filter_duplicates)
        else:
            command_field = OCF_LE_SET_EXT_SCAN_ENABLE
            command = struct.pack("<BBHH", enable, filter_duplicates,
                                  0,  # duration
                                  0   # period
                                  )

        self.backend.send_cmd(self.socket, OGF_LE_CTL, command_field, command)

    def process_packet(self, pkt):
        """Parse the packet and call callback if one of the filters matches."""
        payload = pkt[14:-1] if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0 else pkt[29:]

        # check if this could be a valid packet before parsing
        # this reduces the CPU load significantly
        if not self.kwtree.search(payload):
            return

        bt_addr = bt_addr_to_string(pkt[7:13])
        rssi = bin_to_int(pkt[-1] if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0 else pkt[18])
        # strip bluetooth address and parse packet
        packet = parse_packet(payload)

        # return if packet was not an beacon advertisement
        if not packet:
            return

        # we need to remeber which eddystone beacon has which bt address
        # because the TLM and URL frames do not contain the namespace and instance
        self.save_bt_addr(packet, bt_addr)
        # properties holds the identifying information for a beacon
        # e.g. instance and namespace for eddystone; uuid, major, minor for iBeacon
        properties = self.get_properties(packet, bt_addr)

        if self.device_filter is None and self.packet_filter is None:
            # no filters selected
            self.callback(bt_addr, rssi, packet, properties)

        elif self.device_filter is None:
            # filter by packet type
            if is_one_of(packet, self.packet_filter):
                self.callback(bt_addr, rssi, packet, properties)
        else:
            # filter by device and packet type
            if self.packet_filter and not is_one_of(packet, self.packet_filter):
                # return if packet filter does not match
                return

            # iterate over filters and call .matches() on each
            for filtr in self.device_filter:
                if isinstance(filtr, BtAddrFilter):
                    if filtr.matches({'bt_addr':bt_addr}):
                        self.callback(bt_addr, rssi, packet, properties)
                        return

                elif filtr.matches(properties):
                    self.callback(bt_addr, rssi, packet, properties)
                    return

    def save_bt_addr(self, packet, bt_addr):
        """Add to the list of mappings."""
        if isinstance(packet, EddystoneUIDFrame):
            # remove out old mapping
            new_mappings = [m for m in self.eddystone_mappings if m[0] != bt_addr]
            new_mappings.append((bt_addr, packet.properties))
            self.eddystone_mappings = new_mappings

    def get_properties(self, packet, bt_addr):
        """Get properties of beacon depending on type."""
        if is_one_of(packet, [EddystoneTLMFrame, EddystoneURLFrame, \
                              EddystoneEncryptedTLMFrame, EddystoneEIDFrame]):
            # here we retrieve the namespace and instance which corresponds to the
            # eddystone beacon with this bt address
            return self.properties_from_mapping(bt_addr)
        else:
            return packet.properties

    def properties_from_mapping(self, bt_addr):
        """Retrieve properties (namespace, instance) for the specified bt address."""
        for addr, properties in self.eddystone_mappings:
            if addr == bt_addr:
                return properties
        return None

    def terminate(self):
        """Signal runner to stop and join thread."""
        self.toggle_scan(False)
        self.keep_going = False
        self.join()
Пример #30
0
class OWMCitySlot:
    def __init__(
        self,
        path_to_geo_entities: str = "data/openweathermap_city_list.json"
    ) -> None:
        """Initialize a trie for finding city names.

        :param path_to_geo_entities: filepath to a JSON file containing a list of cities
            file format: ["Ḩeşār-e Sefīd", "‘Ayn Ḩalāqīm", "Taglag", ..... , "Gerton"]
            this list was created using the source file: https://bulk.openweathermap.org/sample/city.list.json.gz
        :type path_to_geo_entities: str
        """
        self.geonames = self._load_from_json(path_to_geo_entities)
        self.kwtree = KeywordTree(case_insensitive=True)
        for geo in self.geonames:
            self.kwtree.add(f" {geo} ")
        self.kwtree.finalize()

    def _load_from_json(self, path_to_geo_entities: str) -> List[str]:
        """Load a list with city names from a JSON file.

        :param path_to_geo_entities: filepath to a JSON file
        :type path_to_geo_entities: str
        :return: a list containing city names
        :rtype: List[str]
        """
        with open(path_to_geo_entities, "r", encoding="utf-8") as f:
            json_data = json.load(f)
        geonames = set()
        for city in json_data:
            geonames.add(city)
        return list(geonames)

    def find_geo_names_in_utterance(self, utterance: str) -> str:
        """Search the first occurrence of the location name in utterance.

        :param utterance: human utterance
        :type utterance: str
        :return: a location name or an empty string if nothing found
        :rtype: str
        """
        # replace punctuation with spaces
        for p in string.punctuation:
            utterance = utterance.replace(p, " ")
        # delete excessive spaces
        utterance = re.sub(r"\s{2,}", " ", utterance.lower()).strip()
        results = list(self.kwtree.search_all(" %s " % utterance))
        # TODO the method could be improved if we search all geo names and then filter
        # the most precises geo entity.
        # User may write: "Massachusetts Boston" -> It has 2 entities, and Boston is preferred
        # because it is more precise location.
        return self.get_best_match(results)

    def get_best_match(self, results: Iterable[Tuple[str, int]]) -> str:
        """Select from the objects with the lowest index the object with the longest length.

         Usually the earliest entity is the most precise.
         For example for the utterance: "west valley city utah", we receive:
         [(' West ', 0), (' West Valley ', 0), (' Valley ', 5), (' West Valley City ', 0),
         (' Valley City ', 5), (' Utah ', 17)], we should select "West Valley City".

        :param results: a sequence with the following pairs (<location_name>, <index>)
        :type results: Iterable[Sequence[str, int]]
        :return: the best match or an empty string if the results are empty
        :rtype: str
        """
        best_match = ""
        if results:
            results = sorted(results,
                             key=lambda entity:
                             (entity[1], -len(entity[0].strip())))
            best_match = results[0][0].strip()
        return best_match

    def __call__(self, *args, **kwargs) -> str:
        """Find the best match in the trie.

        :return: a location name or an empty string if nothing found
        :rtype: str
        """
        return self.find_geo_names_in_utterance(*args, **kwargs)