예제 #1
0
def _indexObject(objectToIndex: ObjectToIndexTuple) -> List[SearchIndex]:
    """ Index Object

    This method creates  the "SearchIndex" objects to insert into the DB.

    Because our data is not news articles, we can skip some of the advanced
    natural language processing (NLP)

    We're going to be indexing things like unique IDs, job titles, and equipment names.
    We may add exclusions for nuisance words later on.

    """
    searchIndexes = []

    for propKey, text in objectToIndex.fullKwProps.items():
        for token in splitFullKeywords(text):
            searchIndexes.append(
                SearchIndex(chunkKey=makeSearchIndexChunkKey(token),
                            keyword=token,
                            propertyName=propKey,
                            objectId=objectToIndex.id))

    for propKey, text in objectToIndex.partialKwProps.items():
        for token in splitPartialKeywords(text):
            searchIndexes.append(
                SearchIndex(chunkKey=makeSearchIndexChunkKey(token),
                            keyword=token,
                            propertyName=propKey,
                            objectId=objectToIndex.id))

    return searchIndexes
예제 #2
0
    def _filterObjectsForSearchString(self, results: List[SearchResultObjectTuple],
                                      searchString: str,
                                      propertyName: Optional[str]) -> Deferred:
        """ Filter Objects For Search String

        STAGE 2 of the search.

        This method filters the loaded objects to ensure we have full matches.

        :param results:
        :param searchString:
        :param propertyName:
        :return:
        """

        noFulls = lambda t: not t.endswith('$')

        # Get the partial tokens, and match them
        tokens = set(filter(noFulls, splitPartialKeywords(searchString)))

        def filterResult(result: SearchResultObjectTuple) -> bool:
            props = result.properties
            if propertyName:
                props = {propertyName: props[propertyName]} \
                    if propertyName in props else {}

            allPropVals = ' '.join(props.values())
            theseTokens = set(filter(noFulls, splitPartialKeywords(allPropVals)))
            return bool(tokens & theseTokens)

        return list(filter(filterResult, results))
    def test_twoCharTokens(self):
        self.assertEqual(set(), twoCharTokens("smith smith"))

        self.assertEqual({'^to$'}, twoCharTokens(splitFullKeywords("two to")))

        self.assertEqual({'^to'},
                         twoCharTokens(splitPartialKeywords("two to")))
예제 #4
0
    def _getObjectIdsForSearchString(self, searchString: str,
                                     propertyName: Optional[str]) -> Deferred:
        """ Get ObjectIds For Search String

        STAGE 1 of the search.

        This method loads all of the search objects that match the search strings.

        This will load in some false matches, they are filtered out in
        _filterObjectsForSearchString

        Searching is complex because we don't know if we're looking for a full
        or partial tokenizing.

        :rtype List[int]

        """
        logger.debug("Started search with string |%s|", searchString)

        # ---------------
        # Search for fulls
        fullTokens = splitFullKeywords(searchString)

        logger.debug("Searching for full tokens |%s|", fullTokens)

        # Now lookup any remaining keywords, if any
        resultsByFullKw = self._getObjectIdsForTokensBlocking(fullTokens,
                                                              propertyName)
        resultsByFullKw = {k: v for k, v in resultsByFullKw.items() if v}

        logger.debug("Found results for full tokens |%s|", set(resultsByFullKw))

        # ---------------
        # Search for partials
        partialTokens = splitPartialKeywords(searchString)
        logger.debug("Searching for partial tokens |%s|", partialTokens)

        # Now lookup any remaining keywords, if any
        resultsByPartialKw = self._getObjectIdsForTokensBlocking(partialTokens,
                                                                 propertyName)
        resultsByPartialKw = {k: v for k, v in resultsByPartialKw.items() if v}

        logger.debug("Found results for partial tokens |%s|", set(resultsByPartialKw))

        # ---------------
        # Process the results

        # Merge partial kw results with full kw results.
        resultsByKw = self._mergePartialAndFullMatches(searchString,
                                                       resultsByFullKw,
                                                       resultsByPartialKw)

        logger.debug("Merged tokens |%s|", set(resultsByKw))

        # Now, return the ObjectIDs that exist in all keyword lookups
        objectIdsUnion = self._setIntersectFilterIndexResults(resultsByKw)

        # Limit to 50 and return
        return list(objectIdsUnion)[:50]
예제 #5
0
        def filterResult(result: SearchResultObjectTuple) -> bool:
            props = result.properties
            if propertyName:
                props = {propertyName: props[propertyName]} \
                    if propertyName in props else {}

            allPropVals = ' '.join(props.values())
            theseTokens = set(filter(noFulls, splitPartialKeywords(allPropVals)))
            return bool(tokens & theseTokens)
예제 #6
0
    def test_mergePartialAndFullMatches_3(self):
        searchString = 'tatu west fus'
        fullByKw = {}
        partialByKw = {t: [7, 6] for t in splitPartialKeywords(searchString)}

        inst = FastKeywordController(None, None)

        resultByKw = inst._mergePartialAndFullMatches(searchString, fullByKw,
                                                      partialByKw)

        self.assertEqual(set(resultByKw), {'west', 'fus', 'tatu'})
        self.assertEqual(set(resultByKw['west']), {6, 7})
        self.assertEqual(set(resultByKw['fus']), {6, 7})
        self.assertEqual(set(resultByKw['tatu']), {6, 7})
예제 #7
0
    def _mergePartialAndFullMatches(self, searchString: str,
                                    resultsByFullKw: Dict[str, List[int]],
                                    resultsByPartialKw: Dict[str, List[int]]
                                    ) -> Dict[str, List[int]]:
        """ Merge Partial """

        # Copy this, because we want to modify it and don't want to affect other logic
        resultsByPartialKw = resultsByPartialKw.copy()
        resultsByPartialKwSet = set(resultsByPartialKw)

        mergedResultsByKw = {}

        for fullKw, fullObjectIds in resultsByFullKw.items():
            # Merge in full
            fullKw = fullKw.strip('^$')
            existing = mergedResultsByKw.get(fullKw.strip('^$'), list())

            # Include the fulls
            existing.extend(fullObjectIds)

            mergedResultsByKw[fullKw] = existing

        tokens = _splitFullTokens(searchString)
        for token in tokens:
            token = token.strip('^$')
            existing = mergedResultsByKw.get(token.strip('^$'), list())
            partialKws = splitPartialKeywords(token)

            if not partialKws <= resultsByPartialKwSet:
                continue

            # Union all
            objectIdsForToken = set(resultsByPartialKw[partialKws.pop()])
            while partialKws:
                objectIdsForToken &= set(resultsByPartialKw[partialKws.pop()])

            existing.extend(list(objectIdsForToken))

            mergedResultsByKw[token] = existing

        return mergedResultsByKw
    def testPartialKeywordSplit(self):
        self.assertEqual({'^smi', 'mit', 'ith'}, splitPartialKeywords("smith"))

        self.assertEqual(
            {'^zor', 'orr', 'rro', 'ror', 'ore', 'rey', 'eyn', 'yne', 'ner'},
            splitPartialKeywords("ZORRO-REYNER"))
        self.assertEqual({'^345', '535', '453', '534', '345'},
                         splitPartialKeywords("34534535"))

        self.assertEqual({'^and'}, splitPartialKeywords("and"))

        self.assertEqual({"^to"}, splitPartialKeywords("to"))

        self.assertEqual({'a55', '^ha5'}, splitPartialKeywords("ha55"))

        self.assertEqual({'^mil', 'ilt', 'lto', 'ton', '^uni', 'nit', '^22'},
                         splitPartialKeywords("Milton Unit 22"))

        self.assertEqual({'^mil', 'ill', 'lls', '^un', '^no'},
                         splitPartialKeywords("mills un no"))

        self.assertEqual({'^uni', 'nit', "^22"},
                         splitPartialKeywords("Unit 22"))

        self.assertEqual({'^uni', 'nit'}, splitPartialKeywords("Unit 1"))

        self.assertEqual({'^uni', 'nit'}, splitPartialKeywords("A Unit"))

        self.assertEqual({'^uni', 'nit'}, splitPartialKeywords("2 Unit"))

        self.assertEqual(
            {'^ats', "^cb", "^b3", '^66k', '6kv', '^tra', 'ran', 'ans'},
            splitPartialKeywords("ATS B3 TRANS 66KV CB"))

        self.assertEqual({'^col', '^lin', 'ins'},
                         splitPartialKeywords("COL LINS"))

        self.assertNotEqual(splitPartialKeywords("COLLINS"),
                            splitPartialKeywords("COL LINS"))