def test_safe_get_nested_value(self):
        test_pelican = PelicanJson(self.monterrey)
        assert test_pelican.safe_get_nested_value(["STRING"]) is None
        badpath = ('results', 'value', '9')
        assert test_pelican.safe_get_nested_value(badpath,
                                                  default="la") == "la"
        assert test_pelican.safe_get_nested_value([8], default=0) == 0
        tp = (1, 2)
        assert test_pelican.safe_get_nested_value(['results', 1000],
                                                  default=tp) == tp

        assert test_pelican.safe_get_nested_value(["results", 10000],
                                                  default="test") == "test"
        assert test_pelican.safe_get_nested_value(["results", "key"],
                                                  default="test") == "test"

        with self.assertRaises(EmptyPath):
            test_pelican.safe_get_nested_value([])
        with self.assertRaises(BadPath):
            test_pelican.safe_get_nested_value({'results', 8})
示例#2
0
    def run_on_index(self, docs: List[dict], doc_paths: List[str], ratio,
                     algorithm: List[str]):
        """Generate summary based on tokenized text retrieved from es fields

            Parameters:
            docs (list): list of documents
            doc_paths (list): list of fields
            ratio (float): ratio to use for summarization
            algorithm (list): list of algorithms for sumy

            Returns:
            list:stack

        """
        stack = []
        algorithm = ast.literal_eval(algorithm)
        summarizers = self.get_summarizers(algorithm)
        for document in docs:
            wrapper = PelicanJson(document)
            for doc_path in doc_paths:
                doc_path_as_list = doc_path.split(".")
                content = wrapper.safe_get_nested_value(doc_path_as_list,
                                                        default=[])
                if content and isinstance(content, str):
                    ratio_count = SumyTokenizer().sentences_ratio(
                        content, float(ratio))
                    parser = PlaintextParser.from_string(
                        content, SumyTokenizer())
                else:
                    ratio_count = SumyTokenizer().sentences_ratio(
                        document[doc_path], float(ratio))
                    parser = PlaintextParser.from_string(
                        document[doc_path], SumyTokenizer())

                summaries = {}
                for name, summarizer in summarizers.items():
                    try:
                        summarization = summarizer(parser.document,
                                                   float(ratio_count))
                    except Exception as e:
                        logging.getLogger(ERROR_LOGGER).exception(e)
                        continue

                    summary = [sent._text for sent in summarization]
                    summary = "\n".join(summary)
                    summaries[doc_path + "_" + name] = summary

                stack.append(summaries)

        return stack
示例#3
0
def parse_doc_texts(doc_path: str, document: dict) -> list:
    """
    Function for parsing text values from a nested dictionary given a field path.
    :param doc_path: Dot separated path of fields to the value we wish to parse.
    :param document: Document to be worked on.
    :return: List of text fields that will be processed by MLP.
    """
    wrapper = PelicanJson(document)
    doc_path_as_list = doc_path.split(".")
    content = wrapper.safe_get_nested_value(doc_path_as_list, default=[])
    if content and isinstance(content, str):
        return [content]
    # Check that content is non-empty list and there are only stings in the list.
    elif content and isinstance(content, list) and all(
        [isinstance(list_content, str) for list_content in content]):
        return content
    # In case the field path is faulty and it gives you a dictionary instead.
    elif isinstance(content, dict):
        return []
    else:
        return []