예제 #1
0
 def wiki_summary_by_name(self):
     # building link
     link_asked_wiki = 'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=1' \
                       '&exintro=&explaintext=&titles=' + str(self)
     wiki_response = PelicanJson(requests.get(link_asked_wiki).json())
     # getting a json tree
     for item in wiki_response.enumerate():
         tree_path = item
     # printing wiki content page
     return wiki_response.get_nested_value(tree_path[0])
예제 #2
0
    def run_on_index(self, docs: List[dict], doc_paths: List[str], ratio,
                     algorithm: List[str]):
        """Generate summary based on tokenized text retrieved from es fields

            Parameters:
            docs (list): list of documents
            doc_paths (list): list of fields
            ratio (float): ratio to use for summarization
            algorithm (list): list of algorithms for sumy

            Returns:
            list:stack

        """
        stack = []
        algorithm = ast.literal_eval(algorithm)
        summarizers = self.get_summarizers(algorithm)
        for document in docs:
            wrapper = PelicanJson(document)
            for doc_path in doc_paths:
                doc_path_as_list = doc_path.split(".")
                content = wrapper.safe_get_nested_value(doc_path_as_list,
                                                        default=[])
                if content and isinstance(content, str):
                    ratio_count = SumyTokenizer().sentences_ratio(
                        content, float(ratio))
                    parser = PlaintextParser.from_string(
                        content, SumyTokenizer())
                else:
                    ratio_count = SumyTokenizer().sentences_ratio(
                        document[doc_path], float(ratio))
                    parser = PlaintextParser.from_string(
                        document[doc_path], SumyTokenizer())

                summaries = {}
                for name, summarizer in summarizers.items():
                    try:
                        summarization = summarizer(parser.document,
                                                   float(ratio_count))
                    except Exception as e:
                        logging.getLogger(ERROR_LOGGER).exception(e)
                        continue

                    summary = [sent._text for sent in summarization]
                    summary = "\n".join(summary)
                    summaries[doc_path + "_" + name] = summary

                stack.append(summaries)

        return stack
예제 #3
0
def parse_doc_texts(doc_path: str, document: dict) -> list:
    """
    Function for parsing text values from a nested dictionary given a field path.
    :param doc_path: Dot separated path of fields to the value we wish to parse.
    :param document: Document to be worked on.
    :return: List of text fields that will be processed by MLP.
    """
    wrapper = PelicanJson(document)
    doc_path_as_list = doc_path.split(".")
    content = wrapper.safe_get_nested_value(doc_path_as_list, default=[])
    if content and isinstance(content, str):
        return [content]
    # Check that content is non-empty list and there are only stings in the list.
    elif content and isinstance(content, list) and all(
        [isinstance(list_content, str) for list_content in content]):
        return content
    # In case the field path is faulty and it gives you a dictionary instead.
    elif isinstance(content, dict):
        return []
    else:
        return []