def wiki_summary_by_name(self): # building link link_asked_wiki = 'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=1' \ '&exintro=&explaintext=&titles=' + str(self) wiki_response = PelicanJson(requests.get(link_asked_wiki).json()) # getting a json tree for item in wiki_response.enumerate(): tree_path = item # printing wiki content page return wiki_response.get_nested_value(tree_path[0])
def run_on_index(self, docs: List[dict], doc_paths: List[str], ratio, algorithm: List[str]): """Generate summary based on tokenized text retrieved from es fields Parameters: docs (list): list of documents doc_paths (list): list of fields ratio (float): ratio to use for summarization algorithm (list): list of algorithms for sumy Returns: list:stack """ stack = [] algorithm = ast.literal_eval(algorithm) summarizers = self.get_summarizers(algorithm) for document in docs: wrapper = PelicanJson(document) for doc_path in doc_paths: doc_path_as_list = doc_path.split(".") content = wrapper.safe_get_nested_value(doc_path_as_list, default=[]) if content and isinstance(content, str): ratio_count = SumyTokenizer().sentences_ratio( content, float(ratio)) parser = PlaintextParser.from_string( content, SumyTokenizer()) else: ratio_count = SumyTokenizer().sentences_ratio( document[doc_path], float(ratio)) parser = PlaintextParser.from_string( document[doc_path], SumyTokenizer()) summaries = {} for name, summarizer in summarizers.items(): try: summarization = summarizer(parser.document, float(ratio_count)) except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) continue summary = [sent._text for sent in summarization] summary = "\n".join(summary) summaries[doc_path + "_" + name] = summary stack.append(summaries) return stack
def parse_doc_texts(doc_path: str, document: dict) -> list: """ Function for parsing text values from a nested dictionary given a field path. :param doc_path: Dot separated path of fields to the value we wish to parse. :param document: Document to be worked on. :return: List of text fields that will be processed by MLP. """ wrapper = PelicanJson(document) doc_path_as_list = doc_path.split(".") content = wrapper.safe_get_nested_value(doc_path_as_list, default=[]) if content and isinstance(content, str): return [content] # Check that content is non-empty list and there are only stings in the list. elif content and isinstance(content, list) and all( [isinstance(list_content, str) for list_content in content]): return content # In case the field path is faulty and it gives you a dictionary instead. elif isinstance(content, dict): return [] else: return []