def _make_sequence( processor: Processor, corpus: ProviderBase, set_type: SetType = SetType.DEV) -> Sequence[DocumentForEval]: for d in tqdm(corpus.subset(set_type), total=corpus.subset_size(set_type)): summary_size = SummarySize.new_absolute(len(d.ref_summary)) \ if corpus.purpose() & CorpusPurpose.SUMMARY \ else SummarySize.new_relative(.1) kw_num = len( retrieve_lemmatized_tokens(corpus.language(), d.ref_keywords) ) if corpus.purpose() & CorpusPurpose.KEYWORDS else 0 text_process_params = TextProcessParams(summary_size, kw_num) summary = processor.process_text(d.text, text_process_params) if summary.errors: print( F"Found errors during processing document '{d.id_}'. Skipped.") print("First:", summary.errors[0]) break if summary.warnings: print( f"Found in document {d.id_} warnings during processing. First: {summary.warnings[0]}" ) res = DocumentForEval(d.ref_keywords, [kw.lemma for kw in summary.keywords], d.ref_summary, [s.lemma for s in summary.summary], corpus.language()) yield res
def test_digest_processor_fr(): processor = Processor() text = udhr.raw('French_Francais-Latin1') text_process_params = TextProcessParams(SummarySize.new_relative(0.1), keywords_number=10) document = processor.process_text(text, text_process_params) assert isinstance(document, Document) assert 5 <= len(document.sentences)
def test_TextProcessParams_str(): params = TextProcessParams(summary_size=SummarySize.new_relative(0.1), keywords_number=0) s = str(params) assert s != '' params = TextProcessParams(summary_size=SummarySize.new_absolute(10), keywords_number=0) s = str(params) assert s != ''
def test_TextProcessParams_creation(): params = TextProcessParams(summary_size=SummarySize.new_relative(0.1), keywords_number=0) params = TextProcessParams(summary_size=SummarySize.new_relative(0.1), keywords_number=10)
def test_TextProcessParams_keywords_negative(): with pytest.raises(ValueError): TextProcessParams(summary_size=SummarySize.new_relative(0.1), keywords_number=-1)