示例#1
0
def analyse_vocab(rouge, datasets=None, topics=None):
    if datasets is None:
        return

    concept_type = ("parse", "ngrams")
    embedding_variants = ("google.neg.300d", "glove.6B.300d", "tudarmstadt_german")
    # concept_type = None
    topic_details = []
    concept_details = []
    # else:
    #     embeddings = load_w2v_embeddings(embeddings_path, language, oracle)
    token_details = []
    embeddings_path = path.normpath(path.join(args.iobasedir, "embeddings"))

    for dataset, concept_type, embedding_variant in itertools.product(datasets, concept_type, embedding_variants):
        print("running analysis for ", dataset, concept_type, embedding_variant,
              "--------------------------------------")
        i = 0
        ds = resolve_against_iobase(dataset, args.iobasedir)
        d = DataSet(ds)
        language = d.get_language()
        embeddings = load_w2v_by_name(embeddings_path, variant=embedding_variant)
        for topic in d.get_topics():
            # if i > 2:
            #     continue
            sumewrap = SumeWrap(language=language)
            i += 1
            docs = topic.get_docs()
            summaries = topic.get_models()

            parse_info = topic.get_parse_info(0)

            sf = SimulatedFeedback(language, rouge, embeddings=embeddings, docs=docs, models=summaries,
                               summary_length=100, oracle_type="active_learning", ub_score=(1, 1, 1),
                               ub_summary=" ", parser_type=concept_type)
            # sf.run_full_simulation(max_iteration_count=0)

            doc_sentences = sf.summarizer.sentences

            summaries_parse_info = [list(topic.get_models(parsed=True)), list(topic.get_models(parsed=True))]
            if concept_type is "parse":
                sumewrap.s.sentences = sumewrap.load_sume_sentences(summaries, parse_type=concept_type,
                                                                    parse_info=list(summaries_parse_info))
                sumewrap.s.extract_ngrams2(concept_type="phrase")
            else:
                sumewrap.s.sentences = sumewrap.load_sume_sentences(summaries)
                sumewrap.s.extract_ngrams2()
            sumewrap.s.compute_document_frequency()
            model_sentences = sumewrap.s.sentences

            #
            #  token_details
            #
            for s in doc_sentences:
                sentence_pos = s.position
                doc_id = s.doc_id
                token_from_summary = False
                token_from_document = True
                for concept in s.concepts:
                    ngrams = concept.split(' ')
                    for token in ngrams:
                        pos = "UNK"
                        try:
                            word, pos = s.tokens_pos[token].split('::')
                        except:
                            token = re.sub('[-\.](\s|$)', '\\1', token)
                            try:
                                word, pos = s.tokens_pos[concept].split('::')
                            except:
                                word, pos = token, 'NN'
                        token_details.append({
                            "sentence_pos": sentence_pos,
                            "doc_id": doc_id,
                            "topic": topic.get_name(),
                            "dataset": d.get_name(),
                            "language": d.get_language(),
                            "token": token,
                            "word": word,
                            "pos_tag": pos,
                            "from_summary": token_from_summary,
                            "from_document": token_from_document,
                            "concept_type": concept_type,
                            "embedding_variant": embedding_variant,
                            "token_has_embedding": embeddings.isKnown(token),
                            "word_has_embedding": embeddings.isKnown(word)
                        })
            for s in model_sentences:
                sentence_pos = s.position
                doc_id = s.doc_id
                token_from_summary = True
                token_from_document = False
                for concept in s.concepts:
                    ngrams = concept.split(' ')
                    for token in ngrams:
                        pos = "UNK"
                        try:
                            word, pos = s.tokens_pos[token].split('::')
                        except:
                            token = re.sub('[-\.](\s|$)', '\\1', token)
                            try:
                                word, pos = s.tokens_pos[concept].split('::')
                            except:
                                word, pos = token, 'NN'

                        token_details.append({
                            "sentence_pos": sentence_pos,
                            "doc_id": doc_id,
                            "topic": topic.get_name(),
                            "dataset": d.get_name(),
                            "language": d.get_language(),
                            "token": token,
                            "word": word,
                            "pos_tag": pos,
                            "from_summary": token_from_summary,
                            "from_document": token_from_document,
                            "concept_type": concept_type,
                            "embedding_variant": embedding_variant,
                            "token_has_embedding": embeddings.isKnown(token),
                            "word_has_embedding": embeddings.isKnown(word)
                        })

    # post-process token details
    token_df = pd.DataFrame(token_details)
    # token_df.groupby("dataset")
    # print(token_df.head())
    filename = "C:\\Users\\hatieke\\.ukpsummarizer\\tmp\\tokens_new.csv"
    print("saving token_df to ", filename)
    token_df.to_csv(filename, encoding="UTF-8")
示例#2
0
        else:
            pickleout = resolve_filename(args.pickleout.replace("\"",""), base=iobasedir)

        runner.single_iteration(picklein=picklein, pickleout=pickleout,
                                feedbacks=js)

    elif args.command == 'summarize':

        # check if the path refers to a dataset, a topic or a sole model:
        queue = []
        f = utils.reader.resolve_against_iobase(args.file, iobasedir)
        if path.exists(path.join(f, "index.json")):
            # is_dataset
            d = DataSet(f)
            # unroll to get topics
            for t in d.get_topics():
                for (mf, mt) in t.get_models():
                    mf = path.normpath(mf)
                    pref = path.commonprefix([mf, iobasedir])
                    tn = mf[len(pref) + 1:]
                    print("shortened:", tn)
                    queue.append(mf)

                    # topics.append([t.get_name for t in d.get_topics()])

        elif path.exists(path.join(f, "task.json")):
            # is topic
            t = Topic(f)
            for (mf, mt) in t.get_models():
                mf = path.normpath(mf)
                pref = path.commonprefix([mf, iobasedir])