Пример #1
0
    def get_scores(self, summary, models):
        write_to_file(summary, path.join(self.temp_dir, self.reference_summary_temp_filename),True)

        models_dir = path.dirname(models[0][0])
        config = self.create_config([self.reference_summary_temp_filename], models, models_dir)

        write_to_file(config, self.temp_config_file, True)

        result = self.execute_rouge()

        result_dict = self.extract_results(result.decode('utf-8'))

        R1score = float(result_dict["1"]['ROUGE-1 R'])
        R2score = float(result_dict["1"]['ROUGE-2 R'])
        if self.verbose:
            R3score = float(result_dict["1"]['ROUGE-3 R'])
            R4score = float(result_dict["1"]['ROUGE-4 R'])
        if self.rouge_l:
            RLscore = float(result_dict["1"]['ROUGE-L R'])
        RSU4score = float(result_dict["1"]['ROUGE-SU* R'])
        if self.verbose and self.rouge_l:
            return R1score, R2score, R3score, R4score, RLscore, RSU4score
        elif self.verbose and not self.rouge_l:
            return R1score, R2score, R3score, R4score, RSU4score
        elif not self.verbose and self.rouge_l:
            return R1score, R2score, RLscore, RSU4score
        else:
            return R1score, R2score, RSU4score
Пример #2
0
 def write_summarize_output_json(self,
                                 sf,
                                 confirmatory_summary,
                                 derived_records,
                                 log,
                                 recom_sentences,
                                 result,
                                 run_id,
                                 summarizer,
                                 summary,
                                 pickle_store=None):
     # convert the sentences into a jsonizable structure:
     sents = convert_to_json(sf.summarizer.sentences)
     outputfilecontents = {
         "picklein": None,
         "pickleout": pickle_store,
         "summary": summary,
         "confirmatory_summary": list(confirmatory_summary),
         "exploratory_summary": list(recom_sentences),
         "type": summarizer,
         "run_id": run_id,
         "weights": sf.summarizer.weights,
         "fbs_weights": dict(sf.feedbackstore.get_weights()),
         "details": derived_records,
         "sentences": list(sents),
         "full": result,
         "score": sf.log_sir_info_data
     }
     json_content = json.dumps(outputfilecontents)
     if self.out is not None:
         log.info("writing output to %s" % (self.out))
         write_to_file(json_content, self.out)
     return json_content
Пример #3
0
 def create_processed(self, topic_description):
     filename = path.join(
         create_dir(
             path.join(self.output_directory, topic_description["id"])),
         "task.json")
     retval = json.dumps(topic_description)
     write_to_file(retval, filename)
Пример #4
0
def load_ub_summary(language,
                    docs,
                    models,
                    size,
                    ngram_type=2,
                    base_dir=path.normpath(
                        path.expanduser("~/.ukpsummarizer/cache/"))):
    import hashlib
    m = hashlib.sha256()
    shortened_docs = [path.split(f)[1] for (f, _) in docs]
    for doc in sorted(shortened_docs):
        m.update(doc)
    shortened_models = [path.split(f)[1] for (f, _) in models]
    for model in sorted(shortened_models):
        m.update(model)
    m.update(str(size))
    m.update(language)
    m.update(str(ngram_type))
    h = m.hexdigest()
    jsonloc = path.normpath(path.join(base_dir, h + ".json"))
    if path.isfile(jsonloc):
        try:
            ubs = json.load(open(jsonloc))
            upsum = ubs["summary"]
            return upsum
        except:
            pass
    upsum = ExtractiveUpperbound(language)
    ub_summary = upsum(docs, models, size, ngram_type)
    jdict = {
        "docs": sorted(shortened_docs),
        "summary": ub_summary,
        "models": sorted(shortened_models),
        "size": size,
        "language": language,
        "ngram_type": ngram_type
    }
    j = json.dumps(jdict)
    write_to_file(j, jsonloc)
    return ub_summary
Пример #5
0
    def write_continue_output_result(self,
                                     sf,
                                     unlabeled_data=None,
                                     picklein=None,
                                     pickleout=None,
                                     summary=None,
                                     summary_sentences=None,
                                     exploratory_sentences=None):
        log = logging.getLogger("SingleTopicRunner")
        if self.out is not None:

            derived_records = []
            # construct table-like array of feedbacks per iteration.
            for i, record in enumerate(sf.flight_recorder.records):
                for accept in record.accept:
                    derived_records.append({
                        "iteration": i,
                        "concept": accept,
                        "value": "accept"
                    })
                for reject in record.reject:
                    derived_records.append({
                        "iteration": i,
                        "concept": reject,
                        "value": "reject"
                    })
                for implicit_reject in record.implicit_reject:
                    derived_records.append({
                        "iteration": i,
                        "concept": implicit_reject,
                        "value": "implicit_reject"
                    })

            for item in unlabeled_data:
                if item not in [i.get("concept", "") for i in derived_records]:
                    derived_records.append({
                        "iteration":
                        -1,
                        "concept":
                        item,
                        "value":
                        "recommendation",
                        "weight":
                        sf.summarizer.weights.get(item, 0.0),
                        "uncertainity":
                        sf.svm_uncertainity.get(item, -1.0)
                    })
                else:
                    log.info(
                        "recommendation included a already labeled instance, '%s'"
                        % (item))

            outputfilecontents = {
                "picklein": picklein,
                "pickleout": pickleout,
                "summary": summary,
                "confirmatory_summary": list(summary_sentences),
                "exploratory_summary": list(exploratory_sentences),
                "weights": sf.summarizer.weights,
                "fbs_weights": dict(sf.feedbackstore.get_weights()),
                "sentence_ids": list(summary_sentences),
                "details": derived_records,
                "score": sf.log_sir_info_data
            }

            write_to_file(json.dumps(outputfilecontents), self.out)
            log.info("writing output to %s" % (self.out))
        log.info("done writing output")
Пример #6
0
    def run(self,
            topic_path,
            size=None,
            summarizer="SUME",
            summary_idx=None,
            parser=None,
            oracle="accept",
            feedback_log=None,
            propagation=False,
            max_iteration_count=10,
            preload_embeddings=None,
            feedbackstore=None,
            override_results_files=False,
            num_clusters=8):
        log = logging.getLogger("SingleTopicRunner")

        sf = None  # just for the sake of being able to run without simulated feedback...
        self.tlog.debug("SingleTopicRunner started")
        # relativize the topic path!
        if type(topic_path) is Topic:
            topic = topic_path
        else:
            if topic_path.startswith("/"):
                relative_path = re.search('^(/)(.*)$', topic_path).group(2)
            else:
                relative_path = topic_path

            topic = Topic(
                path.join(self.iobasedir, path.normpath(relative_path)))
        language = topic.get_language()
        docs = topic.get_docs()
        summaries = topic.get_models()

        flightrecorder = get_flightrecorder_from_file(feedback_log)
        preceding_size = len(
            flightrecorder.records
        )  # the number of iterations that happened due to the provided feedback_log

        embeddings = None
        """
        if preload_embeddings:
            embeddings_path = path.normpath(path.join(self.iobasedir, "embeddings"))
            embeddings = load_w2v_embeddings(embeddings_path, language, 'active_learning')
        else:
            embeddings = preload_embeddings
        """

        if summary_idx is not None:
            summaries = [summaries[summary_idx]]

        if size is None:
            use_size = topic.get_summary_size()
        else:
            use_size = size

        clusters_path = path.join(self.iobasedir, 'clustering',
                                  '{}'.format(num_clusters))
        #print(clusters_path)
        #clusters = get_clusters(clusters_path, topic.docs_dir)

        if summarizer == "SUME":
            sw = SumeWrap(language)
            summary = sw(docs, use_size)
            outputfilecontents = {
                "summary": summary,
                "type": summarizer,
                "info_data": []
            }

            json_content = json.dumps(outputfilecontents)
            if self.out is not None:
                log.info("writing output to %s" % (self.out))
                write_to_file(json_content, self.out)
            write_to_file(
                json_content,
                path.normpath(
                    path.expanduser(
                        path.join(self.iobasedir, "tmp", "tmp.json"))))
        elif summarizer == "UPPER_BOUND":
            ub_summary = load_ub_summary(language,
                                         docs,
                                         summaries,
                                         use_size,
                                         base_dir=self.iobasedir)
            summary = '\n'.join(ub_summary)

            outputfilecontents = {
                "summary": summary,
                "type": summarizer,
                "info_data": []
            }

            json_content = json.dumps(outputfilecontents)
            if self.out is not None:
                log.info("writing output to %s" % (self.out))
                write_to_file(json_content, self.out)
            write_to_file(
                json_content,
                path.normpath(
                    path.expanduser(
                        path.join(self.iobasedir, "tmp", "tmp.json"))))
        elif summarizer == "PROPAGATION":
            #UB considering all the summaries
            ub_summary = load_ub_summary(language,
                                         docs,
                                         summaries,
                                         use_size,
                                         base_dir=self.iobasedir)
            summary = '\n'.join(ub_summary)
            ub_scores = self.rouge(summary, summaries, use_size)

            log.debug(
                "UB scores: R1:%s R2:%s SU4:%s" %
                (str(ub_scores[0]), str(ub_scores[1]), str(ub_scores[2])))

            ref_summ = random.choice(summaries)

            parse_info = []
            #parse_info = topic.get_parse_info(summaries.index(ref_summ))

            # initialize the Algorithm.
            run_config = dict()
            run_config['rank_subset'] = True
            run_config['relative_k'] = True
            run_config['dynamic_k'] = False
            for flag in ['adaptive_sampling', 'strategy']:
                run_config[flag] = False

            r = 0
            clusters = None
            log.info("recording k_size in summarize %f", self.k)
            #TODO: Added summaries instead of one single summary
            sf = SimulatedFeedback(
                language,
                self.rouge,
                embeddings=None,  #TODO: embeddings
                docs=docs,
                models=summaries,
                summary_length=use_size,
                oracle_type=oracle,
                ub_score=ub_scores,
                ub_summary=ub_summary,
                parser_type=parser,
                flightrecorder=flightrecorder,
                feedbackstore=feedbackstore,
                parse_info=parse_info,
                run_config=run_config,
                k=self.k,
                adaptive_window_size=r,
                clusters=clusters)

            if sf.embeddings is None or sf.embeddings == {}:
                embe_var = "none",
            else:
                if sf.embeddings.embedding_variant is None:
                    embe_var = "none"
                else:
                    embe_var = sf.embeddings.embedding_variant
            if feedbackstore is None:
                cfg = {"type": "Unconfigured default"}
            else:
                cfg = feedbackstore.get_config()

            rs = []
            for p, t in [ref_summ]:
                rs.append({"name": os.path.split(p)[1], "text": t})

            run_id_string = "%s-%s-%s-%s-%s-%s-%s-%s" % (
                oracle, summarizer, parser, embe_var, topic.get_dataset(),
                topic.get_name(), [item["name"]
                                   for item in rs], json.dumps(cfg))

            run_id = hashlib.sha224(run_id_string).hexdigest()
            filename = path.join(self.scores_storage_path,
                                 "result-%s.json" % (run_id))

            if (os.path.exists(filename) and self.out is None
                    and self.override_results_switch is False):
                log.info(
                    "Skipping run_id '%s' because the result file does already exist. config: %s"
                    % (run_id, run_id_string))
                return
            else:
                log.info("Doing %s iterations for run_id '%s'\n %s" %
                         (max_iteration_count, run_id, run_id_string))
                write_to_file("", filename)

            summary, confirmatory_summary, exploratory_summary = sf.run_full_simulation(
                max_iteration_count=max_iteration_count)

            recommendations, recom_sentences = sf.get_recommendations()

            derived_records = []
            # construct table-like array of feedbacks per iteration.
            for i, record in enumerate(sf.flight_recorder.records):
                for accept in record.accept:
                    derived_records.append({
                        "iteration": i,
                        "concept": accept,
                        "value": "accept"
                    })
                for reject in record.reject:
                    derived_records.append({
                        "iteration": i,
                        "concept": reject,
                        "value": "reject"
                    })
                for implicit_reject in record.implicit_reject:
                    derived_records.append({
                        "iteration": i,
                        "concept": implicit_reject,
                        "value": "implicit_reject"
                    })

            for item in recommendations:
                derived_records.append({
                    "iteration":
                    -1,
                    "concept":
                    item,
                    "value":
                    "recommendation",
                    "weight":
                    sf.summarizer.weights.get(item, 0.0),
                    "uncertainity":
                    sf.svm_uncertainity.get(item, -1.0)
                })

            result = {
                "config_run_id": run_id,
                "config_oracle_type": oracle,
                "config_summarizer_type": summarizer,
                "config_parse_type": str(parser),
                #"config_wordembeddings": emb_var,
                "config_feedbackstore": sf.feedbackstore.get_config(),
                "config_feedback_interpretation": {},
                "config_concept_recommendation": {},
                "dataset": topic.get_dataset(),
                "topic": topic.get_name(),
                "models": rs,
                "model_rougescores": {
                    "iteration": -1,
                    "ROUGE-1 R score": ub_scores[0],
                    "ROUGE-2 R score": ub_scores[1],
                    "ROUGE-SU* R score": ub_scores[2],
                    "accepted": [],
                    "accept_count": 0,
                    "rejected": [],
                    "reject_count": 0,
                    "summary": ub_summary
                },
                "result_summary": summary,
                "result_rougescores": sf.log_sir_info_data,
                "log_feedbacks": derived_records
            }

            r2 = [{
                "iteration": i,
                "summary": sf.log_info_data[i]
            } for i in range(len(sf.flight_recorder.records))]
            log.debug(
                "records: %s, infos %s, diff: %s" %
                (len(sf.flight_recorder.records), len(sf.log_info_data),
                 len(sf.flight_recorder.records) - len(sf.log_info_data)))

            write_to_file(json.dumps(result), filename)
            log.info("Writing results to %s" % (filename))

            df = pd.DataFrame(derived_records)
            filename = path.join(self.scores_storage_path,
                                 "flightrecorder-%s.csv" % (run_id))
            log.info("saving flightrecorder to %s with run_id %s" %
                     (filename, run_id))
            df.to_csv(filename, encoding="UTF-8")

            write_to_file(
                json.dumps(sf.new_debug_weights_history),
                path.join(
                    self.scores_storage_path,
                    "weightshistory-%s-%s-%s-%s.json" %
                    (topic.get_dataset(), topic.get_name(), summarizer,
                     run_id)))
            log.info("Writing weights history to %s" % (filename))
            weights_hist = pd.DataFrame(sf.new_debug_weights_history)

            filename = path.join(self.scores_storage_path,
                                 "weightshistory-%s.csv" % (run_id))
            weights_hist.to_csv(filename, encoding="UTF-8")

            log.debug("----------------------------------------------")
            log.debug(summary)
            log.debug(sf.log_info_data[-1])
            log.debug("----------------------------------------------")
            if self.pickle_store is not None:
                # Pickle dictionary using protocol 0.
                print('Pickle in file %s' % self.pickle_store)
                self.pickle_write(sf, self.pickle_store, log)

            json_content = self.write_summarize_output_json(
                sf, confirmatory_summary, derived_records, log,
                recom_sentences, result, run_id, summarizer, summary,
                self.pickle_store)
            # write_to_file(json_content, path.normpath(path.expanduser(path.join(self.iobasedir, "tmp", "tmp.json"))))
        else:
            raise BaseException("You should tell which summarizer to use")

        if sf is not None:
            write_details_file([sf.log_info_data],
                               path.join(self.iobasedir, "tmp", "tmp.csv"))
        self.tlog.debug("SingleTopicRunner finished")
Пример #7
0
def print_graph_stats(G, class_type=None):
    stats = {}
    g = stats["graph"] = {}
    if class_type is not None:
        g["_type"] = class_type
    g["name"] = G.name
    type_name = [type(G).__name__]
    g["type_name"] = ",".join(type_name)
    g["node_count"] = G.number_of_nodes()
    g["edge_count"] = G.number_of_edges()
    if len(G) > 0:
        if G.is_directed():
            g["type"] = "directed"
            g["indegree_average"] = (sum(G.in_degree().values()) /
                                     float(g["node_count"]))
            g["outdegree_average"] = (sum(G.out_degree().values()) /
                                      float(g["node_count"]))
        else:
            g["type"] = "undirected"
            g["degree_average"] = (sum(G.degree().values())) / float(
                g["node_count"])
            g["degree_min"] = min(G.degree().values())
            g["degree_max"] = max(G.degree().values())

    g["density"] = nx.density(G)
    if not nx.is_directed(G):
        g["components"] = []

        cc_nodes = []
        cc_edges = []
        for CC in nx.connected_component_subgraphs(G):
            cc = {"nodes": len(CC.nodes()), "edges": len(CC.edges())}

            cc_nodes.append(len(CC.nodes()))
            cc_edges.append(len(CC.edges()))

            g["components"].append(cc)
        (ccn_arr, ccn_bins) = np.histogram(cc_nodes, bins="sturges")
        (cce_arr, cce_bins) = np.histogram(cc_edges, bins="sturges")
        g["components_overview"] = {
            "count": nx.number_connected_components(G),
            "nodes_hist": {
                "bins": ccn_bins.tolist(),
                "data": ccn_arr.tolist()
            },
            "edges_hist": {
                "bins": cce_bins.tolist(),
                "data": cce_arr.tolist()
            }
        }

    (dha, dhb) = np.histogram(G.degree().values(), bins="sturges")
    g["degree_hist"] = {"bins": dhb.tolist(), "data": dha.tolist()}

    feedbacks = []
    dfs = []
    for (n, d) in G.nodes(data=True):
        dfs.append(d["df"])
        feedbacks.append(d["feedback"][-1:][0])

    (fha, fhb) = np.histogram(feedbacks, bins="sturges")
    g["feedbacks_histogram"] = {"bins": fhb.tolist(), "data": fha.tolist()}
    (cha, chb) = np.histogram(dfs, bins="sturges")
    g["concepts_histogram"] = {"bins": chb.tolist(), "data": cha.tolist()}

    t = datetime.datetime.now()
    ts = int(time.mktime(t.timetuple()))
    temp = tempfile.mktemp(prefix=str(ts), suffix=".json")
    logging.getLogger("io").info("feedback-graph stats dumped to ", temp)
    write_to_file(json.dumps(stats), temp)
Пример #8
0
        input_file = path.normpath(args.input)
        with codecs.open(input_file, 'r', 'utf-8') as fp:
            text = fp.read().splitlines()
        summary = text or ""

        f = utils.reader.resolve_against_iobase(args.reference, iobasedir)
        if path.exists(path.join(f, "task.json")):
            # is topic
            t = Topic(f)

            # run rouge on topic
            lang = t.get_language()
            max_size = t.get_summary_size()

            # resolved_rouge_dir = path.normpath(path.expanduser(rouge_dir))
            rouge = Rouge(rouge_dir)
            # reference_summaries = [mt for _, mt in t.get_models()]

            r1,r2,r4 = rouge(summary, t.get_models(), max_size)
            outputfilecontents = {
                "R1": r1,
                "R2": r2,
                "R4": r4
            }
            write_to_file(json.dumps(outputfilecontents), outfile)
        else:
            raise BaseException("Invalid file given.", f, " is neither a topic nor a model.")

        log("Done with rouge")
    log("Done")