def get_scores(self, summary, models): write_to_file(summary, path.join(self.temp_dir, self.reference_summary_temp_filename),True) models_dir = path.dirname(models[0][0]) config = self.create_config([self.reference_summary_temp_filename], models, models_dir) write_to_file(config, self.temp_config_file, True) result = self.execute_rouge() result_dict = self.extract_results(result.decode('utf-8')) R1score = float(result_dict["1"]['ROUGE-1 R']) R2score = float(result_dict["1"]['ROUGE-2 R']) if self.verbose: R3score = float(result_dict["1"]['ROUGE-3 R']) R4score = float(result_dict["1"]['ROUGE-4 R']) if self.rouge_l: RLscore = float(result_dict["1"]['ROUGE-L R']) RSU4score = float(result_dict["1"]['ROUGE-SU* R']) if self.verbose and self.rouge_l: return R1score, R2score, R3score, R4score, RLscore, RSU4score elif self.verbose and not self.rouge_l: return R1score, R2score, R3score, R4score, RSU4score elif not self.verbose and self.rouge_l: return R1score, R2score, RLscore, RSU4score else: return R1score, R2score, RSU4score
def write_summarize_output_json(self, sf, confirmatory_summary, derived_records, log, recom_sentences, result, run_id, summarizer, summary, pickle_store=None): # convert the sentences into a jsonizable structure: sents = convert_to_json(sf.summarizer.sentences) outputfilecontents = { "picklein": None, "pickleout": pickle_store, "summary": summary, "confirmatory_summary": list(confirmatory_summary), "exploratory_summary": list(recom_sentences), "type": summarizer, "run_id": run_id, "weights": sf.summarizer.weights, "fbs_weights": dict(sf.feedbackstore.get_weights()), "details": derived_records, "sentences": list(sents), "full": result, "score": sf.log_sir_info_data } json_content = json.dumps(outputfilecontents) if self.out is not None: log.info("writing output to %s" % (self.out)) write_to_file(json_content, self.out) return json_content
def create_processed(self, topic_description): filename = path.join( create_dir( path.join(self.output_directory, topic_description["id"])), "task.json") retval = json.dumps(topic_description) write_to_file(retval, filename)
def load_ub_summary(language, docs, models, size, ngram_type=2, base_dir=path.normpath( path.expanduser("~/.ukpsummarizer/cache/"))): import hashlib m = hashlib.sha256() shortened_docs = [path.split(f)[1] for (f, _) in docs] for doc in sorted(shortened_docs): m.update(doc) shortened_models = [path.split(f)[1] for (f, _) in models] for model in sorted(shortened_models): m.update(model) m.update(str(size)) m.update(language) m.update(str(ngram_type)) h = m.hexdigest() jsonloc = path.normpath(path.join(base_dir, h + ".json")) if path.isfile(jsonloc): try: ubs = json.load(open(jsonloc)) upsum = ubs["summary"] return upsum except: pass upsum = ExtractiveUpperbound(language) ub_summary = upsum(docs, models, size, ngram_type) jdict = { "docs": sorted(shortened_docs), "summary": ub_summary, "models": sorted(shortened_models), "size": size, "language": language, "ngram_type": ngram_type } j = json.dumps(jdict) write_to_file(j, jsonloc) return ub_summary
def write_continue_output_result(self, sf, unlabeled_data=None, picklein=None, pickleout=None, summary=None, summary_sentences=None, exploratory_sentences=None): log = logging.getLogger("SingleTopicRunner") if self.out is not None: derived_records = [] # construct table-like array of feedbacks per iteration. for i, record in enumerate(sf.flight_recorder.records): for accept in record.accept: derived_records.append({ "iteration": i, "concept": accept, "value": "accept" }) for reject in record.reject: derived_records.append({ "iteration": i, "concept": reject, "value": "reject" }) for implicit_reject in record.implicit_reject: derived_records.append({ "iteration": i, "concept": implicit_reject, "value": "implicit_reject" }) for item in unlabeled_data: if item not in [i.get("concept", "") for i in derived_records]: derived_records.append({ "iteration": -1, "concept": item, "value": "recommendation", "weight": sf.summarizer.weights.get(item, 0.0), "uncertainity": sf.svm_uncertainity.get(item, -1.0) }) else: log.info( "recommendation included a already labeled instance, '%s'" % (item)) outputfilecontents = { "picklein": picklein, "pickleout": pickleout, "summary": summary, "confirmatory_summary": list(summary_sentences), "exploratory_summary": list(exploratory_sentences), "weights": sf.summarizer.weights, "fbs_weights": dict(sf.feedbackstore.get_weights()), "sentence_ids": list(summary_sentences), "details": derived_records, "score": sf.log_sir_info_data } write_to_file(json.dumps(outputfilecontents), self.out) log.info("writing output to %s" % (self.out)) log.info("done writing output")
def run(self, topic_path, size=None, summarizer="SUME", summary_idx=None, parser=None, oracle="accept", feedback_log=None, propagation=False, max_iteration_count=10, preload_embeddings=None, feedbackstore=None, override_results_files=False, num_clusters=8): log = logging.getLogger("SingleTopicRunner") sf = None # just for the sake of being able to run without simulated feedback... self.tlog.debug("SingleTopicRunner started") # relativize the topic path! if type(topic_path) is Topic: topic = topic_path else: if topic_path.startswith("/"): relative_path = re.search('^(/)(.*)$', topic_path).group(2) else: relative_path = topic_path topic = Topic( path.join(self.iobasedir, path.normpath(relative_path))) language = topic.get_language() docs = topic.get_docs() summaries = topic.get_models() flightrecorder = get_flightrecorder_from_file(feedback_log) preceding_size = len( flightrecorder.records ) # the number of iterations that happened due to the provided feedback_log embeddings = None """ if preload_embeddings: embeddings_path = path.normpath(path.join(self.iobasedir, "embeddings")) embeddings = load_w2v_embeddings(embeddings_path, language, 'active_learning') else: embeddings = preload_embeddings """ if summary_idx is not None: summaries = [summaries[summary_idx]] if size is None: use_size = topic.get_summary_size() else: use_size = size clusters_path = path.join(self.iobasedir, 'clustering', '{}'.format(num_clusters)) #print(clusters_path) #clusters = get_clusters(clusters_path, topic.docs_dir) if summarizer == "SUME": sw = SumeWrap(language) summary = sw(docs, use_size) outputfilecontents = { "summary": summary, "type": summarizer, "info_data": [] } json_content = json.dumps(outputfilecontents) if self.out is not None: log.info("writing output to %s" % (self.out)) write_to_file(json_content, self.out) write_to_file( json_content, path.normpath( path.expanduser( path.join(self.iobasedir, "tmp", "tmp.json")))) elif summarizer == "UPPER_BOUND": ub_summary = load_ub_summary(language, docs, summaries, use_size, base_dir=self.iobasedir) summary = '\n'.join(ub_summary) outputfilecontents = { "summary": summary, "type": summarizer, "info_data": [] } json_content = json.dumps(outputfilecontents) if self.out is not None: log.info("writing output to %s" % (self.out)) write_to_file(json_content, self.out) write_to_file( json_content, path.normpath( path.expanduser( path.join(self.iobasedir, "tmp", "tmp.json")))) elif summarizer == "PROPAGATION": #UB considering all the summaries ub_summary = load_ub_summary(language, docs, summaries, use_size, base_dir=self.iobasedir) summary = '\n'.join(ub_summary) ub_scores = self.rouge(summary, summaries, use_size) log.debug( "UB scores: R1:%s R2:%s SU4:%s" % (str(ub_scores[0]), str(ub_scores[1]), str(ub_scores[2]))) ref_summ = random.choice(summaries) parse_info = [] #parse_info = topic.get_parse_info(summaries.index(ref_summ)) # initialize the Algorithm. run_config = dict() run_config['rank_subset'] = True run_config['relative_k'] = True run_config['dynamic_k'] = False for flag in ['adaptive_sampling', 'strategy']: run_config[flag] = False r = 0 clusters = None log.info("recording k_size in summarize %f", self.k) #TODO: Added summaries instead of one single summary sf = SimulatedFeedback( language, self.rouge, embeddings=None, #TODO: embeddings docs=docs, models=summaries, summary_length=use_size, oracle_type=oracle, ub_score=ub_scores, ub_summary=ub_summary, parser_type=parser, flightrecorder=flightrecorder, feedbackstore=feedbackstore, parse_info=parse_info, run_config=run_config, k=self.k, adaptive_window_size=r, clusters=clusters) if sf.embeddings is None or sf.embeddings == {}: embe_var = "none", else: if sf.embeddings.embedding_variant is None: embe_var = "none" else: embe_var = sf.embeddings.embedding_variant if feedbackstore is None: cfg = {"type": "Unconfigured default"} else: cfg = feedbackstore.get_config() rs = [] for p, t in [ref_summ]: rs.append({"name": os.path.split(p)[1], "text": t}) run_id_string = "%s-%s-%s-%s-%s-%s-%s-%s" % ( oracle, summarizer, parser, embe_var, topic.get_dataset(), topic.get_name(), [item["name"] for item in rs], json.dumps(cfg)) run_id = hashlib.sha224(run_id_string).hexdigest() filename = path.join(self.scores_storage_path, "result-%s.json" % (run_id)) if (os.path.exists(filename) and self.out is None and self.override_results_switch is False): log.info( "Skipping run_id '%s' because the result file does already exist. config: %s" % (run_id, run_id_string)) return else: log.info("Doing %s iterations for run_id '%s'\n %s" % (max_iteration_count, run_id, run_id_string)) write_to_file("", filename) summary, confirmatory_summary, exploratory_summary = sf.run_full_simulation( max_iteration_count=max_iteration_count) recommendations, recom_sentences = sf.get_recommendations() derived_records = [] # construct table-like array of feedbacks per iteration. for i, record in enumerate(sf.flight_recorder.records): for accept in record.accept: derived_records.append({ "iteration": i, "concept": accept, "value": "accept" }) for reject in record.reject: derived_records.append({ "iteration": i, "concept": reject, "value": "reject" }) for implicit_reject in record.implicit_reject: derived_records.append({ "iteration": i, "concept": implicit_reject, "value": "implicit_reject" }) for item in recommendations: derived_records.append({ "iteration": -1, "concept": item, "value": "recommendation", "weight": sf.summarizer.weights.get(item, 0.0), "uncertainity": sf.svm_uncertainity.get(item, -1.0) }) result = { "config_run_id": run_id, "config_oracle_type": oracle, "config_summarizer_type": summarizer, "config_parse_type": str(parser), #"config_wordembeddings": emb_var, "config_feedbackstore": sf.feedbackstore.get_config(), "config_feedback_interpretation": {}, "config_concept_recommendation": {}, "dataset": topic.get_dataset(), "topic": topic.get_name(), "models": rs, "model_rougescores": { "iteration": -1, "ROUGE-1 R score": ub_scores[0], "ROUGE-2 R score": ub_scores[1], "ROUGE-SU* R score": ub_scores[2], "accepted": [], "accept_count": 0, "rejected": [], "reject_count": 0, "summary": ub_summary }, "result_summary": summary, "result_rougescores": sf.log_sir_info_data, "log_feedbacks": derived_records } r2 = [{ "iteration": i, "summary": sf.log_info_data[i] } for i in range(len(sf.flight_recorder.records))] log.debug( "records: %s, infos %s, diff: %s" % (len(sf.flight_recorder.records), len(sf.log_info_data), len(sf.flight_recorder.records) - len(sf.log_info_data))) write_to_file(json.dumps(result), filename) log.info("Writing results to %s" % (filename)) df = pd.DataFrame(derived_records) filename = path.join(self.scores_storage_path, "flightrecorder-%s.csv" % (run_id)) log.info("saving flightrecorder to %s with run_id %s" % (filename, run_id)) df.to_csv(filename, encoding="UTF-8") write_to_file( json.dumps(sf.new_debug_weights_history), path.join( self.scores_storage_path, "weightshistory-%s-%s-%s-%s.json" % (topic.get_dataset(), topic.get_name(), summarizer, run_id))) log.info("Writing weights history to %s" % (filename)) weights_hist = pd.DataFrame(sf.new_debug_weights_history) filename = path.join(self.scores_storage_path, "weightshistory-%s.csv" % (run_id)) weights_hist.to_csv(filename, encoding="UTF-8") log.debug("----------------------------------------------") log.debug(summary) log.debug(sf.log_info_data[-1]) log.debug("----------------------------------------------") if self.pickle_store is not None: # Pickle dictionary using protocol 0. print('Pickle in file %s' % self.pickle_store) self.pickle_write(sf, self.pickle_store, log) json_content = self.write_summarize_output_json( sf, confirmatory_summary, derived_records, log, recom_sentences, result, run_id, summarizer, summary, self.pickle_store) # write_to_file(json_content, path.normpath(path.expanduser(path.join(self.iobasedir, "tmp", "tmp.json")))) else: raise BaseException("You should tell which summarizer to use") if sf is not None: write_details_file([sf.log_info_data], path.join(self.iobasedir, "tmp", "tmp.csv")) self.tlog.debug("SingleTopicRunner finished")
def print_graph_stats(G, class_type=None): stats = {} g = stats["graph"] = {} if class_type is not None: g["_type"] = class_type g["name"] = G.name type_name = [type(G).__name__] g["type_name"] = ",".join(type_name) g["node_count"] = G.number_of_nodes() g["edge_count"] = G.number_of_edges() if len(G) > 0: if G.is_directed(): g["type"] = "directed" g["indegree_average"] = (sum(G.in_degree().values()) / float(g["node_count"])) g["outdegree_average"] = (sum(G.out_degree().values()) / float(g["node_count"])) else: g["type"] = "undirected" g["degree_average"] = (sum(G.degree().values())) / float( g["node_count"]) g["degree_min"] = min(G.degree().values()) g["degree_max"] = max(G.degree().values()) g["density"] = nx.density(G) if not nx.is_directed(G): g["components"] = [] cc_nodes = [] cc_edges = [] for CC in nx.connected_component_subgraphs(G): cc = {"nodes": len(CC.nodes()), "edges": len(CC.edges())} cc_nodes.append(len(CC.nodes())) cc_edges.append(len(CC.edges())) g["components"].append(cc) (ccn_arr, ccn_bins) = np.histogram(cc_nodes, bins="sturges") (cce_arr, cce_bins) = np.histogram(cc_edges, bins="sturges") g["components_overview"] = { "count": nx.number_connected_components(G), "nodes_hist": { "bins": ccn_bins.tolist(), "data": ccn_arr.tolist() }, "edges_hist": { "bins": cce_bins.tolist(), "data": cce_arr.tolist() } } (dha, dhb) = np.histogram(G.degree().values(), bins="sturges") g["degree_hist"] = {"bins": dhb.tolist(), "data": dha.tolist()} feedbacks = [] dfs = [] for (n, d) in G.nodes(data=True): dfs.append(d["df"]) feedbacks.append(d["feedback"][-1:][0]) (fha, fhb) = np.histogram(feedbacks, bins="sturges") g["feedbacks_histogram"] = {"bins": fhb.tolist(), "data": fha.tolist()} (cha, chb) = np.histogram(dfs, bins="sturges") g["concepts_histogram"] = {"bins": chb.tolist(), "data": cha.tolist()} t = datetime.datetime.now() ts = int(time.mktime(t.timetuple())) temp = tempfile.mktemp(prefix=str(ts), suffix=".json") logging.getLogger("io").info("feedback-graph stats dumped to ", temp) write_to_file(json.dumps(stats), temp)
input_file = path.normpath(args.input) with codecs.open(input_file, 'r', 'utf-8') as fp: text = fp.read().splitlines() summary = text or "" f = utils.reader.resolve_against_iobase(args.reference, iobasedir) if path.exists(path.join(f, "task.json")): # is topic t = Topic(f) # run rouge on topic lang = t.get_language() max_size = t.get_summary_size() # resolved_rouge_dir = path.normpath(path.expanduser(rouge_dir)) rouge = Rouge(rouge_dir) # reference_summaries = [mt for _, mt in t.get_models()] r1,r2,r4 = rouge(summary, t.get_models(), max_size) outputfilecontents = { "R1": r1, "R2": r2, "R4": r4 } write_to_file(json.dumps(outputfilecontents), outfile) else: raise BaseException("Invalid file given.", f, " is neither a topic nor a model.") log("Done with rouge") log("Done")