def summerize(file): paragraphs = preprocess_file(file) paragraphs,inverse_sentence_structure = preprocess_paragraphs(paragraphs) data = feature_extraction(paragraphs) present = create_yes_no_column(inverse_sentence_structure, outputfile) data['present'] = present print(data.head()) print(data.columns) train_and_plot_results(data)
def plot_most_quoted_countries(data, nb_country): ''' This function plots an histogram representing the number of occurrences of most-quoted countries. Parameters - data : DataFrame sorted by the number of occurrences - nb_country : selection on the most representative countries ''' data = data.head(nb_country) countries_plot = sns.barplot(x=data.index, y='Occurrences', data=data, color='hotpink') for label in countries_plot.get_xticklabels(): label.set_rotation(90) countries_plot.set(ylabel='Occurrences') countries_plot.set_title('Number of occurrences of ' + str(nb_country) + ' most-quoted countries') sns.plt.show()
def predict(self, input_dir, output_dir, rw_type, input_format, chunk_len=100, test_scores=False, output_confidence=False, special_model_path=None): """ tags each file in the input directory (txt or tsv files) and writes the results to output_dir. Also adds a folder "result_stats" with runtime information to the output_dir tsv files must have at least the columns 'tok' and 'sentstart' :param input_dir: string value: path to input directory :param output_dir: string value: path to output directory :param rw_type: string value: direct, indirect, freeIndirect or reported :param input_format: string value: txt or tsv :param chunk_len: :return: """ # time the prediction start_time = datetime.datetime.now().replace(microsecond=0) # create a subdir for testing and overview information in the outputdir result_subdir = "result_stats" if not os.path.exists(os.path.join(output_dir, result_subdir)): os.makedirs(os.path.join(output_dir, result_subdir)) # load the model # determine the current script path curr_path = os.path.dirname(os.path.abspath(__file__)) if special_model_path is None: model_path = os.path.join(curr_path, "models", rw_type, "final-model.pt") else: model_path = os.path.join(curr_path, "models", special_model_path, "final-model.pt") if not os.path.exists(model_path): logging.warning( "Predicting {} aborted. Model not found at path '{}'. Please download a model and put it into " "the appropriate directory. The model file must be named final-model.pt." .format(rw_type, model_path)) else: self.logger.info("loading model {}".format(model_path)) model = SequenceTagger.load(model_path) self.logger.info("model loaded") # if test mode, collect score data (initialize in any case) score_dict = {"file": [], "f1": [], "precision": [], "recall": []} all_predictions_df = pd.DataFrame() input_files = [x for x in os.listdir(input_dir)] for file in input_files: resfile_name = re.sub("\..+$", ".tsv", file) self.logger.info("predicting {}".format(file)) # read the file and convert to dataframe if input_format == "txt": data = self.convert_txtfile_to_dateframe( os.path.join(input_dir, file)) else: data = pd.read_csv(os.path.join(input_dir, file), sep="\t", quoting=3, encoding="utf-8", na_values=[]) # check for tok column: if "tok" not in data.columns: self.logger.warning( "Column 'tok' is missing in file {}. File will be skipped." .format(file)) else: if "sentstart" not in data.columns: self.logger.warning( "Column 'sentstart' is missing in file {}. Will be added with default values (all 'no')." .format(file)) data["sentstart"] = ["no"] * len(data) self.logger.debug("TEST: data head:\n {}".format( data.head(10))) # create sentlist (based on max chunk length) sent_list = self.create_sentlist_from_file_batchmax( data, maxlen=chunk_len, compare_column="NaN") # predict res_dict = { "tok": [], rw_type + "_pred": [], rw_type + "_conf": [] } for sent in sent_list: model.predict(sent) pred_conf_list = [ x["labels"] for x in sent.to_dict(tag_type="cat")["entities"] ] pred_list = [ x[0].to_dict()["value"] for x in pred_conf_list ] conf_list = [ x[0].to_dict()["confidence"] for x in pred_conf_list ] res_dict["tok"].extend([ x["text"] for x in sent.to_dict(tag_type="cat")["entities"] ]) res_dict[rw_type + "_conf"].extend(conf_list) res_dict[rw_type + "_pred"].extend(pred_list) pred_df = pd.DataFrame(res_dict) # create output # if there is a missmatch in file length after prediction, still save the results if (len(data) != len(pred_df)): self.logger.warning( "File length changed when predicting for file {} (before: {}, after: {})\n" "Result file will be saved with prefix 'warn_'; additional columns are lost." .format(file, len(data), len(pred_df))) pred_df.to_csv(os.path.join(output_dir, "warn_" + resfile_name), index=False, sep="\t") # if everything is okay, add the new column(s) to the original data and save else: if output_confidence: data[rw_type + "_conf"] = pred_df[rw_type + "_conf"] data[rw_type + "_pred"] = pred_df[rw_type + "_pred"] data.to_csv(os.path.join(output_dir, resfile_name), index=False, sep="\t", encoding="utf-8") # calculate the testscores: if test_scores: self.logger.info( "Calculate scores for {}".format(file)) if rw_type in data.columns and rw_type + "_pred" in data.columns: data, f1, prec, rec = self.calculate_scores( data, rw_type) score_dict["file"].append(file) score_dict["f1"].append(f1) score_dict["precision"].append(prec) score_dict["recall"].append(rec) all_predictions_df = all_predictions_df.append( data) else: self.logger.warning( "Skipping test scores for file {}: Missing column {} and/or {}" .format(file, rw_type, rw_type + "_pred")) end_time = datetime.datetime.now().replace(microsecond=0) # write an overview file when the process is finished res_text = "RW Tagger (predict): Model {}\n" \ "Predict time:\nstart: {}nend:{}\ntotal: {}" \ .format(model_path, start_time, end_time, end_time - start_time) # if in test mode, calculate the final scores (for all the data) and save the test score df if test_scores: self.logger.info("Calculate total scores") if len(all_predictions_df) > 0: self.logger.debug("all_predictions_len: {}".format( len(all_predictions_df))) all_predictions_df, f1, prec, rec = self.calculate_scores( all_predictions_df, rw_type) score_dict["file"].append("total") score_dict["f1"].append(f1) score_dict["precision"].append(prec) score_dict["recall"].append(rec) score_df = pd.DataFrame(score_dict) score_df.to_csv(os.path.join(output_dir, result_subdir, rw_type + "_test_scores.tsv"), index=False, sep="\t", encoding="utf-8") res_text += "\nTotal test scores (for detailed scores see {}_test_scores.tsv):\n" \ "f1: {}, precision: {}, recall: {}".format(rw_type, f1, prec, rec) self.logger.info( "Total scores for {}: f1: {}, precision: {}, recall: {}" .format(rw_type, f1, prec, rec)) with open(os.path.join(output_dir, result_subdir, rw_type + "_overview.txt"), "w", encoding="utf-8") as f: f.write(res_text)