def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] subsets_dida = [] subsets_notdida = [] covers = [] for i in range(1, n+1): print("Starting analysis for {0}-grams".format(i)) # Process on DIDA class subset, set_cover = process_ngrams(i, dida_data, "dida") subsets_dida.extend(subset) covers.extend(set_cover) # Process on Not-DIDA class subset, set_cover = process_ngrams(i, notdida_data, "notdida") subsets_notdida.extend(subset) covers.extend(set_cover) display.display_ok("Analysis for {0}-grams done".format(i)) print("Searching set cover with all grams for DIDA") set_cover = get_set_cover(subsets_dida) scores = check_score(set_cover, subsets_dida, subsets_dida, dida_data) exh.write_text(scores, SCORE_FILENAME.format("dida", "all")) display.display_ok("Done") print("Searching set cover with all grams for NotDIDA") set_cover = get_set_cover(subsets_notdida) scores = check_score(set_cover, subsets_notdida, subsets_notdida, notdida_data) exh.write_text(scores, SCORE_FILENAME.format("notdida", "all")) display.display_ok("Done") save_topwords(covers) display.display_info("All results were saved in {0} directory".format(DIRECTORY))
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] for i in range(1, n + 1): print("Starting analysis for {0}-grams".format(i)) print("Couting occurrences for DIDA") dida_occurrences = ngh.count_occurrences(i, dida_data) dida_normalized = ngh.normalize_occurrences(dida_occurrences, len(dida_data)) display.display_ok("Counting occurrences for DIDA done") print("Couting occurrences for NotDIDA") notdida_occurrences = ngh.count_occurrences(i, notdida_data) notdida_normalized = ngh.normalize_occurrences(notdida_occurrences, len(notdida_data)) display.display_ok("Counting occurrences for NotDIDA done") # Merge n-grams in the same list merged = merge_ngrams(dida_normalized, notdida_normalized) # Order n-grams by difference merged = ordered(merged, score) # Save results save_to_file(merged, i) display.display_ok("Analysis for {0}-grams done".format(i))
def __init__(self, threshold, filenames, c1, c2, foldername): self.threshold = threshold / 100 self.ngrams_files = filenames self.c1 = c1 self.c2 = c2 self.foldername = "wordsdistribution/" + foldername exh.create_directory(self.foldername) self.c1_grams = [] self.c1_weight_grams = dict() self.c2_grams = [] self.c2_weight_grams = dict() self.global_weights = dict() self.global_weights[self.c1] = dict() self.global_weights[self.c2] = dict() self._prepare_ngrams()
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) # Extension of the input file extension = args.FILE.split('.')[-1] if extension in LEGAL_EXTENSIONS: exh.create_directory(DIRECTORY) # Get publications print("Getting publications") documents_l = read_file(args.FILE, extension) display.display_ok("Getting publications done") # Save publications filename = BACK_FILENAME.format(args.OUTPUT) exh.write_json(documents_l, filename) display.display_info("Publications saved in {0}".format(filename)) # Insert PubTator annotations in the abstracts print("Inserting PubTator annotations in abstracts") docs = pbmdh.extract_features(documents_l) display.display_ok("Inserting PubTator annotations in abstracts done") # Extract n-grams print("Extracting n-grams") ngh.extract_ngrams(docs, CONFIG['NGRAMS']) display.display_ok("Extracting n-grams done") # Save publications and their n-grams filename = NGRAMS_FILENAME.format(args.OUTPUT) exh.write_json(docs, filename) display.display_info("Publications and n-grams saved in {0}".format(filename)) else: # The input file has not a valid extension display.display_fail("Extension of input file not supported. Required : txt or json. Received : {0}".format(extension)) sys.exit(0)
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] for i in range(1, n + 1): extract_ngrams(i, deepcopy(dida_data), deepcopy(notdida_data))
def save_clusters(clusters): data = dict() data['clusters'] = clusters data['Ndw'] = Ndw data['W'] = W directory = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_DIRECTORY'] exh.create_directory(directory) # filename = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_FILENAME'] filename = directory + "/ndw.json" exh.write_json(Ndw, filename) filename = directory + "/W.json" exh.write_json(W, filename) cluster_directory = directory + "/clusters" exh.create_directory(cluster_directory) for i, c in clusters.items(): filename = cluster_directory + "/{0}.json".format(i) exh.write_json(c, filename) display.display_info("Data clusters saved into " + directory)
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) # docs = [deepcopy(dida_data), deepcopy(notdida_data)] docs = [deepcopy(notdida_data), deepcopy(dida_data)] display.display_ok("Loading publications done") print("Starting extraction of words information") extract_words_information(docs) display.display_ok("Extraction of words information done") print("Computing joint probability distribution") joint_probability_distribution() display.display_ok("Computing joint probability distribution done") print("Starting IB method") all_clusters = ib.cluster(deepcopy(Pcw), deepcopy(Pw)) display.display_ok("IB method finished") save_clusters(all_clusters)
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] csv_files = csv_filenames(n) # Real labels of each publication y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data))) data = deepcopy(dida_data) data.extend(deepcopy(notdida_data)) scores = [] classifiers_names = [] print("Strict Classifier training") results = train(StrictClassifier, deepcopy(data), csv_files, y_true) plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data), 'strict_', "threshold", "Threshold", DIRECTORY) scores.append(results['score']) exh.save_to_log(results, "strict", "threshold", LOG_FILENAME.format("strict")) classifiers_names.append("Strict Classifier") display.display_ok("Strict Classifier training done") print("Split Weighted Classifier training") results = train(SplitWeightedClassifier, deepcopy(data), csv_files, y_true) plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data), 'splitweighted_', "threshold", "Threshold", DIRECTORY) scores.append(results['score']) exh.save_to_log(results, "splitweighted", "threshold", LOG_FILENAME.format("splitweighted")) classifiers_names.append("Split Weighted Classifier") display.display_ok("Split Weighted Classifier training done") print("Weighted Classifier training") results = train(WeightedClassifier, deepcopy(data), csv_files, y_true) plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data), 'weighted_', "threshold", "Threshold", DIRECTORY) scores.append(results['score']) exh.save_to_log(results, "weighted", "threshold", LOG_FILENAME.format("weighted")) classifiers_names.append("Weighted Classifier") display.display_ok("Weighted Classifier training done") plt.plot_lines(results['threshold'], scores, classifiers_names, FSCORE_FILENAME, "Threshold", "F1-score") display.display_info("Results saved in {0}".format(DIRECTORY))