def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] subsets_dida = [] subsets_notdida = [] covers = [] for i in range(1, n+1): print("Starting analysis for {0}-grams".format(i)) # Process on DIDA class subset, set_cover = process_ngrams(i, dida_data, "dida") subsets_dida.extend(subset) covers.extend(set_cover) # Process on Not-DIDA class subset, set_cover = process_ngrams(i, notdida_data, "notdida") subsets_notdida.extend(subset) covers.extend(set_cover) display.display_ok("Analysis for {0}-grams done".format(i)) print("Searching set cover with all grams for DIDA") set_cover = get_set_cover(subsets_dida) scores = check_score(set_cover, subsets_dida, subsets_dida, dida_data) exh.write_text(scores, SCORE_FILENAME.format("dida", "all")) display.display_ok("Done") print("Searching set cover with all grams for NotDIDA") set_cover = get_set_cover(subsets_notdida) scores = check_score(set_cover, subsets_notdida, subsets_notdida, notdida_data) exh.write_text(scores, SCORE_FILENAME.format("notdida", "all")) display.display_ok("Done") save_topwords(covers) display.display_info("All results were saved in {0} directory".format(DIRECTORY))
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] for i in range(1, n + 1): print("Starting analysis for {0}-grams".format(i)) print("Couting occurrences for DIDA") dida_occurrences = ngh.count_occurrences(i, dida_data) dida_normalized = ngh.normalize_occurrences(dida_occurrences, len(dida_data)) display.display_ok("Counting occurrences for DIDA done") print("Couting occurrences for NotDIDA") notdida_occurrences = ngh.count_occurrences(i, notdida_data) notdida_normalized = ngh.normalize_occurrences(notdida_occurrences, len(notdida_data)) display.display_ok("Counting occurrences for NotDIDA done") # Merge n-grams in the same list merged = merge_ngrams(dida_normalized, notdida_normalized) # Order n-grams by difference merged = ordered(merged, score) # Save results save_to_file(merged, i) display.display_ok("Analysis for {0}-grams done".format(i))
def read_file(filename, extension): """Returns publications based on a text file containing PMIDs or a JSON file containing publications Parameters ---------- filename : str The name of the file to read extension : str The extension of the file Returns ------- list a list of publications at JSON format """ if extension == "txt": print("Received a text file - Reading PMIDs list") # Read each PMID in the file f = open(filename) lines = f.readlines() pmids = [] for line in lines: pmids.append(line.replace('\n', '')) f.close() # Downloads and returns publications print("Downloading publications") return pbmdh.download_publications(pmids) elif extension == "json": print("Received a JSON file - Getting publications") return exh.load_json(filename)
def run(dida_pmids): """Executes the main process of the script Parameters ---------- dida_pmids : str The file name of the file containing the PMIDs in DIDA """ global CONFIG # Load configuration CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) # Get DIDA PMIDs known_pmids = get_dida_pmids(dida_pmids) # Get Not-DIDA PMIDs pmids = get_pmids_by_dates() notdida_pmids = filter(pmids, known_pmids) display.display_info("Total PMIDs between {0} and {1} : {2}".format( CONFIG['START_YEAR'], CONFIG['SPLIT_YEAR'], len(pmids))) display.display_info("Total PMIDs in DIDA : {0}".format(len(known_pmids))) display.display_info("Total PMIDs in Not-DIDA : {0}".format( len(notdida_pmids))) # Download Not-DIDA publications download_doc(notdida_pmids)
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] for i in range(1, n + 1): extract_ngrams(i, deepcopy(dida_data), deepcopy(notdida_data))
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) # docs = [deepcopy(dida_data), deepcopy(notdida_data)] docs = [deepcopy(notdida_data), deepcopy(dida_data)] display.display_ok("Loading publications done") print("Starting extraction of words information") extract_words_information(docs) display.display_ok("Extraction of words information done") print("Computing joint probability distribution") joint_probability_distribution() display.display_ok("Computing joint probability distribution done") print("Starting IB method") all_clusters = ib.cluster(deepcopy(Pcw), deepcopy(Pw)) display.display_ok("IB method finished") save_clusters(all_clusters)
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) # Extension of the input file extension = args.FILE.split('.')[-1] if extension in LEGAL_EXTENSIONS: exh.create_directory(DIRECTORY) # Get publications print("Getting publications") documents_l = read_file(args.FILE, extension) display.display_ok("Getting publications done") # Save publications filename = BACK_FILENAME.format(args.OUTPUT) exh.write_json(documents_l, filename) display.display_info("Publications saved in {0}".format(filename)) # Insert PubTator annotations in the abstracts print("Inserting PubTator annotations in abstracts") docs = pbmdh.extract_features(documents_l) display.display_ok("Inserting PubTator annotations in abstracts done") # Extract n-grams print("Extracting n-grams") ngh.extract_ngrams(docs, CONFIG['NGRAMS']) display.display_ok("Extracting n-grams done") # Save publications and their n-grams filename = NGRAMS_FILENAME.format(args.OUTPUT) exh.write_json(docs, filename) display.display_info("Publications and n-grams saved in {0}".format(filename)) else: # The input file has not a valid extension display.display_fail("Extension of input file not supported. Required : txt or json. Received : {0}".format(extension)) sys.exit(0)
* download_publications - downloads publications based on a PMIDs list * extract_features - inserts PubTator annotations inside the publications abstracts * get_pmids - gets a PMIDs list based on a particular query """ import json import urllib.request as req import xml.etree.ElementTree as ET from copy import deepcopy from nltk.stem import PorterStemmer from string import punctuation import explorer_helper as exh STOPWORDS = exh.load_json("config/stopwords.json")['stopwords'] URL_DOWNLOAD = "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/BioConcept/{0}/JSON/" URL_PMIDS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={0}&retmax={1}" def clean_text(text, stopwords=STOPWORDS): """Lowerizes and stems publications abstracts Parameters ---------- text : str The abstract to lowerize and stem stopwords: list A list of stopwords Returns -------
def classification(docs, Ndw, W, directory, true_predictions): strict_result = { "n_clusters": [], "tn": [], "fp": [], "fn": [], "tp": [], "score": [] } doublon_result = { "n_clusters": [], "tn": [], "fp": [], "fn": [], "tp": [], "score": [] } print("Documents replacement") converted_docs = converter.init(deepcopy(docs), deepcopy(W)) display.display_ok("Documents replacement done") clusters_directory = directory + "/clusters" max_clusters = len(W) print("Evaluating classifier") a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] a.extend(range(100, 8500, 100)) a.extend([8417]) for n_clusters in a: #range(1, max_clusters+1,100): print("Processing for {0} clusters (Total : {1})".format( n_clusters, max_clusters)) # Load clusters clusters = exh.load_json(clusters_directory + "/{0}.json".format(n_clusters)) # Prepare classifier classifier = NaiveBayesCluster(deepcopy(clusters), deepcopy(Ndw), deepcopy(W)) print("Classifier ready") print("Converting documents") strict_converted_docs = converter.convert_all(deepcopy(converted_docs), deepcopy(clusters)) doublon_converted_docs = converter.convert_all( deepcopy(converted_docs), deepcopy(clusters), method='d') print("Converting documents done") print("Evaluate Strict Predictions") strict_predictions = classifier.evaluate(strict_converted_docs) print("Evaluate Doublon Predictions") doublon_predictions = classifier.evaluate(doublon_converted_docs) print("Predictions done") print("Perform scores") strict_score = classifier.score(true_predictions, strict_predictions) doublon_score = classifier.score(true_predictions, doublon_predictions) print("Scores performed : ({0}, {1})".format(strict_score, doublon_score)) add_result(n_clusters, strict_score, strict_result) add_result(n_clusters, doublon_score, doublon_result) display.display_ok("Evaluating classifier done") return strict_result, doublon_result
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) # docs = [deepcopy(notdida_data), deepcopy(dida_data)] docs = [deepcopy(dida_data), deepcopy(notdida_data)] display.display_ok("Loading publications done") data_directory = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_DIRECTORY'] Ndw = exh.load_json(data_directory + "/ndw.json") W = exh.load_json(data_directory + "/W.json") # Real labels of each publication # y_true = np.append(np.zeros(len(notdida_data)), np.ones(len(dida_data))) y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data))) strict_result, doublon_result = classification(docs, Ndw, W, data_directory, y_true) plt.plot_confusion_matrix(strict_result, len(dida_data), len(notdida_data), "strict_", "n_clusters", "Number of clusters", DIRECTORY, step=1000) exh.save_to_log(strict_result, "strict", "n_clusters", LOG_FILENAME.format("strict")) plt.plot_confusion_matrix(doublon_result, len(dida_data), len(notdida_data), "doublon_", "n_clusters", "Number of clusters", DIRECTORY, step=1000) exh.save_to_log(doublon_result, "doublon", "n_clusters", LOG_FILENAME.format("doublon")) scores = [strict_result['score'], doublon_result['score']] classifiers_names = ["Strict converter", "Doublon converter"] plt.plot_lines(strict_result['n_clusters'], scores, classifiers_names, FSCORE_FILENAME, "Number of clusters", "F1-score", step=1000)
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] csv_files = csv_filenames(n) # Real labels of each publication y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data))) data = deepcopy(dida_data) data.extend(deepcopy(notdida_data)) scores = [] classifiers_names = [] print("Strict Classifier training") results = train(StrictClassifier, deepcopy(data), csv_files, y_true) plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data), 'strict_', "threshold", "Threshold", DIRECTORY) scores.append(results['score']) exh.save_to_log(results, "strict", "threshold", LOG_FILENAME.format("strict")) classifiers_names.append("Strict Classifier") display.display_ok("Strict Classifier training done") print("Split Weighted Classifier training") results = train(SplitWeightedClassifier, deepcopy(data), csv_files, y_true) plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data), 'splitweighted_', "threshold", "Threshold", DIRECTORY) scores.append(results['score']) exh.save_to_log(results, "splitweighted", "threshold", LOG_FILENAME.format("splitweighted")) classifiers_names.append("Split Weighted Classifier") display.display_ok("Split Weighted Classifier training done") print("Weighted Classifier training") results = train(WeightedClassifier, deepcopy(data), csv_files, y_true) plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data), 'weighted_', "threshold", "Threshold", DIRECTORY) scores.append(results['score']) exh.save_to_log(results, "weighted", "threshold", LOG_FILENAME.format("weighted")) classifiers_names.append("Weighted Classifier") display.display_ok("Weighted Classifier training done") plt.plot_lines(results['threshold'], scores, classifiers_names, FSCORE_FILENAME, "Threshold", "F1-score") display.display_info("Results saved in {0}".format(DIRECTORY))