Exemplo n.º 1
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']

    subsets_dida = []
    subsets_notdida = []

    covers = []
    for i in range(1, n+1):
        print("Starting analysis for {0}-grams".format(i))

        # Process on DIDA class
        subset, set_cover = process_ngrams(i, dida_data, "dida")
        subsets_dida.extend(subset)
        covers.extend(set_cover)

        # Process on Not-DIDA class
        subset, set_cover = process_ngrams(i, notdida_data, "notdida")
        subsets_notdida.extend(subset)
        covers.extend(set_cover)

        display.display_ok("Analysis for {0}-grams done".format(i))

    print("Searching set cover with all grams for DIDA")
    set_cover = get_set_cover(subsets_dida)
    scores = check_score(set_cover, subsets_dida, subsets_dida, dida_data)
    exh.write_text(scores, SCORE_FILENAME.format("dida", "all"))
    display.display_ok("Done")

    print("Searching set cover with all grams for NotDIDA")
    set_cover = get_set_cover(subsets_notdida)
    scores = check_score(set_cover, subsets_notdida, subsets_notdida, notdida_data)
    exh.write_text(scores, SCORE_FILENAME.format("notdida", "all"))
    display.display_ok("Done")

    save_topwords(covers)
    display.display_info("All results were saved in {0} directory".format(DIRECTORY))
Exemplo n.º 2
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']

    for i in range(1, n + 1):
        print("Starting analysis for {0}-grams".format(i))

        print("Couting occurrences for DIDA")
        dida_occurrences = ngh.count_occurrences(i, dida_data)
        dida_normalized = ngh.normalize_occurrences(dida_occurrences,
                                                    len(dida_data))
        display.display_ok("Counting occurrences for DIDA done")

        print("Couting occurrences for NotDIDA")
        notdida_occurrences = ngh.count_occurrences(i, notdida_data)
        notdida_normalized = ngh.normalize_occurrences(notdida_occurrences,
                                                       len(notdida_data))
        display.display_ok("Counting occurrences for NotDIDA done")

        # Merge n-grams in the same list
        merged = merge_ngrams(dida_normalized, notdida_normalized)

        # Order n-grams by difference
        merged = ordered(merged, score)

        # Save results
        save_to_file(merged, i)

        display.display_ok("Analysis for {0}-grams done".format(i))
Exemplo n.º 3
0
    def __init__(self, threshold, filenames, c1, c2, foldername):
        self.threshold = threshold / 100
        self.ngrams_files = filenames
        self.c1 = c1
        self.c2 = c2
        self.foldername = "wordsdistribution/" + foldername
        exh.create_directory(self.foldername)

        self.c1_grams = []
        self.c1_weight_grams = dict()
        self.c2_grams = []
        self.c2_weight_grams = dict()
        self.global_weights = dict()
        self.global_weights[self.c1] = dict()
        self.global_weights[self.c2] = dict()

        self._prepare_ngrams()
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    # Extension of the input file
    extension = args.FILE.split('.')[-1]

    if extension in LEGAL_EXTENSIONS:
        exh.create_directory(DIRECTORY)

        # Get publications
        print("Getting publications")
        documents_l = read_file(args.FILE, extension)
        display.display_ok("Getting publications done")

        # Save publications
        filename = BACK_FILENAME.format(args.OUTPUT)
        exh.write_json(documents_l, filename)
        display.display_info("Publications saved in {0}".format(filename))

        # Insert PubTator annotations in the abstracts
        print("Inserting PubTator annotations in abstracts")
        docs = pbmdh.extract_features(documents_l)
        display.display_ok("Inserting PubTator annotations in abstracts done")

        # Extract n-grams
        print("Extracting n-grams")
        ngh.extract_ngrams(docs, CONFIG['NGRAMS'])
        display.display_ok("Extracting n-grams done")

        # Save publications and their n-grams
        filename = NGRAMS_FILENAME.format(args.OUTPUT)
        exh.write_json(docs, filename)
        display.display_info("Publications and n-grams saved in {0}".format(filename))
    else:
        # The input file has not a valid extension
        display.display_fail("Extension of input file not supported. Required : txt or json. Received : {0}".format(extension))
        sys.exit(0)
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']
    for i in range(1, n + 1):
        extract_ngrams(i, deepcopy(dida_data), deepcopy(notdida_data))
def save_clusters(clusters):
    data = dict()
    data['clusters'] = clusters
    data['Ndw'] = Ndw
    data['W'] = W

    directory = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_DIRECTORY']
    exh.create_directory(directory)

    # filename = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_FILENAME']
    filename = directory + "/ndw.json"
    exh.write_json(Ndw, filename)

    filename = directory + "/W.json"
    exh.write_json(W, filename)

    cluster_directory = directory + "/clusters"
    exh.create_directory(cluster_directory)

    for i, c in clusters.items():
        filename = cluster_directory + "/{0}.json".format(i)
        exh.write_json(c, filename)

    display.display_info("Data clusters saved into " + directory)
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))

    # docs = [deepcopy(dida_data), deepcopy(notdida_data)]
    docs = [deepcopy(notdida_data), deepcopy(dida_data)]
    display.display_ok("Loading publications done")

    print("Starting extraction of words information")
    extract_words_information(docs)
    display.display_ok("Extraction of words information done")

    print("Computing joint probability distribution")
    joint_probability_distribution()
    display.display_ok("Computing joint probability distribution done")

    print("Starting IB method")
    all_clusters = ib.cluster(deepcopy(Pcw), deepcopy(Pw))
    display.display_ok("IB method finished")

    save_clusters(all_clusters)
Exemplo n.º 8
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']

    csv_files = csv_filenames(n)

    # Real labels of each publication
    y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data)))

    data = deepcopy(dida_data)
    data.extend(deepcopy(notdida_data))

    scores = []
    classifiers_names = []

    print("Strict Classifier training")
    results = train(StrictClassifier, deepcopy(data), csv_files, y_true)
    plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data),
                              'strict_', "threshold", "Threshold", DIRECTORY)
    scores.append(results['score'])
    exh.save_to_log(results, "strict", "threshold",
                    LOG_FILENAME.format("strict"))
    classifiers_names.append("Strict Classifier")
    display.display_ok("Strict Classifier training done")

    print("Split Weighted Classifier training")
    results = train(SplitWeightedClassifier, deepcopy(data), csv_files, y_true)
    plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data),
                              'splitweighted_', "threshold", "Threshold",
                              DIRECTORY)
    scores.append(results['score'])
    exh.save_to_log(results, "splitweighted", "threshold",
                    LOG_FILENAME.format("splitweighted"))
    classifiers_names.append("Split Weighted Classifier")
    display.display_ok("Split Weighted Classifier training done")

    print("Weighted Classifier training")
    results = train(WeightedClassifier, deepcopy(data), csv_files, y_true)
    plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data),
                              'weighted_', "threshold", "Threshold", DIRECTORY)
    scores.append(results['score'])
    exh.save_to_log(results, "weighted", "threshold",
                    LOG_FILENAME.format("weighted"))
    classifiers_names.append("Weighted Classifier")
    display.display_ok("Weighted Classifier training done")

    plt.plot_lines(results['threshold'], scores, classifiers_names,
                   FSCORE_FILENAME, "Threshold", "F1-score")
    display.display_info("Results saved in {0}".format(DIRECTORY))