예제 #1
0
파일: main.py 프로젝트: dinhinfotech/DiGI
def main(args):
    cur_dir = os.getcwd()
    adjacency_folder = os.path.join(cur_dir, args.adj_folder)
    training_genes_file = os.path.join(cur_dir, args.train_genes_file)
    training_labels_file = os.path.join(cur_dir, args.train_labels_file)
    all_genes_file = os.path.join(cur_dir, args.all_genes_file)
    list_D = args.list_D
    list_C = args.list_C
    list_d = args.list_d
    list_r = args.list_r

    training_genes = util.load_list_from_file(training_genes_file)
    training_labels = [int(l) for l in util.load_list_from_file(training_labels_file)]
    all_genes = util.load_list_from_file(all_genes_file)

    # Creating list of graphs
    print("Unifying graphs...")
    if args.use_vec:
        graphs = gu.create_graphs(adjacency_folder_path=adjacency_folder, list_attr_path=args.node_vecs_file)
    else:
        graphs = gu.create_graphs(adjacency_folder_path=adjacency_folder)

    # Computing kernel matrices
    print("Computing graph kernels...")
    kernel_matrices = []
    for D in list_D:
        for C in list_C:
            g_union = gu.union_graphs(graphs=graphs, deg_threshold=D, cli_threshold=C)
            for d in list_d:
                for r in list_r:
                    vec = graph.CDNK_Vectorizer(d=d, r=r, L=len(graphs), n_nodes=len(graphs[0].nodes()),
                                                discrete=not args.use_vec)
                    kernel_matrices.append(vec.cdnk(g=g_union))

    print("Evaluating model...")
    if args.use_lou:
        val = Validation(kernels=kernel_matrices, all_genes=all_genes, training_genes=training_genes,
                         training_labels=training_labels)
        print('============')
        print('Performances')
        auc = val.validate_leave_one_out()
        print(auc)
    else:
        val = Validation(kernels=kernel_matrices, all_genes=all_genes, training_genes=training_genes, training_labels=training_labels, n_folds=5)
        print('============')
        print('Performances')
        aucs = val.validate_kfolds()
        for auc in aucs:
            print(auc)
        print('-----------')
        print(np.mean(aucs))
예제 #2
0
    def __init__(self, gfile, tagdict):
        self.dict = defaultdict(list)
        self.mapping = {}
        self.tagdict = tagdict
        self.feedback = defaultdict(list)
        self.processed = {}
        c = 1
        n = 0
        t = 0
        goldlist = util.load_list_from_file(gfile)
        self.feedback[c] = []

        for g in goldlist:
                if g == '\n' or g == '':
                    c += 1
                    t = 0
                    self.feedback[c] = []
                else:
                    wrd, tag = g.split("\t")
                    self.feedback[c].append(None)
                    self.dict[c].append({'w':wrd,'t':tag,'a':[]})
                    self.mapping[n] = {}
                    self.mapping[n]['sid'] = c
                    self.mapping[n]['tid'] = t
                    n += 1
                    t += 1
    def __init__(self):
        self.sorted_conns_list = self.get_sorted_conns_list()
        self.cpos_dict = self.get_CPOS_dict()
        self.prev_C_dict = self.get_prev_C_dict()
        self.prevPOS_dict = self.get_prevPOS_dict()
        self.prevPOS_CPOS_dict = self.get_prevPOS_CPOS_dict()
        self.C_next_dict = self.get_C_next_dict()
        self.nextPOS_dict = self.get_nextPOS_dict()
        self.CPOS_nextPOS_dict = self.get_CPOS_nextPOS()
        self.CParent_to_root_path_dict = self.get_CParent_to_root_path_dict()
        self.compressed_CParent_to_root_path_dict = self.get_compressed_CParent_to_root_path_dict(
        )

        self.self_category_dict = self.get_self_category_dict()
        self.parent_category_dict = self.get_parent_category_dict()
        self.left_sibling_category_dict = self.get_left_sibling_category_dict()
        self.right_sibling_category_dict = self.get_right_sibling_category_dict(
        )

        self.conn_self_category_dict = self.get_conn_self_category_dict()
        self.conn_parent_category_dict = self.get_conn_parent_category_dict()
        self.conn_left_sibling_category_dict = self.get_conn_left_sibling_category_dict(
        )
        self.conn_right_sibling_category_dict = self.get_conn_right_sibling_category_dict(
        )

        self.self_parent_dict = self.get_self_parent_dict()
        self.self_right_dict = self.get_self_right_dict()
        self.self_left_dict = self.get_self_left_dict()
        self.parent_left_dict = self.get_parent_left_dict()
        self.parent_right_dict = self.get_parent_right_dict()
        self.left_right_dict = self.get_left_right_dict()

        self.conn_category = self.get_conn_category_dict()
        ''' mine '''
        self.dict_conn_lower_case = util.load_dict_from_file(
            config.CONNECTIVE_DICT_CONN_LOWER_CASE)
        self.dict_conn = util.load_dict_from_file(config.CONNECTIVE_DICT_CONN)
        self.dict_prevPOS_C = util.load_dict_from_file(
            config.CONNECTIVE_DICT_PREVPOS_C)
        self.dict_self_category_to_root_path = util.load_dict_from_file(
            config.CONNECTIVE_DICT_SELF_CATEGORY_TO_ROOT_PATH)
        self.dict_CParent_to_root_path_node_names = util.load_dict_from_file(
            config.CONNECTIVE_DICT_CPARENT_TO_ROOT_PATH_NODE_NAMES)
        self.dict_conn_connCtx = util.load_dict_from_file(
            config.CONNECTIVE_DICT_CONN_CONNCTX)
        self.dict_conn_rightSiblingCtx = util.load_dict_from_file(
            config.CONNECTIVE_DICT_CONN_RIGHTSIBLINGCTX)
        self.dict_conn_leftSiblingCtx = util.load_dict_from_file(
            config.CONNECTIVE_DICT_CONN_LEFTSIBLINGCTX)
        self.dict_conn_left_right_SiblingCtx = util.load_dict_from_file(
            config.CONNECTIVE_DICT_CONN_LEFT_RIGHT_SIBLINGCTX)
        self.dict_conn_parent_category_Ctx = util.load_dict_from_file(
            config.CONNECTIVE_DICT_CONN_PARENT_CATEGORY_CTX)
        self.dict_rightSibling_production_rules = util.load_dict_from_file(
            config.CONNECTIVE_DICT_CONN_RIGHTSIBLING_PRODUCTION_RULES)

        self.word2vec_conns_list = util.load_list_from_file(
            config.WORD2VEC_CONNS_PATH)
예제 #4
0
def getLowestCompetence(comp_file_path):
    list = util.load_list_from_file(comp_file_path)
    lowest = 1
    index = -1
    for i, val in enumerate(list[0].split("\t")):
        if (max(float(val), lowest) == lowest):
            lowest = float(val)
            index = i
    return index
예제 #5
0
def main(args):

    adjacency_folder = args.adj_folder
    training_genes_file = args.train_genes_file
    training_labels_file = args.train_labels_file
    all_genes_file = args.all_genes_file
    list_D = args.list_D
    list_C = args.list_C
    list_d = args.list_d
    list_r = args.list_r

    training_genes = util.load_list_from_file(training_genes_file)
    training_labels = [
        int(l) for l in util.load_list_from_file(training_labels_file)
    ]
    all_genes = util.load_list_from_file(all_genes_file)

    # Creating list of graphs
    graphs = gu.create_graphs(adjacency_folder_path=adjacency_folder)

    # Computing kernel matrices
    kernel_matrices = []
    for D in list_D:
        for C in list_C:
            g_union = gu.union_graphs(graphs=graphs,
                                      deg_threshold=D,
                                      cli_threshold=C)

            for d in list_d:
                for r in list_r:
                    vec = graph.CDNK_Vectorizer(d=d,
                                                r=r,
                                                L=len(graphs),
                                                n_nodes=len(graphs[0].nodes()))
                    kernel_matrices.append(vec.cdnk(g=g_union))

    val = Validation(kernels=kernel_matrices,
                     all_genes=all_genes,
                     training_genes=training_genes,
                     training_labels=training_labels)

    auc = val.validation()

    print(auc)
예제 #6
0
def readGold(gold, sentences):

    goldlist = []
    sentdict = {}

    # read gold file
    gold = util.load_list_from_file(gold)
    for line in gold:
        if line != '\n' and line != '':
            word, tag = line.split('\t')
            # new, unknown tag? add to dictionary
            if tag not in sentences.tagdict:
                newTag = len(sentences.tagdict) + 1
                sentences.tagdict[tag] = newTag
                util.write_dict_keys_to_file(sentences.tagdict,
                                             './dict/tags.txt')
            goldlist.append(sentences.tagdict[tag])
    return goldlist
예제 #7
0
 def get_sorted_conns_list(self):
     # print "loading sorted_conns_list ..."
     return util.load_list_from_file(config.SORTED_ExpConn_PATH)
예제 #8
0
from sklearn.metrics import accuracy_score
from sklearn import cross_validation
import random
from sklearn import svm
import util
seed = 7
np.random.seed(seed)

#label_path = "/media/dinh/DATA/Test/graph_labels/CPDB"
label_path = "/home/dinh/fast-disk/Weisfeiler_GK/scikit-learn-graph/scripts/graph_labels/CPDB"
#kernel_folder_path = "/media/dinh/DATA/Test/kernels/cpdb_reduce/"
kernel_folder_path = "/home/dinh/fast-disk/Weisfeiler_GK/scikit-learn-graph/scripts/kernels/cpdb/"
#save_file = "/media/dinh/DATA/Test/results/cpdb_nested_3"
save_file = "/home/dinh/fast-disk/Weisfeiler_GK/scikit-learn-graph/scripts/results_cv/cpdb"

pre_labels = [int(label) for label in util.load_list_from_file(label_path)]
"""Loading kernels"""
pre_list_kernels = []
kernel_names = util.list_files_in_folder(kernel_folder_path)
for kernel_name in kernel_names:
    pre_list_kernels.append(
        np.fromfile(kernel_folder_path + kernel_name).reshape(
            len(pre_labels), len(pre_labels)))
svm_paras = [1e-0, 1e+1, 1e+2, 1e+3, 1e+4]

n_folds = 10
"""Model selection"""


def model_selection(list_kernels=None,
                    svm_paras=None,
예제 #9
0
def evaluate(adjacency_path=None,
             node_label_folder=None,
             all_gene_path=None,
             train_gene_folder=None,
             train_label_folder=None,
             n_iters=None,
             n_hops=None,
             n_clusters=None,
             svm_paras=None,
             save_folder=None):

    all_genes = util.load_list_from_file(all_gene_path)
    number_svm_parameters = len(svm_paras)

    dict_gene_idx = {}
    for idx, gene in enumerate(all_genes):
        dict_gene_idx[gene] = idx

    graph = util.create_graph(adjacency_path=adjacency_path)

    for n_cluster in n_clusters:
        util.node_labeling(g=graph,
                           label_path=node_label_folder + str(n_cluster))
        for n_iter in n_iters:

            WLvect = WLVectorizer(r=n_iter)
            iters_features = WLvect.transform([graph])
            M = iters_features[0][0]
            for iter_id in range(1, n_iter + 1):
                M = M + iters_features[iter_id][0]
            print 'Done WL compuation'
            sys.stdout.flush()

            for n_hop in n_hops:
                print 'Begining DWL compuation'
                sys.stdout.flush()
                G = util.deepwl(graph=graph, feature_matrix=M, n_hop=n_hop)
                print "Size of G", G.shape

                print 'Done DWL compuation'
                sys.stdout.flush()

                for disease_idx in range(12):
                    list_training_genes = util.load_list_from_file(
                        train_gene_folder + str(disease_idx))
                    list_training_labels = util.load_list_from_file(
                        train_label_folder + str(disease_idx))
                    list_training_labels = [
                        int(e) for e in list_training_labels
                    ]
                    list_qscores = [[] for i in range(number_svm_parameters)]

                    for gene_idx, gene in enumerate(list_training_genes):
                        list_training_genes_del = list_training_genes[:]
                        del list_training_genes_del[gene_idx]
                        training_genes_idx = [
                            dict_gene_idx[g] for g in list_training_genes_del
                        ]

                        list_training_labels_del = list_training_labels[:]
                        del list_training_labels_del[gene_idx]

                        unknown_genes_idx = [dict_gene_idx[gene]]
                        for idx in range(len(all_genes)):
                            if (idx not in training_genes_idx) and (
                                    idx != dict_gene_idx[gene]):
                                unknown_genes_idx.append(idx)

                        Mtr = util.extract_submatrix(training_genes_idx,
                                                     training_genes_idx, G)
                        M_unknown = util.extract_submatrix(
                            unknown_genes_idx, training_genes_idx, G)

                        for idx_svm, svm_para in enumerate(svm_paras):
                            clf = svm.SVC(C=svm_para, kernel='precomputed')
                            clf.fit(Mtr, list_training_labels_del)
                            scores = clf.decision_function(M_unknown)
                            len_scores = len(scores)
                            qscore = float(
                                sum([int(scores[0] > val)
                                     for val in scores])) / len_scores
                            list_qscores[idx_svm].append(qscore)
                    # computing auc
                    save_lines = []
                    for qscores_idx, qscores in enumerate(list_qscores):
                        fpr, tpr, thresholds = metrics.roc_curve(
                            list_training_labels, qscores, pos_label=1)
                        auc = metrics.auc(fpr, tpr)

                        line = str(n_cluster) + "_" + str(n_iter) + "_" + str(
                            n_hop) + "_" + str(qscores_idx) + ":\t" + str(
                                auc) + "\n"
                        save_lines.append(line)

                    f = open(save_folder + str(disease_idx), 'w')
                    f.writelines(save_lines)
                    f.close()
예제 #10
0
# Loading dictionnaries from files
with open('../common/data/annotated/hashtags.json', 'r') as file:
    dict_hashtags = json.load(file)

with open('../common/data/annotated/words_lemmatized.json', 'r', encoding="utf-8") as file:
    dict_words = json.load(file)

# "Like it"
# dict_words = util.load_word_classification_ilikeit('../common/data/external/06032019-POLARITY-JEUXDEMOTS-FR.csv')

with open('../common/data/annotated/emojis.json', 'r') as file:
    dict_emojis = json.load(file)


heavy_negatives_list = util.load_list_from_file('../common/data/annotated/heavy_negatives_filtered.txt')
mildly_heavy_negatives_list = util.load_list_from_file('../common/data/annotated/midly_heavy_negatives_filtered.txt')
dict_date = {}

dict_correspondances = {
    "positif": 1,
    "negatif": -1,
    "neutre": 0
}

# data = util.get_all_tweets()
data = util.get_all_unique_tweets()

tweets_polarity = {}

cpt_neg_words = 0
예제 #11
0
def run_iteration(options, sentences, table, c):
    #####
    # create new annotator from best predictions and get N tags for the predictions with
    # the highest entropy and update the new annotator (feedback from oracle)
    # either replace a random annotator in every round
    bad_guy = random.choice(range(len(table)))

    # or replace the one with the lowest competence (don't! doesn't work well...)
    if options.lowest:
        bad_guy = getLowestCompetence('./competence')
        replaceLowestCompetenceByMACEpred(bad_guy, table, predlist, c)
        # print "Replace annotator ", bad_guy

    # select table entry with highest entropy
    # returns a list with row indices for the rows
    # in the table that have the highest entropy (from file "entropy" created by MACE)
    N = 1
    indices = sentences.entropy(N)
    #### don't use: indices = sentences.entropyMajority(predlist, N)
    #indi = sentences.get_entropy_for_row(N)
    print "ENTROPY:  ", indices
    #print "MAJORITY: ", indi
    print
    for ind in indices:
        print "Option: ", options.simulation
        if options.simulation:
            print "run simulation........................."
            # either from the gold standard (AL simulation)
            tag = sentences.getOracleTag(ind)
        else:
            print "ask oracle............................."
            # or from the oracle (real AL)
            tag = sentences.getAnnotation(ind, c)

        util.print_log("TAG: " + tag.upper() + "\n")

    # now use oracle tags as feedback, combined with the predlist features from MACE
    # create control file and rerun EM
    if options.feedback:
        sentences.write_feedback_to_file(options.feedback, bad_guy)
    sentences.update_predictions(bad_guy)
    sentences.backupFiles(c)

    # read updated predictions from file (with feedback from oracle/AL simulation)
    #table = util.readPredictions(options.path)

    # convert tags to numerical representation
    #table = convert2num(table, tagdict)

    # run MACE
    if options.entropies:
        if options.feedback:
            command = "java -jar ./MACE.jar --controls feedback --entropies preds.csv"
        else:
            command = "java -jar ./MACE.jar --entropies preds.csv"
    else:
        if options.feedback:
            command = "java -jar ./MACE.jar --controls feedback preds.csv"
        else:
            command = "java -jar ./MACE.jar preds.csv"
    if options.restarts:
        newarg = "MACE.jar --restarts " + options.restarts
        command = command.replace("MACE.jar", newarg)
    if options.vanilla:
        newarg = "MACE.jar --em "
        command = command.replace("MACE.jar", newarg)

    os.system(command)

    # read MACE predictions
    predlist = util.load_list_from_file('prediction')

    # if we have a gold file, evaluate best predictions against gold
    if options.goldstandard != False:
        evalTagger(options.goldstandard, sentences)
        evalPredictions(options.goldstandard, predlist, sentences)
예제 #12
0
def run_cmd_interface():
    # parse command line options
    optparser = OptionParser()
    optparser.add_option("-p",
                         "--predfile",
                         dest="path",
                         default="../data/output",
                         help="path to dir with predictions",
                         metavar="DIR")
    optparser.add_option("-g",
                         "--gold",
                         dest="goldstandard",
                         default=False,
                         help="read gold standard annotations (optional)")
    optparser.add_option(
        "-f",
        "--feedback",
        dest="feedback",
        default=False,
        help="read feedback from oracle annotations (optional)")
    optparser.add_option(
        "-e",
        "--entropies",
        dest="entropies",
        default=False,
        help="print entropies for each instance to file (MACE)")
    optparser.add_option(
        "-m",
        "--majority",
        dest="majority",
        default=False,
        help="instead of posterior entropies, use majority vote")
    optparser.add_option("-r",
                         "--restarts",
                         dest="restarts",
                         default=False,
                         help="number of random model initialisations")
    optparser.add_option("-s",
                         "--simulation",
                         dest="simulation",
                         default=False,
                         help="run program as AL simulation")
    optparser.add_option("-v",
                         "--vanilla",
                         dest="vanilla",
                         default=False,
                         help="run program with vanilla EM")
    optparser.add_option("-l",
                         "--lowest",
                         dest="lowest",
                         default=False,
                         help="replace annotator with the lowest competence")

    (options, args) = optparser.parse_args()

    tagdict = util.load_dict_from_file('./dict/tags.txt')
    mapping = util.load_mapping_from_file('./dict/mapping.txt')

    # read gold standard (or, if no gold annotations exist, list with word tokens)
    sentences = Sentences(options.goldstandard, tagdict)

    # read predictions from file
    table = util.readPredictions(options.path)

    # add predictions
    sentences.addPredictions(table)

    # convert tags to numerical representation
    table = convert2num(table, sentences)

    # print table content to csv file
    sentences.printCSV()

    # and update table with existing predictions from experiment (preds.csv)

    # run MACE
    if options.entropies:
        command = "java -jar ./MACE.jar --entropies preds.csv"
    else:
        command = "java -jar ./MACE.jar preds.csv"
    if options.restarts:
        newarg = "MACE.jar --restarts " + options.restarts
        command = command.replace("MACE.jar", newarg)
    if options.vanilla:
        newarg = "MACE.jar --em "
        command = command.replace("MACE.jar", newarg)

    os.system(command)

    # read MACE predictions
    predlist = util.load_list_from_file('prediction')

    # if we have a gold file, evaluate best predictions against gold
    if options.goldstandard != False:
        evalTagger(options.goldstandard, sentences)
        evalPredictions(options.goldstandard, predlist, sentences)

    # FIXME: the max range should also be a parameter (right now the number of iterations is hard-coded)
    # also, it should be able to end/continue with the annotation at every point in time...
    for c in range(0, 500):
        run_iteration(options, sentences, table, c)