def main(args): cur_dir = os.getcwd() adjacency_folder = os.path.join(cur_dir, args.adj_folder) training_genes_file = os.path.join(cur_dir, args.train_genes_file) training_labels_file = os.path.join(cur_dir, args.train_labels_file) all_genes_file = os.path.join(cur_dir, args.all_genes_file) list_D = args.list_D list_C = args.list_C list_d = args.list_d list_r = args.list_r training_genes = util.load_list_from_file(training_genes_file) training_labels = [int(l) for l in util.load_list_from_file(training_labels_file)] all_genes = util.load_list_from_file(all_genes_file) # Creating list of graphs print("Unifying graphs...") if args.use_vec: graphs = gu.create_graphs(adjacency_folder_path=adjacency_folder, list_attr_path=args.node_vecs_file) else: graphs = gu.create_graphs(adjacency_folder_path=adjacency_folder) # Computing kernel matrices print("Computing graph kernels...") kernel_matrices = [] for D in list_D: for C in list_C: g_union = gu.union_graphs(graphs=graphs, deg_threshold=D, cli_threshold=C) for d in list_d: for r in list_r: vec = graph.CDNK_Vectorizer(d=d, r=r, L=len(graphs), n_nodes=len(graphs[0].nodes()), discrete=not args.use_vec) kernel_matrices.append(vec.cdnk(g=g_union)) print("Evaluating model...") if args.use_lou: val = Validation(kernels=kernel_matrices, all_genes=all_genes, training_genes=training_genes, training_labels=training_labels) print('============') print('Performances') auc = val.validate_leave_one_out() print(auc) else: val = Validation(kernels=kernel_matrices, all_genes=all_genes, training_genes=training_genes, training_labels=training_labels, n_folds=5) print('============') print('Performances') aucs = val.validate_kfolds() for auc in aucs: print(auc) print('-----------') print(np.mean(aucs))
def __init__(self, gfile, tagdict): self.dict = defaultdict(list) self.mapping = {} self.tagdict = tagdict self.feedback = defaultdict(list) self.processed = {} c = 1 n = 0 t = 0 goldlist = util.load_list_from_file(gfile) self.feedback[c] = [] for g in goldlist: if g == '\n' or g == '': c += 1 t = 0 self.feedback[c] = [] else: wrd, tag = g.split("\t") self.feedback[c].append(None) self.dict[c].append({'w':wrd,'t':tag,'a':[]}) self.mapping[n] = {} self.mapping[n]['sid'] = c self.mapping[n]['tid'] = t n += 1 t += 1
def __init__(self): self.sorted_conns_list = self.get_sorted_conns_list() self.cpos_dict = self.get_CPOS_dict() self.prev_C_dict = self.get_prev_C_dict() self.prevPOS_dict = self.get_prevPOS_dict() self.prevPOS_CPOS_dict = self.get_prevPOS_CPOS_dict() self.C_next_dict = self.get_C_next_dict() self.nextPOS_dict = self.get_nextPOS_dict() self.CPOS_nextPOS_dict = self.get_CPOS_nextPOS() self.CParent_to_root_path_dict = self.get_CParent_to_root_path_dict() self.compressed_CParent_to_root_path_dict = self.get_compressed_CParent_to_root_path_dict( ) self.self_category_dict = self.get_self_category_dict() self.parent_category_dict = self.get_parent_category_dict() self.left_sibling_category_dict = self.get_left_sibling_category_dict() self.right_sibling_category_dict = self.get_right_sibling_category_dict( ) self.conn_self_category_dict = self.get_conn_self_category_dict() self.conn_parent_category_dict = self.get_conn_parent_category_dict() self.conn_left_sibling_category_dict = self.get_conn_left_sibling_category_dict( ) self.conn_right_sibling_category_dict = self.get_conn_right_sibling_category_dict( ) self.self_parent_dict = self.get_self_parent_dict() self.self_right_dict = self.get_self_right_dict() self.self_left_dict = self.get_self_left_dict() self.parent_left_dict = self.get_parent_left_dict() self.parent_right_dict = self.get_parent_right_dict() self.left_right_dict = self.get_left_right_dict() self.conn_category = self.get_conn_category_dict() ''' mine ''' self.dict_conn_lower_case = util.load_dict_from_file( config.CONNECTIVE_DICT_CONN_LOWER_CASE) self.dict_conn = util.load_dict_from_file(config.CONNECTIVE_DICT_CONN) self.dict_prevPOS_C = util.load_dict_from_file( config.CONNECTIVE_DICT_PREVPOS_C) self.dict_self_category_to_root_path = util.load_dict_from_file( config.CONNECTIVE_DICT_SELF_CATEGORY_TO_ROOT_PATH) self.dict_CParent_to_root_path_node_names = util.load_dict_from_file( config.CONNECTIVE_DICT_CPARENT_TO_ROOT_PATH_NODE_NAMES) self.dict_conn_connCtx = util.load_dict_from_file( config.CONNECTIVE_DICT_CONN_CONNCTX) self.dict_conn_rightSiblingCtx = util.load_dict_from_file( config.CONNECTIVE_DICT_CONN_RIGHTSIBLINGCTX) self.dict_conn_leftSiblingCtx = util.load_dict_from_file( config.CONNECTIVE_DICT_CONN_LEFTSIBLINGCTX) self.dict_conn_left_right_SiblingCtx = util.load_dict_from_file( config.CONNECTIVE_DICT_CONN_LEFT_RIGHT_SIBLINGCTX) self.dict_conn_parent_category_Ctx = util.load_dict_from_file( config.CONNECTIVE_DICT_CONN_PARENT_CATEGORY_CTX) self.dict_rightSibling_production_rules = util.load_dict_from_file( config.CONNECTIVE_DICT_CONN_RIGHTSIBLING_PRODUCTION_RULES) self.word2vec_conns_list = util.load_list_from_file( config.WORD2VEC_CONNS_PATH)
def getLowestCompetence(comp_file_path): list = util.load_list_from_file(comp_file_path) lowest = 1 index = -1 for i, val in enumerate(list[0].split("\t")): if (max(float(val), lowest) == lowest): lowest = float(val) index = i return index
def main(args): adjacency_folder = args.adj_folder training_genes_file = args.train_genes_file training_labels_file = args.train_labels_file all_genes_file = args.all_genes_file list_D = args.list_D list_C = args.list_C list_d = args.list_d list_r = args.list_r training_genes = util.load_list_from_file(training_genes_file) training_labels = [ int(l) for l in util.load_list_from_file(training_labels_file) ] all_genes = util.load_list_from_file(all_genes_file) # Creating list of graphs graphs = gu.create_graphs(adjacency_folder_path=adjacency_folder) # Computing kernel matrices kernel_matrices = [] for D in list_D: for C in list_C: g_union = gu.union_graphs(graphs=graphs, deg_threshold=D, cli_threshold=C) for d in list_d: for r in list_r: vec = graph.CDNK_Vectorizer(d=d, r=r, L=len(graphs), n_nodes=len(graphs[0].nodes())) kernel_matrices.append(vec.cdnk(g=g_union)) val = Validation(kernels=kernel_matrices, all_genes=all_genes, training_genes=training_genes, training_labels=training_labels) auc = val.validation() print(auc)
def readGold(gold, sentences): goldlist = [] sentdict = {} # read gold file gold = util.load_list_from_file(gold) for line in gold: if line != '\n' and line != '': word, tag = line.split('\t') # new, unknown tag? add to dictionary if tag not in sentences.tagdict: newTag = len(sentences.tagdict) + 1 sentences.tagdict[tag] = newTag util.write_dict_keys_to_file(sentences.tagdict, './dict/tags.txt') goldlist.append(sentences.tagdict[tag]) return goldlist
def get_sorted_conns_list(self): # print "loading sorted_conns_list ..." return util.load_list_from_file(config.SORTED_ExpConn_PATH)
from sklearn.metrics import accuracy_score from sklearn import cross_validation import random from sklearn import svm import util seed = 7 np.random.seed(seed) #label_path = "/media/dinh/DATA/Test/graph_labels/CPDB" label_path = "/home/dinh/fast-disk/Weisfeiler_GK/scikit-learn-graph/scripts/graph_labels/CPDB" #kernel_folder_path = "/media/dinh/DATA/Test/kernels/cpdb_reduce/" kernel_folder_path = "/home/dinh/fast-disk/Weisfeiler_GK/scikit-learn-graph/scripts/kernels/cpdb/" #save_file = "/media/dinh/DATA/Test/results/cpdb_nested_3" save_file = "/home/dinh/fast-disk/Weisfeiler_GK/scikit-learn-graph/scripts/results_cv/cpdb" pre_labels = [int(label) for label in util.load_list_from_file(label_path)] """Loading kernels""" pre_list_kernels = [] kernel_names = util.list_files_in_folder(kernel_folder_path) for kernel_name in kernel_names: pre_list_kernels.append( np.fromfile(kernel_folder_path + kernel_name).reshape( len(pre_labels), len(pre_labels))) svm_paras = [1e-0, 1e+1, 1e+2, 1e+3, 1e+4] n_folds = 10 """Model selection""" def model_selection(list_kernels=None, svm_paras=None,
def evaluate(adjacency_path=None, node_label_folder=None, all_gene_path=None, train_gene_folder=None, train_label_folder=None, n_iters=None, n_hops=None, n_clusters=None, svm_paras=None, save_folder=None): all_genes = util.load_list_from_file(all_gene_path) number_svm_parameters = len(svm_paras) dict_gene_idx = {} for idx, gene in enumerate(all_genes): dict_gene_idx[gene] = idx graph = util.create_graph(adjacency_path=adjacency_path) for n_cluster in n_clusters: util.node_labeling(g=graph, label_path=node_label_folder + str(n_cluster)) for n_iter in n_iters: WLvect = WLVectorizer(r=n_iter) iters_features = WLvect.transform([graph]) M = iters_features[0][0] for iter_id in range(1, n_iter + 1): M = M + iters_features[iter_id][0] print 'Done WL compuation' sys.stdout.flush() for n_hop in n_hops: print 'Begining DWL compuation' sys.stdout.flush() G = util.deepwl(graph=graph, feature_matrix=M, n_hop=n_hop) print "Size of G", G.shape print 'Done DWL compuation' sys.stdout.flush() for disease_idx in range(12): list_training_genes = util.load_list_from_file( train_gene_folder + str(disease_idx)) list_training_labels = util.load_list_from_file( train_label_folder + str(disease_idx)) list_training_labels = [ int(e) for e in list_training_labels ] list_qscores = [[] for i in range(number_svm_parameters)] for gene_idx, gene in enumerate(list_training_genes): list_training_genes_del = list_training_genes[:] del list_training_genes_del[gene_idx] training_genes_idx = [ dict_gene_idx[g] for g in list_training_genes_del ] list_training_labels_del = list_training_labels[:] del list_training_labels_del[gene_idx] unknown_genes_idx = [dict_gene_idx[gene]] for idx in range(len(all_genes)): if (idx not in training_genes_idx) and ( idx != dict_gene_idx[gene]): unknown_genes_idx.append(idx) Mtr = util.extract_submatrix(training_genes_idx, training_genes_idx, G) M_unknown = util.extract_submatrix( unknown_genes_idx, training_genes_idx, G) for idx_svm, svm_para in enumerate(svm_paras): clf = svm.SVC(C=svm_para, kernel='precomputed') clf.fit(Mtr, list_training_labels_del) scores = clf.decision_function(M_unknown) len_scores = len(scores) qscore = float( sum([int(scores[0] > val) for val in scores])) / len_scores list_qscores[idx_svm].append(qscore) # computing auc save_lines = [] for qscores_idx, qscores in enumerate(list_qscores): fpr, tpr, thresholds = metrics.roc_curve( list_training_labels, qscores, pos_label=1) auc = metrics.auc(fpr, tpr) line = str(n_cluster) + "_" + str(n_iter) + "_" + str( n_hop) + "_" + str(qscores_idx) + ":\t" + str( auc) + "\n" save_lines.append(line) f = open(save_folder + str(disease_idx), 'w') f.writelines(save_lines) f.close()
# Loading dictionnaries from files with open('../common/data/annotated/hashtags.json', 'r') as file: dict_hashtags = json.load(file) with open('../common/data/annotated/words_lemmatized.json', 'r', encoding="utf-8") as file: dict_words = json.load(file) # "Like it" # dict_words = util.load_word_classification_ilikeit('../common/data/external/06032019-POLARITY-JEUXDEMOTS-FR.csv') with open('../common/data/annotated/emojis.json', 'r') as file: dict_emojis = json.load(file) heavy_negatives_list = util.load_list_from_file('../common/data/annotated/heavy_negatives_filtered.txt') mildly_heavy_negatives_list = util.load_list_from_file('../common/data/annotated/midly_heavy_negatives_filtered.txt') dict_date = {} dict_correspondances = { "positif": 1, "negatif": -1, "neutre": 0 } # data = util.get_all_tweets() data = util.get_all_unique_tweets() tweets_polarity = {} cpt_neg_words = 0
def run_iteration(options, sentences, table, c): ##### # create new annotator from best predictions and get N tags for the predictions with # the highest entropy and update the new annotator (feedback from oracle) # either replace a random annotator in every round bad_guy = random.choice(range(len(table))) # or replace the one with the lowest competence (don't! doesn't work well...) if options.lowest: bad_guy = getLowestCompetence('./competence') replaceLowestCompetenceByMACEpred(bad_guy, table, predlist, c) # print "Replace annotator ", bad_guy # select table entry with highest entropy # returns a list with row indices for the rows # in the table that have the highest entropy (from file "entropy" created by MACE) N = 1 indices = sentences.entropy(N) #### don't use: indices = sentences.entropyMajority(predlist, N) #indi = sentences.get_entropy_for_row(N) print "ENTROPY: ", indices #print "MAJORITY: ", indi print for ind in indices: print "Option: ", options.simulation if options.simulation: print "run simulation........................." # either from the gold standard (AL simulation) tag = sentences.getOracleTag(ind) else: print "ask oracle............................." # or from the oracle (real AL) tag = sentences.getAnnotation(ind, c) util.print_log("TAG: " + tag.upper() + "\n") # now use oracle tags as feedback, combined with the predlist features from MACE # create control file and rerun EM if options.feedback: sentences.write_feedback_to_file(options.feedback, bad_guy) sentences.update_predictions(bad_guy) sentences.backupFiles(c) # read updated predictions from file (with feedback from oracle/AL simulation) #table = util.readPredictions(options.path) # convert tags to numerical representation #table = convert2num(table, tagdict) # run MACE if options.entropies: if options.feedback: command = "java -jar ./MACE.jar --controls feedback --entropies preds.csv" else: command = "java -jar ./MACE.jar --entropies preds.csv" else: if options.feedback: command = "java -jar ./MACE.jar --controls feedback preds.csv" else: command = "java -jar ./MACE.jar preds.csv" if options.restarts: newarg = "MACE.jar --restarts " + options.restarts command = command.replace("MACE.jar", newarg) if options.vanilla: newarg = "MACE.jar --em " command = command.replace("MACE.jar", newarg) os.system(command) # read MACE predictions predlist = util.load_list_from_file('prediction') # if we have a gold file, evaluate best predictions against gold if options.goldstandard != False: evalTagger(options.goldstandard, sentences) evalPredictions(options.goldstandard, predlist, sentences)
def run_cmd_interface(): # parse command line options optparser = OptionParser() optparser.add_option("-p", "--predfile", dest="path", default="../data/output", help="path to dir with predictions", metavar="DIR") optparser.add_option("-g", "--gold", dest="goldstandard", default=False, help="read gold standard annotations (optional)") optparser.add_option( "-f", "--feedback", dest="feedback", default=False, help="read feedback from oracle annotations (optional)") optparser.add_option( "-e", "--entropies", dest="entropies", default=False, help="print entropies for each instance to file (MACE)") optparser.add_option( "-m", "--majority", dest="majority", default=False, help="instead of posterior entropies, use majority vote") optparser.add_option("-r", "--restarts", dest="restarts", default=False, help="number of random model initialisations") optparser.add_option("-s", "--simulation", dest="simulation", default=False, help="run program as AL simulation") optparser.add_option("-v", "--vanilla", dest="vanilla", default=False, help="run program with vanilla EM") optparser.add_option("-l", "--lowest", dest="lowest", default=False, help="replace annotator with the lowest competence") (options, args) = optparser.parse_args() tagdict = util.load_dict_from_file('./dict/tags.txt') mapping = util.load_mapping_from_file('./dict/mapping.txt') # read gold standard (or, if no gold annotations exist, list with word tokens) sentences = Sentences(options.goldstandard, tagdict) # read predictions from file table = util.readPredictions(options.path) # add predictions sentences.addPredictions(table) # convert tags to numerical representation table = convert2num(table, sentences) # print table content to csv file sentences.printCSV() # and update table with existing predictions from experiment (preds.csv) # run MACE if options.entropies: command = "java -jar ./MACE.jar --entropies preds.csv" else: command = "java -jar ./MACE.jar preds.csv" if options.restarts: newarg = "MACE.jar --restarts " + options.restarts command = command.replace("MACE.jar", newarg) if options.vanilla: newarg = "MACE.jar --em " command = command.replace("MACE.jar", newarg) os.system(command) # read MACE predictions predlist = util.load_list_from_file('prediction') # if we have a gold file, evaluate best predictions against gold if options.goldstandard != False: evalTagger(options.goldstandard, sentences) evalPredictions(options.goldstandard, predlist, sentences) # FIXME: the max range should also be a parameter (right now the number of iterations is hard-coded) # also, it should be able to end/continue with the annotation at every point in time... for c in range(0, 500): run_iteration(options, sentences, table, c)