def rate_solution(solution, problem): distances = problem[1] flows = problem[3] result = 0 for pair in utils.generate_pairs(len(solution)): distance = distances[pair] flow = flows[utils.correct_pair((solution[pair[0]],solution[pair[1]]))] result += distance * flow return result
def validate_problem(problem): locations = problem[0] distances = problem[1] facilities = problem[2] flows = problem[3] size = len(locations) if len(facilities) != size: return False pairs = utils.generate_pairs(size) if len(pairs) != len(distances): return False if len(distances) != len(flows): return False distances_keys = list(distances.keys()) flows_keys = list(flows.keys()) for i, pair in enumerate(utils.generate_pairs(size)): if pair != distances_keys[i] or pair != flows_keys[i]: return False return True
def generate_random_problem(size): locations = [] distances = {} facilities = [] flows = {} location_names = ["Warsaw","Cracow","Lodz","Wroclaw","Poznan","Gdansk","Szczecin","Bydgoszcz","Lublin","Bialystok","Katowice","Gdynia"] facility_names = ["Phone factory","Refinery","Coal mine","Hospital","Car factory","Missle silo","Centrifuge","Shipyard","Port","Power plant","5G tower","Airport"] rand.shuffle(location_names) rand.shuffle(facility_names) #size = rand.randint(3, 10) for i in range(size): locations.append(location_names[i]) facilities.append(facility_names[i]) for pair in utils.generate_pairs(size): distances[pair] = rand.randint(50,1000) flows[pair] = rand.randint(0,100) return [locations, distances, facilities, flows]
def get_output_for_writing_problem_to_file(problem): locations = problem[0] distances = problem[1] facilities = problem[2] flows = problem[3] size = len(problem[0]) output = [] cnt = 1 output.append(str(size)+'\n') pairs = utils.generate_pairs(size) for i in range(size): output.append(locations[i]+'\n') cnt += 1 for pair in pairs: output.append(str(distances[pair])+'\n') cnt += 1 for i in range(size): output.append(facilities[i]+'\n') cnt += 1 for pair in pairs: output.append(str(flows[pair])+'\n') cnt += 1 return output
def read_problem_from_file(filename): locations = [] distances = {} facilities = [] flows = {} with open(filename, "r") as file: lines = file.read().splitlines() size = int(lines[0]) cnt = 1 pairs = utils.generate_pairs(size) for i in range(size): locations.append(lines[cnt]) cnt += 1 for pair in pairs: distances[pair] = int(lines[cnt]) cnt += 1 for i in range(size): facilities.append(lines[cnt]) cnt += 1 for pair in pairs: flows[pair] = int(lines[cnt]) cnt += 1 file.close() return [locations, distances, facilities, flows]
################ Get FACE projectors ################ svd_dims = [3, 10, 50] proj_face = [] for subspace_d in svd_dims: proj = compl_svd_projector(all_names_embed, svd=subspace_d) proj_face.append(proj) ################ Get EXPLORE metric ################ np.random.seed(1) # Comparable n_pairs_comp = 50000 unique_names_idx = np.unique(names_from_df, return_index=True)[1] pairs_idx = generate_pairs(len(unique_names_idx), len(unique_names_idx), n_pairs=n_pairs_comp) comparable_pairs = all_names_embed[unique_names_idx[ pairs_idx[0]]] - all_names_embed[unique_names_idx[pairs_idx[1]]] # In-comparable n_pairs_incomp = 50000 pos_idx = np.where(y == 1)[0] neg_idx = np.where(y == -1)[0] pairs_idx = generate_pairs(len(pos_idx), len(neg_idx), n_pairs=n_pairs_incomp) incomp_pairs = X[pos_idx[pairs_idx[0]]] - X[neg_idx[pairs_idx[1]]] # Pairs data X_pairs = np.vstack((comparable_pairs, incomp_pairs)) Y_pairs = np.zeros(n_pairs_comp + n_pairs_incomp) Y_pairs[:n_pairs_comp] = 1
Ceci est un script temporaire. """ from sklearn.preprocessing import normalize from utils import readfile, roc_report, generate_pairs from metric_learn import distances_pairs from scipy.spatial import distance import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.decomposition import PCA X_train_facile, Y_train_facile = readfile('data_train_facile',test=False) X_train_facile = normalize(X_train_facile) pairs_idx, pairs_label = generate_pairs(Y_train_facile, 1000, 0.1) #X_train_facile = normalize(X_train_facile[:10000]) #pairs_idx, pairs_label = generate_pairs(Y_train_facile[:10000], 1000, 0.1) scores = [] possible_distances =[("Cosine",distance.cosine), ("BrayCurtis",distance.braycurtis), ("Euclidean",distance.euclidean),("Manhattan",distance.cityblock),("Chebyshev",distance.chebyshev),("Hamming",distance.hamming), ("Correlation",distance.correlation) ] for (name,func) in possible_distances: print name dist = distances_pairs(X_train_facile, pairs_idx,func) #print dist score_facile , score_difficile = roc_report(pairs_label,dist,name) scores.append((name,score_facile,score_difficile))
def process_doc_chains(M, doc_chains): #Cx, Cxy are total unigram and bigram counts over gigaword (subsection), respectively #DOCx, DOCxy are total document frequencies over gigaword for unigrams and bigrams, respectively DOCx_found = False DOCxy_found = False Cx_found = False Cxy_found = False total_key = M.total_key args = M.args Cx = M.Cx Cxy = M.Cxy DOCx = M.DOCx DOCxy = M.DOCxy Cx_baseline = M.Cx_baseline docset_x = set() docset_xy = set() #if only looking at nsubj and dobj dependencies (as in skipgram paper) if args.subjobj: doc_chains = map( lambda z: [x for x in z if x.endswith('->nsubj') or x.endswith('->dobj')], doc_chains) for seq in doc_chains: for vdep in seq: Cx_baseline[vdep] += 1 #filter for long or longest chains if option is enabled if args.coref == 'longest': #only retain the longest coref chain(s) doc_chains = [ chain for chain in doc_chains if len(chain) == len(max(doc_chains, key=lambda x: len(x))) ] elif args.coref == 'long': #select all chains with five or more events doc_chains = [chain for chain in doc_chains if len(chain) >= 5] else: assert args.coref == 'all' #all coref chains are included in counting for seq in doc_chains: #update DOCx (i.e. unigram document frequencies) if not DOCx_found: for vdep in seq: if not vdep in docset_x: DOCx[vdep] += 1 docset_x.add(vdep) #make unigram updates to Cx, DOCx separately if using naive counts #naive: a a b a c => Cx[a] = 3 if args.naive_uni: if not Cx_found: for vdep in seq: #increment total count for vdep if not Cx_found: Cx[vdep] += 1 Cx[total_key] += 1 #make bigram updates to Cxy, DOCxy #also make updates to Cx, DOCx if marginalizing/non-naive counts #non-naive: a a b a c => Cx[a] = 12 = Cxy[a,*]+Cxy[*,a] if (not Cxy_found) or (not DOCxy_found) or (not args.naive_uni and not Cx_found): vdep_pairs = utils.generate_pairs(seq, M.args) for vdep_pair in vdep_pairs: if not args.naive_uni: if not Cx_found: Cx[vdep_pair[0]] += 1 Cx[vdep_pair[1]] += 1 Cx[total_key] += 2 #increment total count for vdep_pair if not Cxy_found: Cxy[vdep_pair] += 1 Cxy[total_key] += 1 #increment doc count for vdep_pair if (not vdep_pair in docset_xy) and (not DOCxy_found): DOCxy[vdep_pair] += 1 docset_xy.add(vdep_pair)
def process_doc_chains(M, doc_chains): #Cx, Cxy are total unigram and bigram counts over gigaword (subsection), respectively #DOCx, DOCxy are total document frequencies over gigaword for unigrams and bigrams, respectively DOCx_found = False DOCxy_found = False Cx_found = False Cxy_found = False total_key = M.total_key args = M.args Cx = M.Cx Cxy = M.Cxy DOCx = M.DOCx DOCxy = M.DOCxy Cx_baseline = M.Cx_baseline docset_x = set() docset_xy = set() #if only looking at nsubj and dobj dependencies (as in skipgram paper) if args.subjobj: doc_chains = map(lambda z:[x for x in z if x.endswith('->nsubj') or x.endswith('->dobj')], doc_chains) for seq in doc_chains: for vdep in seq: Cx_baseline[vdep] += 1 #filter for long or longest chains if option is enabled if args.coref == 'longest': #only retain the longest coref chain(s) doc_chains = [chain for chain in doc_chains if len(chain) == len(max(doc_chains,key=lambda x:len(x)))] elif args.coref == 'long': #select all chains with five or more events doc_chains = [chain for chain in doc_chains if len(chain) >= 5] else: assert args.coref == 'all' #all coref chains are included in counting for seq in doc_chains: #update DOCx (i.e. unigram document frequencies) if not DOCx_found: for vdep in seq: if not vdep in docset_x: DOCx[vdep] += 1 docset_x.add(vdep) #make unigram updates to Cx, DOCx separately if using naive counts #naive: a a b a c => Cx[a] = 3 if args.naive_uni: if not Cx_found: for vdep in seq: #increment total count for vdep if not Cx_found: Cx[vdep] += 1 Cx[total_key] += 1 #make bigram updates to Cxy, DOCxy #also make updates to Cx, DOCx if marginalizing/non-naive counts #non-naive: a a b a c => Cx[a] = 12 = Cxy[a,*]+Cxy[*,a] if (not Cxy_found) or (not DOCxy_found) or (not args.naive_uni and not Cx_found): vdep_pairs = utils.generate_pairs(seq,M.args) for vdep_pair in vdep_pairs: if not args.naive_uni: if not Cx_found: Cx[vdep_pair[0]] += 1 Cx[vdep_pair[1]] += 1 Cx[total_key] += 2 #increment total count for vdep_pair if not Cxy_found: Cxy[vdep_pair] += 1 Cxy[total_key] += 1 #increment doc count for vdep_pair if (not vdep_pair in docset_xy) and (not DOCxy_found): DOCxy[vdep_pair] += 1 docset_xy.add(vdep_pair)
def get_connectivity_metrics(gt_graph_file, osm_diff_graph_file, post_graph_file, verbose=False, num_pairs=1000): s_points, e_points = generate_pairs(n=num_pairs) gt_graph = GraphSuper(gt_graph_file) post_graph = GraphSuper(post_graph_file) osm_diff_graph = GraphSuper(osm_diff_graph_file) osm_diff_metric = Metric(num_pairs) post_metric = Metric(num_pairs) gt_metric = Metric(num_pairs) for start_points, end_points in zip(s_points, e_points): gt_val = find_path(gt_graph, start_points, end_points, gt_metric, length_key='weight') if gt_val == -1: # osm_diff_metric.reduce_total_paths() # post_metric.reduce_total_paths() if verbose: print('couldnt find path in gt', start_points, end_points) # continue osm_val = find_path(osm_diff_graph, start_points, end_points, osm_diff_metric, length_key='weight') if osm_val == -1: if verbose: print('couldnt find path in osm', start_points, end_points) osm_diff_metric.update_fn(gt_val) else: osm_diff_metric.update_correct(gt_val, osm_val) post_val = find_path(post_graph, start_points, end_points, post_metric, length_key='weight') if post_val == -1: if verbose: print('couldnt find path in post', start_points, end_points) post_metric.update_fn(gt_val) else: post_metric.update_correct(gt_val, post_val) if verbose: print('\n osm diff') osm_diff_metric.print_all() print('\n post') post_metric.print_all() print('\n gt') gt_metric.print_all() return osm_diff_metric.get_all(), post_metric.get_all()