def estimate_pattern_by_misamp(pattern: Pattern, mallows: Mallows, threshold=0.01, single_core_workload=50, num_cores=None, verbose=False): pattern.calculate_tc() seeds = decompose_pattern(pattern) if verbose: print(f'\npattern: {pattern}\n{mallows}\n#seeds={len(seeds)}\n') return estimate_union_of_prefs(seeds, mallows, threshold, single_core_workload, num_cores, verbose)
def test_a_single_case(): verbose = True threshold = 0 from inference.ltm.ltm_wrapper import calculate_marginal_prob_over_mallows_by_ltm patterns, mallows, p_exact = get_test_case_of_patterns_from_synthetic_4_labels( 2) mallows = Mallows(list(range(10)), 0.03) pattern = Pattern(label_to_children={ 'a': {'b'}, 'b': {'c'} }, label_to_items={ 'a': {4, 8}, 'b': {1, 3}, 'c': {4} }) # pattern = patterns[0] # mallows = Mallows(mallows.center, 0.006) res_exact = calculate_marginal_prob_over_mallows_by_ltm(mallows=mallows, pattern=pattern) print(res_exact) res_samp = estimate_pattern_by_misamp1(mallows=mallows, pattern=pattern, threshold=threshold, verbose=verbose) print(res_samp)
def test_2_label(): import pandas as pd from core.patterns import PATTERN_SEP df_in = pd.read_csv('../../data/input_movielens_ramp-vs-amp_2labels.csv') df_exact = pd.read_csv( '../../data/output_movielens_ramp-vs-amp_2labels_exact.csv') for rid in df_exact['rid']: p_exact = df_exact.loc[rid, 'p_exact'] row = df_in.loc[rid] center_ranking = eval(row['ranking']) model = Mallows(center=center_ranking, phi=row['phi']) pattern_list = [ Pattern.from_string(pattern_str) for pattern_str in row['patterns'].split(PATTERN_SEP) ] res = isramp_over_patterns(pattern_list, model) print(res, f'p_exact = {p_exact}') break
def calculate_marginal_prob_over_mallows_by_ltm(pattern: Pattern, mallows: Mallows, num_cores=None, timeout=None): cur_dir = os.path.dirname(os.path.realpath(__file__)) jar_file = cur_dir + '/ltm.jar' json_file = cur_dir + '/temp.json' ltm_verbose_file = cur_dir + '/ltm_verbose.txt' # num_cores is for ltm.jar multi-threading num_cores = num_cores or os.cpu_count() # represent a pattern by a list of nodes. Each node has three properties {name, items, children}. nodes = [] for node_name, items in pattern.label_to_items.items(): items_in_node = [int(item) for item in items] children_names = [ f'L-{child}' for child in pattern.iter_direct_children_of_label(node_name) ] nodes.append({ 'name': f'L-{node_name}', 'items': items_in_node, 'children': children_names }) # save pattern and Mallows core info in a local JSON file print(nodes) with open(json_file, 'w') as outfile: json.dump( { 'pattern': nodes, 'center': mallows.center, 'phi_list': [mallows.phi] }, outfile, indent=4) # original cmd in terminal is java -Xmx500g -Xms4g -jar ltm.jar temp.json 48 >> out.txt 2>&1 execute_jar = f'java -Xmx{calculate_jvm_xmx()}g -Xms4g -jar {jar_file} {json_file} {num_cores}' with open(ltm_verbose_file, 'a') as outfile: try: subprocess.run(execute_jar.split(), stdout=outfile, stderr=outfile, timeout=timeout) outfile.write('\n') except subprocess.TimeoutExpired: return False, 0, 0 with open(json_file, 'r') as file: res = json.load(file) # # delete temp.json # subprocess.run(['rm', json_file]) return True, res['prob_list'][0], res['runtime(ms)']
def get_test_case_of_patterns_from_movielens_linear(rid=0): row = pd.read_csv('data/input_movielens_ramp-vs-amp.csv').loc[rid] center = eval(row['ranking']) mallows = Mallows(center=center, phi=row['phi']) patterns = [ Pattern.from_string(pattern_str) for pattern_str in row['patterns'].split(PATTERN_SEP) ] return patterns, mallows
def __init__(self, sb): """Initialization""" Deobfuscation.__init__(self, sb) self.cfg = cfg.PATTERNS["push_reg"] self.pattern = Pattern(self.cfg, fpath=self.sb.fname) # deobfuscation self.deobfuscation() # patch self.patch()
def calculate_upper_bound_bipartite_pattern(pattern: Pattern): pattern.calculate_tc() l_labels, r_labels = set(), set() label_to_children = {} label_to_items = {} for l, r in pattern.tc.edges: l_name, r_name = f'L-{l}', f'R-{r}' if l_name not in l_labels: l_labels.add(l_name) label_to_items[l_name] = pattern.get_items_in_label(l) if r_name not in r_labels: r_labels.add(r_name) label_to_items[r_name] = pattern.get_items_in_label(r) label_to_children.setdefault(l_name, set()).add(r_name) return BipartitePattern(label_to_children, label_to_items)
def get_test_case_of_patterns_from_movielens_5_labels(rid=0): """ Hard cases for rAMP are 36, 52, 68, 84, 100, 116, 132, 148 """ row = pd.read_csv('data/input_movielens_ramp-vs-amp_5_labels.csv').loc[rid] mallows = Mallows(center=eval(row['ranking']), phi=row['phi']) patterns = [ Pattern.from_string(pattern_str) for pattern_str in row['patterns'].split(' <> ') ] return patterns, mallows
def estimate_pattern_by_misamp1(pattern: Pattern, mallows: Mallows, k=100, threshold=0.01, single_core_workload=50, num_cores=None, verbose=False): num_cores = num_cores or cpu_count() round_size = single_core_workload * num_cores pattern = deepcopy(pattern) pattern.calculate_tc() if verbose: print(f'\nPattern: {pattern}\n{mallows}\n') para_tuple = (pattern, mallows, single_core_workload, k) prob_max, prob_sum, round_i, start_time = 0, 0, 0, time() while True: round_i += 1 num_samples = round_i * round_size with Pool(processes=num_cores) as pool: res_list = pool.map(worker, [para_tuple for _ in range(num_cores)]) for (prob_sum_i, prob_max_i) in res_list: prob_sum += prob_sum_i prob_max = max(prob_max, prob_max_i) prob_now = prob_sum / num_samples if verbose: print( f"prob={prob_now}, #samples={num_samples}, convergence={prob_max / prob_sum}" ) if prob_max < threshold * prob_sum: runtime = int((time() - start_time) * 1000) return prob_now, num_samples, runtime
def get_test_case_of_patterns_from_movielens_2_labels(rid=0): p_exact = pd.read_csv( 'data/output_movielens_ramp-vs-amp_2labels_exact.csv').loc[rid, 'p_exact'] row = pd.read_csv('data/input_movielens_ramp-vs-amp_2labels.csv').loc[rid] center = eval(row['ranking']) mallows = Mallows(center=center, phi=row['phi']) patterns = [ Pattern.from_string(pattern_str) for pattern_str in row['patterns'].split(PATTERN_SEP) ] return patterns, mallows, p_exact
def get_test_case_of_patterns_from_synthetic_4_labels(pid=0): df_ans = pd.read_csv( 'data/test_cases_4_labels_sharing_BD_3_subs_convergence_by_ramp_3.csv') df_ans = df_ans.groupby('rid').first() p_exact = df_ans.loc[pid, 'p_exact'] row = pd.read_csv( 'data/test_cases_4_labels_sharing_BD_3_subs.csv').loc[pid] patterns_str = row['pref(A>C|A>D|B>D)'] patterns = [ Pattern.from_string(pattern_str) for pattern_str in patterns_str.split('\n') ] mallows = Mallows(list(range(row['m'])), row['phi']) return patterns, mallows, p_exact
def generate_a_large_pattern_from_sub_patterns(patterns: Iterable[Pattern]): label_to_children = {} label_to_items = {} for idx, pattern in enumerate(patterns): for parent, children in pattern.label_to_children.items(): parent_name = f'{idx}-{parent}' label_to_children[parent_name] = set() for child in children: label_to_children[parent_name].add(f'{idx}-{child}') for label, items in pattern.label_to_items.items(): label_to_items[f'{idx}-{label}'] = items return Pattern(label_to_children, label_to_items)
def test_linear(): import pandas as pd from core.patterns import PATTERN_SEP df_in = pd.read_csv('../../data/input_movielens_ramp-vs-amp.csv').tail() for rid, row in df_in.iterrows(): center_ranking = eval(row['ranking']) model = Mallows(center=center_ranking, phi=row['phi']) print(f'center ranking = {center_ranking}') pattern_list = [ Pattern.from_string(pattern_str) for pattern_str in row['patterns'].split(PATTERN_SEP) ] res = isramp_over_patterns(pattern_list, model) print(res, f'p_exact = unknown') break
def get_test_case_of_pattern(pid=0): row = pd.read_csv('data/test_cases_label_patterns.csv').iloc[pid] pattern = Pattern.from_string(row['pattern']) mallows = Mallows(list(range(row['m'])), row['phi']) p_exact = e**row['log_p'] return pattern, mallows, p_exact