def match_one_set1_individual_pool(idx, sorted_idx, pool_size): global local_ctx set2_size = context_length(local_ctx) if not set2_size: raise StopIteration if set2_size > pool_size: pool = random.sample(xrange(context_length(local_ctx)), pool_size) else: pool = range(set2_size) sub_local_ctx = context_subset(local_ctx, pool, None) sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) set2_scores = expr_eval(score_expr, sub_local_ctx) individual2_pool_idx = np.argmax(set2_scores) individual2_idx = pool[individual2_pool_idx] id1 = sub_local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1
def match_one_set1_individual(idx, sorted_idx): global local_ctx if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) # pk = tuple(individual1[fname] for fname in pk_names) # optimized_expr = optimized_exprs.get(pk) # if optimized_expr is None: # for name in pk_names: # fake_set1['__f_%s' % name].value = individual1[name] # optimized_expr = str(symbolic_expr.simplify()) # optimized_exprs[pk] = optimized_expr # set2_scores = evaluate(optimized_expr, mm_dict, set2) set2_scores = expr_eval(score_expr, local_ctx) individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1
def match_one_set1_individual(idx, sorted_idx): global local_ctx if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1) set2_scores = expr_eval(score_expr, local_ctx) # print set2_scores individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1
def match_cell(idx, sorted_idx, pool_size): global matching_ctx set2_size = context_length(matching_ctx) if not set2_size: raise StopIteration if pool_size is not None and set2_size > pool_size: pool = random.sample(xrange(set2_size), pool_size) local_ctx = context_subset(matching_ctx, pool) else: local_ctx = matching_ctx.copy() local_ctx.update((k, set1[k][sorted_idx]) for k in {'__ids__'} | used_variables1) eval_ctx = context.clone(entity_data=local_ctx) set2_scores = expr_eval(score, eval_ctx) cell2_idx = set2_scores.argmax() cell1ids = local_ctx['__ids__'] cell2ids = local_ctx['__other___ids__'][cell2_idx] if pool_size is not None and set2_size > pool_size: # transform pool-local index to set/matching_ctx index cell2_idx = pool[cell2_idx] cell1size = len(cell1ids) cell2size = len(cell2ids) nb_match = min(cell1size, cell2size) # we could introduce a random choice here but it is not # much necessary. In that case, it should be done in group_context ids1 = cell1ids[:nb_match] ids2 = cell2ids[:nb_match] result[id_to_rownum[ids1]] = ids2 result[id_to_rownum[ids2]] = ids1 if nb_match == cell2size: matching_ctx = context_delete(matching_ctx, cell2_idx) else: # other variables do not need to be modified since the cell # only got smaller and was not deleted matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:] # FIXME: the expr gets cached for the full matching_ctx at the # beginning and then when another women with the same values is # found, it thinks it can reuse the expr but it breaks because it # has not the correct length. # the current workaround is to invalidate the whole cache for the # current entity but this is not the right way to go. # * disable the cache for matching? # * use a local cache so that methods after matching() can use # what was in the cache before matching(). Shouldn't the cache be # stored inside the context anyway? expr_cache.invalidate(context.period, context.entity_name) if nb_match < cell1size: set1['__ids__'][sorted_idx] = cell1ids[nb_match:] match_cell(idx, sorted_idx, pool_size)