def _get_validation_losses(sess, data, losses, batch_generator, feed_builder, verbose): """Calculate losses for a given dataset. Args: sess: tensorflow.Session data: Dataset instance losses: list of Loss tuples batch_generator: callable feed_builder: callable verbose: int Returns: dict of str -> float """ loss_tensor_names = funcy.ldistinct([loss.tensor for loss in losses]) tensor_dict = {name: name for name in loss_tensor_names} result_dict = _run_predictions(sess, tensor_dict, data, batch_generator, feed_builder) loss_dict = { loss.name: loss.function(data.outputs[loss.field], result_dict[loss.tensor]) for loss in losses } if verbose > 0: print('Validation Scores:') for loss_name, loss in loss_dict.items(): print('{}: {:0.5}'.format(loss_name, loss)) return loss_dict
def workspace_status(self): if not self.exists: return {str(self): "deleted"} if self.hash_info.value is None: return {str(self): "new"} from funcy import ldistinct status = defaultdict(dict) info = self.hash_info.value if self.hash_info else {} actual = self.read_params() # NOTE: we want to preserve the order of params as specified in the # status. In case of tracking the whole file, the order is top-level # keys in the file and then the keys in the `info` from `dvc.lock` # (which are alphabetically sorted). params = self.params or ldistinct([*actual.keys(), *info.keys()]) for param in params: if param not in actual: st = "deleted" elif param not in info: st = "new" elif actual[param] != info[param]: st = "modified" else: assert actual[param] == info[param] continue status[str(self)][param] = st return status
def chain_nodes(nodes, chain): """Returns nodes matched by chain.""" if not chain: return nodes link, *rest = chain if link.func is Ops.const: return [] elif link.func is Ops.multi: coll, = link.args if isinstance(coll, list): coll = dict(enumerate(coll)) return { k: chain_nodes(nodes, subchain + rest) for k, subchain in coll.items() } elif link.func is notnone_fn: return ldistinct( lcat( chain_nodes(nodes, subchain + rest) for subchain in link.args)) else: # Doing this manually in case the link encapsulates a chain we can unpack func = link.func if link.args is None else link.func(*link.args) if isinstance(func, Chain): return chain_nodes(nodes, func + rest) else: next_value = func(nodes) return chain_nodes(next_value, rest) if is_elements(next_value) else nodes
def parse_name(text): # Names to try: # - full with no trailing nor dup spaces # - one with single junk cleared from both ends # - any part longer than 1 char name_parts = text.strip().split() full_name = ' '.join(name_parts) # No trailing and dup spaces clean_name = re.sub(r'^\w\s|\s\w$', '', full_name) names = ldistinct([full_name, clean_name] + [n for n in name_parts if len(n) > 1]) # Use "ocr normalized" version if raw one fails names = ldistinct(interleave(names, map(ocr_normalize, names))) versions = lmap(guess_name, names) return first((guess, warning) for guess, warning in versions if not warning) \ or first((guess, warning) for guess, warning in versions if guess) \ or (None, None)
def test_delete_elements_but_property_always_hold(xs): dyn_arr = dynamic_array(xs) for x in xs: prev_len = len(F.lflatten(dyn_arr)) delete(dyn_arr, x) assert prev_len - 1 == len(F.lflatten(dyn_arr)) # dynamic array properties for arr in dyn_arr: assert is_sorted(arr) assert is_power_of_two(len(arr)) assert len(dyn_arr) == len(F.ldistinct(dyn_arr, key=len))
def test_delete_elem_not_in_arr_then_nothing_happen(xs_z): xs, z = xs_z dyn_arr = dynamic_array(xs) prev_len = len(F.lflatten(dyn_arr)) delete(dyn_arr, z) assert prev_len == len(F.lflatten(dyn_arr)) # dynamic array properties for arr in dyn_arr: assert is_sorted(arr) assert is_power_of_two(len(arr)) assert len(dyn_arr) == len(F.ldistinct(dyn_arr, key=len))
def save(self, **kwargs): # Only set specie when it's non-controversial taxid = ldistinct( keep(self.attrs.get, ['platform_taxid', 'sample_taxid'])) if len(taxid) == 1: self.specie = SPECIES.get(taxid[0]) else: self.specie = '' self.platforms = re_all(r'GPL\d+', self.attrs['platform_id']) self.samples_count = len(self.attrs['sample_id'].split()) super(Series, self).save(**kwargs)
def mygene_fetch(platform, probes, scopes): """Queries mygene.info for current entrezid and sym, given an identifier.""" if scopes == "dna": probes = get_dna_probes(platform, probes) scopes = "accession" def extract_queries(lines): lines = remove(r'^(IMAGE:\d+|--[\w>-]+)$', lines) queries = cat(re_iter(r'[\w+.-]+', l) for l in lines) queries = remove(r'_at$|^\d+-\d+$', queries) # No such thing return queries # Clean unicode for mygene # http://stackoverflow.com/questions/15321138/removing-unicode-u2026-like-characters return [ q.decode('unicode_escape').encode('ascii', 'ignore') for q in queries ] _by_probe = group_values(probes.items()) queries_by_probe = walk_values(extract_queries, _by_probe) # Collect all possible queries to make a single request to mygene queries = set(cat(queries_by_probe.values())) if not queries: return [] mygenes = _mygene_fetch(queries, scopes, platform.specie) # Form results into rows results = [] dups = 0 for probe, queries in queries_by_probe.items(): matches = ldistinct(keep(mygenes.get, queries)) # Skip dups if len(matches) > 1: dups += 1 elif matches: entrez, sym = matches[0] results.append({ 'probe': probe, 'mygene_sym': sym, 'mygene_entrez': entrez }) if dups: cprint('-> Produced %d dups' % dups, 'red') return results
def _fleiss_kappa(sample_sets): # If there is only one set then it can't be measured if len(sample_sets) == 1: return float('nan') all_samples_annos = lcat(sample_sets) categories = ldistinct(sv.annotation for sv in all_samples_annos) # If there is only one label then it can't be measured if len(categories) == 1: return float('nan') category_index = {c: i for i, c in enumerate(categories)} stats = defaultdict(lambda: [0] * len(categories)) for sv in all_samples_annos: stats[sv.sample_id][category_index[sv.annotation]] += 1 return fleiss_kappa(list(stats.values()))
def _cohens_kappa(annos1, annos2): assert set(s.sample_id for s in annos1) == set(s.sample_id for s in annos2) categories = ldistinct(sv.annotation for sv in chain(annos1, annos2)) # If there is only one label then it can't be measured if len(categories) == 1: return float('nan') category_index = {c: i for i, c in enumerate(categories)} table = np.zeros((len(categories), len(categories))) annos1 = sorted(annos1, key=attrgetter('sample_id')) annos2 = sorted(annos2, key=attrgetter('sample_id')) for sv1, sv2 in zip(annos1, annos2): table[category_index[sv1.annotation], category_index[sv2.annotation]] += 1 return cohens_kappa(table, return_results=False)
def test_length_of_arrays_of_dyn_arr_are_all_different(xs): dyn_arr = dynamic_array() for x in xs: insert(dyn_arr, x) assert len(dyn_arr) == len(F.ldistinct(dyn_arr, key=len))
def add_metrics(self, values: List[Metric]): self._params['metric'] = funcy.ldistinct( self._params.get('metric', []) + [v.value for v in values]) return self
def add_metric(self, value: Metric): self._params['metric'] = funcy.ldistinct( self._params.get('metric', []).append(value.value)) return self
def search(request): # Save last specie in session specie = request.GET.get('specie') if specie != request.session.get('specie'): request.session['specie'] = specie q = request.GET.get('q') if not q: return {'series': None} exclude_tags = lkeep(silent(int), request.GET.getlist('exclude_tags')) series_tags, tag_series, tag_ids = series_tags_data() # Parse query q_string, q_tags = _parse_query(q) q_tags, wrong_tags = lsplit(lambda t: t.lower() in tag_ids, q_tags) if wrong_tags: message = 'Unknown tag%s %s.' % ('s' if len(wrong_tags) > 1 else '', ', '.join(wrong_tags)) messages.warning(request, message) if not q_string and not q_tags: return {'series': None} # Build qs qs = search_series_qs(q_string) if specie: qs = qs.filter(specie=specie) if q_tags: q_tag_ids = lkeep(tag_ids.get(t.lower()) for t in q_tags) include_series = reduce(set.intersection, (tag_series[t] for t in q_tag_ids)) if include_series: qs = qs.filter(id__in=include_series) else: message = 'No series annotated with %s.' \ % (q_tags[0] if len(q_tags) == 1 else 'all these tags simultaneously') messages.warning(request, message) return {'series': []} series_ids = qs.values_list('id', flat=True).order_by() tags = ldistinct(mapcat(series_tags, series_ids), key=itemgetter('id')) if exclude_tags: exclude_series = join(tag_series[t] for t in exclude_tags) qs = qs.exclude(id__in=exclude_series) series = paginate(request, qs, 10) # Get annotations statuses annos_qs = SeriesAnnotation.objects.filter(series__in=series) \ .values_list('series_id', 'tag_id', 'best_cohens_kappa') tags_validated = {(s, t): k == 1 for s, t, k in annos_qs} return dict( { 'series': series, 'page': series, 'tags_validated': tags_validated, 'tags': tags, 'series_tags': series_tags, }, **_search_stats(qs))