def unfold_tuple_to_columns(series_or_df_with_tuple_column, new_column_names=None, column_name=None): """ Unfolds a column `column_name` with tuples and adds the unfolded series as new columns. Original column of tuples unchanged. Or dict. """ to_unfold = series_or_df_with_tuple_column if isinstance(series_or_df_with_tuple_column, pd.DataFrame): assert column_name is not None to_unfold = series_or_df_with_tuple_column[column_name] # was too slow # unfolded_cols = to_unfold.apply(pd.Series) if isinstance(to_unfold.iloc[0], dict): new_column_names = new_column_names if new_column_names is not None else list( to_unfold.iloc[0].keys()) data = list( map(list, more_itertools.unzip((tuple(d.values()) for d in to_unfold)))) else: # tuple or list new_column_names = new_column_names if new_column_names is not None else to_unfold.name data = list(map(list, more_itertools.unzip(to_unfold))) data = dict(zip(new_column_names, data)) return pd.DataFrame(series_or_df_with_tuple_column).assign(**data)
def main(): args = parse_arguments() content = [x.strip() for x in args.query if x.strip()] if args.sep != "none": sep = None if args.sep == "space" else args.sep qnos, queries = unzip(line.split(sep, maxsplit=1) for line in content) else: queries = content qnos = list(map(str, range(len(queries)))) trans = str.maketrans("", "", string.punctuation) queries = [s.translate(trans) for s in queries] qnos = list(qnos) queries = list(queries) indri = IndriRunQuery(None, str(args.index.resolve()), args.scheduler) if args.scheduler: output = indri.run_distributed(qnos, queries, extra={"count": args.count}) else: output = indri.run_batch( qnos, queries, working_set=[], extra={"count": args.count}, workers=args.workers, ) args.output.writelines(output)
def run_model(model_prefix: str, model_epoch: int, config: BasicConfig, data_df: pd.DataFrame, save_path: Path): use_gpu = len(config.gpus) > 0 if use_gpu: ctx = [mx.gpu(cur_idx) for cur_idx in config.gpus] else: ctx = [mx.cpu()] sym, args, auxs = mx.model.load_checkpoint(model_prefix, model_epoch) model = mx.mod.Module(symbol=sym, context=ctx, label_names=None) data_shape = (1, 3, 112, 112) model.bind(data_shapes=[('data', data_shape)], for_training=False) model.set_params(args, auxs) dataset = InfoDataset(data_df, filter_fn=config.filter_fn, augs=config.test_augmentations) data = DataLoader( dataset, batch_size=config.batch_size, shuffle=False, sampler=None, num_workers=config.num_workers, pin_memory=use_gpu ) predictions = [] all_paths, labels = unzip(dataset.data) for i, batch in tqdm(enumerate(data), total=len(data)): data = mx.gluon.utils.split_and_load(batch[0], ctx_list=ctx, even_split=False) batch = mx.io.DataBatch(data) model.forward(batch, is_train=False) predictions.append(model.get_outputs()[0].asnumpy()) predictions = np.concatenate(predictions, axis=0) labels = np.array(list(labels)) all_paths = list(all_paths) np.savez(str(save_path), paths=all_paths, labels=labels, preds=predictions) return all_paths, labels, predictions
def collate_fn(inputs): (input_ids, token_type_ids, attention_mask, positions, widths, boundary_pairs, options, targets) = map(list, unzip(inputs)) input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0) attn_masks = pad_sequence(attention_mask, batch_first=True, padding_value=0) width_max = max(widths) context_width_max = max( [left_end - left_start + right_end - right_start for ((left_start, left_end), (right_start, right_end)) in boundary_pairs]) gather_index = torch.arange(0, width_max, dtype=torch.long).unsqueeze(0).repeat(len(inputs), 1).clone() context_gather_index = torch.arange(0, context_width_max, dtype=torch.long).unsqueeze(0).repeat(len(inputs), 1).clone() for i, (p, w, ((left_start, left_end), (right_start, right_end))) in enumerate( zip(positions, widths, boundary_pairs)): gather_index.data[i, :w] = torch.arange(p, p + w, dtype=torch.long).data cw = left_end - left_start + right_end - right_start context_gather_index.data[i, :cw] = torch.cat([torch.arange(left_start, left_end, dtype=torch.long), torch.arange(right_start, right_end, dtype=torch.long)]).data batch = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': attn_masks, 'gather_index': gather_index, 'context_gather_index': context_gather_index, 'positions': torch.tensor(positions).long(), 'option_ids': torch.tensor(options).long(), 'targets': torch.tensor(targets).long()} return batch
def _transform_unidify( self, results_dir: Path, twitter_api_settings: TwitterApiSettings, ) -> Counter[_ExecuteResult]: result_counter = Counter[_ExecuteResult]() head, entries_tweet_ids = spy( self._iter_entries_tweet_ids(results_dir, result_counter) ) if not head: # Check if any entries with Tweet-IDs exist (else unzip fails). return result_counter entries, tweet_ids = cast( Tuple[Iterator[BatchEntry], Iterator[TweetId]], unzip(entries_tweet_ids) ) for entry, tweets in groupby_transform( zip(entries, statuses_lookup(tweet_ids, twitter_api_settings)), keyfunc=itemgetter(0), valuefunc=itemgetter(1), ): write_jsonl_lines( results_dir / entry.data_file_name, (tweet for tweet in tweets if tweet is not None), use_lzma=True, ) write_json( results_dir / entry.meta_file_name, entry, overwrite_existing=True ) result_counter[_ExecuteResult.SUCCESS] += 1 return result_counter
def run(self, ds: stream.DataStream) -> stream.DataStream: raw_topics_scores_ds = super().run(ds) topics_with_ctx = self._get_topic_per_item(raw_topics_scores_ds) topics, ctxs = more_itertools.unzip(topics_with_ctx) return stream.DataStream(items=topics, applied_ops=ds.applied_ops + [self], context=ctxs)
def collate_fn(inputs): (input_ids, token_type_ids, attention_mask, positions, widths, options, targets) = map(list, unzip(inputs)) input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0) attn_masks = pad_sequence(attention_mask, batch_first=True, padding_value=0) width_max = max(widths) gather_index = torch.arange(0, width_max, dtype=torch.long).unsqueeze(0).repeat( len(inputs), 1).clone() for i, (p, w) in enumerate(zip(positions, widths)): gather_index.data[i, :w] = torch.arange(p, p + w, dtype=torch.long).data batch = { 'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': attn_masks, 'gather_index': gather_index, 'positions': torch.tensor(positions).long(), 'option_ids': torch.tensor(options).long(), 'targets': torch.tensor(targets).long() } return batch
def get_docs_stream(self, ds: DataStream) -> DataStream: """Returns DataStream of spacy Docs. If the data stream already contains spacy Docs then they are returned as-is otherwise the nlp object is used to create spacy Docs Parameters ---------- ds : DataStream input data stream Returns ------ out : DataStream A datastream containing an iterable of spacy's `Doc` objects """ if ds.item_type != Doc: docs_with_context = self.nlp.pipe( zip(ds, ds.context), as_tuples=True, n_process=config.ALLOCATED_PROCESSOR_FOR_SPACY, ) new_docs, context = more_itertools.unzip(docs_with_context) return DataStream(items=new_docs, applied_ops=ds.applied_ops, context=context) else: return ds
def initialize_data_from_leavesdb(dataset_name='PNAS', splits={'train':0.7,'validation':0.3}, threshold=50, exclude_classes=[], include_classes=[]): datasets = { 'PNAS': pnas_dataset.PNASDataset(), 'Leaves': leaves_dataset.LeavesDataset(), 'Fossil': fossil_dataset.FossilDataset() } data_files = datasets[dataset_name] data_files.exclude_rare_classes(threshold=threshold) encoder = base_dataset.LabelEncoder(data_files.classes) classes = list((set(encoder.classes)-set(exclude_classes)).union(set(include_classes))) data_files, excluded_data_files = data_files.enforce_class_whitelist(class_names=classes) x = list(data_files.data['path'].values) y = np.array(encoder.encode(data_files.data['family'])) shuffled_data = list(zip(x,y)) random.shuffle(shuffled_data) partitioned_data = partition_data(data=shuffled_data, partitions=OrderedDict(splits)) split_data = {k:v for k,v in partitioned_data.items() if len(v)>0} for subset, subset_data in split_data.items(): split_data[subset] = [list(i) for i in unzip(subset_data)] return split_data, data_files, excluded_data_files
def evolve_local(self, num_gen: int, overwrite: bool = True, callbacks: t.Optional[t.List[Callback]] = None): """ Evolve populations sequentially, using a single processor. No early stopping applied. Exists mainly for testing purposes. """ data_to_evolve = zip(self.populations, self.records or [None] * len(self.populations)) operators = self.ops results = [] for individuals, records in tqdm(data_to_evolve, total=len(self.populations), desc='Evolving population'): individuals, records = self._evolver.evolve( num_gen, self.ops, self.genetic_params.Population_size, individuals, records) if callbacks: individuals, records, operators = self._evolver.call_callbacks( callbacks, individuals, records, operators) results.append((individuals, records, callbacks)) populations, records, callbacks = map(list, unzip(results)) if overwrite: self.populations, self.records = populations, records return populations, records, callbacks
def generate_image(): # import plotly # plotly.io.orca.config.executable = '/home/miguel/anaconda3/bin/orca' layout = go.Layout(autosize=True, margin={'l': 0, 'r': 0, 't': 0, 'b': 0}) players, score, ping, _ = unzip(get_players()) players = list(players) score = list(score) ping = list(ping) height = len(list(players)) * 25 + 40 fig = go.Figure( # columnwidth=[1,0.5,0.5], layout=layout, data=[ go.Table( columnwidth=[70, 15, 15], header=dict( values=['<b>Player</b>', '<b>Score</b>', '<b>Ping</b>'], line_color='darkslategray', fill_color='lightskyblue', font_size=18, height=30, align=['left', 'center', 'center']), cells=dict(values=[list(players), list(score), list(ping)], height=25, font_size=16, line_color='darkslategray', fill_color='lightcyan', align=['left', 'center', 'center'])) ]) fig.update_layout(width=400, height=height) loc = '/tmp/players.jpg' fig.write_image('/tmp/players.jpg', engine='kaleido') return loc
def detection_func(lines): try: _, line = lines.peek() except StopIteration: line = "" match = prompt_re.match(line) if not match: return None groups = match.groupdict() indent = len(groups["indent"]) prompt_length = len(groups["prompt"]) detected_lines = list( itertools.chain( [more_itertools.first(lines)], continuation_lines(lines, indent, prompt_length), )) line_numbers, lines = map(tuple, more_itertools.unzip(detected_lines)) line_range = min(line_numbers), max(line_numbers) + 1 if line_numbers != tuple(range(line_range[0], line_range[1])): raise RuntimeError("line numbers are not contiguous") return line_range, name, "\n".join(lines)
def test_pagination(self): index_to_page_size = { (page_size * num_pages + num_excess_doc, page_size) for page_size in (1, 2, 5) for num_pages in (0, 1, 2, 3) for num_excess_doc in (-1, 0, 1) if page_size * num_pages + num_excess_doc > 0 } page_sizes_by_index_size = { i: list(unzip(page_sizes)[1]) for i, page_sizes in groupby(sorted(index_to_page_size), key=itemgetter(0)) } index_size_ = 0 for index_size, page_sizes in page_sizes_by_index_size.items(): self._add_docs(index_size - index_size_) for page_size in page_sizes: for sort_field, sort_path, sort_unique in [ ('entryId', ['entryId'], True), ('fileId', ['files', 0, 'uuid'], True), ('fileName', ['files', 0, 'name'], False) ]: for reverse in False, True: kwargs = dict(index_size=index_size, page_size=page_size, sort_field=sort_field, reverse=reverse) with self.subTest(**kwargs): self._test_pagination(**kwargs, sort_path=sort_path, sort_unique=sort_unique) index_size_ = index_size
def fuse_optimal(self, cutoff=None, scores=None, fuse_func=None, qrels=None, ignore_zero=True, return_vnos=False, show_progress=True): qno_map = {} qnos, runqnos = unzip(self.qno_map.items()) qnos, runqnos = list(qnos), list(runqnos) subqrels = [qrels.select_by_qno(x) for x in qnos] subscore = [{v: scores[v] for v in x.vnos()} for x in runqnos] with Pool(os.cpu_count() // 2) as pool: result = pool.imap_unordered(fuse_optimal_sp, zip( runqnos, repeat(cutoff), subscore, repeat(fuse_func), subqrels, repeat(ignore_zero), repeat(return_vnos), ), chunksize=32) if show_progress: result = tqdm(result, desc='Opt', total=len(qnos)) if return_vnos: result = {x[0].qno: x for x in result} qno_map = {x: result[x][0] for x in qnos} qno_vnos = {x: result[x][1] for x in qnos} return TrecRun(qno_map), qno_vnos else: result = {x.qno: x for x in result} return TrecRun(result)
async def dar_fetch_chunked(uuids, addrtype, chunk_size, client=None): """Lookup uuids in DAR (chunked). Args: uuids: List of DAR UUIDs. addr_type: The address type to lookup. chunk_size: Number of UUIDs per block, sent to DAR. client (optional): aiohttp.ClientSession to use for connecting. Returns: (dict, set): dict: Map from UUID to DAR reply. set: Set of UUIDs of entries which were not found. """ def create_task(uuid_chunk): return asyncio.ensure_future( dar_fetch_non_chunked(uuid_chunk, addrtype=addrtype, client=client)) # Chunk our UUIDs into blocks of chunk_size uuid_chunks = chunked(uuids, chunk_size) # Convert chunks into a list of asyncio.tasks tasks = list(map(create_task, uuid_chunks)) # Here 'result' is a list of tuples (dict, set) => (result, missing) result = await asyncio.gather(*tasks) # First we unzip 'result' to get a list of results and a list of missing result_dicts, missing_sets = unzip(result) # Then we union the dicts and sets before returning combined_result = dict(ChainMap(*result_dicts)) combined_missing = set.union(*missing_sets) return combined_result, combined_missing
def validate(comparator: Comparator, data_dir: Path, validation_csv: Path, num_sample: int = 10 ** 3, pairs: Optional[Iterable[Tuple[int, int]]] = None, labels: Optional[Sequence[int]] = None ) -> Tuple[np.ndarray, np.ndarray]: if num_sample > 0: df = pd.read_csv(validation_csv) subject_dict = aggregate_subjects(df['TEMPLATE_ID'], df['SUBJECT_ID']) sampled_pairs, sampled_labels = unzip(sample_pairs(subject_dict, num_sample)) sampled_labels = np.array(list(sampled_labels)) else: sampled_pairs = pairs sampled_labels = np.array(labels) predictions = np.array(list(unzip(compare_all(data_dir, sampled_pairs, comparator))[2])) return sampled_labels, predictions
def run(self, ds: DataStream) -> DataStream: docs_ds = self.get_docs_stream(ds) processed_docs = map(self.process_doc, docs_ds, docs_ds.context) processed_docs = (x for x in processed_docs if x is not None) items, context = more_itertools.unzip(processed_docs) return DataStream(items=items, applied_ops=ds.applied_ops + [self], context=context)
def run(self, ds: DataStream) -> DataStream: flat = itertools.chain.from_iterable(map(self._flatten, ds, ds.context)) items, context = more_itertools.unzip(flat) return DataStream(items=items, applied_ops=ds.applied_ops + [self], context=context)
def make_pairs_with_lcs(structures_metadata, workers): # groupby isoform # product apo holo # do following in multiple processes: # find LCS # write LCS to output (if no mismatches?), or all? groups = structures_metadata.groupby('uniprotkb_id') # or ísoform (maybe specify in args) # structures as files or codes, so that should be handled in `chains_for_uniprot_ids`? Or somehow with a join in filter_structures? # that could be done.. def get_pairs(): for uniprot_id, group_indices in groups.indices.items(): for pair in get_pairs_in_group(structures_metadata, group_indices, uniprot_id): yield uniprot_id, pair uniprot_ids, pairs = more_itertools.unzip(get_pairs()) pair_ids, lcs__args = more_itertools.unzip(pairs) i = 0 print(datetime.now()) # for uniprot_id, (apo_chain, holo_chain), lcs_future in zip(uniprot_ids, pair_ids, lcs_futures): for uniprot_id, (apo_chain, holo_chain), args in zip(uniprot_ids, pair_ids, lcs__args): lcs_future = FutureLike(process_execute(get_longest_common_polypeptide, *args)) i += 1 if i % 100 == 0: # if i % 100 == 0: maybe_print(False, f'\r{i}', end='') try: logger.info(f'getting result of {apo_chain} {holo_chain}, from {uniprot_id}') yield { 'pdb_code_apo': apo_chain[0], # todo could rename the variables to more general chain1 /c1, c2... 'chain_id_apo': apo_chain[1], 'pdb_code_holo': holo_chain[0], 'chain_id_holo': holo_chain[1], 'lcs_result': lcs_future.result(), } except Exception as e: logger.exception('compute_lcs failed with: ') print(datetime.now())
def compare(data_path: Path, experiment: str, num_sample: int, use_flip: bool = False) -> None: val_csv = Path('data') / 'wide_val.csv' model_path = Path('experiments') / experiment / 'snapshots' num_weights = len(list(model_path.iterdir())) - 1 results = [] df = load_info(data_path, val_csv) exists = [ idx for idx, cur_path in enumerate(df['img_path']) if cur_path.exists() ] val_data = df.iloc[np.array(exists)] subject_dict = aggregate_subjects(df['TEMPLATE_ID'], df['SUBJECT_ID']) sampled_pairs, sampled_labels = unzip( sample_pairs(subject_dict, num_sample)) sampled_labels = np.array(list(sampled_labels)) sampled_pairs = list(sampled_pairs) names = [] rank_results = [] for cur_epoch in range(7, num_weights): comparator = CompareModel(str(model_path / experiment), cur_epoch + 1, use_flip=use_flip, ctx=mx.gpu(0)) comparator.metric = cosine rank_comparator = config_rank_comparator(comparator, val_data['img_path']) cosine_res = validate(comparator, data_path, val_csv, num_sample=0, pairs=sampled_pairs, labels=sampled_labels) rank_results.append( validate(rank_comparator, data_path, val_csv, num_sample=0, pairs=sampled_pairs, labels=sampled_labels)[1]) results.append(cosine_res) names.append(f'epoch {cur_epoch + 1:04d}') results.append((sampled_labels, rank_results[-1])) names.append(f'epoh {cur_epoch + 1:04d} rank') rank_merge_results = np.mean(rank_results, axis=0) results.append((sampled_labels, rank_merge_results)) names.append('merged') plot_roc( results, experiment_names=names, save_name=f'{experiment}_{"flip" if use_flip else "no_flip"}_roc.png')
def check_value(variable, new_value, o): """Recurse through object to ensure correct value "new_value" in "variable".""" seeded_check_value = partial(check_value, variable, new_value) if isinstance(o, dict): if variable in o: if o[variable] == new_value: return o, False o[variable] = new_value o["virkning"] = virkning return o, True keys, values = unzip(o.items()) values, changed = unzip(map(seeded_check_value, values)) return dict(zip(keys, values)), any(changed) elif isinstance(o, list): values, changed = unzip(map(seeded_check_value, o)) return list(values), any(changed) elif isinstance(o, tuple): values, changed = unzip(map(seeded_check_value, o)) return tuple(values), any(changed) else: return o, False
def collate_fn(inputs): (input_ids_tuple, token_type_ids, attention_mask, positions, widths, options, targets) = map(list, unzip(inputs)) input_ids = pad_sequence([item[0] for item in input_ids_tuple], batch_first=True, padding_value=0) input_masked_ids = pad_sequence([item[1] for item in input_ids_tuple], batch_first=True, padding_value=0) token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0) attn_masks_literal = pad_sequence([item[0] for item in attention_mask], batch_first=True, padding_value=0) attn_masks_idiomatic = pad_sequence( [item[1] for item in attention_mask], batch_first=True, padding_value=0) width_max = max(widths) gather_index = torch.arange(0, width_max, dtype=torch.long).unsqueeze(0).repeat( len(inputs), 1).clone() for i, (p, w) in enumerate(zip(positions, widths)): gather_index.data[i, :w] = torch.arange(p[0], p[0] + w, dtype=torch.long).data gather_index_masked = torch.arange( 0, width_max, dtype=torch.long).unsqueeze(0).repeat(len(inputs), 1).clone() for i, (p, w) in enumerate(zip(positions, widths)): gather_index_masked.data[i, :w] = torch.arange( p[1], p[1] + w, dtype=torch.long).data batch = { 'input_ids': torch.stack([input_ids, input_masked_ids]), 'token_type_ids': torch.stack([token_type_ids, token_type_ids]), 'attention_mask': torch.stack([attn_masks_literal, attn_masks_idiomatic]), 'gather_index': (gather_index, gather_index_masked), 'positions': torch.tensor(positions).long(), 'option_ids': torch.tensor(options).long(), 'targets': torch.tensor(targets).long() } return batch
def current(self): """ Obtain the reward function which currently maximizes the objective = Value Function objective + Sparsity objective. """ pairs = list(zip(self.coeffs, self.rewardBases)) fn = lambda s: sum([c * rfn(s) for c, rfn in pairs]) ranges = [rfn.reward_range for rfn in self.rewardBases] mins, maxs = list(map(list, unzip(ranges))) rMin = min(c * m for c, m in zip(self.coeffs, mins)) rMax = max(c * M for c, M in zip(self.coeffs, maxs)) return Reward(fn, (rMin, rMax))
def take_unchanged(mating_group: t.List[t.Tuple[GraphIndividual, Record]], brood_size: int) -> t.List[GraphIndividual]: """ Randomly takes `brood_size` number of individuals from the mating groups. :param mating_group: A group of individuals selected to give progeny. :param brood_size: A number of offsprings. :return: List of offsprings -- copies of the parents. """ individuals, _ = unzip(mating_group) return list( take(brood_size, (ind.copy() for ind in random_permutation(individuals))))
def main(input, part): # Iterator of lines lines = map(lambda x: x.strip(), input.readlines()) # Iterator of integers integers = list(map(int, lines)) # integers = [28, 33, 18, 42, 31, 14, 46, 20, 48, 47, 24, 23, 49, 45, 19, 38, 39, 11, 1, 32, 25, 35, 8, 17, 7, 9, 4, 2, 34, 10, 3] # integers = [16, 10, 15, 5, 1, 11, 7, 19, 6, 12, 4] integers = sorted(integers) min_jolts = 0 max_jolts = integers[-1] + 3 diff_integers = chain([min_jolts], integers, [max_jolts]) differences = list(map(lambda a, b: b - a, *unzip(pairwise(diff_integers)))) if part == "1": differences = Counter(differences) print(differences[1] * differences[3]) if part == "2": # We do not have to consider all possible combinations, as all the # combinations will contain certain sequences, namely all sequences # will passthrough 3-difference numbers, and thus the sequences on # either side of these 3-difference numbers are independent. # # Thus we can split the problem into several subproblems, one on either # side of the 3-difference numbers. @apply def difference_is_3(index, value): """Check if the current index is a 3-difference number.""" return differences[index] == 3 # Iterator of subproblems (lists seperated by 3-difference numbers) # Each element in the lists are (index, value) subproblems = split_after(enumerate(integers), difference_is_3) # Map each element in the lists to just value (drop index) subproblems = map(lambda element: list(unzip(element)[1]), subproblems) # Find number of possible combinations for each block sub_counts = map(find_arrangements, subproblems) # Multiple the values for all the blocks to get a total print(prod(sub_counts))
def get_predicts(data_path, cache_dir, img_paths, pairs): img_matcher = config_resnet_matcher(img_paths, cache_dir) detector = pipeline_detector(img_paths, cache_dir / 'detector', small_face=16) experiment_names = ['ultimate5', 'test_center_vgg'] epochs = [20, 10] experiment_path = Path('experiments') feature_extractors = [mxnet_feature_extractor( cache_dir / f'extractor_{cur_exp}_{cur_epoch:04d}', str(experiment_path / cur_exp / 'snapshots' / cur_exp), cur_epoch, use_flip=True, ctx=mx.gpu(0)) for cur_exp, cur_epoch in zip(experiment_names, epochs)] comparator = PipeMatcher(img_paths, cache_dir, img_matcher, detector, feature_extractors) return np.array(list(unzip(compare_all(data_path, pairs, comparator))[2]))
def validate_pipe(): cache_dir = Path('/run/media/andrey/Data/pipe_cache') data_path = Path('/run/media/andrey/Fast/FairFace/data/train/data') val_csv = Path('data') / 'val_df.csv' val_data = load_info(data_path, val_csv) num_sample = 1 * 10 ** 5 subject_dict = aggregate_subjects(val_data['TEMPLATE_ID'], val_data['SUBJECT_ID']) sampled_pairs, sampled_labels = unzip(sample_pairs(subject_dict, num_sample)) sampled_labels = np.array(list(sampled_labels)) sampled_pairs = list(sampled_pairs) predictions = get_predicts(data_path, cache_dir, val_data['img_path'], sampled_pairs) plot_roc([(sampled_labels, predictions)], ['composite'], save_name='test_pipe.png')
def run(self, ds: stream.DataStream) -> stream.DataStream: docs_ds = self.get_docs_stream(ds) docs = zip(docs_ds, docs_ds.context) # match results is a tuple ((doc, matches), context) match_results = self.matcher.pipe(docs, return_matches=True, as_tuples=True) new_docs_with_context = more_itertools.map_except( self._filter_tokens, match_results, EmptyTextError) new_docs, context = more_itertools.unzip(new_docs_with_context) return stream.DataStream(new_docs, applied_ops=ds.applied_ops + [self], context=context)
def _get_batch(self, bs: int, x, y=None): y_x_pairs = zip(y, x) if y is not None else enumerate(x) for batch in cytoolz.partition_all(bs, y_x_pairs): batch_y, batch_x = more_itertools.unzip(batch) X, Y = list(batch_x), list(batch_y) if sparse.issparse(Y[0]): Y = sparse.vstack(Y) elif isinstance(Y[0], np.ndarray): Y = np.vstack(Y) if sparse.issparse(X[0]): X = sparse.vstack(X) elif isinstance(X[0], np.ndarray): X = np.vstack(X) yield X, Y
def splot_multiple( *exprs: CurveOrRange, plotf: splot, random_colors=False, **options): "plots multiple curves with extra kwargs for each one" show = options.pop("show", True) curves, curves_args = unzip(curves_iter(exprs)) #pprint(curves) plot = plotf(*curves, show=False, **options) for i, curve_args in enumerate(curves_args): if random_colors: plot[i].line_color = [random_bright_rgb_color()] for key, value in curve_args.items(): setattr(plot[i], key, value) plot.show() if show else None return plot
def scrape_candidates(self, product_name, archive_directory, major_version, stdout): """Scrape the candidates/ directory for beta, release candidate, and final releases.""" url_path = '/pub/%s/candidates/' % archive_directory stdout.write('scrape_candidates working on %s' % url_path) # First, let's look at /pub/PRODUCT/releases/ so we know what final # builds have been released release_path = '/pub/%s/releases/' % archive_directory release_path_content = self.download(release_path) # Get the final release version numbers, so something like "64.0b8/" -> "64.0b8" final_releases = [ link['text'].rstrip('/') for link in self.get_links(release_path_content) if link['text'][0].isdigit() ] content = self.download(url_path) version_links = [ link for link in self.get_links(content) if link['text'][0].isdigit() ] # If we've got a major_version, then we only want to scrape data for versions # greater than (major_version - 4) and esr builds if major_version: major_version_minus_4 = major_version - 4 stdout.write( 'skipping anything before %s and not esr (%s)' % (product_name, major_version_minus_4) ) version_links = [ link for link in version_links if ( # "63.0b7-candidates/" -> 63 int(link['text'].split('.')[0]) >= major_version_minus_4 or 'esr' in link['text'] ) ] scrape = partial( self.scrape_candidate_version, product_name=product_name, final_releases=final_releases ) if self.num_workers == 1: results = map(scrape, version_links) else: with concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) as executor: results = executor.map(scrape, version_links, timeout=300) results = list(results) # Convert [(build_data, msgs), (build_data, msgs), ...] into # build_data and msgs if results: build_data, msgs = more_itertools.unzip(results) else: build_data, msgs = [], [] # Print all the msgs to stdout for msg_group in msgs: for msg in msg_group: stdout.write('worker: %s' % msg) # build_data is a list of lists so we flatten that return list(more_itertools.flatten(build_data))