def request_status(scan_id): request = scan_db.get(scan_id) def _dirty(system_name): return market_db.get(("dirty", system_name), default=None) dirty = groupby(_dirty, request["system_names"]) system_completion = { ("Partial" if k is True else "Complete" if k is False else "Pending"): len(v) for (k, v) in dirty.items() } num_systems = len(request["system_names"]) partial_or_pending = (system_completion.get("Partial", 0) + system_completion.get("Pending", 0)) system_completion_percent = 100 * ( (num_systems - partial_or_pending) / num_systems) def _shell_status(item): return AsyncResult(item["task_id"]).state shell_completion = groupby(_shell_status, request["tasks"].values()) return { "scan_id": request["scan_id"], "location": request["location"], "radius": request["radius"], "system_completion_percent": system_completion_percent, "system_completion": system_completion, "system_names": request["system_names"], "tasks": shell_completion, "unfinished_shells": dissoc(shell_completion, "SUCCESS"), }
def _fuzzy_filter(self, text, candidates, metric=fuzz.ratio): """ :param text: str :param candidates: List[TrieEntry] :param metric: (str, str) -> Numeric :return: List[TrieEntry] """ # similar = groupby(lambda entry: metric(entry.sf, text), candidates) # group by val of metric # Calculate a metric measured = [(metric(entry.sf, text), entry) for entry in candidates] # Group by the same uri similar = groupby(lambda entry: entry[1].uri, measured) # uri: (m, entry) # In each group of same matches leave only the one with the highest match-metric similar = [max(sames, key=first) for sames in similar.values()] # Sort by the metric best_matches = sorted(similar, key=first, reverse=True) # Filter bad matches best_matches = [ entry for m, entry in best_matches if m >= self._metric_threshold * 100 ] # Some more checks on the best matches if there're several matches if len(best_matches) > 1: # best_matches = [max(best_matches, key=lambda entry: metric(raw_d(raw(entry.uri)), text))] best_matches = groupby( lambda entry: metric(raw_d(raw(entry.uri)), text), best_matches) best_matches = best_matches[max(best_matches)] return best_matches
def parse_instance(content, outdir): categories = {d['id']: d['name'] for d in content['categories']} # merge images and annotations: id in images vs image_id in annotations merged_info_list = list( map( cytoolz.merge, cytoolz.join('id', content['images'], 'image_id', content['annotations']))) # convert category id to name for instance in merged_info_list: instance['category_id'] = categories[instance['category_id']] # group by filename to pool all bbox in same file for name, groups in cytoolz.groupby('file_name', merged_info_list).items(): multiple = groups[0]['url'].split('/')[-4] if multiple != 'multiple': continue subfolder = groups[0]['url'].split('/')[-2] folder = groups[0]['url'].split('/')[-3] if not os.path.exists(os.path.join(outdir, folder)): os.mkdir(os.path.join(outdir, folder)) if not os.path.exists(os.path.join(outdir, folder, subfolder)): os.mkdir(os.path.join(outdir, folder, subfolder)) anno_tree = instance2xml_base(groups[0]) # if one file have multiple different objects, save it in each category sub-directory filenames = [] for group in groups: filenames.append( os.path.join(outdir, folder, subfolder, os.path.splitext(name)[0] + ".xml")) anno_tree.append(instance2xml_bbox(group, bbox_type='xyxy')) for filename in filenames: etree.ElementTree(anno_tree).write(filename, pretty_print=True) print("Formating instance xml file {} done!".format(name))
def compute(t, lhs, rhs): """ Join Operation for Python Streaming Backend Note that a pure streaming Join is challenging/impossible because any row in one seq might connect to any row in the other, requiring simultaneous complete access. As a result this approach compromises and fully realizes the LEFT sequence while allowing the RIGHT sequence to stream. As a result Always put your bigger table on the RIGHT side of the Join. """ lhs = compute(t.lhs, lhs) rhs = compute(t.rhs, rhs) on_left = rowfunc(t.lhs[t.on_left]) on_right = rowfunc(t.rhs[t.on_right]) right_columns = list(range(len(t.rhs.columns))) for col in listpack(t.on_right): right_columns.remove(t.rhs.columns.index(col)) get_right = lambda x: type(x)(get(right_columns, x)) lhs_dict = groupby(on_left, lhs) for row in rhs: try: key = on_right(row) matches = lhs_dict[key] for match in matches: yield match + get_right(row) except KeyError: pass
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet: import soundfile as sf # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-3], audio_paths) recordings = [] for session_name, channel_paths in channel_wavs.items(): audio_sf = sf.SoundFile(str(channel_paths[0])) recordings.append( Recording( id=session_name, sources=[ AudioSource(type="file", channels=[idx], source=str(audio_path)) for idx, audio_path in enumerate(sorted(channel_paths)) ], sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) return RecordingSet.from_recordings(recordings)
def _index_by_recording_id_and_cache(self): if self._features_by_recording_id is None: from cytoolz import groupby self._features_by_recording_id = groupby( lambda feat: feat.recording_id, self) return self._features_by_recording_id
def create_annotations(dbpath, subset, dst): ''' :param dbpath: root path of coco dataset :param subset: 'train' or 'val' :param dst: where to save transfered result :return: ''' annotations_path = dbpath + '/annotations_trainval2014/annotations/instances_{}2014.json'.format( subset) images_path = dbpath + '/images/{}2014'.format(subset) categories, instances = get_instances(annotations_path) if not os.path.exists(dst): os.makedirs(dst) for i, instance in enumerate(instances): instances[i]['category_id'] = categories[instance['category_id']] for name, group in iteritems(groupby('file_name', instances)): print("image_path is %s , name is %s " % (images_path, name)) img = imread(images_path + "/" + name) if img.ndim == 3: annotation = root(images_path, name, group[0]['height'], group[0]['width']) for instance in group: annotation.append(instance_to_xml(instance)) etree.ElementTree(annotation).write( dst + '/{}.xml'.format(name.split(".")[0])) print(name) else: print(instance['file_name'])
def create_annotations(dbpath, subset, dst): annotations_path = path( dbpath).expand() / 'annotations/instances_{}2014.json'.format(subset) #images_path = path(dbpath).expand() / 'images/{}2014'.format(subset) images_path = path(dbpath).expand() / '{}2014'.format( subset) #clw note:这里图片直接放在2014val下,前面没有images # 另外subset这里一般是val或者train,见文件名 categories, instances = get_instances(annotations_path) dst = path(dst).expand() for i, instance in enumerate(instances): instances[i]['category_id'] = categories[instance['category_id']] for name, group in iteritems(groupby('file_name', instances)): img = imread(images_path / name) if img.ndim == 3: out_name = rename(name) annotation = root('VOC2014', '{}.jpg'.format(out_name), group[0]['height'], group[0]['width']) for instance in group: annotation.append(instance_to_xml(instance)) etree.ElementTree(annotation).write(dst / '{}.xml'.format(out_name)) #print out_name print(out_name) #clw modify: 看来之前是py2.7 else: #print instance['file_name'] print(instance['file_name']) #clw modify: 看来之前是py2.7
def highest_td_peer(self) -> BasePeer: peers = tuple(self.connected_nodes.values()) if not peers: raise NoConnectedPeers() peers_by_td = groupby(operator.attrgetter('head_td'), peers) max_td = max(peers_by_td.keys()) return random.choice(peers_by_td[max_td])
def change_annotations(data_path, subset, destination_path): if not os.path.exists(data_path): raise FileNotFoundError("{} path is not exist".format(data_path)) os.makedirs(destination_path, exist_ok=True) annotation_path = os.path.join( data_path, "annotations/instances_{}2014.json".format(subset)) image_path = os.path.join(data_path, "images/{}2014".format(subset)) if not os.path.exists(annotation_path): raise FileNotFoundError( "{} annotation is not exist".format(annotation_path)) if not os.path.exists(image_path): raise FileNotFoundError("{} image is not exist".format(image_path)) categories, instances = create_annotation_instance(annotation_path) for i, instance in enumerate(instances): instances[i]["category_id"] = categories[instance["category_id"]] for name, group in tqdm(iteritems(groupby("file_name", instances)), desc="Create annotation xml files"): out_name = name.split(".")[-2] img = imread(os.path.join(image_path, name)) if img.ndim == 3: annotation = xml_root("{}.jpg".format(out_name), group[0]["height"], group[0]["width"]) for instance in group: annotation.append(instance_to_xml(instance)) etree.ElementTree(annotation).write( os.path.join(destination_path, "{}.xml".format(out_name)))
def _get_sub_overrides_by_prop( overrides: Dict[str, Any]) -> Iterator[Tuple[str, Dict[str, List[str]]]]: # we only want the overrides that are not top level. sub_overrides = _get_sub_overrides(overrides) key_groups = groupby(_extract_top_level_key, sub_overrides.keys()) for top_level_key, props in key_groups.items(): yield top_level_key, {_extract_tail_key(prop): overrides[prop] for prop in props}
def X_feature_onehot(self, dataframe: DataFrame) -> DataFrame: # fieldgroups[basename] = [ fieldname ] # noinspection PyArgumentList fieldgroups = groupby( curry(re.sub)('\d+(st|nd|rd)?$')(''), # basename self.params['X_feature_onehot'] # fieldnames ) encodings = {} for basename, fieldnames in fieldgroups.items(): # NOTE: in theory, unique_values should be hardcoded based on data_description.txt # for Kaggle, we can cheat and just take unique_values from self.data['combined'] # BUGFIX: running to_X() separately on test/train/validate datasets results in column name mismatches unique_values = np.unique( self.data['combined'][fieldnames].dropna().values) category_dtype = CategoricalDtype(categories=unique_values) for fieldname in fieldnames: dataframe[fieldname] = dataframe[fieldname].astype( category_dtype) onehot = pd.get_dummies(dataframe[fieldname], prefix=basename, prefix_sep='_') if not basename in encodings: encodings[basename] = onehot else: encodings[basename] = onehot & encodings[ basename] # Bitwise addition # Add additional onehot columns to dataframe for basename, onehot in encodings.items(): dataframe = dataframe.join(onehot) # Mark original categorical columns for exclusion self.params['X_feature_exclude'] += self.params['X_feature_onehot'] return dataframe
def parse_keypoints(content, outdir): keypoints = dict( zip(range(1, len(content['categories'][0]['keypoints']) + 1), content['categories'][0]['keypoints'])) # merge images and annotations: id in images vs image_id in annotations merged_info_list = map( cytoolz.merge, cytoolz.join('id', content['images'], 'image_id', content['annotations'])) # convert category name to person for keypoint in merged_info_list: keypoint['category_id'] = "person" # group by filename to pool all bbox and keypoint in same file for name, groups in cytoolz.groupby('file_name', merged_info_list).items(): filename = os.path.join(outdir, os.path.splitext(name)[0] + ".xml") anno_tree = keypoints2xml_base(groups[0]) for group in groups: anno_tree = keypoints2xml_object(group, anno_tree, keypoints, bbox_type="xyxy") doc = etree.ElementTree(anno_tree) doc.write(open(filename, "w"), pretty_print=True) print("Formating keypoints xml file {} done!".format(name))
def batch_stitch_stack(file_dict, output, stitch_order=None, channel_order=[0, 1, 2], target_bit_depth=8, compress=1, **kwargs): """Run snail stitch and concatenate the channels across a set of images. This function takes the (plate, well) dictionary built using the ``make_key2file`` function. Images are grouped according to their channel, stitched together and stacked into a single 3-channel image. Images are re-scaled and saved to a user specified output directory. Images are saved to directories according to their plate number. Parameters ---------- file_dict : dict { tuple (plate, well) : list of strings } The dictionary mapping the (plate, well) tuple to a list of image files. This dictionary is built using the ``make_key2file`` function. output : string The directory to output the stitched and concatenated images to. stitch_order : array of int, shape (M, N) The order of the stitching. Passed to "stitch_order" argument of `snail_stitch`. channel_order : list of int The order the channels should be in in the final image. Passed to "channel_order" argument of `stack_channels`. target_bit_depth : int in {8, 16}, optional If None, perform no rescaling. Otherwise, rescale to occupy the dynamic range of the target bit depth. compress : int in [0, 9], optional Compression level for saved images. 0 = no compression, 1 = fast compression, 9 = maximum compression, slowest. **kwargs : dict Keyword arguments to be passed to `microscopium.preprocess.stretchlim` """ for fns in list(file_dict.values()): sem = cellomics_semantic_filename(fns[0]) plate = str(sem['plate']) new_fn = '-'.join([sem['prefix'], plate, sem['well']]) new_fn = '.'.join([new_fn, sem['suffix']]) channels = groupby(get_channel, fns) while len(channels) < 3: channels[np.max(list(channels.keys())) + 1] = None images = [] for channel, fns in sorted(channels.items()): if fns is None: images.append(None) else: image = snail_stitch(fns, stitch_order) image = rescale_from_12bit(image, target_bit_depth, **kwargs) images.append(image) stack_image = stack_channels(images, channel_order) out_dir = os.path.join(output, plate) if not os.path.exists(out_dir): os.makedirs(out_dir) mio.imsave(os.path.join(out_dir, new_fn), stack_image, compress=compress)
def create_annotations(dbpath, subset, dst): first_part = path(dbpath).expand() last_part = 'annotations/instances_{}2014.json'.format(subset) annotations_path = first_part / last_part images_path = first_part / 'images/{}2014'.format(subset) categories, instances = get_instances(annotations_path) dst = path(dst).expand() for i, instance in enumerate(instances): instances[i]['category_id'] = categories[instance['category_id']] for name, group in iteritems(groupby('file_name', instances)): img = imread(images_path / name) if img.ndim == 3: out_name = rename(name) annotation = root('VOC2014', '{}.jpg'.format(out_name), group[0]['height'], group[0]['width']) for instance in group: annotation.append(instance_to_xml(instance)) etree.ElementTree(annotation).write( dst / '{}.xml'.format(out_name) ) print(out_name) else: print(instance['file_name'])
def parse_instance(content, outdir): categories = {d['id']: d['name'] for d in content['categories']} # merge images and annotations: id in images vs image_id in annotations merged_info_list = map( cytoolz.merge, cytoolz.join('id', content['images'], 'image_id', content['annotations'])) # convert category id to name for instance in merged_info_list: instance['category_id'] = categories[instance['category_id']] # group by filename to pool all bbox in same file for name, groups in cytoolz.groupby('file_name', merged_info_list).items(): anno_tree = instance2xml_base(groups[0]) # if one file have multiple different objects, save it in each category sub-directory filenames = [] for group in groups: if group[u'iscrowd'] == 0: filenames.append( os.path.join(outdir, re.sub(" ", "_", group['category_id']), os.path.splitext(name)[0] + ".xml")) anno_tree.append(instance2xml_bbox(group, bbox_type='xyxy')) for filename in filenames: etree.ElementTree(anno_tree).write(filename, pretty_print=True) print "Formating instance xml file {} done!".format(name)
def validate_uncles(self, block: BaseBlock) -> None: """ Validate the uncles for the given block. """ # Check for duplicates uncle_groups = groupby(operator.attrgetter('hash'), block.uncles) duplicate_uncles = tuple(sorted( hash for hash, twins in uncle_groups.items() if len(twins) > 1 )) if duplicate_uncles: raise ValidationError( "Block contains duplicate uncles:\n" " - {0}".format(' - '.join(duplicate_uncles)) ) recent_ancestors = tuple( ancestor for ancestor in self.get_ancestors(MAX_UNCLE_DEPTH + 1, header=block.header) ) recent_ancestor_hashes = {ancestor.hash for ancestor in recent_ancestors} recent_uncle_hashes = _extract_uncle_hashes(recent_ancestors) for uncle in block.uncles: if uncle.hash == block.hash: raise ValidationError("Uncle has same hash as block") # ensure the uncle has not already been included. if uncle.hash in recent_uncle_hashes: raise ValidationError( "Duplicate uncle: {0}".format(encode_hex(uncle.hash)) ) # ensure that the uncle is not one of the canonical chain blocks. if uncle.hash in recent_ancestor_hashes: raise ValidationError( "Uncle {0} cannot be an ancestor of {1}".format( encode_hex(uncle.hash), encode_hex(block.hash))) # ensure that the uncle was built off of one of the canonical chain # blocks. if uncle.parent_hash not in recent_ancestor_hashes or ( uncle.parent_hash == block.header.parent_hash): raise ValidationError( "Uncle's parent {0} is not an ancestor of {1}".format( encode_hex(uncle.parent_hash), encode_hex(block.hash))) # Now perform VM level validation of the uncle self.validate_seal(uncle) try: uncle_parent = self.get_block_header_by_hash(uncle.parent_hash) except HeaderNotFound: raise ValidationError( "Uncle ancestor not found: {0}".format(uncle.parent_hash) ) uncle_vm_class = self.get_vm_class_for_block_number(uncle.block_number) uncle_vm_class.validate_uncle(block, uncle, uncle_parent)
def parse_instance(content, outdir): categories = {d['id']: d['name'] for d in content['categories']} # merge images and annotations: id in images vs image_id in annotations merged_info_list = list(map(cytoolz.merge, cytoolz.join('id', content['images'], 'image_id', content['annotations']))) filtered_info_list = [] # convert category id to name && get target object info for instance in merged_info_list: cat_name = categories[instance['category_id']] filepath = os.path.join(dataDir, instance['file_name']) if cat_name in target_classes: # 过滤对于voc不合格的照片 origimg = Image.open(filepath) if len(np.asarray(origimg).shape) != 3: continue instance['category_id'] = cat_name filtered_info_list.append(instance) # ## 控制每个类别的数量 # target_image_list = [] # for img_info in filtered_info_list: # if img_info['category_id'] == 'bicycle': # target_image_list.append(img_info) # elif len(target_image_list) < total_num: # target_image_list.append(img_info) # group by filename to pool all bbox in same file target_images = [] for name, groups in cytoolz.groupby('file_name', filtered_info_list).items(): anno_tree = instance2xml_base(groups[0]) # if one file have multiple different objects, save it in each category sub-directory filenames = [] for group in groups: # filenames.append(os.path.join(outdir, re.sub(" ", "_", group['category_id']), # 'annotations', os.path.splitext(name)[0] + ".xml")) filenames.append(os.path.join(outdir, 'annotations', os.path.splitext(name)[0] + ".xml")) anno_tree.append(instance2xml_bbox(group, bbox_type='xyxy')) for filename in filenames: etree.ElementTree(anno_tree).write(filename, pretty_print=True) print("Formating instance xml file {} done!".format(name)) # copy target image file to outdir if name not in target_images: img_path = os.path.join(dataDir, name) # target_dir = os.path.join(output_dir, re.sub(" ", "_", group['category_id']), 'images', name) target_dir = os.path.join(output_dir, 'images', name) shutil.copyfile(img_path, target_dir) target_images.append(name) # if len(target_images) > total_num: # break print(len(target_images))
def replace_key(self, key: KeyId, replace_with: Signature) -> 'Signatures': """Return a new object with the matching keys replaced.""" matches: Dict[bool, List[Signature]] = groupby(lambda sig: sig.keyid == key, self.sigs) return Signatures( list(concat([[replace_with], matches.get(False, [])])))
def process(workbook: Any, content: str) -> None: """Process Hosts (3Par) worksheet :param workbook: :param content: """ worksheet = workbook.get_sheet_by_name('Hosts') headers = list(concat([ get_parser_header(SHOWHOST_TMPL), get_parser_header(SHOWHOST_LINES_TMPL)[4:], ])) RowTuple = namedtuple('RowTuple', headers) # pylint: disable=invalid-name build_header(worksheet, headers) show_hosts_out = groupby( itemgetter(0, 1, 2, 3, 4), run_parser_over(content, SHOWHOST_TMPL)) show_hosts_lines_out = groupby( itemgetter(0, 1, 2, 3), run_parser_over(content, SHOWHOST_LINES_TMPL)) rows = [] for idfier in show_hosts_out: with suppress(KeyError): for host_line, details_line in \ zip(show_hosts_out[idfier], show_hosts_lines_out[idfier[:-1]]): rows.append(host_line + details_line[4:]) final_col, final_row = 0, 0 for row_n, row_tuple in enumerate(map(RowTuple._make, rows), 2): for col_n, col_value in \ enumerate(row_tuple._asdict().values(), ord('A')): cell = worksheet['{}{}'.format(column_format(col_n), row_n)] cell.value = str.strip(col_value) style_value_cell(cell) set_cell_to_number(cell) final_col = col_n final_row = row_n sheet_process_output( worksheet, 'HostsTable', 'Hosts', final_col, final_row)
def group_data(ctx): grouper = lambda x: 'defaults' if x['sday'] == '0001-01-01' and x[ 'eday'] == '0001-01-01' else 'data' groups = groupby(grouper, ctx['data']) return merge(ctx, { 'data': get('data', groups, []), 'defaults': get('defaults', groups, []) })
def is_valid_connection_candidate(self, candidate: Node) -> bool: # connect to no more then 2 nodes with the same IP nodes_by_ip = groupby( operator.attrgetter('remote.address.ip'), self.connected_nodes.values(), ) matching_ip_nodes = nodes_by_ip.get(candidate.address.ip, []) return len(matching_ip_nodes) <= 2
def broadcast_dimensions(argpairs, numblocks, sentinels=(1, (1, )), consolidate=None): """ Find block dimensions from arguments Parameters ---------- argpairs: iterable name, ijk index pairs numblocks: dict maps {name: number of blocks} sentinels: iterable (optional) values for singleton dimensions consolidate: func (optional) use this to reduce each set of common blocks into a smaller set Examples -------- >>> argpairs = [('x', 'ij'), ('y', 'ji')] >>> numblocks = {'x': (2, 3), 'y': (3, 2)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Supports numpy broadcasting rules >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> numblocks = {'x': (2, 1), 'y': (1, 3)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Works in other contexts too >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))} >>> broadcast_dimensions(argpairs, d) {'i': 'Hello', 'j': (2, 3)} """ # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)] argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None] L = toolz.concat([ zip(inds, dims) for (x, inds), (x, dims) in toolz.join( toolz.first, argpairs2, toolz.first, numblocks.items()) ]) g = toolz.groupby(0, L) g = dict((k, set([d for i, d in v])) for k, v in g.items()) g2 = dict( (k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items()) if consolidate: return toolz.valmap(consolidate, g2) if g2 and not set(map(len, g2.values())) == set([1]): raise ValueError("Shapes do not align %s" % g) return toolz.valmap(toolz.first, g2)
def partition(grouper, sequence, npartitions, p, nelements=2**20): """ Partition a bag along a grouper, store partitions on disk """ for block in partition_all(nelements, sequence): d = groupby(grouper, block) d2 = defaultdict(list) for k, v in d.items(): d2[abs(hash(k)) % npartitions].extend(v) p.append(d2) return p
def find(self, key: KeyId) -> Optional[Signature]: """Return the first matching key if available.""" matches: Dict[bool, List[Signature]] = groupby(lambda sig: sig.keyid == key, self.sigs) try: return matches[True][0] except KeyError: return None
def getrecursive(dict_, keys): if not any(keys): return dict_ head_to_tails = valmap( lambda l: [t[1:] for t in l], groupby(itemgetter(0), filter(len, keys)) ) return { head: getrecursive(dict_[head], tails) for head, tails in head_to_tails.items() }
def _get_sub_overrides_by_prop(overrides): # we only want the overrides that are not top level. sub_overrides = _get_sub_overrides(overrides) key_groups = groupby(_extract_top_level_key, sub_overrides.keys()) for top_level_key, props in key_groups.items(): yield top_level_key, { _extract_tail_key(prop): overrides[prop] for prop in props }
def left_join1(lseq, rseq, key): key_fn = operator.itemgetter(*key) lr = [ cytoolz.merge(_) for _ in cytoolz.groupby(key_fn, lseq + rseq).values() ] return (sorted(list( filter(lambda d: key_fn(d) in [key_fn(l) for l in lseq], lr)), key=key_fn))
async def multi_set(self, triplets): # pylint: disable=no-member grouped_by_ttl = cytoolz.groupby(lambda t: t[2], triplets) futures = [] for cache in self._caches: for ttl, ttl_group in grouped_by_ttl.items(): pairs = [t[:2] for t in ttl_group] futures.append( asyncio.ensure_future(cache.multi_set(pairs, ttl=ttl))) return await asyncio.gather(*futures)
def create_annotations(coco_annotation, dst='annotations_voc'): os.makedirs(dst, exist_ok=True) categories, instances = get_instances(coco_annotation) ''' About categories: Dictionary where the keys are the categories IDs and the values are tha categories names. About instances: Tuple of dictionaries containing information of the annotations and its respective images. NOTE: There is one instance for every annotation, not image. ''' dst = os.path.abspath(dst) ''' Modifying the category ID to show an string instead of a number. The string corresponds to the name of the category. ''' for i, instance in tqdm(enumerate(instances), desc="rewriting categories"): instances[i]['category_id'] = categories[instance['category_id']] for name, group in tqdm(iteritems(groupby('file_name', instances)), total=len(groupby('file_name', instances)), desc="processing annotations"): ''' About name: the image path About group: the image informations ''' img = imread(os.path.abspath(name)) if img.ndim == 3: out_name = rename(name) image_folder, image_name = os.path.split(out_name) annotation = root(image_folder, '{}.jpg'.format(image_name), group[0]['height'], group[0]['width']) for instance in group: annotation.append(instance_to_xml(instance)) # Exporting XML to destination folder destination_file = "{}.xml".format(out_name) _, destination_file = os.path.split(destination_file) xml_file = etree.ElementTree(annotation) xml_file.write(os.path.join(dst, destination_file))
def increment_rt_counts(tweet_pks): """ :param tweet_pks: dictionary {tweet_pk: rt_count} :return: """ items = sorted(tweet_pks.items(), key=lambda x: x[1], reverse=True) grouped = groupby(lambda x: x[1], items) for incr, pairs in grouped.items(): if incr > 0: pks = pluck(0, pairs) TweetFeatures.objects.filter(tweet_id__in=pks).update(count_rts=F('count_rts') + incr)
def broadcast_dimensions(argpairs, numblocks, sentinels=(1, (1,)), consolidate=None): """ Find block dimensions from arguments Parameters ---------- argpairs: iterable name, ijk index pairs numblocks: dict maps {name: number of blocks} sentinels: iterable (optional) values for singleton dimensions consolidate: func (optional) use this to reduce each set of common blocks into a smaller set Examples -------- >>> argpairs = [('x', 'ij'), ('y', 'ji')] >>> numblocks = {'x': (2, 3), 'y': (3, 2)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Supports numpy broadcasting rules >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> numblocks = {'x': (2, 1), 'y': (1, 3)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Works in other contexts too >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))} >>> broadcast_dimensions(argpairs, d) {'i': 'Hello', 'j': (2, 3)} """ # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)] argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None] L = toolz.concat([zip(inds, dims) for (x, inds), (x, dims) in toolz.join(toolz.first, argpairs2, toolz.first, numblocks.items())]) g = toolz.groupby(0, L) g = dict((k, set([d for i, d in v])) for k, v in g.items()) g2 = dict((k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items()) if consolidate: return toolz.valmap(consolidate, g2) if g2 and not set(map(len, g2.values())) == set([1]): raise ValueError("Shapes do not align %s" % g) return toolz.valmap(toolz.first, g2)
def collate_discovery_messages(encoded_blobs): all_messages = tuple(map(decode_discovery_message, encoded_blobs)) messages_by_type = groupby(type, all_messages) ping_blobs = tuple(rlp.encode(msg) for msg in messages_by_type[RLPPing]) pong_blobs = tuple(rlp.encode(msg) for msg in messages_by_type[RLPPong]) find_node_blobs = tuple( rlp.encode(msg) for msg in messages_by_type[RLPFindNode]) neighbours_blobs = tuple( rlp.encode(msg) for msg in messages_by_type[RLPNeighbours]) return ping_blobs, pong_blobs, find_node_blobs, neighbours_blobs
def unique_mentions_per_word(mentions, field): """Count of unique mentions per previous/next-word Parameters: mentions, list: a list of Mention objects field, string : can be one of `('previous_word', 'next_word')` Returns: a dictionary with words as keys and counts as values """ d = defaultdict(int) groups = cytoolz.groupby(lambda x: x[field], mentions) for k, g in groups.iteritems(): d[k] = count(unique(g, lambda x: x.text)) return d
def parse_keypoints(content, outdir): keypoints = dict(zip(range(1, len(content['categories'][0]['keypoints'])+1), content['categories'][0]['keypoints'])) # merge images and annotations: id in images vs image_id in annotations merged_info_list = map(cytoolz.merge, cytoolz.join('id', content['images'], 'image_id', content['annotations'])) # convert category name to person for keypoint in merged_info_list: keypoint['category_id'] = "person" # group by filename to pool all bbox and keypoint in same file for name, groups in cytoolz.groupby('file_name', merged_info_list).items(): filename = os.path.join(outdir, os.path.splitext(name)[0]+".xml") anno_tree = keypoints2xml_base(groups[0]) for group in groups: anno_tree = keypoints2xml_object(group, anno_tree, keypoints, bbox_type="xyxy") doc = etree.ElementTree(anno_tree) doc.write(open(filename, "w"), pretty_print=True) print "Formating keypoints xml file {} done!".format(name)
def extend_chunk(self, seq): self._open_files() grouper = self.grouper npart = self.npartitions groups = groupby(grouper, seq) # Unify groups that hash the same groups2 = dict() for k, v in groups.items(): key = hash(k) % self.npartitions if key not in groups2: groups2[key] = [] groups2[key].extend(v) # Store to disk for k, group in groups2.items(): if group: self.dump(group, self.files[k])
def parse_instance(content, outdir): categories = {d['id']: d['name'] for d in content['categories']} # merge images and annotations: id in images vs image_id in annotations merged_info_list = map(cytoolz.merge, cytoolz.join('id', content['images'], 'image_id', content['annotations'])) # convert category id to name for instance in merged_info_list: instance['category_id'] = categories[instance['category_id']] # group by filename to pool all bbox in same file for name, groups in cytoolz.groupby('file_name', merged_info_list).items(): anno_tree = instance2xml_base(groups[0]) # if one file have multiple different objects, save it in each category sub-directory filenames = [] for group in groups: filenames.append(os.path.join(outdir, re.sub(" ", "_", group['category_id']), os.path.splitext(name)[0] + ".xml")) anno_tree.append(instance2xml_bbox(group, bbox_type='xyxy')) for filename in filenames: etree.ElementTree(anno_tree).write(filename, pretty_print=True) print "Formating instance xml file {} done!".format(name)
def make_key2file(fns): """Return a dictionary mapping well co-ordinates to filenames. Returns a dictionary where key are (plate, well) co-ordinates and values are lists of images corresponding to that plate and well. Parameters ---------- fns : list of string A list of Cellomics TIF files. Returns ------- wellchannel2file : dict {tuple : list of string} The dictionary mapping the (plate, well) co-ordinate to a list of files corresponding to that well. """ wellchannel2file = groupby(filename2coord, fns) return wellchannel2file
def get_interactions(): dates = sorted(set(map(_g('date'), data['interactions']))) d = t.pipe(data['interactions'], tc.groupby(lambda i: i.student), tc.valmap(lambda x: t.pipe(t.groupby(lambda i: i.date,x), tc.valmap(lambda v: [v[0].time_in, v[0].time_out])))) mat = [['student'] + dates] for student, attendance in d.items(): record = [student] for dt in dates: if dt in attendance: record.append(attendance[dt]) elif dt in data['students'][student].absences: record.append(('','')) else: record.append((None,None)) mat.append(record) return {'interactions': mat}
def compute_up(t, seq, **kwargs): if ((isinstance(t.apply, Reduction) and type(t.apply) in binops) or (isinstance(t.apply, Summary) and builtins.all(type(val) in binops for val in t.apply.values))): grouper, binop, combiner, initial = reduce_by_funcs(t) d = reduceby(grouper, binop, seq, initial) else: grouper = rrowfunc(t.grouper, t._child) groups = groupby(grouper, seq) d = dict((k, compute(t.apply, {t._child: v})) for k, v in groups.items()) if isscalar(t.grouper.dshape.measure): keyfunc = lambda x: (x,) else: keyfunc = identity if isscalar(t.apply.dshape.measure): valfunc = lambda x: (x,) else: valfunc = identity return tuple(keyfunc(k) + valfunc(v) for k, v in d.items())
def compute_one(t, seq, **kwargs): grouper = rrowfunc(t.grouper, t.child) if (isinstance(t.apply, Reduction) and type(t.apply) in binops): binop, initial = binops[type(t.apply)] applier = rrowfunc(t.apply.child, t.child) def binop2(acc, x): return binop(acc, applier(x)) d = reduceby(grouper, binop2, seq, initial) else: groups = groupby(grouper, seq) d = dict((k, compute(t.apply, {t.child: v})) for k, v in groups.items()) if t.grouper.iscolumn: return d.items() else: return tuple(k + (v,) for k, v in d.items())
def compute(t, seq): parent = compute(t.parent, seq) if (isinstance(t.apply, Reduction) and type(t.apply) in binops): binop, initial = binops[type(t.apply)] applier = rowfunc(t.apply.parent) grouper = rowfunc(t.grouper) def binop2(acc, x): return binop(acc, applier(x)) d = reduceby(grouper, binop2, parent, initial) else: grouper = rowfunc(t.grouper) groups = groupby(grouper, parent) d = dict((k, compute(t.apply, v)) for k, v in groups.items()) if t.grouper.iscolumn: return d.items() else: return tuple(k + (v,) for k, v in d.items())
def collect(grouper, group, p, barrier_token): """ Collect partitions from disk and yield k,v group pairs """ d = groupby(grouper, p.get(group, lock=False)) return list(d.items())
def _group_clusters(docs, labels): """Group docs by their cluster labels.""" return [zip(*cluster)[1] for cluster in itervalues(toolz.groupby(operator.itemgetter(0), zip(labels, docs)))]
ct = bquery.ctable(z, rootdir=rootdir, ) print(ct) # -- pandas -- df = pd.DataFrame(z) with ctime(message='pandas'): result = df.groupby(['f0'])['f2'].sum() print(result) t_pandas = t_elapsed # -- cytoolz -- with ctime(message='cytoolz over bcolz'): # In Memory Split-Apply-Combine # http://toolz.readthedocs.org/en/latest/streaming-analytics.html?highlight=reduce#split-apply-combine-with-groupby-and-reduceby r = cytoolz.groupby(lambda row: row.f0, ct) result = valmap(compose(sum, pluck(2)), r) print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2))) print(result) # -- blaze + bcolz -- blaze_data = blz.Data(ct.rootdir) expr = blz.by(blaze_data.f0, sum_f2=blaze_data.f2.sum()) with ctime(message='blaze over bcolz'): result = blz.compute(expr) print('x{0} slower than pandas'.format(round(t_elapsed / t_pandas, 2))) print(result) # -- bquery -- with ctime(message='bquery over bcolz'): result = ct.groupby(['f0'], ['f2'])
def _aggregate_miner_data(raw_data): data_by_miner = groupby(0, raw_data) for miner, miner_data in data_by_miner.items(): _, block_hashes, gas_prices = map(set, zip(*miner_data)) yield MinerData(miner, len(set(block_hashes)), min(gas_prices))
def groupby(self, key): return fdict(cytoolz.groupby(key, self)).valmap(flist)