def load_from_vars( self, context: "Context", vars_: List, wdir: PathInfo, skip_imports: Dict[str, Optional[List[str]]], stage_name: str = None, ): stage_name = stage_name or "" for index, item in enumerate(vars_): assert isinstance(item, (str, dict)) if isinstance(item, str): path, _, keys_str = item.partition(":") keys = lfilter(bool, keys_str.split(",")) path_info = wdir / path path = os.path.abspath(path_info) if path in skip_imports: if not keys and skip_imports[path] is None: # allow specifying complete filepath multiple times continue self.check_loaded(path, item, keys, skip_imports) context.merge_from(self.tree, path_info, select_keys=keys) skip_imports[path] = keys if keys else None else: joiner = "." if stage_name else "" meta = Meta(source=f"{stage_name}{joiner}vars[{index}]") context.merge_update(Context(item, meta=meta))
def filter_annotations(annotations, images): """ Filter out annotations for available images. """ image_ids = funcy.lmap(lambda i: int(i["id"]), images) return funcy.lfilter(lambda a: int(a["image_id"]) in image_ids, annotations)
def merge_from( self, tree, item: str, wdir: PathInfo, overwrite=False, ): path, _, keys_str = item.partition(":") select_keys = lfilter(bool, keys_str.split(",")) if keys_str else None path_info = wdir / path abspath = os.path.abspath(path_info) if abspath in self.imports: if not select_keys and self.imports[abspath] is None: return # allow specifying complete filepath multiple times self.check_loaded(abspath, item, select_keys) ctx = Context.load_from(tree, path_info, select_keys) try: self.merge_update(ctx, overwrite=overwrite) except ReservedKeyError as exc: raise ReservedKeyError(exc.keys, item) from exc cp = ctx.imports[abspath] if abspath not in self.imports: self.imports[abspath] = cp elif cp: self.imports[abspath].extend(cp)
def _manual_ordering_strategy( layers_and_groups: list[LayerDirectoryElement], settings: AnyGroupSettings, ) -> list[LayerDirectoryElement]: """Sort `layers_and_groups` using `settings.order` as a guide.""" ordered_directory_elements: list[LayerDirectoryElement] = [] if not settings.order: raise RuntimeError('Order must be specified in settings.') for s in settings.order: try: if s.startswith(':'): matcher = lambda x: isinstance(x, Layer) and x.id == s[1:] thing_desc = f'layer id "{s[1:]}"' else: matcher = lambda x: isinstance(x, Path) and x.name == s thing_desc = f'group/directory "{s}"' matches = funcy.lfilter(matcher, layers_and_groups) if len(matches) != 1: raise RuntimeError( f'Expected to find {thing_desc}. Found: {matches}', ) thing = matches[0] except Exception as e: raise RuntimeError( f'Unexpected error processing `settings.order` element "{s}".' f' {e}', ) ordered_directory_elements.append(thing) return ordered_directory_elements
def get_accepted_features(features: Collection[Feature], proposed_feature: Feature) -> List[Feature]: """Deselect candidate features from list of all features Args: features: collection of all features in the ballet project: both accepted features and candidate ones that have not been accepted proposed_feature: candidate feature that has not been accepted Returns: list of features with the proposed feature not in it. Raises: ballet.exc.BalletError: Could not deselect exactly the proposed feature. """ def eq(feature): """Features are equal if they have the same source At least in this implementation... """ return feature.source == proposed_feature.source # deselect features that match the proposed feature result = lfilter(complement(eq), features) if len(features) - len(result) == 1: return result elif len(result) == len(features): raise BalletError( 'Did not find match for proposed feature within \'contrib\'') else: raise BalletError(f'Unexpected condition (n_features={len(features)}, ' f'n_result={len(result)})')
def remove_indentation(string: str): lines = string.split('\n') lines = lfilter(bool, lines) indents = [len(line) - len(line.lstrip()) for line in lines] base_indent = min(indents) if indents else 0 lines = [x[base_indent:] for x in lines] # lines = lfilter(lambda s: len(s.replace(' ', '')), lines) # print(lines) return '\n'.join(lines)
def dfilter(call, pred): """Decorate a callable with a filter that accepts a predicate Example:: >>> @dfilter(lambda x: x >= 0) ... def numbers(): ... return [-1, 2, 0, -2] [2, 0] """ return lfilter(pred, call())
def part1(data): bids = lfilter(lambda e: isinstance(e, int), data.bids) # the id of the next bus we can catch bid = bids[np.argmax([data.earliest % i for i in bids])] # the timestamp at which we can board this bus board = data.earliest - (data.earliest % bid) + bid # the time we have to wait wait = board - data.earliest return bid * wait
def parse_params(path_params): ret = [] for path_param in path_params: path, _, params_str = path_param.rpartition(":") # remove empty strings from params, on condition such as `-p "file1:"` params = lfilter(bool, params_str.split(",")) if not path: ret.extend(params) else: ret.append({path: params}) return ret
def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): from dvc.dvcfile import is_valid_filename from dvc.ignore import DvcIgnore assert repo_walk try: _, dvc_dirs, dvc_fnames = ( next(dvc_walk) if dvc_walk else (None, [], []) ) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs dvc_set = set(dvc_dirs) repo_set = set(repo_dirs) dvc_only = list(dvc_set - repo_set) repo_only = list(repo_set - dvc_set) shared = list(dvc_set & repo_set) dirs = shared + dvc_only + repo_only def _func(fname): if dvcfiles: return True return not ( is_valid_filename(fname) or fname == DvcIgnore.DVCIGNORE_FILE ) # merge file lists files = set(filter(_func, dvc_fnames + repo_fnames)) yield repo_root, dirs, list(files) def is_dvc_repo(d): return self._is_dvc_repo(os.path.join(repo_root, d)) # remove subrepos to prevent it from being traversed subrepos = set(filter(is_dvc_repo, repo_only)) # set dir order for next recursion level - shared dirs first so that # next() for both generators recurses into the same shared directory dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set] repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs) for dirname in dirs: if dirname in subrepos: dir_path = os.path.join(repo_root, dirname) yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles) elif dirname in shared: yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)
def filter_annotations(annotations, images): """ Downloaded from github.com/akarazniewicz/cocosplit.git@master and modified to just handle annotations Function helping to create new data split from an original COCO Dataset :param annotations: :param images: :return: """ image_ids = funcy.lmap(lambda i: int(i['id']), images) return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)
def test_pqtree_after_reduce_chars_rand_examples(): ITERATIONS = 1000 merged = 0 rts = [] for i in range(ITERATIONS): iter_stats = {} # ------------------------- Generation of permutations ------------------------- id_perm = list(range(1, 10)) duplication_mutations(id_perm, 2) other_perms = [list(id_perm), list(id_perm)] for p in other_perms: mutate_collection(p, 2) ps = tmap(tuple, (id_perm, *other_perms)) # ------------------------- Try to merge same adjacent chars ------------------------- start_time = time.time() pq = PQTreeDup.from_perms_wth_multi(ps) iter_stats["merged"] = time.time() - start_time if not pq: continue else: merged += 1 # ------------------------- find all the trees with minimal size ------------------------- start_time = time.time() all_possibilities = list(PQTreeDup.from_perms(ps)) iter_stats["no_merge"] = time.time() - start_time iter_stats["perms"] = ps best_size = all_possibilities[0].approx_frontier_size() only_best_sized = lfilter(lambda t: t.approx_frontier_size() == best_size, all_possibilities) # verify tree with multi chars contains in its frontier one of the best trees try: front = set(pq.frontier()) assert any(front.issuperset(t.frontier()) for t in only_best_sized) except: print(ps) # PQTreeVisualizer.show_all(pq, *only_best_sized) raise else: rts.append(iter_stats) print(f"multi merged: {merged} / {ITERATIONS}") lmap(print, rts)
def check_tarantool_logs(logs: str): lines = logs.splitlines() is_ok = [ # errors that are not actually errors and are OK to occur lambda l: "E> ER_LOADING: Instance bootstrap hasn't finished yet" in l, lambda l: "E> Cleanup" in l and "reload.lua" in l, lambda l: "SystemError unexpected EOF when reading from socket" in l, lambda l: "SystemError getaddrinfo: Name does not resolve" in l, lambda l: "ER_NO_SUCH_USER: User 'storage' is not found" in l, lambda l: "ER_ACCESS_DENIED: Session access to universe '' is denied for user 'storage'" in l, lambda l: "ER_ACCESS_DENIED: Read access to universe '' is denied for user 'storage'" in l, lambda l: "Exception during calling 'vshard.storage.buckets_count' on " in l and ": Invalid argument" in l, lambda l: '"code":77,"message":"Connection refused"' in l, ] all_errors = lfilter(lambda l: "E>" in l, lines) is_real_error = none_fn(*is_ok) real_errors = lfilter(is_real_error, all_errors) assert not real_errors, "found errors in Tarantool logs"
def test_pqtree_with_merges_rand(): ITERATIONS = 100 merged = 0 for i in range(ITERATIONS): id_perm = list(range(1, 100)) duplication_mutations(id_perm, 5) other_perms = [list(id_perm), list(id_perm)] for p in other_perms: mutate_collection(p, 2) ps = tmap(tuple, (id_perm, *other_perms)) pqs = list(PQTreeDup.from_perms_with_merge(ps)) if not pqs: continue else: merged += 1 best_size_found = min(pq.approx_frontier_size() for pq in pqs) only_best_sized_found = tfilter_fx_eq(PQTree.approx_frontier_size, best_size_found, pqs) all_possibilities = list(PQTreeDup.from_perms(ps)) best_size = min(t.approx_frontier_size() for t in all_possibilities) only_best_sized = lfilter(lambda t: t.approx_frontier_size() == best_size, all_possibilities) only_best_sized_parens = smap(PQTree.to_parens, only_best_sized) try: assert best_size_found == best_size assert all(len(list(pq.frontier())) == best_size for pq in only_best_sized_found) assert any(pq.to_parens() in only_best_sized_parens for pq in only_best_sized_found) except: print("same cahrs together:", any(any(o[1] != 1 for o in iter_char_occurrence(p)) for p in ps)) print(f"best no opt: {best_size}, best with merge: {best_size_found}") print(f"Merged best: {[pq.to_parens() for pq in only_best_sized_found]}") print(f"Merged all: {[pq.to_parens() for pq in pqs]}") print(ps) print(f"actual front sizes = {[len(list(t.frontier())) for t in only_best_sized]}") print([t.to_parens() for t in all_possibilities]) print([t.to_parens() for t in only_best_sized]) # PQTreeVisualizer.show_all(pq, *only_best_sized) continue print(f"merged {merged}")
def _find_redundant_intervals(cls, intervals: CommonIntervals) -> IntervalSet: no_singleton_intervals = lfilter( lambda inter: inter.first_end != inter.first_start, intervals) intersection_dict = { ci: cls._intersects_with(no_singleton_intervals, ci) for ci in no_singleton_intervals } start_end_index = {(ci.first_start, ci.first_end): ci for ci in no_singleton_intervals} redundant = { start_end_index.get((src.first_start, dest.first_end)) for src in no_singleton_intervals for dest in intersection_dict[src] } return redundant - {None}
def from_irreducible_intervals( cls, intervals: Set[CommonInterval]) -> 'IntervalHierarchy': ih = IntervalHierarchy() for interval in intervals: include_interval = lfilter(interval.included_in_other, intervals - {interval}) nest_level = len(include_interval) ih.nesting_levels[nest_level].append(interval) lmap(lambda l: l.sort(key=lambda ci: ci.first_start), ih.nesting_levels.values()) ih.reverse_index = { ci: lvl for lvl, ci_lst in ih.nesting_levels.items() for ci in ci_lst } return ih
def push_branches_to_remote(repo: git.Repo, remote_name: str, branches: Iterable[str]): """Push selected branches to origin Similar to:: $ git push origin branch1:branch1 branch2:branch2 Raises: ballet.exc.BalletError: Push failed in some way """ remote = repo.remote(remote_name) result = remote.push([f'{b}:{b}' for b in branches]) failures = lfilter(complement(did_git_push_succeed), result) if failures: for push_info in failures: logger.error(f'Failed to push ref {push_info.local_ref.name} to ' f'{push_info.remote_ref.name}') raise BalletError('Push failed')
def _push(project): """Push default branch and project template branch to remote With default config (i.e. remote and branch names), equivalent to:: $ git push origin master:master project-template:project-template Raises: ballet.exc.BalletError: Push failed in some way """ repo = project.repo remote_name = project.config.get('github.remote') remote = repo.remote(remote_name) result = _call_remote_push(remote) failures = lfilter(complement(did_git_push_succeed), result) if failures: for push_info in failures: logger.error('Failed to push ref {from_ref} to {to_ref}'.format( from_ref=push_info.local_ref.name, to_ref=push_info.remote_ref.name)) raise BalletError('Push failed')
def serializer_for(self, type: Type) -> Type[Serializer]: if self._serializer_map.get(type): return self._serializer_map[type] else: # Check if the type is a subclass of any of the defined serializers possible_bindings = funcy.lfilter( lambda t: issubclass(type, t), self._bindings ) if len(possible_bindings) == 0: # No serializer found. Return the default serializer if self._default_serializer: return self._default_serializer else: raise Exception(f"No serializer could be found for the type {type}") elif len(possible_bindings) == 1: return possible_bindings[0][1] else: logger.warning( f"More than one serializer found for type {type}. " "Choosing the first one." ) return itertoolz.first(possible_bindings)[1]
def merge_from(self, fs, item: str, wdir: str, overwrite=False): path, _, keys_str = item.partition(":") path = os.path.normpath(fs.path.join(wdir, path)) select_keys = lfilter(bool, keys_str.split(",")) if keys_str else None if path in self.imports: if not select_keys and self.imports[path] is None: return # allow specifying complete filepath multiple times self.check_loaded(path, item, select_keys) ctx = Context.load_from(fs, path, select_keys) try: self.merge_update(ctx, overwrite=overwrite) except ReservedKeyError as exc: raise ReservedKeyError(exc.keys, item) from exc cp = ctx.imports[path] if path not in self.imports: self.imports[path] = cp elif cp: self.imports[path].extend(cp)
def can_merge_multi_chars(cls, context_perms): """ Will return a dictionary of the structure: { char1: { perm1: [ContextChar1, ..., ContextCharK] perm2: [...] } char2: {...} } each context char has common neighbour with at LEAST ONE context char for EACH other perm """ more_than_once = cls.find_multi_chars(context_perms[0]) def common_neighbour_set(char): perm_neighbours = lambda cperm: set(flatten( [cc.left_char, cc.right_char] for cc in cperm if cc.char == char )) common_neigbours = reduce(operator.__and__, map(perm_neighbours, context_perms)) return common_neigbours - {None} def neighbours_of(char_col, char): return [cc for cc in char_col if cc.left_char == char or cc.right_char == char] mergable_chars = {} for char in more_than_once: neighbours = common_neighbour_set(char) if neighbours: cc_anywhere = lfilter(lambda cc: cc.char == char, chain(*context_perms)) cc_with_common_neighbours = sflatmap1(neighbours_of, cc_anywhere, neighbours) mergable_chars_per_perm = group_by_attr('perm', cc_with_common_neighbours) mergable_chars[char] = mergable_chars_per_perm # mergable_chars[char] = neighbours_of(filter(lambda cc: cc.char == char, chain(*context_perms)), char) return mergable_chars
def assert_valid_bst(mode, ixy_map, ixy_arr, tree, n_inserted, n_node): ''' tree is bst ''' key = prop(mode) # Num of leaves ixy ref = num of inserted ixys # Parent must be positive value except root. for i,node in enumerate(tree[1:n_inserted+1]): assert node.parent >= 0, (n_inserted, i, pyobj(node)) # Get ixy idxes from tree structure ixy_idxes = all_ixy_idxes( #tup_tree(tree[:n_inserted+50])) tup_tree(tree[:n_node+100])) if DBG: print(f' after[{n_node}]',#tup_tree(tree[:n_node+10])) [f'{p} {l} {r}' for _,p,l,r in tup_tree(tree[:n_node+10])])########### if DBG: print('iidxes', ixy_idxes) if DBG: print('n_node =',n_node) # Inserted number of ixys preserved? no0idxes = F.compact([abs(i) for i in ixy_idxes]) assert n_inserted == len(no0idxes), \ 'ixy_idxes = {}, tup_tree = {}'.format( ixy_idxes, tup_tree(tree[:n_inserted+4])) # All ixy have unique index. assert len(set(no0idxes)) == n_inserted,\ f'{len(set(no0idxes))} == {n_inserted}' # All leaves point ixy(neg idx), not inode. assert all(idx <= 0 for idx in ixy_idxes), \ 'ixy_idxes = {}, tree = {}'.format( ixy_idxes, tup_tree(tree[:n_inserted+4])) # Inserted ixys are sorted in ascending order. inserted_ixys = F.lmap( lambda i: ixy_arr[abs(i)], ixy_idxes) for ixy1, ixy2 in F.pairwise(inserted_ixys): assert key(ixy1) <= key(ixy2), 'tree = {}' \ .format(tup_tree(tree[:n_inserted+4])) # All leaves: l <= r leaves = F.lfilter(is_leaf, tree[:n_inserted+4]) for leaf in leaves: l = leaf.left; r = leaf.right if l and r: l_val = key(ixy_map[abs(l)]) r_val = key(ixy_map[abs(r)]) assert l_val <= r_val # All inodes must be sorted in ascending order. inodes = all_inodes(tup_tree(tree[:n_node+100])) for n1, n2 in F.pairwise(inodes): k1 = n1[0]; k2 = n2[0] assert k1 <= k2 # Inserted ixys are sorted in ascending order. neg_idxeseq = F.mapcat(tup( lambda k,p,l,r: ((l,) if l < 0 else ()) + ((r,) if r < 0 else ())), inodes) ixy_idxes = F.map(abs, neg_idxeseq) saved_ixys = F.map(lambda i: pyobj(ixy_arr[i]), ixy_idxes) keys = F.lmap(key, saved_ixys) for k1,k2 in F.pairwise(keys): assert k1 <= k2
def lfilter(f, *seq): return F.lfilter(f,*seq) if seq \ else lambda *xs: F.lfilter(f,*xs)
def register_handler(self, handler: CommandHandler): handled_commands = lfilter(lambda t: t is not object, handler.execute.registry) for handled_command in handled_commands: self.handlers[handled_command] = handler
class Ops: const = has_args(lambda x: lambda _: x) @has_args def multi(coll): def make_apply(el): return lambda f: f(el) if callable(f) else f if is_mapping(coll): return lambda el: walk_values(make_apply(el), coll) else: return lambda el: lmap(make_apply(el), coll) # Traverse css = has_args( lambda selector: _list_mapcat(lambda el: el.cssselect(selector))) xpath = has_args(lambda query, **params: _list_mapcat(lambda el: el.xpath( query, **params))) parent = _list_map(lambda el: el.getparent()) prev = _list_map(lambda el: el.getprevious()) next = _list_map(lambda el: el.getnext()) # Microdata @has_args def itemscope(name): return C.css(f'[itemscope][itemprop*={name}]') @has_args def itemprop(name): return C.css(f'[itemprop*={name}]') @has_args def microdata(name): return C.css(f'[itemprop*={name}]').map( C.attr('content') | C.inner_text) def ld(node): text = C.css('script[type="application/ld+json"]').inner_text(node) try: return json.loads(text) except ValueError as e: try: # Try parsing non-strict import demjson return demjson.decode(text) except: raise e # reraise first one # Select def get(els): if len(els) == 0: raise ValueError("Trying to get value from empty list: %r..." % els[:3]) if len(els) > 1: raise ValueError( "Trying to get single value from multivalue list: %r..." % els[:3]) return first(els) first = first second = second last = last slice = has_args(lambda start, stop=None, step=None: lambda val: val[slice( start, stop, step)]) # Access text = _list_first(lambda el: el.text) texts = lambda els: [el.text for el in els] tail = _list_first(lambda el: el.tail) attr = has_args(lambda name: _list_first(lambda el: el.attrib.get(name))) attrs = has_args( lambda name: lambda els: [el.attrib.get(name) for el in els]) @_list_first def head(el): prev = el.getprevious() return prev.tail if prev is not None else el.getparent().text inner_text = _list_first( lambda el: lxml.html.tostring(el, encoding='unicode', method='text')) inner_html = _list_first(lambda el: (el.text or '') + ''.join( lxml.html.tostring(sub, encoding='unicode') for sub in el)) outer_html = _list_first( lambda el: lxml.html.tostring(el, encoding='unicode')) @_list_first def html_to_text(html): """Cleans html preserving newlines""" if isinstance(html, lxml.html.HtmlElement): html = Ops.inner_html(html) html = re.sub(r'\s+', ' ', html).strip() html = re.sub(r'<br[^>]*>|</li>', '\n', html, flags=re.I) html = re.sub(r'</p>', '\n\n', html, flags=re.I) if not html or html.isspace(): return '' return lxml.html.tostring(lxml.html.fromstring(html), encoding='unicode', method='text') # Text utils # TODO: make these two work with bytes? trim = lambda text: str.strip(text) strip = has_args(lambda dirt=None: lambda text: str.strip(text, dirt)) normspace = normalize_whitespace = lambda text: re.sub(r'\s+', ' ', text ).strip() split = has_args(lambda by: lambda text: text.split(by)) re = has_args(re_finder) @has_args def re_sub(pattern, repl, count=0, flags=0): return lambda text: re.sub( pattern, repl, text, count=count, flags=flags) # Data utils len = len @has_args def map(f): if not callable(f) and isinstance(f, (Mapping, Sequence)): f = C.multi(f) return lambda els: lmap(f, els) filter = has_args(lambda pred: lambda seq: lfilter(pred, seq)) # Data cleaning float = float int = int clean_float = lambda text: float( re.sub(r'[^\d,.]', '', text).replace(',', '.')) clean_int = lambda text: int(re.sub(r'\D', '', text)) date = dateparser.parse def duration(text): regexes = [ r'()(?:(\d\d):)?(\d\d):(\d\d)(?:\s|$)', re.compile( r'''\s* (?:(\d+)\s*д[еньяй.]*)? \s* (?:(\d+)\s*ч[ас.]*)? \s* (?:(\d+)\s*м[инуты.]*)? ()''', re.I | re.X) ] for regex in regexes: m = re_find(regex, text) if m: days, hours, minutes, seconds = [ silent(int)(p) or 0 for p in m ] if days == hours == minutes == 0: return None return (days * 24 + hours * 60 + minutes) * 60 + seconds
def listens_to(self): return lfilter(lambda t: t is not object, self._process.registry)
def main(args): with open(args.annotations, 'rt', encoding='UTF-8') as annotations: coco = json.load(annotations) info = coco['info'] licenses = coco['licenses'] images = coco['images'] annotations = coco['annotations'] for item in coco["categories"]: item['name'] = CLASSES[int(item['id']) - 1] categories = coco['categories'] annotations = [item for item in annotations if item['area'] > 0] number_of_images = len(images) images_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations) if args.having_annotations: images = funcy.lremove( lambda i: i['id'] not in images_with_annotations, images) x = [ item for item in images if item['file_name'].split('/')[1] not in (test_set + exclude_set) ] y = [ item for item in images if item['file_name'].split('/')[1] in test_set ] if args.split > 0 and args.split < 1: x_train, x_val = train_test_split(x, train_size=args.split) else: x_val = y.copy() x_train = x.copy() if args.sample: random.shuffle(x_train) random.shuffle(x_val) random.shuffle(y) x_train = x_train[:int(len(x_train) * args.sample)] x_val = x_val[:int(len(x_val) * args.sample)] y = y[:int(len(y) * args.sample)] if not args.coco_category: category_map = { item['id']: select_classes.index(item['name']) for item in categories if item['name'] in select_classes } else: category_map = { item['id']: int(item['id'] - 1) for item in categories if item['name'] in select_classes } root = args.root label_folder = os.path.join(root, 'labels') if not os.path.exists(label_folder): os.makedirs(label_folder) # else: # print('delete {} ...'.format(label_folder)) # shutil.rmtree(label_folder) # os.makedirs(label_folder) image_folder = os.path.join(root, 'images') if not os.path.exists(image_folder): os.makedirs(image_folder) lists = [x_train, x_val, y] lists_mode = ['train', 'val', 'test'] if 'thermal' in args.annotations: suffix = '_thermal' else: suffix = '_rgb' lists_mode = [item + suffix for item in lists_mode] for items, mode in zip(lists, lists_mode): if not os.path.exists(os.path.join(label_folder, mode)): os.makedirs(os.path.join(label_folder, mode)) if not os.path.exists(os.path.join(image_folder, mode)): os.makedirs(os.path.join(image_folder, mode)) for item in items: txt_name = os.path.join( label_folder, mode, item['file_name'].replace('png', 'txt').replace('/', '_')) if not args.label_only: image_name = os.path.join( image_folder, mode, item['file_name'].replace('/', '_')) shutil.copyfile(os.path.join(root, item['file_name']), image_name) # item['file_name'] = item['file_name'].replace('/','_') anns = funcy.lfilter( lambda a: int(a['image_id']) in [item['id']], annotations) fid = open(txt_name, 'w') for ann in anns: if ann['category_id'] in category_map: bbox = ann['bbox'] bbox[0] = np.max([0., bbox[0]]) bbox[1] = np.max([0., bbox[1]]) bbox[2] = np.min( [bbox[0] + bbox[2], item['width'] - 1]) - bbox[0] bbox[3] = np.min( [bbox[1] + bbox[3], item['height'] - 1]) - bbox[1] if bbox[2] * bbox[3] > 0: fid.write( '%d %f %f %f %f\n' % (category_map[ann['category_id']], (bbox[0] + bbox[2] / 2.0) / item['width'], (bbox[1] + bbox[3] / 2.0) / item['height'], bbox[2] / item['width'], bbox[3] / item['height'])) fid.close() print("Saved {} entries in train {} in val, and {} in test".format( len(x_train), len(x_val), len(y)))
def scrape_comments(mongo, batch_size=250, max_workers=50): """ Parse operations and post-process for comment/post extraction. """ indexer = Indexer(mongo) start_block = indexer.get_checkpoint('comments') query = { "type": "comment", "block_num": { "$gt": start_block, "$lte": start_block + batch_size, } } projection = { '_id': 0, 'block_num': 1, 'author': 1, 'permlink': 1, } results = list(mongo.Operations.find(query, projection=projection)) identifiers = set(f"{x['author']}/{x['permlink']}" for x in results) # handle an edge case when we are too close to the head, # and the batch contains no work to do if not results and is_recent(start_block, days=1): return # get Post.export() results in parallel raw_comments = thread_multi(fn=get_comment, fn_args=[None], dep_args=list(identifiers), max_workers=max_workers, yield_results=True) raw_comments = lkeep(raw_comments) # split into root posts and comments posts = lfilter(lambda x: x['depth'] == 0, raw_comments) comments = lfilter(lambda x: x['depth'] > 0, raw_comments) # Mongo upsert many log_output = '' if posts: r = mongo.Posts.bulk_write( [ UpdateOne({'identifier': x['identifier']}, {'$set': { **x, 'updatedAt': dt.datetime.utcnow() }}, upsert=True) for x in posts ], ordered=False, ) log_output += \ f'(Posts: {r.upserted_count} upserted, {r.modified_count} modified) ' if comments: r = mongo.Comments.bulk_write( [ UpdateOne({'identifier': x['identifier']}, {'$set': { **x, 'updatedAt': dt.datetime.utcnow() }}, upsert=True) for x in comments ], ordered=False, ) log_output += \ f'(Comments: {r.upserted_count} upserted, {r.modified_count} modified) ' # We are only querying {type: 'comment'} blocks and sometimes # the gaps are larger than the batch_size. index = silent(max)(lpluck('block_num', results)) or (start_block + batch_size) indexer.set_checkpoint('comments', index) log.info(f'Checkpoint: {index} {log_output}')
def filter_annotations(annotations, images): image_ids = funcy.lmap(lambda i: int(i['id']), images) return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)
def main(args): with open(args.annotations, "rt", encoding="UTF-8") as annotations: coco = json.load(annotations) info = coco["info"] licenses = coco["licenses"] images = coco["images"] annotations = coco["annotations"] categories = coco["categories"] print(coco.keys()) print("Original", len(images)) def nothing(): pass funcy.lmap( lambda a: print( a, next(i for i in images if i["id"] == a["image_id"])) if a["segmentation"] == [] else nothing(), annotations, ) print("Annotations", len(annotations)) a2 = [] for i in range(len(annotations)): if max(annotations[i]["bbox"][2], annotations[i]["bbox"][3]) < 50 or min( annotations[i]["bbox"][2], annotations[i]["bbox"][3]) < 30: pass else: a2.append(annotations[i]) annotations = a2 print("Annotations filtered by size", len(annotations)) c2 = [] ch = [] for c in categories: if c["name"] == "human" or c["name"] == "car": ch.append(c["id"]) else: c2.append(c) print(len(c2), len(categories)) categories = c2 a2 = [] for i in range(len(annotations)): if annotations[i]["category_id"] in ch: pass else: a2.append(annotations[i]) annotations = a2 print("Annotations filtered cars and humans", len(annotations)) images_with_annotations = funcy.lmap(lambda a: int(a["image_id"]), annotations) images = funcy.lremove( lambda i: i["id"] not in images_with_annotations, images) print("Removed empty images", len(images)) images = funcy.lremove(lambda i: "copy" in i["file_name"].lower(), images) print("Removed copy", len(images)) def f(e): return e["file_name"] images.sort(key=f) images = images[-300:] # funcy.lmap(lambda i : print(i['file_name'][9:12], end="\t"), images) print(len(images)) no_segm = funcy.lfilter(lambda a: len(a["segmentation"]) == 0, annotations) print(len(no_segm), len(annotations)) image_ids = funcy.lmap(lambda i: i["image_id"], no_segm) funcy.lmap( lambda i: print("! no segm annot in #" + i["file_name"]) if i["id"] in image_ids else nothing(), images, ) save_coco( args.annotations, info, licenses, images, filter_annotations(annotations, images), categories, )