Пример #1
0
    def load_from_vars(
        self,
        context: "Context",
        vars_: List,
        wdir: PathInfo,
        skip_imports: Dict[str, Optional[List[str]]],
        stage_name: str = None,
    ):
        stage_name = stage_name or ""
        for index, item in enumerate(vars_):
            assert isinstance(item, (str, dict))
            if isinstance(item, str):
                path, _, keys_str = item.partition(":")
                keys = lfilter(bool, keys_str.split(","))

                path_info = wdir / path
                path = os.path.abspath(path_info)

                if path in skip_imports:
                    if not keys and skip_imports[path] is None:
                        # allow specifying complete filepath multiple times
                        continue
                    self.check_loaded(path, item, keys, skip_imports)

                context.merge_from(self.tree, path_info, select_keys=keys)
                skip_imports[path] = keys if keys else None
            else:
                joiner = "." if stage_name else ""
                meta = Meta(source=f"{stage_name}{joiner}vars[{index}]")
                context.merge_update(Context(item, meta=meta))
Пример #2
0
def filter_annotations(annotations, images):
    """
    Filter out annotations for available images.
    """
    image_ids = funcy.lmap(lambda i: int(i["id"]), images)
    return funcy.lfilter(lambda a: int(a["image_id"]) in image_ids,
                         annotations)
Пример #3
0
    def merge_from(
        self, tree, item: str, wdir: PathInfo, overwrite=False,
    ):
        path, _, keys_str = item.partition(":")
        select_keys = lfilter(bool, keys_str.split(",")) if keys_str else None
        path_info = wdir / path

        abspath = os.path.abspath(path_info)
        if abspath in self.imports:
            if not select_keys and self.imports[abspath] is None:
                return  # allow specifying complete filepath multiple times
            self.check_loaded(abspath, item, select_keys)

        ctx = Context.load_from(tree, path_info, select_keys)

        try:
            self.merge_update(ctx, overwrite=overwrite)
        except ReservedKeyError as exc:
            raise ReservedKeyError(exc.keys, item) from exc

        cp = ctx.imports[abspath]
        if abspath not in self.imports:
            self.imports[abspath] = cp
        elif cp:
            self.imports[abspath].extend(cp)
Пример #4
0
def _manual_ordering_strategy(
    layers_and_groups: list[LayerDirectoryElement],
    settings: AnyGroupSettings,
) -> list[LayerDirectoryElement]:
    """Sort `layers_and_groups` using `settings.order` as a guide."""
    ordered_directory_elements: list[LayerDirectoryElement] = []

    if not settings.order:
        raise RuntimeError('Order must be specified in settings.')

    for s in settings.order:
        try:
            if s.startswith(':'):
                matcher = lambda x: isinstance(x, Layer) and x.id == s[1:]
                thing_desc = f'layer id "{s[1:]}"'
            else:
                matcher = lambda x: isinstance(x, Path) and x.name == s
                thing_desc = f'group/directory "{s}"'

            matches = funcy.lfilter(matcher, layers_and_groups)
            if len(matches) != 1:
                raise RuntimeError(
                    f'Expected to find {thing_desc}. Found: {matches}', )

            thing = matches[0]
        except Exception as e:
            raise RuntimeError(
                f'Unexpected error processing `settings.order` element "{s}".'
                f' {e}', )

        ordered_directory_elements.append(thing)

    return ordered_directory_elements
Пример #5
0
def get_accepted_features(features: Collection[Feature],
                          proposed_feature: Feature) -> List[Feature]:
    """Deselect candidate features from list of all features

    Args:
        features: collection of all features in the ballet project: both
            accepted features and candidate ones that have not been accepted
        proposed_feature: candidate feature that has not been accepted

    Returns:
        list of features with the proposed feature not in it.

    Raises:
        ballet.exc.BalletError: Could not deselect exactly the proposed
            feature.
    """
    def eq(feature):
        """Features are equal if they have the same source

        At least in this implementation...
        """
        return feature.source == proposed_feature.source

    # deselect features that match the proposed feature
    result = lfilter(complement(eq), features)

    if len(features) - len(result) == 1:
        return result
    elif len(result) == len(features):
        raise BalletError(
            'Did not find match for proposed feature within \'contrib\'')
    else:
        raise BalletError(f'Unexpected condition (n_features={len(features)}, '
                          f'n_result={len(result)})')
Пример #6
0
def remove_indentation(string: str):
    lines = string.split('\n')
    lines = lfilter(bool, lines)
    indents = [len(line) - len(line.lstrip()) for line in lines]
    base_indent = min(indents) if indents else 0
    lines = [x[base_indent:] for x in lines]
    # lines = lfilter(lambda s: len(s.replace(' ', '')), lines)
    # print(lines)
    return '\n'.join(lines)
Пример #7
0
def dfilter(call, pred):
    """Decorate a callable with a filter that accepts a predicate

    Example::

        >>> @dfilter(lambda x: x >= 0)
        ... def numbers():
        ...     return [-1, 2, 0, -2]
        [2, 0]
    """
    return lfilter(pred, call())
Пример #8
0
def part1(data):
    bids = lfilter(lambda e: isinstance(e, int), data.bids)

    # the id of the next bus we can catch
    bid = bids[np.argmax([data.earliest % i for i in bids])]
    # the timestamp at which we can board this bus
    board = data.earliest - (data.earliest % bid) + bid
    # the time we have to wait
    wait = board - data.earliest

    return bid * wait
Пример #9
0
Файл: run.py Проект: dapivei/dvc
def parse_params(path_params):
    ret = []
    for path_param in path_params:
        path, _, params_str = path_param.rpartition(":")
        # remove empty strings from params, on condition such as `-p "file1:"`
        params = lfilter(bool, params_str.split(","))
        if not path:
            ret.extend(params)
        else:
            ret.append({path: params})
    return ret
Пример #10
0
    def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False):
        from dvc.dvcfile import is_valid_filename
        from dvc.ignore import DvcIgnore

        assert repo_walk
        try:
            _, dvc_dirs, dvc_fnames = (
                next(dvc_walk) if dvc_walk else (None, [], [])
            )
            repo_root, repo_dirs, repo_fnames = next(repo_walk)
        except StopIteration:
            return

        # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs
        dvc_set = set(dvc_dirs)
        repo_set = set(repo_dirs)
        dvc_only = list(dvc_set - repo_set)
        repo_only = list(repo_set - dvc_set)
        shared = list(dvc_set & repo_set)
        dirs = shared + dvc_only + repo_only

        def _func(fname):
            if dvcfiles:
                return True

            return not (
                is_valid_filename(fname) or fname == DvcIgnore.DVCIGNORE_FILE
            )

        # merge file lists
        files = set(filter(_func, dvc_fnames + repo_fnames))

        yield repo_root, dirs, list(files)

        def is_dvc_repo(d):
            return self._is_dvc_repo(os.path.join(repo_root, d))

        # remove subrepos to prevent it from being traversed
        subrepos = set(filter(is_dvc_repo, repo_only))
        # set dir order for next recursion level - shared dirs first so that
        # next() for both generators recurses into the same shared directory
        dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set]
        repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs)

        for dirname in dirs:
            if dirname in subrepos:
                dir_path = os.path.join(repo_root, dirname)
                yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles)
            elif dirname in shared:
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            elif dirname in dvc_set:
                yield from self._dvc_walk(dvc_walk)
            elif dirname in repo_set:
                yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)
def filter_annotations(annotations, images):
    """
    Downloaded from github.com/akarazniewicz/cocosplit.git@master and modified to just handle annotations
    Function helping to create new data split from an original COCO Dataset

    :param annotations:
    :param images:
    :return:
    """
    image_ids = funcy.lmap(lambda i: int(i['id']), images)
    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids,
                         annotations)
Пример #12
0
def test_pqtree_after_reduce_chars_rand_examples():
    ITERATIONS = 1000

    merged = 0
    rts = []

    for i in range(ITERATIONS):
        iter_stats = {}

        # ------------------------- Generation of permutations -------------------------
        id_perm = list(range(1, 10))
        duplication_mutations(id_perm, 2)

        other_perms = [list(id_perm), list(id_perm)]
        for p in other_perms:
            mutate_collection(p, 2)

        ps = tmap(tuple, (id_perm, *other_perms))

        # ------------------------- Try to merge same adjacent chars -------------------------
        start_time = time.time()
        pq = PQTreeDup.from_perms_wth_multi(ps)
        iter_stats["merged"] = time.time() - start_time

        if not pq:
            continue
        else:
            merged += 1

        # ------------------------- find all the trees with minimal size -------------------------
        start_time = time.time()
        all_possibilities = list(PQTreeDup.from_perms(ps))
        iter_stats["no_merge"] = time.time() - start_time
        iter_stats["perms"] = ps

        best_size = all_possibilities[0].approx_frontier_size()
        only_best_sized = lfilter(lambda t: t.approx_frontier_size() == best_size, all_possibilities)

        # verify tree with multi chars contains in its frontier one of the best trees
        try:
            front = set(pq.frontier())
            assert any(front.issuperset(t.frontier()) for t in only_best_sized)
        except:
            print(ps)
            # PQTreeVisualizer.show_all(pq, *only_best_sized)
            raise
        else:
            rts.append(iter_stats)

    print(f"multi merged: {merged} / {ITERATIONS}")
    lmap(print, rts)
Пример #13
0
def check_tarantool_logs(logs: str):
    lines = logs.splitlines()
    is_ok = [
        # errors that are not actually errors and are OK to occur
        lambda l: "E> ER_LOADING: Instance bootstrap hasn't finished yet" in l,
        lambda l: "E> Cleanup" in l and "reload.lua" in l,
        lambda l: "SystemError unexpected EOF when reading from socket" in l,
        lambda l: "SystemError getaddrinfo: Name does not resolve" in l,
        lambda l: "ER_NO_SUCH_USER: User 'storage' is not found" in l,
        lambda l:
        "ER_ACCESS_DENIED: Session access to universe '' is denied for user 'storage'"
        in l,
        lambda l:
        "ER_ACCESS_DENIED: Read access to universe '' is denied for user 'storage'"
        in l,
        lambda l: "Exception during calling 'vshard.storage.buckets_count' on "
        in l and ": Invalid argument" in l,
        lambda l: '"code":77,"message":"Connection refused"' in l,
    ]
    all_errors = lfilter(lambda l: "E>" in l, lines)
    is_real_error = none_fn(*is_ok)
    real_errors = lfilter(is_real_error, all_errors)
    assert not real_errors, "found errors in Tarantool logs"
Пример #14
0
def test_pqtree_with_merges_rand():
    ITERATIONS = 100

    merged = 0

    for i in range(ITERATIONS):
        id_perm = list(range(1, 100))
        duplication_mutations(id_perm, 5)

        other_perms = [list(id_perm), list(id_perm)]
        for p in other_perms:
            mutate_collection(p, 2)

        ps = tmap(tuple, (id_perm, *other_perms))


        pqs = list(PQTreeDup.from_perms_with_merge(ps))

        if not pqs:
            continue
        else:
            merged += 1

        best_size_found = min(pq.approx_frontier_size() for pq in pqs)
        only_best_sized_found = tfilter_fx_eq(PQTree.approx_frontier_size, best_size_found, pqs)

        all_possibilities = list(PQTreeDup.from_perms(ps))
        best_size = min(t.approx_frontier_size() for t in all_possibilities)
        only_best_sized = lfilter(lambda t: t.approx_frontier_size() == best_size, all_possibilities)
        only_best_sized_parens = smap(PQTree.to_parens, only_best_sized)

        try:
            assert best_size_found == best_size
            assert all(len(list(pq.frontier())) == best_size for pq in only_best_sized_found)
            assert any(pq.to_parens() in only_best_sized_parens for pq in only_best_sized_found)
        except:
            print("same cahrs together:", any(any(o[1] != 1 for o in iter_char_occurrence(p)) for p in ps))

            print(f"best no opt: {best_size}, best with merge: {best_size_found}")
            print(f"Merged best: {[pq.to_parens() for pq in only_best_sized_found]}")
            print(f"Merged all: {[pq.to_parens() for pq in pqs]}")
            print(ps)
            print(f"actual front sizes = {[len(list(t.frontier())) for t in only_best_sized]}")
            print([t.to_parens() for t in all_possibilities])
            print([t.to_parens() for t in only_best_sized])
            # PQTreeVisualizer.show_all(pq, *only_best_sized)
            continue

    print(f"merged {merged}")
Пример #15
0
    def _find_redundant_intervals(cls,
                                  intervals: CommonIntervals) -> IntervalSet:
        no_singleton_intervals = lfilter(
            lambda inter: inter.first_end != inter.first_start, intervals)
        intersection_dict = {
            ci: cls._intersects_with(no_singleton_intervals, ci)
            for ci in no_singleton_intervals
        }
        start_end_index = {(ci.first_start, ci.first_end): ci
                           for ci in no_singleton_intervals}

        redundant = {
            start_end_index.get((src.first_start, dest.first_end))
            for src in no_singleton_intervals
            for dest in intersection_dict[src]
        }

        return redundant - {None}
Пример #16
0
    def from_irreducible_intervals(
            cls, intervals: Set[CommonInterval]) -> 'IntervalHierarchy':
        ih = IntervalHierarchy()

        for interval in intervals:
            include_interval = lfilter(interval.included_in_other,
                                       intervals - {interval})
            nest_level = len(include_interval)
            ih.nesting_levels[nest_level].append(interval)

        lmap(lambda l: l.sort(key=lambda ci: ci.first_start),
             ih.nesting_levels.values())

        ih.reverse_index = {
            ci: lvl
            for lvl, ci_lst in ih.nesting_levels.items() for ci in ci_lst
        }

        return ih
Пример #17
0
def push_branches_to_remote(repo: git.Repo, remote_name: str,
                            branches: Iterable[str]):
    """Push selected branches to origin

    Similar to::

        $ git push origin branch1:branch1 branch2:branch2

    Raises:
        ballet.exc.BalletError: Push failed in some way
    """
    remote = repo.remote(remote_name)
    result = remote.push([f'{b}:{b}' for b in branches])
    failures = lfilter(complement(did_git_push_succeed), result)
    if failures:
        for push_info in failures:
            logger.error(f'Failed to push ref {push_info.local_ref.name} to '
                         f'{push_info.remote_ref.name}')
        raise BalletError('Push failed')
Пример #18
0
def _push(project):
    """Push default branch and project template branch to remote

    With default config (i.e. remote and branch names), equivalent to::

        $ git push origin master:master project-template:project-template

    Raises:
        ballet.exc.BalletError: Push failed in some way
    """
    repo = project.repo
    remote_name = project.config.get('github.remote')
    remote = repo.remote(remote_name)
    result = _call_remote_push(remote)
    failures = lfilter(complement(did_git_push_succeed), result)
    if failures:
        for push_info in failures:
            logger.error('Failed to push ref {from_ref} to {to_ref}'.format(
                from_ref=push_info.local_ref.name,
                to_ref=push_info.remote_ref.name))
        raise BalletError('Push failed')
Пример #19
0
 def serializer_for(self, type: Type) -> Type[Serializer]:
     if self._serializer_map.get(type):
         return self._serializer_map[type]
     else:
         # Check if the type is a subclass of any of the defined serializers
         possible_bindings = funcy.lfilter(
             lambda t: issubclass(type, t), self._bindings
         )
         if len(possible_bindings) == 0:
             # No serializer found. Return the default serializer
             if self._default_serializer:
                 return self._default_serializer
             else:
                 raise Exception(f"No serializer could be found for the type {type}")
         elif len(possible_bindings) == 1:
             return possible_bindings[0][1]
         else:
             logger.warning(
                 f"More than one serializer found for type {type}. "
                 "Choosing the first one."
             )
             return itertoolz.first(possible_bindings)[1]
Пример #20
0
    def merge_from(self, fs, item: str, wdir: str, overwrite=False):
        path, _, keys_str = item.partition(":")
        path = os.path.normpath(fs.path.join(wdir, path))

        select_keys = lfilter(bool, keys_str.split(",")) if keys_str else None
        if path in self.imports:
            if not select_keys and self.imports[path] is None:
                return  # allow specifying complete filepath multiple times
            self.check_loaded(path, item, select_keys)

        ctx = Context.load_from(fs, path, select_keys)

        try:
            self.merge_update(ctx, overwrite=overwrite)
        except ReservedKeyError as exc:
            raise ReservedKeyError(exc.keys, item) from exc

        cp = ctx.imports[path]
        if path not in self.imports:
            self.imports[path] = cp
        elif cp:
            self.imports[path].extend(cp)
Пример #21
0
    def can_merge_multi_chars(cls, context_perms):
        """
        Will return a dictionary of the structure:
        {
            char1: {
                        perm1: [ContextChar1, ..., ContextCharK]
                        perm2: [...]
                    }

            char2: {...}
        }

        each context char has common neighbour with at LEAST ONE context char for EACH other perm
        """
        more_than_once = cls.find_multi_chars(context_perms[0])

        def common_neighbour_set(char):
            perm_neighbours = lambda cperm: set(flatten(
                [cc.left_char, cc.right_char] for cc in cperm if cc.char == char
            ))

            common_neigbours = reduce(operator.__and__, map(perm_neighbours, context_perms))
            return common_neigbours - {None}

        def neighbours_of(char_col, char):
            return [cc for cc in char_col if cc.left_char == char or cc.right_char == char]

        mergable_chars = {}
        for char in more_than_once:
            neighbours = common_neighbour_set(char)
            if neighbours:
                cc_anywhere = lfilter(lambda cc: cc.char == char, chain(*context_perms))
                cc_with_common_neighbours = sflatmap1(neighbours_of, cc_anywhere, neighbours)
                mergable_chars_per_perm = group_by_attr('perm', cc_with_common_neighbours)
                mergable_chars[char] = mergable_chars_per_perm
                # mergable_chars[char] = neighbours_of(filter(lambda cc: cc.char == char, chain(*context_perms)), char)

        return mergable_chars
Пример #22
0
def assert_valid_bst(mode, ixy_map,
                     ixy_arr, tree, n_inserted, n_node):
    ''' tree is bst '''
    key = prop(mode)
    # Num of leaves ixy ref = num of inserted ixys
    # Parent must be positive value except root.
    for i,node in enumerate(tree[1:n_inserted+1]):
        assert node.parent >= 0, (n_inserted, i, pyobj(node))
                               
    #   Get ixy idxes from tree structure
    ixy_idxes = all_ixy_idxes(
        #tup_tree(tree[:n_inserted+50]))
        tup_tree(tree[:n_node+100]))
    if DBG: print(f' after[{n_node}]',#tup_tree(tree[:n_node+10]))
                [f'{p} {l} {r}' for _,p,l,r in 
                tup_tree(tree[:n_node+10])])###########
    if DBG: print('iidxes', ixy_idxes)
    if DBG: print('n_node =',n_node)
    # Inserted number of ixys preserved?
    no0idxes = F.compact([abs(i) for i in ixy_idxes])
    assert n_inserted == len(no0idxes), \
        'ixy_idxes = {}, tup_tree = {}'.format(
            ixy_idxes, tup_tree(tree[:n_inserted+4]))
    # All ixy have unique index.
    assert len(set(no0idxes)) == n_inserted,\
        f'{len(set(no0idxes))} == {n_inserted}'
    # All leaves point ixy(neg idx), not inode.
    assert all(idx <= 0 for idx in ixy_idxes), \
        'ixy_idxes = {}, tree = {}'.format(
            ixy_idxes, tup_tree(tree[:n_inserted+4]))

    # Inserted ixys are sorted in ascending order.
    inserted_ixys = F.lmap(
        lambda i: ixy_arr[abs(i)], ixy_idxes)
    for ixy1, ixy2 in F.pairwise(inserted_ixys): 
        assert key(ixy1) <= key(ixy2), 'tree = {}' \
            .format(tup_tree(tree[:n_inserted+4]))

    # All leaves: l <= r
    leaves = F.lfilter(is_leaf, tree[:n_inserted+4])
    for leaf in leaves:
        l = leaf.left; r = leaf.right
        if l and r:
            l_val = key(ixy_map[abs(l)])
            r_val = key(ixy_map[abs(r)])
            assert l_val <= r_val  

    # All inodes must be sorted in ascending order.
    inodes = all_inodes(tup_tree(tree[:n_node+100]))
    for n1, n2 in F.pairwise(inodes):
        k1 = n1[0]; k2 = n2[0]
        assert k1 <= k2

    # Inserted ixys are sorted in ascending order.
    neg_idxeseq = F.mapcat(tup(
        lambda k,p,l,r: 
        ((l,) if l < 0 else ()) + ((r,) if r < 0 else ())),
        inodes)
    ixy_idxes = F.map(abs, neg_idxeseq)
    saved_ixys = F.map(lambda i: pyobj(ixy_arr[i]), ixy_idxes)
    keys = F.lmap(key, saved_ixys)
    for k1,k2 in F.pairwise(keys):
        assert k1 <= k2
Пример #23
0
def lfilter(f, *seq):
    return F.lfilter(f,*seq) if seq \
    else lambda *xs: F.lfilter(f,*xs)
Пример #24
0
 def register_handler(self, handler: CommandHandler):
     handled_commands = lfilter(lambda t: t is not object,
                                handler.execute.registry)
     for handled_command in handled_commands:
         self.handlers[handled_command] = handler
Пример #25
0
class Ops:
    const = has_args(lambda x: lambda _: x)

    @has_args
    def multi(coll):
        def make_apply(el):
            return lambda f: f(el) if callable(f) else f

        if is_mapping(coll):
            return lambda el: walk_values(make_apply(el), coll)
        else:
            return lambda el: lmap(make_apply(el), coll)

    # Traverse
    css = has_args(
        lambda selector: _list_mapcat(lambda el: el.cssselect(selector)))
    xpath = has_args(lambda query, **params: _list_mapcat(lambda el: el.xpath(
        query, **params)))
    parent = _list_map(lambda el: el.getparent())
    prev = _list_map(lambda el: el.getprevious())
    next = _list_map(lambda el: el.getnext())

    # Microdata
    @has_args
    def itemscope(name):
        return C.css(f'[itemscope][itemprop*={name}]')

    @has_args
    def itemprop(name):
        return C.css(f'[itemprop*={name}]')

    @has_args
    def microdata(name):
        return C.css(f'[itemprop*={name}]').map(
            C.attr('content') | C.inner_text)

    def ld(node):
        text = C.css('script[type="application/ld+json"]').inner_text(node)
        try:
            return json.loads(text)
        except ValueError as e:
            try:
                # Try parsing non-strict
                import demjson
                return demjson.decode(text)
            except:
                raise e  # reraise first one

    # Select
    def get(els):
        if len(els) == 0:
            raise ValueError("Trying to get value from empty list: %r..." %
                             els[:3])
        if len(els) > 1:
            raise ValueError(
                "Trying to get single value from multivalue list: %r..." %
                els[:3])
        return first(els)

    first = first
    second = second
    last = last
    slice = has_args(lambda start, stop=None, step=None: lambda val: val[slice(
        start, stop, step)])

    # Access
    text = _list_first(lambda el: el.text)
    texts = lambda els: [el.text for el in els]
    tail = _list_first(lambda el: el.tail)
    attr = has_args(lambda name: _list_first(lambda el: el.attrib.get(name)))
    attrs = has_args(
        lambda name: lambda els: [el.attrib.get(name) for el in els])

    @_list_first
    def head(el):
        prev = el.getprevious()
        return prev.tail if prev is not None else el.getparent().text

    inner_text = _list_first(
        lambda el: lxml.html.tostring(el, encoding='unicode', method='text'))
    inner_html = _list_first(lambda el: (el.text or '') + ''.join(
        lxml.html.tostring(sub, encoding='unicode') for sub in el))
    outer_html = _list_first(
        lambda el: lxml.html.tostring(el, encoding='unicode'))

    @_list_first
    def html_to_text(html):
        """Cleans html preserving newlines"""
        if isinstance(html, lxml.html.HtmlElement):
            html = Ops.inner_html(html)

        html = re.sub(r'\s+', ' ', html).strip()
        html = re.sub(r'<br[^>]*>|</li>', '\n', html, flags=re.I)
        html = re.sub(r'</p>', '\n\n', html, flags=re.I)
        if not html or html.isspace():
            return ''
        return lxml.html.tostring(lxml.html.fromstring(html),
                                  encoding='unicode',
                                  method='text')

    # Text utils
    # TODO: make these two work with bytes?
    trim = lambda text: str.strip(text)
    strip = has_args(lambda dirt=None: lambda text: str.strip(text, dirt))
    normspace = normalize_whitespace = lambda text: re.sub(r'\s+', ' ', text
                                                           ).strip()
    split = has_args(lambda by: lambda text: text.split(by))
    re = has_args(re_finder)

    @has_args
    def re_sub(pattern, repl, count=0, flags=0):
        return lambda text: re.sub(
            pattern, repl, text, count=count, flags=flags)

    # Data utils
    len = len

    @has_args
    def map(f):
        if not callable(f) and isinstance(f, (Mapping, Sequence)):
            f = C.multi(f)
        return lambda els: lmap(f, els)

    filter = has_args(lambda pred: lambda seq: lfilter(pred, seq))

    # Data cleaning
    float = float
    int = int
    clean_float = lambda text: float(
        re.sub(r'[^\d,.]', '', text).replace(',', '.'))
    clean_int = lambda text: int(re.sub(r'\D', '', text))

    date = dateparser.parse

    def duration(text):
        regexes = [
            r'()(?:(\d\d):)?(\d\d):(\d\d)(?:\s|$)',
            re.compile(
                r'''\s* (?:(\d+)\s*д[еньяй.]*)?
                           \s* (?:(\d+)\s*ч[ас.]*)?
                           \s* (?:(\d+)\s*м[инуты.]*)?
                           ()''', re.I | re.X)
        ]
        for regex in regexes:
            m = re_find(regex, text)
            if m:
                days, hours, minutes, seconds = [
                    silent(int)(p) or 0 for p in m
                ]
                if days == hours == minutes == 0:
                    return None
                return (days * 24 + hours * 60 + minutes) * 60 + seconds
Пример #26
0
 def listens_to(self):
     return lfilter(lambda t: t is not object, self._process.registry)
def main(args):
    with open(args.annotations, 'rt', encoding='UTF-8') as annotations:
        coco = json.load(annotations)
        info = coco['info']
        licenses = coco['licenses']
        images = coco['images']
        annotations = coco['annotations']
        for item in coco["categories"]:
            item['name'] = CLASSES[int(item['id']) - 1]
        categories = coco['categories']

        annotations = [item for item in annotations if item['area'] > 0]

        number_of_images = len(images)

        images_with_annotations = funcy.lmap(lambda a: int(a['image_id']),
                                             annotations)

        if args.having_annotations:
            images = funcy.lremove(
                lambda i: i['id'] not in images_with_annotations, images)

        x = [
            item for item in images
            if item['file_name'].split('/')[1] not in (test_set + exclude_set)
        ]
        y = [
            item for item in images
            if item['file_name'].split('/')[1] in test_set
        ]
        if args.split > 0 and args.split < 1:
            x_train, x_val = train_test_split(x, train_size=args.split)
        else:
            x_val = y.copy()
            x_train = x.copy()

        if args.sample:
            random.shuffle(x_train)
            random.shuffle(x_val)
            random.shuffle(y)
            x_train = x_train[:int(len(x_train) * args.sample)]
            x_val = x_val[:int(len(x_val) * args.sample)]
            y = y[:int(len(y) * args.sample)]

        if not args.coco_category:
            category_map = {
                item['id']: select_classes.index(item['name'])
                for item in categories if item['name'] in select_classes
            }
        else:
            category_map = {
                item['id']: int(item['id'] - 1)
                for item in categories if item['name'] in select_classes
            }

        root = args.root
        label_folder = os.path.join(root, 'labels')
        if not os.path.exists(label_folder):
            os.makedirs(label_folder)
        # else:
        #     print('delete {} ...'.format(label_folder))
        #     shutil.rmtree(label_folder)
        #     os.makedirs(label_folder)

        image_folder = os.path.join(root, 'images')
        if not os.path.exists(image_folder):
            os.makedirs(image_folder)

        lists = [x_train, x_val, y]
        lists_mode = ['train', 'val', 'test']
        if 'thermal' in args.annotations:
            suffix = '_thermal'
        else:
            suffix = '_rgb'
        lists_mode = [item + suffix for item in lists_mode]

        for items, mode in zip(lists, lists_mode):

            if not os.path.exists(os.path.join(label_folder, mode)):
                os.makedirs(os.path.join(label_folder, mode))

            if not os.path.exists(os.path.join(image_folder, mode)):
                os.makedirs(os.path.join(image_folder, mode))

            for item in items:
                txt_name = os.path.join(
                    label_folder, mode,
                    item['file_name'].replace('png', 'txt').replace('/', '_'))

                if not args.label_only:
                    image_name = os.path.join(
                        image_folder, mode,
                        item['file_name'].replace('/', '_'))
                    shutil.copyfile(os.path.join(root, item['file_name']),
                                    image_name)

                # item['file_name'] = item['file_name'].replace('/','_')
                anns = funcy.lfilter(
                    lambda a: int(a['image_id']) in [item['id']], annotations)
                fid = open(txt_name, 'w')
                for ann in anns:
                    if ann['category_id'] in category_map:
                        bbox = ann['bbox']
                        bbox[0] = np.max([0., bbox[0]])
                        bbox[1] = np.max([0., bbox[1]])
                        bbox[2] = np.min(
                            [bbox[0] + bbox[2], item['width'] - 1]) - bbox[0]
                        bbox[3] = np.min(
                            [bbox[1] + bbox[3], item['height'] - 1]) - bbox[1]
                        if bbox[2] * bbox[3] > 0:
                            fid.write(
                                '%d %f %f %f %f\n' %
                                (category_map[ann['category_id']],
                                 (bbox[0] + bbox[2] / 2.0) / item['width'],
                                 (bbox[1] + bbox[3] / 2.0) / item['height'],
                                 bbox[2] / item['width'],
                                 bbox[3] / item['height']))
                fid.close()

        print("Saved {} entries in train {} in val, and {} in test".format(
            len(x_train), len(x_val), len(y)))
Пример #28
0
def scrape_comments(mongo, batch_size=250, max_workers=50):
    """ Parse operations and post-process for comment/post extraction. """
    indexer = Indexer(mongo)
    start_block = indexer.get_checkpoint('comments')

    query = {
        "type": "comment",
        "block_num": {
            "$gt": start_block,
            "$lte": start_block + batch_size,
        }
    }
    projection = {
        '_id': 0,
        'block_num': 1,
        'author': 1,
        'permlink': 1,
    }
    results = list(mongo.Operations.find(query, projection=projection))
    identifiers = set(f"{x['author']}/{x['permlink']}" for x in results)

    # handle an edge case when we are too close to the head,
    # and the batch contains no work to do
    if not results and is_recent(start_block, days=1):
        return

    # get Post.export() results in parallel
    raw_comments = thread_multi(fn=get_comment,
                                fn_args=[None],
                                dep_args=list(identifiers),
                                max_workers=max_workers,
                                yield_results=True)
    raw_comments = lkeep(raw_comments)

    # split into root posts and comments
    posts = lfilter(lambda x: x['depth'] == 0, raw_comments)
    comments = lfilter(lambda x: x['depth'] > 0, raw_comments)

    # Mongo upsert many
    log_output = ''
    if posts:
        r = mongo.Posts.bulk_write(
            [
                UpdateOne({'identifier': x['identifier']},
                          {'$set': {
                              **x, 'updatedAt': dt.datetime.utcnow()
                          }},
                          upsert=True) for x in posts
            ],
            ordered=False,
        )
        log_output += \
            f'(Posts: {r.upserted_count} upserted, {r.modified_count} modified) '
    if comments:
        r = mongo.Comments.bulk_write(
            [
                UpdateOne({'identifier': x['identifier']},
                          {'$set': {
                              **x, 'updatedAt': dt.datetime.utcnow()
                          }},
                          upsert=True) for x in comments
            ],
            ordered=False,
        )
        log_output += \
            f'(Comments: {r.upserted_count} upserted, {r.modified_count} modified) '

    # We are only querying {type: 'comment'} blocks and sometimes
    # the gaps are larger than the batch_size.
    index = silent(max)(lpluck('block_num',
                               results)) or (start_block + batch_size)
    indexer.set_checkpoint('comments', index)

    log.info(f'Checkpoint: {index} {log_output}')
def filter_annotations(annotations, images):
    image_ids = funcy.lmap(lambda i: int(i['id']), images)
    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids,
                         annotations)
Пример #30
0
def main(args):
    with open(args.annotations, "rt", encoding="UTF-8") as annotations:
        coco = json.load(annotations)
        info = coco["info"]
        licenses = coco["licenses"]
        images = coco["images"]
        annotations = coco["annotations"]
        categories = coco["categories"]

        print(coco.keys())

        print("Original", len(images))

        def nothing():
            pass

        funcy.lmap(
            lambda a: print(
                a, next(i for i in images if i["id"] == a["image_id"]))
            if a["segmentation"] == [] else nothing(),
            annotations,
        )

        print("Annotations", len(annotations))
        a2 = []
        for i in range(len(annotations)):
            if max(annotations[i]["bbox"][2],
                   annotations[i]["bbox"][3]) < 50 or min(
                       annotations[i]["bbox"][2],
                       annotations[i]["bbox"][3]) < 30:
                pass
            else:
                a2.append(annotations[i])
        annotations = a2
        print("Annotations filtered by size", len(annotations))

        c2 = []
        ch = []
        for c in categories:
            if c["name"] == "human" or c["name"] == "car":
                ch.append(c["id"])
            else:
                c2.append(c)
        print(len(c2), len(categories))
        categories = c2

        a2 = []
        for i in range(len(annotations)):
            if annotations[i]["category_id"] in ch:
                pass
            else:
                a2.append(annotations[i])
        annotations = a2
        print("Annotations filtered cars and humans", len(annotations))

        images_with_annotations = funcy.lmap(lambda a: int(a["image_id"]),
                                             annotations)

        images = funcy.lremove(
            lambda i: i["id"] not in images_with_annotations, images)

        print("Removed empty images", len(images))

        images = funcy.lremove(lambda i: "copy" in i["file_name"].lower(),
                               images)

        print("Removed copy", len(images))

        def f(e):
            return e["file_name"]

        images.sort(key=f)
        images = images[-300:]
        # funcy.lmap(lambda i : print(i['file_name'][9:12], end="\t"), images)

        print(len(images))

        no_segm = funcy.lfilter(lambda a: len(a["segmentation"]) == 0,
                                annotations)
        print(len(no_segm), len(annotations))
        image_ids = funcy.lmap(lambda i: i["image_id"], no_segm)
        funcy.lmap(
            lambda i: print("! no segm annot in #" + i["file_name"])
            if i["id"] in image_ids else nothing(),
            images,
        )

        save_coco(
            args.annotations,
            info,
            licenses,
            images,
            filter_annotations(annotations, images),
            categories,
        )