def merge_meta(meta, target=None, commit=True, local_repo=None): """Merge one EMPD meta data into another Parameters ---------- meta: str The file to merge. target: str The file to merge `meta` into. If None, the meta file of the `local_repo` is used, and, if this is `meta` we use `meta.tsv`. commit: bool If True, commit the changes to the git repository local_repo: str The path to the EMPD-data local repository. If None, the directory of `meta` is used Returns ------- str The `target`""" if local_repo is None: local_repo = osp.dirname(meta) if not target: target = osp.basename(get_meta_file(local_repo)) if osp.samefile(meta, osp.join(local_repo, target)): target = 'meta.tsv' meta_df = read_empd_meta(meta) base_meta = osp.join(local_repo, target) base_meta_df = read_empd_meta(base_meta) # update the meta file and save base_meta_df = base_meta_df.join(meta_df[[]], how='outer') cols = [col for col in meta_df.columns if col in base_meta_df.columns] base_meta_df.loc[meta_df.index, cols] = meta_df dump_empd_meta(base_meta_df, base_meta) if commit: repo = Repo(local_repo) repo.index.add([target]) repo.index.commit("Merged {} into {} [skip ci]".format( osp.basename(meta), target)) return target
def query_meta(meta, query, columns='notnull', count=False, output=None, commit=False, local_repo=None, distinct=False): """Query the meta data of a data contribution This function uses the :func:`query_samples` function to return a subset of the EMPD metadata. The performed query is such as:: SELECT columns FROM meta WHERE query Parameters ---------- meta: str The path to the metadata that shall be queried (see :func:`~empd_admin.common.read_empd_meta`) query: str The WHERE clause of the SQL query columns: list of str The columns that shall be returned. It can either be a list of columns, ``'all'`` to return all columns, or ``'notnull'`` (default) to return the non-empty columns count: bool If True, do not return the values per column but the number of valid entries per column (i.e. ``SELECT COUNT(*) FROM meta WHERE query``) output: str The path where to save the tab-delimited result of the query. If None and `commit` is ``True``, it will be saved to ``queries/query.tsv``, relative to the `local_repo` commit: bool If True, commit the changes in the repository `local_repo` local_repo: str The path of the local EMPD-data repository. If None, it will be assumed to be the directory of the given `meta`. distinct: list of str If not null, return a distinct query based on the columns listed in this parameter. For example ``distinct=['Country', 'SampleContext']`` will result in ``SELECT DISTINCT ON ('Country', 'SampleContext') ...`` Returns ------- str The path where the query has been saved (see `output` and `commit`) or None str The result of the query as a markdown table, at maximum 200 rows """ if local_repo is None: local_repo = osp.dirname(meta) else: meta = osp.join(local_repo, meta) meta_df = read_empd_meta(meta).replace('', np.nan) samples = query_samples(meta_df, query) sub = meta_df.loc[samples].reset_index() if isinstance(columns, str): columns = [columns] if 'notnull' in columns: missing = [] notnull = sub.notnull().any(axis=0) columns = notnull[notnull].index elif 'all' in columns: missing = [] columns = sub.columns else: columns = np.array(columns) mask = np.isin(columns, sub.columns) missing = columns[~mask] columns = columns[mask] if count: sub = sub[columns].count().to_frame().reset_index().fillna('') sub.columns = ['Column', 'Count'] else: sub = sub[columns].fillna('') if commit: output = output or 'query.tsv' if output: ofile = osp.join(local_repo, 'queries', output) os.makedirs(osp.dirname(ofile), exist_ok=True) dump_empd_meta(sub, ofile) if commit: repo = Repo(local_repo) repo.index.add([osp.join('queries', output)]) repo.index.commit(f'Added {output} [skip ci]\n\n{query}') sub = pd.concat([ pd.DataFrame([('---', ) * len(sub.columns)], columns=sub.columns), sub ], ignore_index=True) if distinct: if 'all' in distinct: distinct = sub.columns sub.drop_duplicates(distinct, inplace=True) ret = f'<details><summary>{query}</summary>\n\n' + textwrap.indent( dump_empd_meta(sub.head(200), sep='|'), '| ') ret += '\n\nDisplaying %i of %i rows' % (min(len(sub) - 1, 200), len(sub) - 1) if len(missing): ret += '\n\nMissing columns ' + ', '.join(missing) return output, ret + '\n</details>'
def diff(meta, left=None, right=None, output=None, commit=False, maxdiff=200, *args, **kwargs): """Compute the diff between two EMPD metadata files This function computes the difference between two EMPD-data files using the :func:`compute_diff` function. It takes the meta data of an EMPD-data repository and compares it to another Parameters ---------- meta: str The path to the tab-delimited meta data of a cloned EMPD-data repository left: str The path to the first meta data file, relative to the directory of `meta`. Alternatively it can also be a url. If `left` is None, the `meta` will be used right: str The path to the second meta data file, relative to the directory of `meta`. Alternatively it can also be a url. If `right` is None, the `meta` will be used, or (if `left` is the same as `meta` or None), the meta data of the EMPD2/EMPD-data repository at https://raw.githubusercontent.com/EMPD2/EMPD-data/master/meta.tsv is used. output: str The filename to use for saving the diff. If set, it will be saved in the ``'queries'`` directory, relative to `meta`. If not set but `commit` is True, it will be saved to ``'queries/diff.tsv'``. commit: bool If True, commit the added `output` to the git repository of `meta` maxdiff: int The maximum number of lines for the diff ``*args,**kwargs`` Any other parameter for the :func:`compute_diff` function Returns ------- str The path where the data has been saved (if `output` is set or `commit` is True) str The computed difference as markdown table Examples -------- For a data contribution, e.g. the test-data branch, you can compute the difference to the EMPD meta.tsv via:: import git git.Repo.clone_from('https://github.com/EMPD2/EMPD-data', branch='test-data') diff('EMPD-data/test.tsv') which is essentially the same as:: diff('EMPD-data/test.tsv', 'test.tsv', 'meta.tsv') You will reveive nothing, however, because `how` is set to ``'inner'`` and ``'test.tsv'`` contains new samples. Instead, you can set `how` to ``'left'`` to include the samples of ``'test.tsv'`` that are not in ``'meta.tsv'``:: diff('EMPD-data/test.tsv', how='left') """ local_repo = osp.dirname(meta) meta = osp.basename(meta) repo = Repo(local_repo) master_url = ('https://raw.githubusercontent.com/EMPD2/EMPD-data/' 'master/meta.tsv') if left is None: left = meta if right is None: if left == meta: base_meta = osp.join(local_repo, 'meta.tsv') if osp.samefile(meta, base_meta): right = master_url else: right = 'meta.tsv' elif left == 'meta.tsv': right = master_url else: right = meta if url_regex.match(left): with tempfile.TemporaryDirectory() as tmpdir: download_target = osp.join(tmpdir, 'meta.tsv') request.urlretrieve(left, download_target) left_df = read_empd_meta(download_target) else: left_df = read_empd_meta(osp.join(local_repo, left)) if url_regex.match(right): with tempfile.TemporaryDirectory() as tmpdir: download_target = osp.join(tmpdir, 'meta.tsv') request.urlretrieve(right, download_target) right_df = read_empd_meta(download_target) else: right_df = read_empd_meta(osp.join(local_repo, right)) diff = compute_diff(left_df, right_df, *args, **kwargs) if commit and not output: output = 'diff.tsv' if output: target = osp.join(local_repo, 'queries', output) if not osp.exists(osp.dirname(target)): os.makedirs(osp.dirname(target)) dump_empd_meta(diff, target) if commit: repo.index.add([osp.join('queries', output)]) repo.index.commit(f"Added diff between {left} and {right}") diff.reset_index(inplace=True) diff = pd.concat([ pd.DataFrame([('---', ) * len(diff.columns)], columns=diff.columns), diff], ignore_index=True) ret = f'<details><summary>{left}..{right}</summary>\n\n' + textwrap.indent( dump_empd_meta(diff.head(maxdiff), sep='|'), '| ') ret += '\n\nDisplaying %i of %i rows' % (min(len(diff) - 1, maxdiff), len(diff) - 1) return output, ret
def handle_viewer_request(metadata, submitter, repo='EMPD2/EMPD-data', branch='master', meta='meta.tsv', submitter_gh=None, commit_msg=''): """Handle data contribution through the viewer Parameters ---------- metadata: dict The meta data as JSON from the viewer submitter: str The name of the submitter repo: str The name of the repository ('EMPD2/EMPD-data') branch: str The branch of the repo meta: str The name of the meta file for the contribution submitter_gh: str The github username of the `submitter` commit_msg: str The message that shall be used for the commit Returns ------- bool True, if everything went fine str a html-formatted report whether everything worked as expected """ # read the meta data json metadata = pd.DataFrame.from_dict( {d.pop('SampleName'): d for d in metadata}, 'index') if 'Temperature' in metadata.columns: metadata['Temperature'] = metadata.Temperature.apply(transform_list) if 'Precipitation' in metadata.columns: metadata['Precipitation'] = metadata.Precipitation.apply( transform_list) metadata.index.name = 'SampleName' # write the data frame and load it again to have a consistent dump with tempfile.TemporaryDirectory() as d2: dump_empd_meta(metadata, osp.join(d2, 'tmp.tsv')) metadata = read_empd_meta(osp.join(d2, 'tmp.tsv')) if repo == 'EMPD2/EMPD-data' and branch == 'master': return create_new_pull_request(metadata, submitter, submitter_gh, commit_msg) # check if we can find an existing pull request for the given repository pulls = github.Github( os.environ['GH_TOKEN']).get_repo('EMPD2/EMPD-data').get_pulls() for pull in pulls: if (pull.state == 'open' and pull.head.repo.full_name == repo and pull.head.label.split(':')[1] == branch): return edit_pull_request(pull, meta, metadata, submitter, submitter_gh, commit_msg) return False, f"Could not find an open pull request for {repo}:{branch}"
def edit_pull_request(pull, meta, metadata, submitter, submitter_gh=None, commit_msg='', commit=True): """Edit the meta data of an existing pull request Parameters ---------- pull: github.PullRequest The pull request on github meta: str The name of the meta file for the contribution metadata: dict The meta data as JSON from the viewer submitter: str The name of the submitter submitter_gh: str The github username of the `submitter` commit_msg: str The message that shall be used for the commit commit: bool If True, commit the changes""" full_repo = pull.head.repo.full_name remote_url = f'https://github.com/{full_repo}.git' branch = pull.head.label.split(':')[1] if not pull.labels or not any(l.name == 'viewer-editable' for l in pull.labels): return False, ( f"Pull request {pull.number} for {full_repo}:{branch} is not " "marked as editable. To change this, post a new comment in the " f"<a href='{pull.html_url}' target='_blank'>PR</a> with " "<code>@EMPD-admin allow-edits</code></a>") with tempfile.TemporaryDirectory('_empd') as tmpdir: repo = Repo.clone_from(remote_url, tmpdir, branch=branch) old_meta = read_empd_meta(osp.join(tmpdir, meta)) save_meta = old_meta.copy(True) cols = [col for col in metadata.columns if col in old_meta.columns] old_meta.loc[metadata.index, cols] = metadata n = len(metadata) nsamples = '%i sample%s' % (n, 's' if n > 1 else '') if old_meta.shape == save_meta.shape and old_meta.equals(save_meta): return False, "No data has been edited." else: dump_empd_meta(old_meta, osp.join(tmpdir, meta)) repo.index.add([meta]) commit_msg += '\n\n' if commit_msg else '' repo.index.commit(commit_msg + f"Updated {nsamples} in {meta} as requested by " f"{submitter}") remote_url = ('https://*****:*****@github.com/' f'{full_repo}.git') remote = repo.create_remote('push_remote', remote_url % os.environ['GH_TOKEN']) if commit: remote.push(branch) pr_owner = '@' + pull.user.login uri = pull.html_url if submitter_gh and '@' + submitter_gh != pr_owner: pr_owner += ' and @' + submitter_gh pr_msg = ( f"Dear {pr_owner}, I just updated {nsamples} in your {meta} file " f"as requested via [EMPD2.github.io](https://empd2.github.io/) by " f"{submitter}.\n" f"If you believe that this is a bug or has been a wrong edit: " f"Please ping `@Chilipp`.") if commit: comment = comment_on_pr('EMPD2', 'EMPD-data', pull.number, pr_msg, force=True) uri = comment.html_url return True, ( f'Successfully pushed {nsamples} into {full_repo}/{meta} ' f'and PR <a href="{uri}" title="PR #{pull.number}: {pull.title}">' f'#{pull.number}</a>.')
def fill_repo(meta, db_url, root_db=None, dry_run=False, meta_data=True, count_data=True, keep=None, how='left', on=None, exclude=[], columns='left', atol=1e-3): """Fill the EMPD-data repo with the database in the given URL Parameters ---------- meta: str The path where to save the data db_url: str The url where the postgres database can be accessed. Note that we expect this database to have a ``'metaViewer'`` table root_db: str The url where the EMPD2 postgres database can be accessed. This parameter is only necessary where ``how != 'left-only'`` dry_run: bool If True, do not create any file but only report what would have been saved meta_data: bool If True (default), dump the meta data into `meta` count_data: bool If True (default), dump the pollen counts in the corresponding file of the sample keep: list Columns to keep from the `root_df` how: str How to merge the `root` meta data into the new one. Possiblities are inner use intersection of samples from both frames, similar to a SQL inner join; preserve the order of the left keys. outer use union of samples from both frames, similar to a SQL full outer join; sort keys lexicographically. left (default) use only samples from the new frame, similar to a SQL left outer join; preserve key order. right use only samples from right frame, similar to a SQL right outer join; preserve key order. on: list of str The names of the columns to compute the diff on. If None, we use the intersection of columns between `left` and `right.` exclude: list of str Columns names that should be excluded in the diff. columns: str or list of str The columns of the returned dataframe. It can either be a list of column names to use or one of leftdiff (default) To use the columns from `left` that differ from `right` left To use all columns from `left` rightdiff To use the columns from `right` that differ from `left` right To use all columns from `right` inner To use the intersection of `left` and `right` bothdiff To use the differing columns from `right` and `left` (columns from `right` are suffixed with an ``'_r'``) both To use all columns from `left` and `right` (columns from `right` are suffixed with an ``'_r'``) In any of these cases (except if you specify the column names explicitly), the columns the data frame will include a ``diff`` column that contains for each sample the columns names of the differing cells. atol: float Absolute tolerance to use for numeric columns (see the :attr:`empd_admin.common.NUMERIC_COLS`). Returns ------- str The markdown formatted report list The filenames that have changed (or would have been changed, if `dry_run` is True)""" engine = sqlalchemy.create_engine( db_url, poolclass=sqlalchemy.pool.NullPool) outdir = osp.dirname(meta) exclude = list(exclude) + ['var_', 'acc_var_'] meta_df = pd.read_sql('metaViewer', engine) climate = pd.read_sql('climate', engine) climate['Temperature'] = list(map( ','.join, climate.iloc[:, 1:18].values.astype(str))) climate['Precipitation'] = list(map( ','.join, climate.iloc[:, 18:-1].values.astype(str))) meta_df = meta_df.merge( climate[['samplename', 'Temperature', 'Precipitation']].rename( columns={'samplename': 'SampleName'}), on='SampleName', how='left') meta_df.set_index('SampleName', inplace=True) # save meta data and load it again to make sure we have a consistent table with tempfile.NamedTemporaryFile(suffix='_empd.tsv') as f: dump_empd_meta(meta_df, f.name) meta_df = read_empd_meta(f.name) if 'okexcept' not in meta_df: meta_df['okexcept'] = '' files = [] message = "" if how != 'left-only': diff_kws = dict(how=how, on=on, exclude=exclude, columns=columns, atol=atol) root_df = read_empd_meta(osp.join(outdir, 'meta.tsv')) meta_df = compute_diff(meta_df, root_df, **diff_kws) if keep: meta_df.loc[:, keep] = meta_df[[]].join(root_df[keep], how='left') if meta_data and len(meta_df): files += [meta] if not dry_run: dump_empd_meta(meta_df, meta) message = f"Dumped {meta_df.shape[0]} lines to {osp.basename(meta)}." else: message = "No meta data has changed." if count_data: engine = sqlalchemy.create_engine( db_url, poolclass=sqlalchemy.pool.NullPool) counts = pd.read_sql_query( 'SELECT * FROM p_counts LEFT JOIN p_vars USING (var_)', engine, index_col=['samplename', 'original_varname']) if how != 'left-only': engine = sqlalchemy.create_engine( root_db, poolclass=sqlalchemy.pool.NullPool) root_counts = pd.read_sql_query( 'SELECT * FROM p_counts LEFT JOIN p_vars USING (var_)', engine, index_col=['samplename', 'original_varname']) diff = compute_diff(counts, root_counts, **diff_kws) changed = np.unique(diff.index.get_level_values(0)) files.extend(map('samples/{}.tsv'.format, changed)) if not dry_run: for key, group in counts.reset_index(-1).loc[changed].groupby( level=0): target = osp.join(outdir, 'samples', f'{key}.tsv') dump_empd_meta(group, target) else: changed = np.unique(counts.index.get_level_values(0)) files.extend(map('samples/{}.tsv'.format, changed)) if not dry_run: for key, group in counts.groupby(level=0): target = osp.join(outdir, 'samples', f'{key}.tsv') dump_empd_meta(group, target) if count_data: message += f" Changed {len(changed)} count files." if dry_run: message += '\n\nNo action has been performed because it was a dry run.' return message, files
def unaccept_query(meta, query, columns, commit=True, skip_ci=False, raise_error=False, local_repo=None): """Reverse acceptance for failed meta data based on a SQL query This function reverses the acceptance made by the :func:`accept` or :func:`accept_query` function, based on a SQL query. The arguments are the same as for the :func:`accept_query` function. Parameters ---------- meta: str The path to the metadata that shall be queried query: str The ``WHERE`` part of the query (see :func:`empd_admin.query.query_samples`). columns: list of str The columns that shall not be accepted any more commit: bool If True, commit the changes in the repository of `meta` skip_ci: bool If True and `commit`, then ``[skip ci]`` will be added to the commit message raise_error: bool If True, raise an error on Failure, otherwise return the error msg local_repo: str The path of the local EMPD-data repository. If None, it will be assumed to be the directory of the given `meta`. Returns ------- str The status message. None if everything is allright. See Also -------- unaccept Examples -------- Do not accept any failure for samples where the Country equals "Germany":: unaccept_query(meta, "Country = 'Germany'", ['Country']) """ if local_repo is None: local_repo = osp.dirname(meta) base_meta = osp.basename(meta) else: base_meta = meta meta = osp.join(local_repo, meta) repo = Repo(local_repo) meta_df = read_empd_meta(meta) samples = query_samples(meta_df, query) if not len(samples): msg = "No samples selected with %r" % (query, ) if raise_error: raise ValueError(msg) else: return msg if 'okexcept' not in meta_df.columns: meta_df['okexcept'] = '' else: meta_df['okexcept'] = meta_df.okexcept.fillna('') nsamples = len(samples) for column in columns: if column == 'all': meta_df.loc[samples, 'okexcept'] = '' message = (f"Do not accept any failure for {nsamples} samples\n\n" f"based on '{query}'") else: meta_df.loc[samples, 'okexcept'] = meta_df.loc[samples, 'okexcept'].replace( column + ',', '') message = ( f"Do not accept wrong {column} for {nsamples} samples\n\n" f"based on '{query}'") if commit: dump_empd_meta(meta_df, meta) repo.index.add([base_meta]) repo.index.commit(message + ('\n\n[skip ci]' if skip_ci else '')) if not commit: dump_empd_meta(meta_df, meta) return ("Marked the fields as accepted but without having it " "commited. %i sample%s would have been affected.") % ( nsamples, 's' if nsamples > 1 else '')
def unaccept(meta, what, commit=True, skip_ci=False, raise_error=False, exact=False, local_repo=None): """Reverse acceptance for failed meta data This function reverses the acceptance made by the :func:`accept` or :func:`accept_query` function. Arguments are the same as for the :ref:`accept` function, despite the fact that the `column` part in `what` can also be `all`. Parameters ---------- meta: str The path to the metadata what: list of str A list of strings like `sample:column` where `sample` is a regular expression (or the name of the sample if `exact`) and the `column` is the column for the corresponding sample that shall be accepted commit: bool If True, commit the changes in the repository of `meta` skip_ci: bool If True and `commit`, then ``[skip ci]`` will be added to the commit message raise_error: bool If True, raise an error on Failure, otherwise return the error msg except: bool If True, samples must be euqal to the `sample` part in `what`. Otherwise we use regular expressions local_repo: str The path of the local EMPD-data repository. If None, it will be assumed to be the directory of the given `meta`. Returs ------ str The status message. None if everything is allright. Examples -------- Do not accept any failure for any column:: unaccept(meta, ['all:all']) Do not accept any failure for latitudes or longitudes with samples that start with ``'Barboni'``:: unaccept(meta, ['Barboni:Latitude', 'Barboni:Longitude']) Do not accept wrong Temperature for the sample ``'Beaudouin_a1'``:: unaccept(meta, ['Beaudouin_a1:Temperature'], exact=True) .. note:: If you skip the `exact` parameter above, wrong temperatures would also be not accepted anymore for the sample ``Beaudouin_a10``! """ if local_repo is None: local_repo = osp.dirname(meta) base_meta = osp.basename(meta) else: base_meta = meta meta = osp.join(local_repo, meta) repo = Repo(local_repo) meta_df = read_empd_meta(meta).reset_index() samples = np.unique([t[0] for t in what]) valid = (samples == 'all') if exact: valid |= np.isin(samples, meta_df.SampleName.values) else: valid |= np.array( [meta_df.SampleName.str.contains(s).any() for s in samples]) if not valid.all(): msg = "Missing samples %s in %s" % (samples[~valid], osp.basename(meta)) if raise_error: raise ValueError(msg) else: return msg if 'okexcept' not in meta_df.columns or not meta_df.okexcept.any(): return # no failures are already old_okexcept = meta_df.okexcept.copy(True) names = meta_df.SampleName messages = [] for sample, column in what: if sample == 'all': if column == 'all': meta_df['okexcept'] = '' message = 'Do not accept any failure' else: meta_df['okexcept'] = meta_df['okexcept'].str.replace( column + ',', '') message = f"Do not accept wrong {column} for all samples" else: if column == 'all': if exact: meta_df.loc[names == sample, 'okexcept'] = '' else: meta_df.loc[names.str.contains(sample), 'okexcept'] = '' message = f"Do not accept any failure for sample {sample}" else: if exact: meta_df.loc[names == sample, 'okexcept'] = \ meta_df.loc[names == sample, 'okexcept'].replace( column + ',', '') else: meta_df.loc[names.str.contains(sample), 'okexcept'] = \ meta_df.loc[names.str.contains(sample), 'okexcept'].replace(column + ',', '') message = f"Do not accept wrong {column} for sample {sample}" messages.append(message) if commit and (old_okexcept != meta_df['okexcept']).any(): dump_empd_meta(meta_df, meta) repo.index.add([base_meta]) repo.index.commit(message + ('\n\n[skip ci]' if skip_ci else '')) old_okexcept = meta_df['okexcept'].copy(True) if not commit: dump_empd_meta(meta_df, meta) return ("Reverted the acceptance of mentioned erroneous fields but " "did not commit.\n\n- " + "\n- ".join(messages))
def accept_query(meta, query, columns, commit=True, skip_ci=False, raise_error=False, local_repo=None): """Accept failed metadata based on a query for the pandas.DataFrame.query This function can accept failed `columns` for samples based on a `query`. The sql expression would be something like:: UPDATE meta SET okexcept = ','.join(columns) WHERE query Parameters ---------- meta: str The path to the metadata that shall be queried query: str The ``WHERE`` part of the query (see :func:`empd_admin.query.query_samples`). columns: list of str The columns that shall be marked as accepted (they will be appended to the existing columns) commit: bool If True, commit the changes in the repository `local_repo` skip_ci: bool If True and `commit`, then ``[skip ci]`` will be added to the commit message raise_error: bool If True, raise an error on Failure, otherwise return the error msg local_repo: str The path of the local EMPD-data repository. If None, it will be assumed to be the directory of the given `meta`. Returns ------- str The status message. None if everything is allright. See Also -------- accept Examples -------- Accept missing Latitudes and Longitudes:: accept_query( meta, "Latitude is NULL or Longitude is NULL", ['Country']) """ if local_repo is None: local_repo = osp.dirname(meta) base_meta = osp.basename(meta) else: base_meta = meta meta = osp.join(local_repo, meta) repo = Repo(local_repo) meta_df = read_empd_meta(meta) samples = query_samples(meta_df, query) if not len(samples): msg = "No samples selected with %r" % (query, ) if raise_error: raise ValueError(msg) else: return msg if 'okexcept' not in meta_df.columns: meta_df['okexcept'] = '' else: meta_df['okexcept'] = meta_df.okexcept.fillna('') nsamples = len(samples) for column in columns: meta_df.loc[samples, 'okexcept'] += column + ',' meta_df.loc[samples, 'okexcept'] = meta_df.loc[ samples, 'okexcept'].apply( lambda s: ','.join(sorted(set(s[:-1].split(',')))) + ',') message = (f"Accept wrong {column} for {nsamples} samples\n\n" f"based on '{query}'") if commit: dump_empd_meta(meta_df, meta) repo.index.add([base_meta]) repo.index.commit(message + ('\n\n[skip ci]' if skip_ci else '')) if not commit: dump_empd_meta(meta_df, meta) return ("Marked the fields as accepted but without having it " "commited. %i sample%s would have been affected.") % ( nsamples, 's' if nsamples > 1 else '')
def accept(meta, what, commit=True, skip_ci=False, raise_error=False, exact=False, local_repo=None): """Accept failed metadata This function marks columns for specific cells as `okexcept`, such that it passes the EMPD-data tests Parameters ---------- meta: str The path to the metadata what: list of str A list of strings like `sample:column` where `sample` is a regular expression (or the name of the sample if `exact`) and the `column` is the column for the corresponding sample that shall be accepted. The `sample` can also be ``'all'`` to match all samples in the metadata commit: bool If True, commit the changes in the repository of `meta` skip_ci: bool If True and `commit`, then ``[skip ci]`` will be added to the commit message raise_error: bool If True, raise an error on Failure, otherwise return the error msg except: bool If True, samples must be euqal to the `sample` part in `what`. Otherwise we use regular expressions local_repo: str The path of the local EMPD-data repository. If None, it will be assumed to be the directory of the given `meta`. Returs ------ str The status message. None if everything is allright. Examples -------- Accept wrong countries for all samples:: accept(meta, ['all:Country']) Accept wrong latitudes and longitudes for all samples that start with ``'Barboni'``:: accept(meta, ['Barboni:Latitude', 'Barboni:Longitude']) Accept wrong Temperature for the sample ``'Beaudouin_a1'`` and nothing else:: accept(meta, ['Beaudouin_a1:Temperature'], exact=True) .. note:: If you skip the `exact` parameter above, wrong temperatures would also be accepted for the sample ``Beaudouin_a10``!""" if local_repo is None: local_repo = osp.dirname(meta) base_meta = osp.basename(meta) else: base_meta = meta meta = osp.join(local_repo, meta) repo = Repo(local_repo) meta_df = read_empd_meta(meta).reset_index() samples = np.unique([t[0] for t in what]) valid = (samples == 'all') if exact: valid |= np.isin(samples, meta_df.SampleName.values) else: valid |= np.array( [meta_df.SampleName.str.contains(s).any() for s in samples]) if not valid.all(): msg = "Missing samples %s in %s" % (samples[~valid], osp.basename(meta)) if raise_error: raise ValueError(msg) else: return msg if 'okexcept' not in meta_df.columns: meta_df['okexcept'] = '' else: meta_df['okexcept'] = meta_df.okexcept.fillna('') names = meta_df.SampleName messages = [] for sample, column in what: if sample == 'all': slicer = slice(None) message = f"Accept wrong {column} for all samples" else: if exact: slicer = names == sample else: slicer = names.str.contains(sample) message = f"Accept wrong {column} for sample {sample}" meta_df.loc[slicer, 'okexcept'] += column + ',' meta_df.loc[slicer, 'okexcept'] = meta_df.loc[ slicer, 'okexcept'].apply( lambda s: ','.join(sorted(set(s[:-1].split(',')))) + ',') messages.append(message) if commit: dump_empd_meta(meta_df, meta) repo.index.add([base_meta]) repo.index.commit(message + ('\n\n[skip ci]' if skip_ci else '')) if not commit: dump_empd_meta(meta_df, meta) return ("Marked the fields as accepted but without having it " "commited\n\n- " + "\n- ".join(messages))
def full_repo_test(local_repo, pr_id): """Run all tests and test the postgres import of a pull request This function is called by the webapp for new pull requests or after a PR has been edited. Parameters ---------- local_repo: str The path to the local directory where the repository has been cloned into pr_id: int The number of the pull request Returns ------- dict A mapping with status information of the PR. The keys are: message A markdown formatted message that can be posted on github status 'failure', 'mixed' or 'good': Whether the tests passed or not sha The hexsha of the PR""" local_repo = osp.join(local_repo, '') repo = Repo(local_repo) sha = repo.refs['pull/{pr}/head'.format(pr=pr_id)].commit.hexsha meta = get_meta_file(local_repo) results = OrderedDict() # run cricital tests results['Critical tests'] = crit_success, crit_log, crit_md = run_test( meta, '-m critical --tb=line --maxfail=20'.split()) if crit_success: results['Formatting tests'] = run_test(meta, ['--maxfail=20', '--tb=line'], tests=['test_formatting.py']) results['Metadata tests'] = run_test(meta, ['--maxfail=20', '--tb=line'], tests=['test_meta.py']) test_summary = '\n\n'.join( textwrap.dedent(""" ## {}..{} {} <details><summary>Full test report</summary> ``` {} ``` </details>""").format(key, "PASSED" if success else "FAILED", log.replace(local_repo, 'data/'), md.replace(local_repo, 'data/')) for key, (success, md, log) in results.items()) good = textwrap.dedent(""" Hi! I'm your friendly automated EMPD-admin bot! This is just to inform you that I tested your data submission in your PR (``%s``) and found it in an excellent condition! """ % osp.basename(meta)) mixed = good + textwrap.dedent(""" I just have some more information for you: """) + test_summary failed = textwrap.dedent(""" Hi! I'm your friendly automated EMPD-admin bot! I found some errors in your data submission. You may fix some of them using the `@EMPD-admin fix` command. Please ping `@Chilipp` if you have difficulties with your submission. """) + test_summary if not all(t[0] for t in results.values()): status = 'failure' message = failed elif any(t[2] for t in results.values()): status = 'mixed' message = mixed else: status = 'good' message = good if status in ['mixed', 'good']: # test the import into postgres if ONHEROKU and len(read_empd_meta(meta)) > 700: message += "\n\nSkipping postgres import because of too many rows" success = True else: success, log, sql_dump = import_database(meta) if not success: message += '\n\n' + textwrap.dedent(""" ## Postgres import I tried to import your data into the postgres database, but did not success! <details> ``` {} ``` </details> """).format(log.replace(local_repo, 'data/')) return {'message': message, 'status': status, 'sha': sha}