Пример #1
0
def generate_controlled_metadata(row, groups):
    """
    row  - row from input pandas.DataFrame object to convert to metadata
    cols - columns of input pandas.DataFrame to convert to metadata
    """
    metadata = {}
    # use the shared fields
    for col, val in row.iteritems():
        ss_validator = SAMP_SERV_CONFIG['validators'].get(col, None)
        if ss_validator:
            if not pd.isnull(row[col]):
                idx = check_value_in_list(col, [g['value'] for g in groups],
                                          return_idx=True)
                try:
                    val = float(row[col])
                except (ValueError, TypeError):
                    val = row[col]
                mtd = {"value": val}
                if idx is not None:
                    mtd, _ = parse_grouped_data(row, groups[idx])
                # verify against validator
                missing_fields = _find_missing_fields(mtd, ss_validator)
                for field, default in missing_fields.items():
                    mtd[field] = default
                metadata[col] = mtd

    return metadata
Пример #2
0
def generate_controlled_metadata(row, groups):
    """
    row - row from input pandas.DataFrame object to convert to metadata
    groups - list of dicts - each dict is a grouping where key = "metadata field name" (i.e. "value
    ", or "units") and value = input file column name
    """
    metadata = {}
    used_cols = set([])
    # use the shared fields
    for col, val in row.iteritems():
        ss_validator = SAMP_SERV_CONFIG['validators'].get(col, None)
        # check if `col` is controlled metadata
        if ss_validator:
            # check if value associated with `col`, we don't store empty values
            if not pd.isnull(row[col]):
                idx = check_value_in_list(col, [g['value'] for g in groups],
                                          return_idx=True)
                try:
                    val = float(row[col])
                except (ValueError, TypeError):
                    val = row[col]
                mtd = {"value": val}
                # checking if there is a "grouping" for the metadata field `col`
                # "grouping" = two or more columns compose into one metadata field
                idx = check_value_in_list(col, [g['value'] for g in groups],
                                          return_idx=True)
                if idx is not None:
                    mtd, grouped_used_cols = parse_grouped_data(
                        row, groups[idx])
                    used_cols.update(grouped_used_cols)

                # verify against validator
                missing_fields = _find_missing_fields(mtd, ss_validator)
                for field, default in missing_fields.items():
                    mtd[field] = default

                col_type = ss_validator.get('key_metadata', {}).get('type')
                if col_type == 'string':
                    mtd['value'] = str(mtd['value'])

                metadata[col] = mtd
                used_cols.add(col)

    return metadata, used_cols
Пример #3
0
def sample_set_to_output(sample_set, sample_url, token, output_file,
                         output_file_format):
    """"""
    def add_to_output(o, key_metadata, val):
        if key_metadata in o:
            o[key_metadata] += [
                "" for _ in range(
                    len(o['kbase_sample_id']) - 1 - len(o[key_metadata]))
            ] + [val]
        else:
            o[key_metadata] = [
                "" for _ in range(len(o['kbase_sample_id']) - 1)
            ] + [val]
        return o

    if output_file_format == "SESAR":
        groups = SESAR_mappings['groups']

    output = {"kbase_sample_id": [], "sample name": []}
    for samp_id in sample_set['samples']:
        sample = get_sample(samp_id, sample_url, token)
        output['kbase_sample_id'].append(sample['id'])
        output['sample name'].append(sample['name'])
        used_headers = set(['kbase_sample_id', 'name', 'sample name'])
        for node in sample['node_tree']:
            # get 'source_meta' information
            source_meta = node.get('source_meta', [])
            source_meta_key = {m['key']: m['skey'] for m in source_meta}
            for key_metadata in node['meta_controlled']:
                # get original input key
                upload_key = source_meta_key.get(key_metadata, key_metadata)
                if upload_key not in used_headers:
                    for key, val in node['meta_controlled'][
                            key_metadata].items():
                        if key == 'value':
                            output = add_to_output(output, upload_key, val)
                            used_headers.add(upload_key)
                        if key == 'units':
                            idx = check_value_in_list(key_metadata, [
                                upload_key_format(g['value']) for g in groups
                            ],
                                                      return_idx=True)
                            if idx is not None and not groups[idx][
                                    'units'].startswith('str:'):
                                output = add_to_output(output,
                                                       groups[idx]['units'],
                                                       val)
                                used_headers.add(groups[idx]['units'])

            for key_metadata in node['meta_user']:
                # get original input key
                upload_key = source_meta_key.get(key_metadata, key_metadata)
                if upload_key not in used_headers:
                    for key, val in node['meta_user'][key_metadata].items():
                        if key == 'value':
                            output = add_to_output(output, upload_key, val)
                            used_headers.add(upload_key)
                        if key == 'units':
                            idx = check_value_in_list(key_metadata, [
                                upload_key_format(g['value']) for g in groups
                            ],
                                                      return_idx=True)
                            if idx is not None and not groups[idx][
                                    'units'].startswith('str:'):
                                output = add_to_output(output,
                                                       groups[idx]['units'],
                                                       val)

    for key in output:
        output[key] += [
            ""
            for _ in range(len(output['kbase_sample_id']) - len(output[key]))
        ]

    df = pd.DataFrame.from_dict(output)

    def line_prepender(filename, line):
        with open(filename, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(line.rstrip('\r\n') + '\n' + content)

    df.to_csv(output_file, index=False)

    if output_file_format == "SESAR":
        line_prepender(output_file,
                       "Object Type:,Individual Sample,User Code:,")
Пример #4
0
def sample_set_to_output(sample_set, sample_url, token, output_file,
                         output_file_format):
    """"""
    def add_to_output(o, key_metadata, val):
        if key_metadata in o:
            o[key_metadata] += [
                "" for _ in range(
                    len(o['kbase_sample_id']) - 1 - len(o[key_metadata]))
            ] + [val]
        else:
            o[key_metadata] = [
                "" for _ in range(len(o['kbase_sample_id']) - 1)
            ] + [val]
        return o

    if output_file_format.lower() == "sesar":
        groups = SESAR_mappings['groups']
    else:
        raise ValueError(f"SESAR only file format supported for export")

    output = {"kbase_sample_id": [], "name": []}
    for samp_id in sample_set['samples']:
        sample = get_sample(samp_id, sample_url, token)
        output['kbase_sample_id'].append(sample['id'])
        # we need to check if there is another match in there.
        sample_name = sample['name']

        output['name'].append(sample_name)
        used_headers = set(['kbase_sample_id', 'name'])
        for node_idx, node in enumerate(sample['node_tree']):
            # check if node 'id' and sample 'name' are not the same
            if node['id'] != sample_name:
                output = add_to_output(output, f"alt_id_{node_idx}",
                                       node['id'])
            # get 'source_meta' information
            source_meta = node.get('source_meta', [])
            source_meta_key = {m['key']: m['skey'] for m in source_meta}
            for key_metadata in node['meta_controlled']:
                # get original input key
                upload_key = source_meta_key.get(key_metadata, key_metadata)
                if upload_key not in used_headers:
                    for key, val in node['meta_controlled'][
                            key_metadata].items():
                        if key == 'value':
                            output = add_to_output(output, upload_key, val)
                            used_headers.add(upload_key)
                        if key == 'units':
                            idx = check_value_in_list(key_metadata, [
                                upload_key_format(g['value']) for g in groups
                            ],
                                                      return_idx=True)
                            if idx is not None and not groups[idx][
                                    'units'].startswith('str:'):
                                output = add_to_output(output,
                                                       groups[idx]['units'],
                                                       val)
                                used_headers.add(groups[idx]['units'])

            for key_metadata in node['meta_user']:
                # get original input key
                upload_key = source_meta_key.get(key_metadata, key_metadata)
                if upload_key not in used_headers:
                    for key, val in node['meta_user'][key_metadata].items():
                        if key == 'value':
                            output = add_to_output(output, upload_key, val)
                            used_headers.add(upload_key)
                        if key == 'units':
                            idx = check_value_in_list(key_metadata, [
                                upload_key_format(g['value']) for g in groups
                            ],
                                                      return_idx=True)
                            if idx is not None and not groups[idx][
                                    'units'].startswith('str:'):
                                output = add_to_output(output,
                                                       groups[idx]['units'],
                                                       val)

    # add any missing lines to the end.
    for key in output:
        output[key] += [
            ""
            for _ in range(len(output['kbase_sample_id']) - len(output[key]))
        ]

    df = pd.DataFrame.from_dict(output)

    def line_prepender(filename, line):
        with open(filename, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(line.rstrip('\r\n') + '\n' + content)

    df.to_csv(output_file, index=False)

    if output_file_format.lower() == "sesar":
        line_prepender(output_file,
                       "Object Type:,Individual Sample,User Code:,")