예제 #1
0
def map_iris(iri_dict, target_ontologies, distance, use_paxo, oxo_inner_url,
             query_size, verbose):
    quantified_url = "%s?size=%d" % (oxo_inner_url, query_size)
    data = {'ids': list(iri_dict.keys()), 'mappingTarget': target_ontologies}
    json_strings = []
    # data['distance'] = '1' if threshold >= 100 else '2' if threshold >= 75 else '3' if threshold >= 50 else ''
    data['distance'] = distance
    """ If boundary less than 50%, throw 'confidence too low' error: need to code! """
    oxo_hit_counter = 0
    while quantified_url is not None:
        reply = requests.post(quantified_url, data)
        oxo_hit_counter += 1
        json_content = reply.content
        json_string = json.loads(json_content)
        newsflash(json_string, verbose)
        json_strings.append(json_string)
        try:
            these_results = json_string["_embedded"]["searchResults"]
        except:
            newsflash(
                "IRI map load aborted: OxO hit %d times, with %d query terms" %
                (oxo_hit_counter, oxo_hit_counter * query_size))
            raise
        for this_result in these_results:
            source_label = this_result["label"]
            hits = this_result["mappingResponseList"]
            ontology_dict = {}
            for hit in hits:
                target_ontology = hit['targetPrefix']
                """ Create one key per target ontology, then append individual hits to associated array """
                ontology_dict.setdefault(target_ontology, []).append({
                    'curie':
                    hit['curie'],
                    'target_label':
                    hit['label'],
                    'distance':
                    hit['distance']
                })
            # for okey in ontology_dict:
            # ontology_dict[okey] = ', '.join(ontology_dict[okey])
            iri_dict[this_result["queryId"]] = {
                'source_label': source_label,
                'ontodict': ontology_dict
            }
            # newsflash(ontology_dict)
        try:
            quantified_url = json_string["_links"]["next"]["href"]
        except KeyError:
            newsflash("Stopped: all good!")
            quantified_url = None
    newsflash("No. of iterative calls to OxO web API was %d" %
              len(json_strings))
    """ Passed-in dictionary object is mutated in situ: no need to return it """
    return None
예제 #2
0
def main():

    """ First of all, check configuration file """
    parser1 = argparse.ArgumentParser(description='Config filepath', add_help=False)
    parser1.add_argument('-g', '--config', help='filepath of config file (ini format)')
    namespace, extra = parser1.parse_known_args()
    config_file = namespace.config

    """ Second of all, get configuration info from configuration file (if specified) """
    ontoconfig = configparser.ConfigParser(os.environ)
    ontoconfig.optionxform = str
    column_list = None
    if config_file is not None:
        ontoconfig.read(config_file)
        columns_plus = ontoconfig.items('Columns')
        spurious_columns = ontoconfig.items('DEFAULT')
        real_columns = dict(set(columns_plus) - set(spurious_columns))
        column_list = list(real_columns.values())

    """ Third of all, parse the rest of the switches, possibly using defaults from configuration file """
    parser2 = argparse.ArgumentParser(prog='Ontomapper',
                                      description="%s%s" %
                                                   ('Allows the generation of speadsheet subsample. Specifically, ',
                                                    'samples rows at random and columns by parameterised request.'),
                                      parents=[parser1])
    cfg_sect_lookup = config_or_bust(ontoconfig, 'Params')
    parser2.add_argument('-i', '--input-file', default=cfg_sect_lookup('input_file', 'string'),
                         help='location of input spreadsheet: accepts filepath or URL')
    parser2.add_argument('-o', '--output', default=cfg_sect_lookup('output', 'string'),
                         help='output spreadsheet filepath **NO CURRENT EFFECT**')
    parser2.add_argument('-f', '--file-format', choices=['csv', 'tsv'],
                         default=cfg_sect_lookup('file_format', 'string'),
                         help='file format (both input and output)')
    parser2.add_argument('-s', '--sample-size', type=int, default=cfg_sect_lookup('sample_size', 'int'),
                         help='number of records to return in randomly sampled spreadsheet')
    parser2.add_argument('-c', '--column-name', nargs='+', default=column_list,
                         help='space-separated list of column names required in sampled output')

    if len(sys.argv) < 2:
        parser2.print_help(sys.stderr)
        newsflash()
        sys.exit(0)

    args = parser2.parse_args()

    """ vars returns a dictionary from the Namespace object; """
    arg_dict = vars(args)
    newsflash()
    newsflash("These are your opening parameters:")
    newsflash()
    # newsflash("Length of args dictionary is %d" % len(arg_dict))
    newsflash(pd.Series(arg_dict))
    newsflash()

    """ config is still in arg_dict at this point """
    arg_dict.pop('config')

    """ Don't check values of reserved options, which have no effect at the moment """
    active_arg_dict = arg_dict.copy()
    for inactive_arg in ['output']:
        active_arg_dict.pop(inactive_arg)
    if None in active_arg_dict.values():
        newsflash()
        newsflash("Please set values for the following parameters---on command line or in config file!")
        newsflash()
        for cfg_key in active_arg_dict:
            if active_arg_dict[cfg_key] is None:
                newsflash("\t%s" % cfg_key)
        newsflash()
        sys.exit(1)

    """ '**' unpacks a dictionary """
    sample_ss(**arg_dict)
예제 #3
0
def sample_ss(input_file, output, file_format, sample_size, column_name):
    ss_dict = {}
    iri_map = {}
    try:
        # newsflash("Spreadsheet location is %s" % spreadsheet)
        # newsflash("Column index is %d" % colno)
        separator = "\t" if file_format == 'tsv' else ","

        url_bool = re.compile('[a-zA-Z](\w|[-+.])*://.*')
        # filestuff = None
        if url_bool.match(input_file):
            t0 = time.time()
            newsflash("Getting spreadsheet from URL ...")
            r = requests.get(input_file, allow_redirects=True)
            t1 = time.time()
            newsflash("It took %.2f seconds to retrieve the spreadsheet" % float(t1 - t0))
            filestuff = io.StringIO(r.content.decode('utf-8'))
        else:
            filestuff = input_file

        newsflash("Pandafying spreadsheet ...")
        source_df = pd.read_csv(filestuff, sep=separator, low_memory=False, keep_default_na=False)

        newsflash("Generating random sample of records ...")
        output_df = source_df.sample(n=sample_size).loc[:, column_name]
        print(output_df.to_csv(index=False, sep=separator))

    except requests.exceptions.InvalidSchema as is_error:
        """ pandas should have coped with distinguishing text file from URL already """
        newsflash('Error retrieving spreadsheet?')
        newsflash(is_error)
    return None
예제 #4
0
def parse_ss(spreadsheet, separator, column_dict):
    ss_dict = {}
    iri_map = {}
    try:
        # newsflash("Spreadsheet location is %s" % spreadsheet)
        # newsflash("Column index is %d" % column_dict['index'])

        url_bool = re.compile('[a-zA-Z](\w|[-+.])*://.*')
        # filestuff = None
        if url_bool.match(spreadsheet):
            t0 = time.time()
            newsflash("Getting spreadsheet from URL ...")
            r = requests.get(spreadsheet, allow_redirects=True)
            t1 = time.time()
            newsflash("It took %.2f seconds to retrieve the spreadsheet" %
                      float(t1 - t0))
            filestuff = io.StringIO(r.content.decode('utf-8'))
        else:
            filestuff = spreadsheet

        newsflash("Pandafying spreadsheet ...")
        source_df = pd.read_csv(filestuff,
                                sep=separator,
                                low_memory=False,
                                keep_default_na=False)
        if column_dict['index'] is None:
            column_dict['index'] = source_df.columns.get_loc(
                column_dict['name'])
        else:
            column_dict['name'] = source_df.columns[column_dict['index']]
        colno = column_dict['index']

        newsflash("Getting source terms ...")
        source_iris = source_df.iloc[:, colno]
        # iri_lists = source_iris.apply(lambda x: x.split(", "))
        """
        1. Return empty array if empty string.
        2. Split by comma only, then remove spaces afterwards: more robust!
        """
        newsflash("Breaking source term strings into lists ...")
        # iri_lists = source_iris.apply(lambda x: [] if x == '' else x.split(", "))
        iri_lists = source_iris.apply(lambda x: [] if x == '' else list(
            map(lambda w: w.strip(), x.split(","))))
        """ Use nested lambdas to collect source IRIs from Series of lists """
        newsflash("Generate dictionary keyed on unique source terms ...")
        iri_lists.apply(
            lambda x, y: list(map(lambda z: y.update({z: None}), x)),
            args=[iri_map])
        newsflash("Dictionary generated!")
        ss_dict.update({'pandafued': source_df})
        ss_dict.update({'unique_iris': iri_map})
        newsflash("Returning from function 'parse_ss' ...")
    except requests.exceptions.InvalidSchema as is_error:
        """ pandas should have coped with distinguishing text file from URL already """
        # if is_error.
        #     raise
        newsflash('Error retrieving spreadsheet?')
        newsflash(is_error)
        # raise
    return ss_dict
예제 #5
0
def main():
    """ First of all, check configuration file """
    parser1 = argparse.ArgumentParser(description='Config filepath',
                                      add_help=False)
    parser1.add_argument('-g',
                         '--config',
                         help='filepath of config file (ini format)')
    namespace, extra = parser1.parse_known_args()
    config_file = namespace.config
    """ Second of all, get configuration info from configuration file (if specified) """
    ontoconfig = configparser.ConfigParser(os.environ)
    ontoconfig.optionxform = str
    target_list = None
    if config_file is not None:
        ontoconfig.read(config_file)
        targets_plus = ontoconfig.items('Targets')
        spurious_targets = ontoconfig.items('DEFAULT')
        real_targets = dict(set(targets_plus) - set(spurious_targets))
        # newsflash(pd.Series(real_targets))
        target_list = list(real_targets.values())
    """ Third of all, parse the rest of the switches, possibly using defaults from configuration file """
    parser2 = argparse.ArgumentParser(
        prog='Ontomapper',
        description="%s%s%s" %
        ('Takes source ontology (e.g. EFO) terms from an input spreadsheet, ',
         'and generates an output spreadsheet with equivalent terms from ',
         'another ontology or ontologies.'),
        parents=[parser1])
    cfg_sect_lookup = config_or_bust(ontoconfig, 'Params')
    parser2.add_argument(
        '-i',
        '--input-file',
        default=cfg_sect_lookup('input_file', 'string'),
        help='location of input spreadsheet: accepts filepath or URL')
    parser2.add_argument(
        '-o',
        '--output',
        default=cfg_sect_lookup('output', 'string'),
        help='output spreadsheet filepath **NO CURRENT EFFECT**')
    parser2.add_argument('-f',
                         '--file-format',
                         choices=['csv', 'tsv'],
                         default=cfg_sect_lookup('file_format', 'string'),
                         help='file format (both input and output)')
    parser2.add_argument(
        '-l',
        '--layout',
        choices=[
            'in-situ', 'uni-column', 'multi-column', 'uni-row', 'multi-row'
        ],
        default=cfg_sect_lookup('layout', 'string'),
        help="%s%s" %
        ('whether new ontology terms are required in multiple rows, ',
         'multiple columns, a single row, a single column, or the originating cell'
         ))
    cmeg = parser2.add_mutually_exclusive_group(required=False)
    cmeg.add_argument(
        '-x',
        '--column-index',
        type=int,  # default=cfg_sect_lookup('column_index', 'int'),
        help='zero-based index of column containing source ontology terms')
    cmeg.add_argument(
        '-c',
        '--column-name',
        default=cfg_sect_lookup('column_name', 'string'),
        help='name or heading of column containing source ontology terms')
    kmeg = parser2.add_mutually_exclusive_group(required=False)
    kmeg.add_argument('-k',
                      '--keep',
                      dest='keep',
                      action='store_true',
                      help='retain source ontology terms')
    kmeg.add_argument('-e',
                      '--no-keep',
                      dest='keep',
                      action='store_false',
                      help='ditch source ontology terms')
    parser2.add_argument(
        '-t',
        '--target',
        nargs='+',
        default=target_list,
        help='space-separated list of target ontology prefixes')
    parser2.add_argument(
        '-u',
        '--uri-format',
        choices=['long', 'short', 'curie'],
        default=cfg_sect_lookup('uri_format', 'string'),
        help='format of target ontology term identifiers **NO CURRENT EFFECT**'
    )
    parser2.add_argument('-d',
                         '--distance',
                         type=int,
                         default=cfg_sect_lookup('distance', 'int'),
                         choices=[1, 2, 3],
                         help='stepwise OxO distance (ontology to ontology)')
    parser2.add_argument('-r',
                         '--oxo-url',
                         default=cfg_sect_lookup('oxo_url', 'string'),
                         help='OxO (or Paxo) web service URL')
    pmeg = parser2.add_mutually_exclusive_group(required=False)
    pmeg.add_argument('-p',
                      '--paxo',
                      dest='paxo',
                      action='store_true',
                      help='use Paxo rather than OxO **NO CURRENT EFFECT**')
    pmeg.add_argument('-z',
                      '--no-paxo',
                      dest='paxo',
                      action='store_false',
                      help='do not use Paxo: use OxO')
    parser2.add_argument(
        '-n',
        '--number',
        type=int,
        default=cfg_sect_lookup('query_term_number', 'int'),
        help=
        'number of query terms to chunk, per HTTP request on the OxO web service'
    )
    vmeg = parser2.add_mutually_exclusive_group(required=False)
    vmeg.add_argument('-v',
                      '--verbose',
                      dest='verbose',
                      action='store_true',
                      help="%s%s" %
                      ('send verbose progess reports to standard error: ',
                       'not recommended for regular use'))
    vmeg.add_argument('-q',
                      '--quiet',
                      dest='verbose',
                      action='store_false',
                      help='suppress verbose output')
    parser2.add_argument(
        '-m',
        '--mapping-file',  # default=cfg_sect_lookup('mapping_file', 'string'),
        help=
        'optional extra output file with tab-separated list of source to target term mappings'
    )
    # parser2.add_argument('-b', '--boundary', type=int, default=cfg_sect_lookup('boundary', 'int'),
    #                      help="%s%s" % ('minimum percentage confidence threshold of target ontology term matches ',
    #                                     '**NO CURRENT EFFECT: ENFORCE 100%% CONFIDENCE (OxO distance=1)**'))
    parser2.set_defaults(keep=cfg_sect_lookup('keep', 'boolean'),
                         paxo=cfg_sect_lookup('paxo', 'boolean'),
                         verbose=cfg_sect_lookup('verbose', 'boolean'))
    parser2.add_argument('--version', action='version', version='%(prog)s 1.0')

    if len(sys.argv) < 2:
        parser2.print_help(sys.stderr)
        newsflash()
        sys.exit(0)

    args = parser2.parse_args()
    """ vars returns a dictionary from the Namespace object; """
    arg_dict = vars(args)
    newsflash()
    newsflash("These are your opening parameters:")
    newsflash()
    # newsflash("Length of args dictionary is %d" % len(arg_dict))
    newsflash(pd.Series(arg_dict))
    newsflash()
    """ config is still in arg_dict at this point """
    arg_dict.pop('config')
    """ Don't check values of reserved options, which have no effect at the moment; also, column_index may be null """
    active_arg_dict = arg_dict.copy()
    for inactive_arg in [
            'output', 'paxo', 'uri_format', 'column_index', 'mapping_file'
    ]:
        active_arg_dict.pop(inactive_arg)
    if None in active_arg_dict.values():
        newsflash()
        newsflash(
            "Please set values for the following parameters---on command line or in config file!"
        )
        newsflash()
        for cfg_key in active_arg_dict:
            if active_arg_dict[cfg_key] is None:
                newsflash("\t%s" % cfg_key)
        newsflash()
        sys.exit(1)

    newsflash(arg_dict, arg_dict['verbose'])
    """ '**' unpacks a dictionary """
    re_ontologise(**arg_dict)
예제 #6
0
def re_ontologise(input_file, output, layout, file_format, column_index,
                  column_name, keep, target, uri_format, distance, paxo,
                  oxo_url, number, verbose, mapping_file):

    target = sorted(target)
    # newsflash("Length of target ontology array is %d" % len(target))
    # for t in target:
    #     newsflash("Target is %s" % t)
    field_separator = ',' if file_format == 'csv' else '\t'
    ss_column = {'index': column_index, 'name': column_name}
    ss_dict = parse_ss(input_file, field_separator, ss_column)
    column_index = ss_column['index']
    column_name = ss_column['name']
    iri_map = ss_dict['unique_iris']
    """ Print out list of source iris in iri_map """
    # iri_counter = 0
    # for src_iri in iri_map:
    ### print("%d\t%s\t%s" % (iri_counter, src_iri, iri_map[src_iri]))  # Print values _and_ keys
    # newsflash("%d\t%s" % (iri_counter, src_iri))
    # iri_counter += 1
    panda_original = ss_dict['pandafued']
    newsflash("Calling map_iris with url = '%s' ..." % oxo_url)
    map_iris(iri_map, target, distance, paxo, oxo_url, number, verbose)
    """ Print a tab-separated list of source and target terms, if --mapping-file switch specified """
    if mapping_file is not None:
        with open(mapping_file, 'w') as emf:
            for efo_iri in iri_map.keys():
                # for efo_map in iri_map[efo_iri].values():
                #     for efo_single in efo_map.split(', '):
                #         print("%s\t%s" % (efo_iri, efo_single), file=emf)
                ## for efo_map in iri_map[efo_iri]:
                for efo_map in iri_map[efo_iri]['ontodict'].values():
                    for efo_single in efo_map:
                        print("%s\t%s\t%s\t%s\t%d" %
                              (efo_iri, iri_map[efo_iri]['source_label'],
                               efo_single['curie'], efo_single['target_label'],
                               efo_single['distance']),
                              file=emf)

    newsflash("Calling augment ...")
    ontologically_enriched = augment(panda_original, iri_map, layout,
                                     column_index, keep, uri_format)
    """ Print out augmented_panda here ... """
    # newsflash("No. of dictionary elements: %d" % len(ss_dict))
    # newsflash("No. of rows in spreadsheet: %d" % len(panda_original))
    newsflash("No. of unique IRIs: %d" % len(iri_map))
    newsflash('', verbose)
    newsflash(ss_dict['unique_iris'], verbose)
    # newsflash(ss_dict.keys())
    # for spot_key in ss_dict:
    #     newsflash(spot_key)
    """ Enable print to check for uniqueness of IRIs """
    # for iri_key in ss_dict['unique_iris']:
    #     newsflash(iri_key)
    # newsflash(ss_dict['unique_iris'])
    newsflash("Outputting ontologically enriched spreadsheet ...")
    # print(ontologically_enriched.head(30).to_csv(index=False, sep='\t'))
    print(ontologically_enriched.to_csv(index=False, sep=field_separator))
예제 #7
0
def augment(panda_input, iri_map, table_layout, colno, keep_original,
            iri_format):
    colname = panda_input.columns[colno]
    prev_colname = panda_input.columns[colno - 1]
    next_colname = panda_input.columns[colno + 1]
    out_columns = panda_input.columns
    newsflash()
    newsflash('These are the column headers of your pandas DataFrame:')
    newsflash()
    newsflash(out_columns)
    newsflash()
    in_tuple_counter = 0
    # out_tuple_counter = 0
    out_dict_list = []
    if table_layout in {'uni-column', 'multi-column'}:
        extra_col_dict_list = []
    tt0 = time.time()
    tt1 = tt0
    newsflash('Processing input records ...')
    for in_tuple in panda_input.itertuples():
        """ Need to convert back to regular tuple, from pandafied named tuple with extra leading index number """
        in_supple = tuple(in_tuple[1:])
        # source_string = in_tuple[colname]
        source_string = in_supple[colno]
        source_terms = list(map(lambda x: x.strip(), source_string.split(",")))
        target_groups = {}
        if table_layout == 'in-situ' and keep_original:
            """ Key '00source00' is lazy, collational way of placing source terms at top of list, where we want them """
            target_groups.setdefault('00source00', source_terms)
        for source_term in source_terms:
            # newsflash("Source term is %s" % source_term)
            # map_dict = iri_map[source_term]
            map_dict = iri_map.get(source_term)
            # print('map_dict below ...', file=sys.stderr)
            # print(map_dict, file=sys.stderr)
            if map_dict:
                for m in map_dict['ontodict']:
                    # print(m, file=sys.stderr)
                    """ target_groups assigned one key per target ontology --- NOT per source term in source cell! """
                    # target_groups.setdefault(m, []).append(map_dict['ontodict'][m])
                    ncount = 0
                    for n in map_dict['ontodict'][m]:
                        target_groups.setdefault(m, []).append(
                            map_dict['ontodict'][m][ncount]['curie'])
                        ncount += 1

        for target_group in target_groups:
            # newsflash("Data type of target_groups: %s" % type(target_groups))
            # newsflash("Data type of target_group: %s" % type(target_group))
            # newsflash("target_group: %s" % target_group)
            # newsflash("Data type of target_groups[target_group]: %s" % type(target_groups[target_group]))
            # print(target_groups[target_group], file=sys.stderr)
            target_groups[target_group] = ', '.join(
                target_groups[target_group])
        tg_series = pd.Series(target_groups)
        # tg_series = tg_series.reindex(sorted(tg_series.index))
        # newsflash(tg_series)

        # out_dict_list = []

        if table_layout in {'in-situ', 'uni-row', 'uni-column'}:
            target_string = ', '.join(tg_series.values)
            # newsflash("Additions to %s from MeSH: %s" % (source_string, target_string))
            # newsflash()

        if table_layout in {'in-situ', 'uni-row', 'multi-row'}:
            out_dict = dict(zip(out_columns, in_supple))
            # newsflash(out_dict)

            if table_layout == 'in-situ' or keep_original:
                if len(tg_series) > 0 or keep_original:
                    if table_layout == 'in-situ':
                        out_dict[colname] = target_string
                    out_dict_list.append(out_dict)

            if table_layout == 'uni-row' and len(tg_series) > 0:
                out_dict_extra = dict(out_dict)
                out_dict_extra[colname] = target_string
                out_dict_list.append(out_dict_extra)

            elif table_layout == 'multi-row':
                for hit in tg_series.values:
                    out_dict_iter = dict(out_dict)
                    out_dict_iter[colname] = hit
                    out_dict_list.append(out_dict_iter)

        elif table_layout in {'uni-column', 'multi-column'
                              } and (len(tg_series) > 0 or keep_original):
            """ Now need to handle uni- and multi-column outputs """
            out_dict = dict(zip(out_columns, in_supple))
            out_dict_list.append(out_dict)
            extra_col_dict = {}
            if table_layout == 'multi-column':
                extra_col_dict = dict(tg_series)
            elif table_layout == 'uni-column':
                # extra_col_dict = dict({'all_ontologies', ', '.join(tg_series.values)})
                extra_col_dict = dict({'EQUIVALENT_TRAIT_URIS': target_string})
            extra_col_dict_list.append(extra_col_dict)

        in_tuple_counter += 1
        if in_tuple_counter % 4000 == 0:
            tt2 = time.time()
            newsflash(
                "Processed %d thousand input records: took %.2f s (increment of %.2f s) ..."
                % (int(in_tuple_counter / 1000), float(tt2 - tt0),
                   float(tt2 - tt1)))
            tt1 = tt2

    tt2 = time.time()
    newsflash("Processed a total of %d input records, in %.2f seconds" %
              (in_tuple_counter, float(tt2 - tt0)))
    newsflash("No. of records in output spreadsheet is %d" %
              len(out_dict_list))
    newsflash()
    panda_output = pd.DataFrame(out_dict_list, columns=out_columns)
    if table_layout in {'uni-column', 'multi-column'}:
        newsflash("Adding new columns ...")
        # tg_series out of scope here!
        # extra_columns_df = pd.DataFrame(extra_col_dict_list,
        #                                 columns=[colname] if table_layout == 'uni-column' else tg_series.keys())
        extra_columns_df = pd.DataFrame(extra_col_dict_list)
        panda_output = pd.concat([
            None if colno == 0 and not keep_original else
            panda_output.loc[:, :colname if keep_original else prev_colname],
            extra_columns_df, None if colno == len(panda_input.columns) -
            1 else panda_output.loc[:, next_colname:]
        ],
                                 axis=1)

    return panda_output