Exemplo n.º 1
0
    def build_json(self, filename1, filename2):
        # *** Work through manually curated chemical probes from the different portals ***
        # chemicalprobes column names are Probe, Target, SGClink, CPPlink, OSPlink, Note
        with URLZSource(filename1).open() as r_file:
            for i, row in enumerate(csv.DictReader(r_file,
                                                   dialect='excel-tab'),
                                    start=1):
                # Generate 'line' for current target
                probelinks = []
                if row["SGClink"] != "":
                    probelinks.append({
                        'source': "Structural Genomics Consortium",
                        'link': row["SGClink"]
                    })
                if row["CPPlink"] != "":
                    probelinks.append({
                        'source': "Chemical Probes Portal",
                        'link': row["CPPlink"]
                    })
                if row["OSPlink"] != "":
                    probelinks.append({
                        'source': "Open Science Probes",
                        'link': row["OSPlink"]
                    })

                line = {
                    "gene": row["Target"],
                    "chemicalprobe": row["Probe"],
                    "sourcelinks": probelinks,
                    "note": row["Note"]
                }
                # Add data for current chemical probe to self.chemicalprobes[Target]['portalprobes']
                # If gene has not appeared in chemical probe list yet,
                # initialise self.chemicalprobes with an empty list
                if row["Target"] not in self.chemicalprobes:
                    self.chemicalprobes[row["Target"]] = {}
                    self.chemicalprobes[row["Target"]]['portalprobes'] = []
                self.chemicalprobes[row["Target"]]['portalprobes'].append(line)

        # *** Work through Probe Miner targets ***
        # probeminer column names are Target, UniPRotID, NrofProbes
        # probeminer column names are hgnc_symbol, uniprot_symbol, nr_of_probes
        with URLZSource(filename2).open() as r_file:
            for i, row in enumerate(csv.DictReader(r_file,
                                                   dialect='excel-tab'),
                                    start=1):
                PMdata = {
                    "probenumber":
                    row["nr_of_probes"],
                    "link":
                    "https://probeminer.icr.ac.uk/#/" + row["uniprot_symbol"]
                }
                if row["hgnc_symbol"] not in self.chemicalprobes:
                    self.chemicalprobes[row["hgnc_symbol"]] = {}
                self.chemicalprobes[row["hgnc_symbol"]]['probeminer'] = PMdata
Exemplo n.º 2
0
    def get_pathway_relations(self):
        added_relations = []
        with URLZSource(self.pathway_relation_url).open() as source:
            for i, row in enumerate(csv.DictReader(
                    source,
                    fieldnames=self.headers_pathway_rel,
                    dialect='excel-tab'),
                                    start=1):
                if len(row) != 2:
                    raise ValueError(
                        'Reactome.py: Pathway Relation file format unexpected at line %d.'
                        % i)

                parent_id = row["id"]
                child_id = row["related_id"]

                relation = (parent_id, child_id)
                if relation not in added_relations:
                    if parent_id in self.valid_pathway_ids:
                        yield dict(
                            id=parent_id,
                            child=child_id,
                        )
                        added_relations.append(relation)
                        if len(added_relations) % 1000 == 0:
                            self.logger.debug(
                                "%i rows parsed from reactome_pathway_relation"
                                % len(added_relations))
                else:
                    self.logger.warn(
                        "Pathway relation %s is already loaded, skipping duplicate data"
                        % str(relation))
        self.logger.info('parsed %i rows from reactome_pathway_relation' %
                         len(added_relations))
Exemplo n.º 3
0
    def get_pathway_data(self):
        self.valid_pathway_ids = []
        with URLZSource(self.pathway_data_url).open() as source:
            for i, row in enumerate(csv.DictReader(source,
                                                   fieldnames=self.headers,
                                                   dialect='excel-tab'),
                                    start=1):
                if len(row) != 3:
                    raise ValueError(
                        'Reactome.py: Pathway file format unexpected at line %d.'
                        % i)

                pathway_id = row["id"]
                pathway_name = row["description"]
                species = row["species"]

                if pathway_id not in self.valid_pathway_ids:
                    if species in self.allowed_species:
                        self.valid_pathway_ids.append(pathway_id)
                        yield dict(
                            id=pathway_id,
                            name=pathway_name,
                            species=species,
                        )
                        if len(self.valid_pathway_ids) % 1000 == 0:
                            self.logger.debug(
                                "%i rows parsed for reactome_pathway_data" %
                                len(self.valid_pathway_ids))
                else:
                    self.logger.warn(
                        "Pathway id %s is already loaded, skipping duplicate data"
                        % pathway_id)

        self.logger.info('parsed %i rows for reactome_pathway_data' %
                         len(self.valid_pathway_ids))
Exemplo n.º 4
0
    def retrieve_normal_tissue_data(self):
        """Parse 'normal_tissue' csv file,
        the expression profiles for proteins in human tissues from HPA

        :return: dict
        """
        self.logger.info('get normal tissue rows into dicts')
        table = (petl.fromcsv(
            URLZSource(self.normal_tissue_url), delimiter='\t'
        ).rename({
            'Tissue': 'tissue',
            'Cell type': 'cell_type',
            'Level': 'level',
            'Reliability': 'reliability',
            'Gene': 'gene'
        }).cut('tissue', 'cell_type', 'level', 'reliability', 'gene').addfield(
            'tissue_label',
            lambda rec: name_from_tissue(rec['tissue'].strip(), self.t2m)
        ).addfield('tissue_code', lambda rec: code_from_tissue(
            rec['tissue_label'], self.t2m)).addfield(
                'tissue_level',
                lambda rec: level_from_text(rec['level'])).addfield(
                    'anatomical_systems',
                    lambda rec: asys_from_tissue(rec['tissue_label'], self.t2m)
                ).addfield(
                    'organs', lambda rec: organs_from_tissue(
                        rec['tissue_label'], self.t2m)).addfield(
                            'tissue_reliability', lambda rec:
                            reliability_from_text(rec['reliability'])).cut(
                                'gene', 'tissue_code', 'tissue_label',
                                'tissue_level', 'tissue_reliability',
                                'cell_type', 'anatomical_systems',
                                'organs').aggregate(
                                    ('gene', 'tissue_code'),
                                    aggregation={
                                        'cell_types':
                                        (('cell_type', 'tissue_level',
                                          'tissue_reliability'), list),
                                        'tissue_label': ('tissue_label', set),
                                        'anatomical_systems':
                                        ('anatomical_systems', list),
                                        'organs': ('organs', list)
                                    },
                                    presorted=True).
                 aggregate('gene',
                           aggregation={
                               'data':
                               (('tissue_code', 'tissue_label', 'cell_types',
                                 'anatomical_systems', 'organs'), list)
                           },
                           presorted=True).addfield(
                               'result',
                               lambda rec: format_expression(rec)).cut(
                                   'gene', 'result'))

        return table
Exemplo n.º 5
0
    def test_urlzsource(self):
        lines4 = []
        with URLZSource('http://www.google.com/robots.txt').open() as f:
            take_and_rstrip = compose(curry(map, lambda l: rstrip(l, '\n')),
                                      curry(take, 4))
            lines4 = list(take_and_rstrip(f))

        print(str(lines4))
        self.assertGreaterEqual(len(lines4), 1,
                                "Failed to get more than 0 lines")
Exemplo n.º 6
0
    def populate_molecules_dict(self):
        self._logger.info('ChEMBL getting Molecule from ' + self.molecule_set_uri_pattern)
        # Shelve creates a file with specific database. Using a temp file requires a workaround to open it.
        # dumbdbm creates an empty database file. In this way shelve can open it properly.
        t_filename = tempfile.NamedTemporaryFile(delete=False).name
        dumb_dict = dumbdbm.open(t_filename)
        shelve_out = shelve.Shelf(dict=dumb_dict)
        with URLZSource(self.molecule_set_uri_pattern).open() as f_obj:
            for line in f_obj:
                mol = json.loads(line)
                shelve_out[str(mol["molecule_chembl_id"])] = mol

        self._logger.info('ChEMBL Molecule loading done. ')
        return shelve_out
Exemplo n.º 7
0
    def __init__(self, tissue_translation_map, tissue_curation_map,
                 normal_tissue_url, rna_level_url, rna_value_url,
                 rna_zscore_url):
        self.logger = logging.getLogger(__name__)
        self.tissue_translation_map = tissue_translation_map
        self.tissue_curation_map = tissue_curation_map
        self.normal_tissue_url = normal_tissue_url
        self.rna_level_url = rna_level_url
        self.rna_value_url = rna_value_url
        self.rna_zscore_url = rna_zscore_url

        #load t2m
        t2m = {'tissues': {}, 'curations': {}}

        with URLZSource(self.tissue_translation_map).open() as r_file:
            t2m['tissues'] = json.load(r_file)['tissues']

        with URLZSource(self.tissue_curation_map).open() as r_file:
            t2m['curations'] = {
                el['name']: el['canonical']
                for el in csv.DictReader(
                    r_file, fieldnames=['name', 'canonical'], delimiter='\t')
            }
        self.t2m = t2m
Exemplo n.º 8
0
def get_data_config(data_url):
    with URLZSource(data_url).open() as r_file:
        #note us safe loading as described at https://pyyaml.org/wiki/PyYAMLDocumentation
        #TL;DR - only dicts and lists and primitives
        data_config = yaml.safe_load(r_file)

        #replace hyphens with underscores in variable
        #this is because we want to use addict to
        #access config as config.foo_bar instead of config["foo-bar"]
        data_config_underscores = {}
        for key in data_config:
            key_underscore = key.replace("-", "_")
            data_config_underscores[key_underscore] = data_config[key]

        return addict.Dict(data_config_underscores)
Exemplo n.º 9
0
    def merge_data(self, genes, loader, r_server, data_config):

        self._logger.info("HGNC parsing - requesting from URL %s",
                          data_config.hgnc_complete_set)

        with URLZSource(data_config.hgnc_complete_set).open() as source:

            data = json.load(source)

            for row in data['response']['docs']:
                gene = Gene()
                gene.load_hgnc_data_from_json(row)
                genes.add_gene(gene)

            self._logger.info("STATS AFTER HGNC PARSING:\n" +
                              genes.get_stats())
Exemplo n.º 10
0
    def process(self, ensembl_filename, dry_run):
        def _put_line(line):
            return 1

        self.logger.info('Reading Ensembl gene info from %s' %
                         ensembl_filename)

        #setup elasticsearch
        if not dry_run:
            self.loader.create_new_index(
                Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME)
            #need to directly get the versioned index name for this function
            self.loader.prepare_for_bulk_indexing(
                self.loader.get_versioned_index(
                    Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME))

        inserted_lines = 0
        for line in more_itertools.with_iter(
                URLZSource(ensembl_filename).open()):
            entry = json.loads(line)
            #store in elasticsearch if not dry running
            if not dry_run:
                self.loader.put(Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME,
                                Const.ELASTICSEARCH_ENSEMBL_DOC_NAME,
                                entry['id'], line)
            inserted_lines += 1

        self.logger.info("Read %d lines from %s", inserted_lines,
                         ensembl_filename)

        self.logger.info("flush index")

        #cleanup elasticsearch
        if not dry_run:
            self.loader.flush_all_and_wait(
                Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.loader.restore_after_bulk_indexing()
Exemplo n.º 11
0
    def merge_data(self, genes, loader, r_server, data_config):

        #turn the species id/label mappings into a dict from the argument list
        self.orthologs_species = dict()
        if data_config.hgnc_orthologs_species:
            for value in data_config.hgnc_orthologs_species:
                code, label = value.split("-")
                label = label.strip()
                code = code.strip()
                self.orthologs_species[code] = label

        self._logger.info("Ortholog parsing - requesting from URL %s",
                          data_config.hgnc_orthologs)

        with URLZSource(data_config.hgnc_orthologs).open() as source:
            reader = csv.DictReader(source, delimiter="\t")
            for row in reader:
                if row['human_ensembl_gene'] in genes:
                    self.add_ortholog_data_to_gene(
                        gene=genes[row['human_ensembl_gene']], data=row)

        self._logger.info("STATS AFTER HGNC ortholog PARSING:\n" +
                          genes.get_stats())
Exemplo n.º 12
0
    def build_json(self, filename):
        # Just for reference: column names are: "ID_CENSUS_ANNOT", "ID_CENSUS", "ID_GENE", "GENE_NAME", "CELL_TYPE",
        # "PUBMED_PMID", "ID_DATA_CATEGORY", "DESCRIPTION", "DISPLAY", "SHORT", "CELL_LINE", "DESCRIPTION_1")
        with URLZSource(filename).open() as r_file:
            for i, row in enumerate(csv.DictReader(r_file,
                                                   dialect='excel-tab'),
                                    start=1):

                PMID = re.sub(r'^"|"$', '', row["PUBMED_PMID"])
                Short = re.sub(r'^"|"$', '', row["SHORT"])
                GeneSymbol = re.sub(r'^"|"$', '', row["GENE_NAME"])
                Description_1 = re.sub(r'^"|"$', '', row["DESCRIPTION_1"])
                Description_1.rstrip()
                Description = re.sub(r'^"|"$', '', row["DESCRIPTION"])

                if GeneSymbol not in self.hallmarks:
                    self.hallmarks[GeneSymbol] = dict()

                if Description_1 in self.hallmarks_labels:
                    promote = False
                    suppress = False

                    if Short == 'a': promote = True
                    if Short == 's': suppress = True

                    line = {
                        "label": Description_1,
                        "description": Description,
                        "promote": promote,
                        "suppress": suppress,
                        "pmid": PMID
                    }

                    try:
                        self.hallmarks[GeneSymbol]["cancer_hallmarks"].append(
                            line)
                    except KeyError:
                        self.hallmarks[GeneSymbol]["cancer_hallmarks"] = list()
                        self.hallmarks[GeneSymbol]["cancer_hallmarks"].append(
                            line)

                elif Description_1 == 'function summary':
                    line = {"pmid": PMID, "description": Description}

                    try:
                        self.hallmarks[GeneSymbol]["function_summary"].append(
                            line)
                    except KeyError:
                        self.hallmarks[GeneSymbol]["function_summary"] = list()
                        self.hallmarks[GeneSymbol]["function_summary"].append(
                            line)

                else:
                    line = {
                        "attribute_name": Description_1,
                        "description": Description,
                        "pmid": PMID
                    }

                    try:
                        self.hallmarks[GeneSymbol]["attributes"].append(line)
                    except KeyError:
                        self.hallmarks[GeneSymbol]["attributes"] = list()
                        self.hallmarks[GeneSymbol]["attributes"].append(line)
Exemplo n.º 13
0
    def retrieve_rna_data(self):
        """
        Parse 'rna_tissue' csv file,
        RNA levels in 56 cell lines and 37 tissues based on RNA-seq from HPA.

        :return: dict
        """
        self.logger.info('get rna tissue rows into dicts')
        self.logger.debug('melting rna level table into geneid tissue level')

        t_level = (petl.fromcsv(URLZSource(
            self.rna_level_url), delimiter='\t').melt(
                key='ID', variablefield='tissue',
                valuefield='rna_level').rename({
                    'ID': 'gene'
                }).addfield(
                    'tissue_label', lambda rec: name_from_tissue(
                        rec['tissue'].strip(), self.t2m)).addfield(
                            'tissue_code', lambda rec: code_from_tissue(
                                rec['tissue_label'], self.t2m)).addfield(
                                    'anatomical_systems',
                                    lambda rec: asys_from_tissue(
                                        rec['tissue_label'], self.t2m)).
                   addfield(
                       'organs', lambda rec: organs_from_tissue(
                           rec['tissue_label'], self.t2m)).cutout('tissue'))

        t_value = (petl.fromcsv(URLZSource(
            self.rna_value_url), delimiter='\t').melt(
                key='ID', variablefield='tissue',
                valuefield='rna_value').rename({
                    'ID': 'gene'
                }).addfield(
                    'tissue_label', lambda rec: name_from_tissue(
                        rec['tissue'].strip(), self.t2m)).addfield(
                            'tissue_code', lambda rec: code_from_tissue(
                                rec['tissue_label'], self.t2m)).addfield(
                                    'rna_unit', 'TPM').cutout('tissue'))

        t_zscore = (petl.fromcsv(
            URLZSource(self.rna_zscore_url), delimiter='\t').melt(
                key='ID', variablefield='tissue',
                valuefield='zscore_level').rename({
                    'ID': 'gene'
                }).addfield(
                    'tissue_label',
                    lambda rec: name_from_tissue(rec['tissue'].strip(
                    ), self.t2m)).addfield(
                        'tissue_code', lambda rec: code_from_tissue(
                            rec['tissue_label'], self.t2m)).cutout('tissue'))

        t_vl = petl.join(t_level,
                         t_value,
                         key=('gene', 'tissue_code', 'tissue_label'),
                         presorted=True)

        t_join = (petl.join(t_vl,
                            t_zscore,
                            key=('gene', 'tissue_code', 'tissue_label'),
                            presorted=True).aggregate(
                                'gene',
                                aggregation={
                                    'data': (('tissue_code', 'tissue_label',
                                              'rna_level', 'rna_value',
                                              'rna_unit', 'anatomical_systems',
                                              'organs', 'zscore_level'), list)
                                },
                                presorted=True))

        return t_join
Exemplo n.º 14
0
    def get_genotype_phenotype(self):
        self._logger.debug("get_genotype_phenotype")
        with URLZSource(
                self.data_config.mouse_phenotypes_orthology).open() as fi:
            self._logger.debug("get %s",
                               self.data_config.mouse_phenotypes_orthology)

            for li, line in enumerate(fi):
                # a way too many false spaces just to bother people
                array = map(str.strip, line.strip().split("\t"))
                if len(array) == 7:
                    (human_gene_symbol, a, b, c, mouse_gene_symbol,
                     mouse_gene_id, phenotypes_raw) = array

                    # at least 1 phenotype in phenotypes_raw
                    if len(phenotypes_raw) > 0:
                        try:
                            mouse_gene_id = mouse_gene_id.strip()
                            mouse_gene_symbol = mouse_gene_symbol.strip()
                            if mouse_gene_id not in self.mouse_genes:
                                self.mouse_genes[mouse_gene_id] = {
                                    "gene_id":
                                    mouse_gene_id,
                                    "gene_symbol":
                                    mouse_gene_symbol,
                                    "phenotypes": {},
                                    "human_orthologs": [],
                                    "phenotypes_summary":
                                    list(phenotypes_raw.strip().split("\s+"))
                                }
                            self.mouse_genes[mouse_gene_id][
                                "human_orthologs"].append({
                                    "gene_symbol": human_gene_symbol,
                                    "gene_id": None
                                })

                            if human_gene_symbol not in self.human_genes:
                                self.human_genes[human_gene_symbol] = {
                                    "gene_symbol": human_gene_symbol,
                                    "ensembl_gene_id": None,
                                    "gene_id": None,
                                    "mouse_orthologs": []
                                }
                        except Exception as e:
                            self._logger.debug(
                                "exception processing a line %d: %s", li,
                                str(e))

        self._logger.info("Retrieved %i mouse genes", len(self.mouse_genes))

        count_symbols = set()
        count_accepted_symbols = set()

        with URLZSource(self.data_config.mouse_phenotypes_report).open() as fi:
            # lines = response.readlines()
            self._logger.debug("get lines from mgi report phenotyes file %s",
                               self.data_config.mouse_phenotypes_report)

            # Allelic Composition	Allele Symbol(s)	Genetic Background	Mammalian Phenotype ID	PubMed ID	MGI Marker Accession ID
            for li, line in enumerate(fi):
                # a way too many false spaces just to bother people
                array = map(str.strip, line.strip().split("\t"))

                self._logger.debug('mouse KO array %s in line %d', str(array),
                                   li)

                if len(array) == 6:
                    (allelic_composition, allele_symbol, genetic_background,
                     mp_id, pmid, mouse_gene_ids) = array
                    # check for double-mutant but exclude duplicates
                    for mouse_gene_id in set(mouse_gene_ids.split(",")):
                        # exclude heritable phenotypic marker like http://www.debugrmatics.jax.org/marker/MGI:97446
                        count_symbols.add(mouse_gene_id)

                        mp_id_key = mp_id.split("/")[-1].replace(":", "_")
                        self._logger.debug("Looking for mouse_gene_id " +
                                           mouse_gene_id)
                        self._logger.debug("Looking for mp_id_key " +
                                           mp_id_key)

                        if mouse_gene_id in self.mouse_genes and mp_id_key in self.mps:
                            self._logger.debug('process mouse KO gene %s',
                                               mouse_gene_id)
                            count_accepted_symbols.add(mouse_gene_id)
                            self._logger.debug('get class for %s' % mp_id)
                            mp_class = self.mps[mp_id_key]
                            mp_label = mp_class["label"]

                            for k, v in PHENOTYPE_CATEGORIES.iteritems():
                                if k not in self.mouse_genes[mouse_gene_id][
                                        "phenotypes"]:
                                    self.mouse_genes[mouse_gene_id]["phenotypes"][k] = \
                                        {
                                            "category_mp_identifier": k,
                                            "category_mp_label": v,
                                            "genotype_phenotype": []
                                        }

                            # it's possible that there are multiple paths to the same root.
                            mp_category_ids = set(
                                map(lambda x: x[0], mp_class["path_codes"]))
                            for category_id in mp_category_ids:
                                mp_category_id = category_id.replace("_", ":")
                                self.mouse_genes[mouse_gene_id]["phenotypes"][
                                    mp_category_id][
                                        "genotype_phenotype"].append({
                                            "subject_allelic_composition":
                                            allelic_composition,
                                            "subject_background":
                                            genetic_background,
                                            "pmid":
                                            pmid,
                                            "mp_identifier":
                                            mp_id,
                                            "mp_label":
                                            mp_label
                                        })
                        else:
                            self._logger.warning(
                                'process mouse KO gene %s failed because not in self.mouse_genes set in line %d',
                                mouse_gene_id, li)
                else:
                    self._logger.warning("could not process %i %s", len(array),
                                         line)

        self._logger.info("Count symbols %i / %i with phenotypes",
                          len(count_accepted_symbols), len(count_symbols))
Exemplo n.º 15
0
def open_to_read(filename):
    """return an iterator from izip (filename, (enumerate(file_handle, start=1))"""
    _l.debug('generate an iterator of (filename,enumerate) for filename %s',
             filename)
    it = more_itertools.with_iter(URLZSource(filename).open())
    return itertools.izip(itertools.cycle([filename]), enumerate(it, start=1))
    def build_json(self, filename):

        with URLZSource(filename).open() as r_file:
            # fieldnames=cancerbiomarker_columns not used at the moment
            for i, row in enumerate(csv.DictReader(r_file,
                                                   dialect='excel-tab'),
                                    start=1):

                Source = row["Source"]
                Gene = row["Gene"]
                IndividualMutation = row["IndividualMutation"]
                PrimaryTumorTypeFullName = row["PrimaryTumorTypeFullName"]

                # Split Source and Gene to separate out multiple entries
                mSource = map(str.strip, Source.split(";"))
                geneList = list(map(str.strip, Gene.split(";")))
                # If the two genes are identical, only keep one copy to prevent duplication of current biomarker
                if len(geneList) > 1:
                    if geneList[0] == geneList[1]:
                        geneList = [geneList[0]]

                # Edit IndividualMutation from eg. FGFR3:V555M to FGFR3 (V555M)
                # Replace ':' with ' (' and add ')' at the end
                if ":" in IndividualMutation:
                    IndividualMutation = IndividualMutation.replace(':',
                                                                    ' (') + ')'

                # Get Tumor type names and EFO IDs/links
                PrimaryTumorTypeFullName = PrimaryTumorTypeFullName.replace(
                    ' ', '_')
                PrimaryTumorTypeFullName = PrimaryTumorTypeFullName.replace(
                    '-', '_')
                TumorNames = ""
                TumorIDs = ""
                if ";" in PrimaryTumorTypeFullName:
                    TumorTypes = PrimaryTumorTypeFullName.split(";")
                    diseases = []
                    for TumorType in TumorTypes:
                        diseases.append({
                            'label':
                            BIOMARKER_DISEASE_MAPPINGS[TumorType]['label'],
                            'id': (BIOMARKER_DISEASE_MAPPINGS[TumorType]['url']
                                   ).split('/')[-1]
                        })
                else:
                    diseases = [{
                        'label':
                        BIOMARKER_DISEASE_MAPPINGS[PrimaryTumorTypeFullName]
                        ['label'],
                        'id':
                        (BIOMARKER_DISEASE_MAPPINGS[PrimaryTumorTypeFullName]
                         ['url']).split('/')[-1]
                    }]

                # Iterate through genes and sources
                for singleGene in geneList:
                    # Replace 3 gene symbols with their approved_symbol (C15orf55=NUTM1, MLL=KMT2A, MLL2=KMT2D)
                    if singleGene == 'C15orf55':
                        singleGene = 'NUTM1'
                    elif singleGene == 'MLL':
                        singleGene = 'KMT2A'
                    elif singleGene == 'MLL2':
                        singleGene = 'KMT2D'
                    # If gene has not appeared in biomarker list yet,
                    # initialise self.cancerbiomarkers with an empty list
                    if singleGene not in self.cancerbiomarkers:
                        self.cancerbiomarkers[singleGene] = []

                    # Create empty lists for PMIDs and other references
                    pubmed = []
                    other = []

                    # Go through the references/sources
                    for singleSource in mSource:
                        if "PMID" in singleSource:  # If the source is a PMID
                            currPMID = singleSource[
                                5:]  # Remove 'PMID:' if necessary
                            pubmed.append({'pmid': currPMID})
                        else:  # Else: the source is either a clinical trial or a conference abstract
                            if 'NCT' in singleSource:
                                other.append({
                                    'name':
                                    singleSource,
                                    'link':
                                    'https://clinicaltrials.gov/ct2/show/' +
                                    singleSource,
                                    'description':
                                    'Clinical Trial'
                                })
                            elif singleSource.split(
                                    " (")[0] in BIOMARKER_SOURCE_MAPPINGS:
                                other.append({
                                    'name':
                                    singleSource,
                                    'link':
                                    BIOMARKER_SOURCE_MAPPINGS[
                                        singleSource.split(" (")[0]]['url'],
                                    'description':
                                    BIOMARKER_SOURCE_MAPPINGS[
                                        singleSource.split(" (")[0]]['label']
                                })

                    # Put the reference info together for each biomarker
                    myReferences = {"pubmed": pubmed, "other": other}

                    line = {
                        "gene": singleGene,
                        "biomarker": row["Biomarker"],
                        "individualbiomarker": row["IndividualMutation"],
                        "association": row["Association"],
                        "drug": row["Drug"],
                        "drugfamily": row["DrugFamily"],
                        "drugfullname": row["DrugFullName"],
                        "diseases": diseases,
                        "evidencelevel": row["EvidenceLevel"],
                        "references": myReferences
                    }
                    # Add data for current biomarker to self.cancerbiomarkers
                    self.cancerbiomarkers[singleGene].append(line)
Exemplo n.º 17
0
def get_chembl_info_by_file(uri):
    with URLZSource(uri).open() as f_obj:
        for i, line in enumerate(f_obj, start=1):
            cheml_dict = json.loads(line)
            yield cheml_dict
Exemplo n.º 18
0
    def build_json(self, filename):
        self._logger.info("Data in TSV file comes in non standard ways, eg. bool comes as categ. data Y/N"
                          "so casting to bool, int and float with default fallback values instead of "
                          "throwing exceptions as we are parsing a TSV file where types do not exist")
        to_bool = SaferBool(with_fallback=False)
        to_int = SaferInt(with_fallback=0)
        to_float = SaferFloat(with_fallback=0.)

        sm_bucket_list = [1, 2, 3, 4, 5, 6, 7, 8]
        ab_bucket_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]

        with URLZSource(filename).open() as r_file:
            for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1):
                try:
                    # Get lists of small molecule and antibody buckets
                    buckets = list(row[k] for k in
                                   ("Bucket_1", "Bucket_2", "Bucket_3", "Bucket_4", "Bucket_5", "Bucket_6", "Bucket_7",
                                    "Bucket_8"))
                    buckets_ab = list(row[k] for k in
                                      ("Bucket_1_ab", "Bucket_2_ab", "Bucket_3_ab", "Bucket_4_ab", "Bucket_5_ab",
                                       "Bucket_6_ab", "Bucket_7_ab", "Bucket_8_ab", "Bucket_9_ab"))
                    sm_buckets = list(compress(sm_bucket_list, [x == '1' for x in buckets]))
                    ab_buckets = list(compress(ab_bucket_list, [x == '1' for x in buckets_ab]))

                    # struct is built inline as the most pythonic way is preferable and more explicit
                    #
                    line = {
                        'smallmolecule': {
                            'buckets': sm_buckets,  # list of buckets
                            'categories': {
                                'clinical_precedence': to_float(row["Clinical_Precedence"]),
                                'discovery_precedence': to_float(row["Discovery_Precedence"]),
                                'predicted_tractable': to_float(row["Predicted_Tractable"])
                            },
                            'top_category': row["Category"],
                            # TODO drugebility score not used at the moment but in a future
                            'ensemble': to_float(row["ensemble"]),
                            'high_quality_compounds':
                                to_int(row["High_Quality_ChEMBL_compounds"]),
                            'small_molecule_genome_member':
                                to_bool(row["Small_Molecule_Druggable_Genome_Member"])
                        },
                        'antibody': {
                            'buckets': ab_buckets,
                            'categories': {
                                'clinical_precedence':
                                    to_float(row["Clinical_Precedence_ab"]),
                                'predicted_tractable_high_confidence':
                                    to_float(row["Predicted_Tractable__High_confidence"]),
                                'predicted_tractable_med_low_confidence':
                                    to_float(row["Predicted_Tractable__Medium_to_low_confidence"])
                            },
                            'top_category': row["Category_ab"]
                        }
                    }

                    # Add data for current gene to self.tractability
                    self.tractability[row["ensembl_gene_id"]] = line

                except Exception as k_ex:
                    self._logger.exception("this line %d won't be inserted %s with ex: %s",
                                           i, str(row), str(k_ex))