예제 #1
0
def test_parse_tsv_file(tmpdir):
    some_tsv_text_with_header = (
        'gene	id	some column with spaces',
        'XYZ	1	some description',
        'WQT	2	some description',
        'BCZ	3	some description',
    )
    temp_file = tmpdir.join('some_tsv_file.tsv')
    temp_file.write('\n'.join(some_tsv_text_with_header))
    file_name = str(temp_file)

    wrong_headers = (['gene', 'id', 'some'], ['gene', 'id'])
    for wrong_header in wrong_headers:
        with pytest.raises(parsers.ParsingError):
            parsers.parse_tsv_file(file_name,
                                   lambda x: x,
                                   file_header=wrong_header)

    # test case 1: check if import goes well wih the test data
    counter = 0

    def parse(data):
        nonlocal counter
        counter += 1
        assert counter == int(data[1])
        assert data[2] == 'some description'

    parsers.parse_tsv_file(
        file_name,
        parse,
        file_header=['gene', 'id', 'some column with spaces'])
예제 #2
0
    def load_sites(self, path='data/site_table.tsv'):
        """Load sites from given file altogether with kinases which
        interact with these sites - kinases already in database will
        be reused, unknown kinases will be created

        Args:
            path: to tab-separated-values file with sites to load

        Returns:
            list of created sites
        """
        header = ['gene', 'position', 'residue', 'enzymes', 'pmid', 'type']

        sites = []

        def parser(line):

            refseq, position, residue, kinases_str, pmids, mod_types = line

            site_kinase_names = filter(bool, kinases_str.split(','))
            pmids = pmids.split(',')

            for mod_type in mod_types.split(','):
                site, is_new = self.add_site(refseq, int(position), residue, mod_type, pmids, site_kinase_names)

                if is_new:
                    sites.append(site)

        parse_tsv_file(path, parser, header)

        return sites
예제 #3
0
def pathways(path='data/hsapiens.pathways.NAME.gmt'):
    """Loads pathways from given '.gmt' file.

    New genes may be created and should automatically be added
    to the session with pathways as those have a relationship.
    """
    known_genes = create_key_model_dict(Gene, 'name', lowercase=True)

    pathways = []
    new_genes = []

    def parser(data):
        """Parse GTM file with pathway descriptions.

        Args:
            data: a list of subsequent columns from a single line of GTM file

                For example::

                    ['CORUM:5419', 'HTR1A-GPR26 complex', 'GPR26', 'HTR1A']

        """
        gene_set_name = data[0]
        # Entry description can by empty
        entry_description = data[1].strip()

        entry_gene_names = [name.strip() for name in data[2:]]

        pathway_genes = []

        for gene_name in entry_gene_names:
            name_lower = gene_name.lower()
            if name_lower in known_genes:
                gene = known_genes[name_lower]
            else:
                gene = Gene(name=gene_name)
                known_genes[name_lower] = gene
                new_genes.append(gene)

            pathway_genes.append(gene)

        pathway = Pathway(description=entry_description, genes=pathway_genes)

        if gene_set_name.startswith('GO'):
            pathway.gene_ontology = int(gene_set_name[3:])
        elif gene_set_name.startswith('REAC'):
            pathway.reactome = int(gene_set_name[5:])
        else:
            raise Exception('Unknown gene set name: "%s"' % gene_set_name)

    parse_tsv_file(path, parser)

    print(len(new_genes), 'new genes created')

    return pathways
예제 #4
0
def active_driver_gene_lists(lists=(
    ListData(name='Cancer (TCGA PanCancerAtlas)',
             path='data/mc3.activedriver.2017-11-28.txt',
             mutations_source=MC3Mutation),
    ListData(
        name='Clinical (ClinVar)',
        path=
        'data/ActiveDriver1_result_pvalue_less_0.01_InheritedMutation-2017-02-16.txt',
        mutations_source=InheritedMutation)),
                             fdr_cutoff=0.01):
    current_gene_lists = [
        existing_list.name for existing_list in GeneList.query.all()
    ]
    gene_lists = []

    for list_data in lists:
        if list_data.name in current_gene_lists:
            print('Skipping gene list %s: already present in database' %
                  list_data.name)
            continue

        gene_list = GeneList(
            name=list_data.name,
            mutation_source_name=(list_data.mutations_source.name
                                  if list_data.mutations_source else None))

        header = ['gene', 'p', 'fdr']

        to_high_fdr_count = 0
        list_entries = []

        def parser(line):
            gene_name, p_value, fdr = line
            p_value = float(p_value)
            fdr = float(fdr)

            nonlocal to_high_fdr_count

            if fdr >= fdr_cutoff:
                to_high_fdr_count += 1
                return

            gene, created = get_or_create(Gene, name=gene_name)

            entry = GeneListEntry(gene=gene, p=p_value, fdr=fdr)
            list_entries.append(entry)

            gene_list.entries = list_entries

        parse_tsv_file(list_data.path, parser, header)

        gene_lists.append(gene_list)

    return gene_lists
예제 #5
0
def cancers(path='data/cancer_types.txt'):
    print('Loading cancer data:')

    cancers = []

    def parser(line):
        code, name, color = line
        cancer, created = get_or_create(Cancer, name=name)
        if created:
            cancers.append(cancer)

        cancer.code = code

    parse_tsv_file(path, parser)

    return cancers
예제 #6
0
def kinase_classification(path='data/regphos_kinome_scraped_clean.txt'):

    known_kinases = create_key_model_dict(Kinase, 'name', True)
    known_groups = create_key_model_dict(KinaseGroup, 'name', True)

    new_groups = []

    print('Loading protein kinase groups:')

    header = [
        'No.', 'Kinase', 'Group', 'Family', 'Subfamily', 'Gene.Symbol',
        'gene.clean', 'Description', 'group.clean'
    ]

    def parser(line):

        # note that the subfamily is often absent
        group, family, subfamily = line[2:5]

        # the 'gene.clean' [6] fits better to the names
        # of kinases used in all other data files
        kinase_name = line[6]

        # 'group.clean' is not atomic and is redundant with respect to
        # family and subfamily. This check assures that in case of a change
        # the maintainer would be able to spot the inconsistency easily
        clean = family + '_' + subfamily if subfamily else family
        assert line[8] == clean

        if kinase_name.lower() not in known_kinases:
            kinase = Kinase(name=kinase_name,
                            protein=get_preferred_gene_isoform(kinase_name))
            known_kinases[kinase_name.lower()] = kinase

        # the 'family' corresponds to 'group' in the all other files
        if family.lower() not in known_groups:
            group = KinaseGroup(name=family)
            known_groups[family.lower()] = group
            new_groups.append(group)

        known_groups[family.lower()].kinases.append(
            known_kinases[kinase_name.lower()])

    parse_tsv_file(path, parser, header)

    return new_groups
예제 #7
0
def sites(path='data/site_table.tsv'):
    """Load sites from given file altogether with kinases which
    interact with these sites - kinases already in database will
    be reused, unknown kinases will be created

    Args:
        path: to tab-separated-values file with sites to load

    Returns:
        list of created sites
    """
    proteins = get_proteins()

    print('Loading protein sites:')

    header = ['gene', 'position', 'residue', 'enzymes', 'pmid', 'type']

    sites = []

    known_kinases = create_key_model_dict(Kinase, 'name')
    known_groups = create_key_model_dict(KinaseGroup, 'name')

    def parser(line):

        refseq, position, residue, kinases_str, pmid, mod_type = line

        site_kinase_names = filter(bool, kinases_str.split(','))

        site_kinases, site_groups = get_or_create_kinases(
            site_kinase_names, known_kinases, known_groups)

        site = Site(position=int(position),
                    residue=residue,
                    pmid=pmid,
                    protein=proteins[refseq],
                    kinases=list(site_kinases),
                    kinase_groups=list(site_groups),
                    type=mod_type)

        sites.append(site)

    parse_tsv_file(path, parser, header)

    return sites
예제 #8
0
    def parse(self, path):
        esp_mutations = []
        duplicates = 0
        skipped = 0

        def esp_parser(line):
            nonlocal duplicates, skipped

            metadata = line[20].split(';')

            # not flexible way to select MAF from metadata, but quite quick
            assert metadata[4].startswith('MAF=')

            maf_ea, maf_aa, maf_all = map(float, metadata[4][4:].split(','))

            if maf_all == 0:
                skipped += 1
                return

            for mutation_id in self.preparse_mutations(line):

                values = (mutation_id, maf_ea, maf_aa, maf_all)

                duplicated = self.look_after_duplicates(
                    mutation_id, esp_mutations, values)
                if duplicated:
                    duplicates += 1
                    continue

                self.protect_from_duplicates(mutation_id, esp_mutations)

                esp_mutations.append(values)

        parse_tsv_file(path,
                       esp_parser,
                       self.header,
                       file_opener=gzip_open_text)

        print('%s duplicates found' % duplicates)
        print('%s zero-frequency mutations skipped' % skipped)

        return esp_mutations
예제 #9
0
def kinase_mappings(path='data/curated_kinase_IDs.txt'):
    """Create kinases from `kinase_name gene_name` mappings.

    For each kinase a `preferred isoforms` of given gene will be used.

    If given kinase already is in the database and has an isoform
    associated, the association will be superseded with the new one.

    Returns:
        list of created isoforms
    """
    known_kinases = create_key_model_dict(Kinase, 'name')

    new_kinases = []

    def parser(line):
        kinase_name, gene_name = line
        protein = get_preferred_gene_isoform(gene_name)

        if not protein:
            print('No isoform for %s kinase mapped to %s gene!' %
                  (kinase_name, gene_name))
            return

        if kinase_name in known_kinases:
            kinase = known_kinases[kinase_name]
            if kinase.protein and kinase.protein != protein:

                print('Overriding kinase-protein association for '
                      '%s kinase. Old isoform: %s; new isoform: %s.' %
                      (kinase_name, kinase.protein.refseq, protein.refseq))
            kinase.protein = protein

        else:
            new_kinases.append(Kinase(name=kinase_name, protein=protein))

    parse_tsv_file(path, parser)

    return new_kinases
예제 #10
0
def drugbank(path='data/drugbank/drugbank.tsv'):

    drugs = set()

    # in case we need to query drugbank, it's better to keep names comapt.
    drug_type_map = {
        'BiotechDrug': 'biotech',
        'SmallMoleculeDrug': 'small molecule'
    }

    def parser(data):
        drug_id, gene_name, drug_name, drug_groups, drug_type_name = data
        target_gene = Gene.query.filter_by(name=gene_name).first()

        if target_gene:
            drug, created = get_or_create(Drug, name=drug_name)
            if created:
                drugs.add(drug)
            drug.target_genes.append(target_gene)
            drug.drug_bank_id = drug_id

            for drug_group_name in drug_groups.split(';'):
                if drug_group_name != 'NA':
                    drug_group, created = get_or_create(DrugGroup,
                                                        name=drug_group_name)
                    drug.groups.add(drug_group)

            if drug_type_name != 'NA':
                drug_type, created = get_or_create(
                    DrugType, name=drug_type_map[drug_type_name])
                drug.type = drug_type

    # TODO: the header has type and group swapped
    header = 'DRUG_id	GENE_symbol	DRUG_name	DRUG_type	DRUG_group'.split('\t')

    parse_tsv_file(path, parser, header)

    return drugs
예제 #11
0
    def parse(self, path):
        mimps = []

        def parser(line):
            nonlocal mimps

            refseq = line[0]
            mut = line[1]
            psite_pos = line[2]

            try:
                protein = self.proteins[refseq]
            except KeyError:
                return

            ref, pos, alt = decode_raw_mutation(mut)

            try:
                assert ref == protein.sequence[pos - 1]
            except (AssertionError, IndexError):
                self.broken_seq[refseq].append((protein.id, alt))
                return

            assert line[13] in ('gain', 'loss')

            # MIMP mutations are always hardcoded PTM mutations
            mutation_id = self.get_or_make_mutation(pos, protein.id, alt, True)

            psite_pos = int(psite_pos)

            affected_sites = [
                site
                for site in protein.sites
                if site.position == psite_pos
            ]

            if len(affected_sites) != 1:
                warning = UserWarning(
                    'Skipping %s: %s%s%s (for site at position %s): ' % (
                       refseq, ref, pos, alt, psite_pos
                    ) +
                    'MIMP site does not match to the database - ' +
                    (
                        'too many (%s) sites found.' % len(affected_sites)
                        if affected_sites else
                        'given site not found.'
                    )
                )
                print(warning)
                warn(warning)
                return

            site_id = affected_sites[0].id

            mimps.append(
                (
                    mutation_id,
                    int(line[3]),
                    1 if line[13] == 'gain' else 0,
                    line[9],
                    line[10],
                    float(line[12]),
                    site_id
                )
            )

        parse_tsv_file(path, parser, self.header)

        return mimps
예제 #12
0
def proteins_and_genes(path='data/protein_data.tsv'):
    """Create proteins and genes based on data in a given file.

    If protein/gene already exists it will be skipped.

    Returns:
        list of created (new) proteins
    """
    # TODO where does the tsv file come from?
    print('Creating proteins and genes:')

    genes = create_key_model_dict(Gene, 'name', lowercase=True)
    known_proteins = get_proteins()

    proteins = {}

    coordinates_to_save = [('txStart', 'tx_start'), ('txEnd', 'tx_end'),
                           ('cdsStart', 'cds_start'), ('cdsEnd', 'cds_end')]

    allowed_strands = ['+', '-']

    # a list storing refseq ids which occur at least twice in the file
    with_duplicates = []
    potentially_empty_genes = set()

    header = [
        'bin', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart',
        'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'score', 'name2',
        'cdsStartStat', 'cdsEndStat', 'exonFrames'
    ]

    columns = tuple(header.index(x[0]) for x in coordinates_to_save)
    coordinates_names = [x[1] for x in coordinates_to_save]

    def parser(line):

        # use name2 (fourth column from the end)
        name = line[-4]

        strand = line[3]
        assert strand in allowed_strands

        gene_data = {
            'name': name,
            'chrom': line[2][3:],  # remove chr prefix
            'strand': True if strand == '+' else False
        }

        if name.lower() not in genes:
            gene = Gene(**gene_data)
            genes[name.lower()] = gene
        else:
            gene = genes[name.lower()]
            for key, value in gene_data.items():
                previous = getattr(gene, key)
                if previous != value:
                    print('Replacing %s %s with %s (previously: %s)' %
                          (gene, key, value, previous))
                    setattr(gene, key, value)

        # load protein
        refseq = line[1]

        # if protein is already in database no action is required
        if refseq in known_proteins:
            return

        # do not allow duplicates
        if refseq in proteins:

            with_duplicates.append(refseq)
            potentially_empty_genes.add(gene)
            """
            if gene.chrom in ('X', 'Y'):
                # close an eye for pseudoautosomal regions
                print(
                    'Skipping duplicated entry (probably belonging',
                    'to pseudoautosomal region) with refseq:', refseq
                )
            else:
                # warn about other duplicated records
                print(
                    'Skipping duplicated entry with refseq:', refseq
                )
            """
            return

        # from this line there is no processing of duplicates allowed
        assert refseq not in proteins

        protein_data = {'refseq': refseq, 'gene': gene}

        coordinates = zip(
            coordinates_names,
            [int(value) for i, value in enumerate(line) if i in columns])
        protein_data.update(coordinates)

        proteins[refseq] = Protein(**protein_data)

    parse_tsv_file(path, parser, header)

    cnt = sum(map(lambda g: len(g.isoforms) == 1, potentially_empty_genes))
    print('Duplicated that are only isoforms for gene:', cnt)
    print('Duplicated rows detected:', len(with_duplicates))
    return proteins.values()
예제 #13
0
def domains(path='data/biomart_protein_domains_20072016.txt'):
    proteins = get_proteins()

    print('Loading domains:')

    interpro_domains = create_key_model_dict(InterproDomain, 'accession')
    new_domains = []

    skipped = 0
    wrong_length = 0
    not_matching_chrom = []

    header = [
        'Ensembl Gene ID', 'Ensembl Transcript ID', 'Ensembl Protein ID',
        'Chromosome Name', 'Gene Start (bp)', 'Gene End (bp)',
        'RefSeq mRNA [e.g. NM_001195597]', 'Interpro ID',
        'Interpro Short Description', 'Interpro Description', 'Interpro end',
        'Interpro start'
    ]

    def parser(line):

        nonlocal skipped, wrong_length, not_matching_chrom

        try:
            protein = proteins[line[6]]  # by refseq
        except KeyError:
            skipped += 1
            return

        # If there is no data about the domains, skip this record
        if len(line) == 7:
            return

        try:
            assert len(line) == 12
        except AssertionError:
            print(line, len(line))

        # does start is lower than end?
        assert int(line[11]) < int(line[10])

        accession = line[7]

        # according to:
        # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC29841/#__sec2title
        assert accession.startswith('IPR')

        start, end = int(line[11]), int(line[10])

        # TODO: the assertion fails for some domains: what to do?
        # assert end <= protein.length
        if end > protein.length:
            wrong_length += 1

        if line[3] != protein.gene.chrom:
            skipped += 1
            not_matching_chrom.append(line)
            return

        if accession not in interpro_domains:

            interpro = InterproDomain(
                accession=line[7],  # Interpro Accession
                short_description=line[8],  # Interpro Short Description
                description=line[9],  # Interpro Description
            )

            interpro_domains[accession] = interpro

        interpro = interpro_domains[accession]

        similar_domains = [
            # select similar domain occurrences with criteria being:
            domain for domain in protein.domains
            # - the same interpro id
            if domain.interpro == interpro and
            # - at least 75% of common coverage for shorter occurrence of domain
            ((min(domain.end, end) - max(domain.start, start)) /
             min(len(domain), end - start) > 0.75)
        ]

        if similar_domains:
            try:
                assert len(similar_domains) == 1
            except AssertionError:
                print(similar_domains)
            domain = similar_domains[0]

            domain.start = min(domain.start, start)
            domain.end = max(domain.end, end)
        else:

            domain = Domain(interpro=interpro,
                            protein=protein,
                            start=start,
                            end=end)
            new_domains.append(domain)

    parse_tsv_file(path, parser, header)

    print('Domains loaded,', skipped, 'proteins skipped.',
          'Domains exceeding proteins length:', wrong_length,
          'Domains skipped due to not matching chromosomes:',
          len(not_matching_chrom))
    return new_domains
예제 #14
0
def external_references(path='data/HUMAN_9606_idmapping.dat.gz',
                        refseq_lrg='data/LRG_RefSeqGene',
                        refseq_link='data/refseq_link.tsv.gz'):
    from models import Protein
    from models import ProteinReferences
    from models import EnsemblPeptide
    from sqlalchemy.orm.exc import NoResultFound

    references = defaultdict(list)

    def add_uniprot_accession(data):

        # full uniprot includes isoform (if relevant)
        full_uniprot, ref_type, value = data

        if ref_type == 'RefSeq_NT':
            # get protein
            refseq_nm = value.split('.')[0]

            if not refseq_nm or not refseq_nm.startswith(
                    'NM') or not full_uniprot:
                return

            try:
                protein = Protein.query.filter_by(refseq=refseq_nm).one()
            except NoResultFound:
                return

            try:
                uniprot, isoform = full_uniprot.split('-')
                isoform = int(isoform)
            except ValueError:
                # only one isoform ?
                # print('No isoform specified for', full_uniprot, refseq_nm)
                uniprot = full_uniprot
                isoform = 1

            reference, new = get_or_create(ProteinReferences, protein=protein)
            uniprot_entry, _ = get_or_create(UniprotEntry,
                                             accession=uniprot,
                                             isoform=isoform)
            reference.uniprot_entries.append(uniprot_entry)
            references[uniprot].append(reference)

            if new:
                db.session.add(reference)

    ensembl_references_to_collect = {'Ensembl_PRO': 'peptide_id'}

    def add_references_by_uniprot(data):

        full_uniprot, ref_type, value = data

        if '-' in full_uniprot:
            uniprot, isoform = full_uniprot.split('-')
            uniprot_tied_references = references.get(uniprot, None)
            if not uniprot_tied_references:
                return

            relevant_references = []
            # select relevant references:
            for reference in uniprot_tied_references:
                if any(entry.isoform == int(isoform)
                       for entry in reference.uniprot_entries):
                    relevant_references.append(reference)

        else:
            uniprot_tied_references = references.get(full_uniprot, None)
            if not uniprot_tied_references:
                return
            relevant_references = uniprot_tied_references

        if ref_type == 'UniProtKB-ID':
            # http://www.uniprot.org/help/entry_name
            # "Each >reviewed< entry is assigned a unique entry name upon integration into UniProtKB/Swiss-Prot"
            # Entry names comes in format: X_Y;
            # - for Swiss-Prot entry X is a mnemonic protein identification code (at most 5 characters)
            # - for TrEMBL entry X is the same as accession code (6 to 10 characters)
            x, y = value.split('_')

            if len(x) <= 5:
                for reference in relevant_references:
                    assert '-' not in full_uniprot
                    entry = UniprotEntry.query.filter_by(
                        accession=full_uniprot, reference=reference).one()
                    entry.reviewed = True

            return

        if ref_type in ensembl_references_to_collect:

            attr = ensembl_references_to_collect[ref_type]

            for relevant_reference in relevant_references:
                attrs = {'reference': relevant_reference, attr: value}

                peptide, new = get_or_create(EnsemblPeptide, **attrs)

                if new:
                    db.session.add(peptide)

    def add_ncbi_mappings(data):
        # 9606    3329    HSPD1   NG_008915.1     NM_199440.1     NP_955472.1     reference standard
        taxonomy, entrez_id, gene_name, refseq_gene, lrg, refseq_nucleotide, t, refseq_peptide, p, category = data

        refseq_nm = refseq_nucleotide.split('.')[0]

        if not refseq_nm or not refseq_nm.startswith('NM'):
            return

        try:
            protein = Protein.query.filter_by(refseq=refseq_nm).one()
        except NoResultFound:
            return

        reference, new = get_or_create(ProteinReferences, protein=protein)

        if new:
            db.session.add(reference)

        reference.refseq_np = refseq_peptide.split('.')[0]
        reference.refseq_ng = refseq_gene.split('.')[0]
        gene = protein.gene

        if gene.name != gene_name:
            print('Gene name mismatch for RefSeq mappings: %s vs %s' %
                  (gene.name, gene_name))

        entrez_id = int(entrez_id)

        if gene.entrez_id:
            if gene.entrez_id != entrez_id:
                print('Entrez ID mismatch for isoforms of %s gene: %s, %s' %
                      (gene.name, gene.entrez_id, entrez_id))
                if gene.name == gene_name:
                    print(
                        'Overwriting %s entrez id with %s for %s gene, because record with %s has matching gene name'
                        % (gene.entrez_id, entrez_id, gene.name, entrez_id))
                    gene.entrez_id = entrez_id
        else:
            gene.entrez_id = entrez_id

    parse_tsv_file(refseq_lrg,
                   add_ncbi_mappings,
                   file_header=[
                       '#tax_id', 'GeneID', 'Symbol', 'RSG', 'LRG', 'RNA', 't',
                       'Protein', 'p', 'Category'
                   ])

    # add mappings retrieved from UCSC tables for completeness
    header = [
        '#name', 'product', 'mrnaAcc', 'protAcc', 'geneName', 'prodName',
        'locusLinkId', 'omimId'
    ]
    for line in iterate_tsv_gz_file(refseq_link, header):
        gene_name, protein_full_name, refseq_nm, refseq_peptide, _, _, entrez_id, omim_id = line

        if not refseq_nm or not refseq_nm.startswith('NM'):
            continue

        try:
            protein = Protein.query.filter_by(refseq=refseq_nm).one()
        except NoResultFound:
            continue

        gene = protein.gene

        if gene.name != gene_name:
            print('Gene name mismatch for RefSeq mappings: %s vs %s' %
                  (gene.name, gene_name))

        entrez_id = int(entrez_id)

        if protein_full_name:
            if protein.full_name:
                if protein.full_name != protein_full_name:
                    print(
                        'Protein full name mismatch: %s vs %s for %s' %
                        (protein.full_name, protein_full_name, protein.refseq))
                continue
            protein.full_name = protein_full_name

        if gene.entrez_id:
            if gene.entrez_id != entrez_id:
                print('Entrez ID mismatch for isoforms of %s gene: %s, %s' %
                      (gene.name, gene.entrez_id, entrez_id))
                if gene.name == gene_name:
                    print(
                        'Overwriting %s entrez id with %s for %s gene, because record with %s has matching gene name'
                        % (gene.entrez_id, entrez_id, gene.name, entrez_id))
                    gene.entrez_id = entrez_id
        else:
            gene.entrez_id = entrez_id

        if refseq_peptide:
            reference, new = get_or_create(ProteinReferences, protein=protein)

            if new:
                db.session.add(reference)

            if reference.refseq_np and reference.refseq_np != refseq_peptide:
                print(
                    'Refseq peptide mismatch between LRG and UCSC retrieved data: %s vs %s for %s'
                    % (reference.refseq_np, refseq_peptide, protein.refseq))

            reference.refseq_np = refseq_peptide

    parse_tsv_file(path,
                   add_uniprot_accession,
                   file_opener=gzip.open,
                   mode='rt')
    parse_tsv_file(path,
                   add_references_by_uniprot,
                   file_opener=gzip.open,
                   mode='rt')

    return [
        reference for reference_group in references.values()
        for reference in reference_group
    ]
예제 #15
0
    def parse(self, path):
        clinvar_mutations = []
        clinvar_data = []
        duplicates = 0
        new_diseases = OrderedDict()

        clinvar_keys = (
            'RS',
            'MUT',
            'VLD',
            'PMC',
            'CLNSIG',
            'CLNDBN',
            'CLNREVSTAT',
        )

        highest_disease_id = get_highest_id(Disease)

        def clinvar_parser(line):
            nonlocal highest_disease_id, duplicates

            metadata = line[20].split(';')

            clinvar_entry = make_metadata_ordered_dict(clinvar_keys, metadata)

            names, statuses, significances = (
                (entry.replace('|', ',').split(',') if entry else None)
                for entry in
                (
                    clinvar_entry[key]
                    for key in ('CLNDBN', 'CLNREVSTAT', 'CLNSIG')
                )
            )

            # those length should be always equal if they exists
            sub_entries_cnt = max(
                [
                    len(x)
                    for x in (names, statuses, significances)
                    if x
                ] or [0]
            )

            at_least_one_significant_sub_entry = False

            for i in range(sub_entries_cnt):

                try:
                    if names:
                        if names[i] not in ('not_specified', 'not provided'):
                            names[i] = self._beautify_disease_name(names[i])
                            at_least_one_significant_sub_entry = True
                    if statuses and statuses[i] == 'no_criteria':
                        statuses[i] = None
                except IndexError:
                    print('Malformed row (wrong count of subentries) on %s-th entry:' % i)
                    print(line)
                    return False

            values = list(clinvar_entry.values())

            # following 2 lines are result of issue #47 - we don't import those
            # clinvar mutations that do not have any diseases specified:
            if not at_least_one_significant_sub_entry:
                return

            for mutation_id in self.preparse_mutations(line):

                # take care of duplicates
                duplicated = self.look_after_duplicates(mutation_id, clinvar_mutations, values[:4])
                if duplicated:
                    duplicates += 1
                    continue

                # take care of nearly-duplicates
                same_mutation_pointers = self.mutations_details_pointers_grouped_by_unique_mutations[mutation_id]
                assert len(same_mutation_pointers) <= 1
                if same_mutation_pointers:
                    pointer = same_mutation_pointers[0]
                    old = self.data_as_dict(clinvar_mutations[pointer])
                    new = self.data_as_dict(values, mutation_id=mutation_id)

                    if old['db_snp_ids'] != [new['db_snp_ids']]:
                        clinvar_mutations[pointer][1].append(new['db_snp_ids'])

                    # if either of the dbSNP entries is validated, the mutation is validated
                    # (the same with presence in PubMed)
                    for key in ['is_validated', 'is_in_pubmed_central']:
                        if old[key] != new[key] and new[key]:
                            index = self.insert_keys.index(key)
                            clinvar_mutations[pointer][index] = True

                    print(
                        'Merged details referring to the same mutation (%s): %s into %s'
                        %
                        (mutation_id, values, clinvar_mutations[pointer])
                    )
                    continue

                self.protect_from_duplicates(mutation_id, clinvar_mutations)

                # Python 3.5 makes it easy: **values (but is not available)
                clinvar_mutations.append(
                    [
                        mutation_id,
                        [values[0]],
                        values[1],
                        values[2],
                        values[3],
                    ]
                )

                for i in range(sub_entries_cnt):
                    name = names[i]

                    # we don't won't _uninteresting_ data
                    if name in ('not_specified', 'not provided'):
                        continue

                    if name in new_diseases:
                        disease_id = new_diseases[name]
                    else:
                        disease, created = get_or_create(Disease, name=name)
                        if created:
                            highest_disease_id += 1
                            new_diseases[name] = highest_disease_id
                            disease_id = highest_disease_id
                        else:
                            disease_id = disease.id

                    clinvar_data.append(
                        (
                            len(clinvar_mutations),
                            int(significances[i]) if significances is not None else None,
                            disease_id,
                            statuses[i] if statuses else None,
                        )
                    )

        parse_tsv_file(
            path,
            clinvar_parser,
            self.header,
            file_opener=gzip_open_text
        )

        print('%s duplicates found' % duplicates)

        return clinvar_mutations, clinvar_data, new_diseases.keys()