예제 #1
0
def test_make_snv_key():
    test_data = (
        # chrom, pos, ref, alt
        (('1', '211', 'A', 'C'), ('1', '211', 'A', 'c')),
        (('X', '2012', 'T', 'G'), ('X', '2012', 't', 'g')),
    )
    for attributes, equivalent_attributes in test_data:
        result_1 = genomic_mappings.make_snv_key(*attributes)
        result_2 = genomic_mappings.make_snv_key(*equivalent_attributes)
        assert result_1 == result_2
예제 #2
0
def test_mappings():
    """This is a simple inclusion test for genome -> proteme mutation mappings.

    Knowing the data, we demand the items from the right side (of test data)
    to be in the results of queries specified on the left side.
    """

    test_data = (
        # (chrom, dna_pos, dna_ref, dna_alt), (name, pos, ref, alt)
        (('17', '7572934', 'G', 'A'), ('TP53', 353, 'S', 'L')),
        (('17', '19282215', 't', 'a'), ('MAPK7', 1, 'M', 'K')),
        (('21', '40547520', 'g', 'a'), ('PSMG1', 283, 'T', 'I')),
        (('9', '125616157', 't', 'a'), ('RC3H2', 1064, 'Y', 'F')),
        (('11', '120198175', 'g', 'a'), ('TMEM136', 31, 'V', 'M')),
        (('10', '81838457', 't', 'a'), ('TMEM254', 1, 'M', 'K')),
        (('13', '111267940', 't', 'a'), ('CARKD', 1, 'M', 'K')),
        (('6', '30539266', 't', 'a'), ('ABCF1', 1, 'M', 'K')),
        (('6', '36765430', 'g', 'a'), ('CPNE5', 140, 'L', 'F')),
        (('12', '123464753', 't', 'a'), ('ARL6IP4', 1, 'M', 'K')),
    )

    for genomic_data, protein_data in test_data:

        snv = make_snv_key(*genomic_data)

        items = [decode_csv(item) for item in bdb[snv]]

        for item in items:
            retrieved_data = (Protein.query.get(item['protein_id']).gene.name,
                              item['pos'], item['ref'], item['alt'])
            if retrieved_data == protein_data:
                break
        else:
            raise Exception(retrieved_data, protein_data)
    def test_genome_proteome_mappings(self):

        mappings_filename, gene, proteins = create_test_data()

        broken_sequences = import_genome_proteome_mappings(
            proteins, path.dirname(mappings_filename),
            path.basename(mappings_filename))

        # in some cases it is needed to reload bdb after import
        bdb.reload()

        assert not bdb[make_snv_key('1', 19282216, 'G', 'A')]
        assert bdb[make_snv_key('17', 19282216, 'G', 'A')]

        assert set(broken_sequences.keys()) == {'NM_002749'}
        assert [('NM_002749', 'L', 'A', '5', 'Q')
                ] in list(broken_sequences.values())
예제 #4
0
def import_genome_proteome_mappings(
        proteins,
        mappings_dir='data/200616/all_variants/playground',
        mappings_file_pattern='annot_*.txt.gz',
        bdb_dir=''):
    print('Importing mappings:')

    chromosomes = get_human_chromosomes()
    broken_seq = defaultdict(list)

    bdb.reset()
    bdb.close()

    path = current_app.config['BDB_DNA_TO_PROTEIN_PATH']
    if bdb_dir:
        path = bdb_dir + '/' + basename(path)

    bdb.open(path, cache_size=20480 * 8 * 8 * 8 * 8)

    for line in read_from_gz_files(mappings_dir, mappings_file_pattern):
        try:
            chrom, pos, ref, alt, prot = line.rstrip().split('\t')
        except ValueError as e:
            print(e, line)
            continue

        assert chrom.startswith('chr')
        chrom = chrom[3:]

        assert chrom in chromosomes
        ref = ref.rstrip()

        # new Coding Sequence Variants to be added to those already
        # mapped from given `snv` (Single Nucleotide Variation)

        for dest in filter(bool, prot.split(',')):
            try:
                name, refseq, exon, cdna_mut, prot_mut = dest.split(':')
            except ValueError as e:
                print(e, line)
                continue
            assert refseq.startswith('NM_')
            # refseq = int(refseq[3:])
            # name and refseq are redundant with respect one to another

            assert exon.startswith('exon')
            exon = exon[4:]

            assert cdna_mut.startswith('c')
            try:
                cdna_ref, cdna_pos, cdna_alt = decode_mutation(cdna_mut)
            except ValueError as e:
                print(e, line)
                continue

            try:
                strand = determine_strand(ref, cdna_ref, alt, cdna_alt)
            except DataInconsistencyError as e:
                print(e, line)
                continue

            assert prot_mut.startswith('p')
            # we can check here if a given reference nuc is consistent
            # with the reference amino acid. For example cytosine in
            # reference implies that there should't be a methionine,
            # glutamic acid, lysine nor arginine. The same applies to
            # alternative nuc/aa and their combinations (having
            # references (nuc, aa): (G, K) and alt nuc C defines that
            # the alt aa has to be Asparagine (N) - no other is valid).
            # Note: it could be used to compress the data in memory too
            aa_ref, aa_pos, aa_alt = decode_mutation(prot_mut)

            try:
                # try to get it from cache (`proteins` dictionary)
                protein = proteins[refseq]
            except KeyError:
                continue

            assert aa_pos == (int(cdna_pos) - 1) // 3 + 1

            broken_sequence_tuple = is_sequence_broken(protein, aa_pos, aa_ref,
                                                       aa_alt)

            if broken_sequence_tuple:
                broken_seq[refseq].append(broken_sequence_tuple)
                continue

            is_ptm_related = protein.has_sites_in_range(aa_pos - 7, aa_pos + 7)

            snv = make_snv_key(chrom, pos, cdna_ref, cdna_alt)

            # add new item, emulating set update
            item = encode_csv(strand, aa_ref, aa_alt, cdna_pos, exon,
                              protein.id, is_ptm_related)

            bdb.add(snv, item)

    return broken_seq