示例#1
0
def get_nr_sequences(fasta_file, genomes_list):
    locus2genome = {}
    for fasta in genomes_list:
        genome = os.path.basename(fasta).split('.')[0]
        for seq in SeqIO.parse(fasta, "fasta"):
            locus2genome[seq.name] = genome
    nr_fasta = open('nr.faa', 'w')
    nr_mapping = open('nr_mapping.tab', 'w')

    hsh_checksum_list = {}

    records = SeqIO.parse(fasta_file, "fasta")
    updated_records = []

    for record in records:

        # NOTE: the case is important for crc64, need to check whether it
        # is necessary to make all entries lower/upper case to ensure consistency.
        checksum = CheckSum.crc64(record.seq)
        nr_mapping.write("%s\t%s\t%s\n" % (record.id,
                                          checksum,
                                          locus2genome[record.id]))
        if checksum not in hsh_checksum_list:
            hsh_checksum_list[checksum] = [record]
            record.id = checksum
            record.name = ""
            updated_records.append(record)
        else:
            # NOTE: having same hash does not mean that the sequences are identical: as
            # the hash space is smaller than the sequence space, it means that collision
            # are unavoidable (but not probable) and record with same hashes should be compared
            # https://www.uniprot.org/help/uniparc (sequence comparison)
            #
            # the list of records having the same checksum, but potentially, 
            # different sequences -> compare them: python does so 
            # comparing the sequences as strings, assuming a similar alphabet
            lst_records = hsh_checksum_list[checksum]
            sequence = record.seq
            has_identical = False
            for prev_record in lst_records:
                if prev_record.seq == sequence:
                    has_identical = True
                    break
            if not has_identical:
                lst_records.append(record)
                record.id = checksum + "-" + len(lst_records)
                record.name = ""
                updated_records.append(record)

    SeqIO.write(updated_records, nr_fasta, "fasta")
示例#2
0
print IUPACData.ambiguous_dna_complement	#dictionary of complements
#and a lot more
from Bio.Data import CodonTable
print CodonTable.generic_by_id[2]

#SeqUtils. Several functions to deal with DNA and protein sequences.
#DNA utils
import Bio.SeqUtils as SeqUtils
print SeqUtils.GC('gacgatcggtattcgtag')	#GC content
from Bio.SeqUtils import MeltingTemp
print MeltingTemp.Tm_staluc('tgcagtacgtatcgt')	#DNA/RNA melting temperature
#checksum functions: short alphanumeric string signature of a file or sequence
#usually written in description of sequence
#cgc is a easy, weak, very used checksum (better crc32, crc64)
from Bio.SeqUtils import CheckSum
myseq='acaagatgccattgtcccccggcctcctgctgctgct'
print CheckSum.gcg(myseq)
print CheckSum.crc32(myseq)
print CheckSum.crc64(myseq)
print CheckSum.seguid(myseq)
#Protein utils
from Bio.SeqUtils import ProtParam
myprot=ProtParam.ProteinAnalysis('MLTNK')
print myprot.count_amino_acids()
print myprot.get_amino_acids_percent()
print myprot.molecular_weight()
print myprot.aromaticity()
print myprot.instability_index()
print myprot.flexibility()
print myprot.isoelectric_point()
print myprot.secondary_structure_fraction()