def get_nr_sequences(fasta_file, genomes_list): locus2genome = {} for fasta in genomes_list: genome = os.path.basename(fasta).split('.')[0] for seq in SeqIO.parse(fasta, "fasta"): locus2genome[seq.name] = genome nr_fasta = open('nr.faa', 'w') nr_mapping = open('nr_mapping.tab', 'w') hsh_checksum_list = {} records = SeqIO.parse(fasta_file, "fasta") updated_records = [] for record in records: # NOTE: the case is important for crc64, need to check whether it # is necessary to make all entries lower/upper case to ensure consistency. checksum = CheckSum.crc64(record.seq) nr_mapping.write("%s\t%s\t%s\n" % (record.id, checksum, locus2genome[record.id])) if checksum not in hsh_checksum_list: hsh_checksum_list[checksum] = [record] record.id = checksum record.name = "" updated_records.append(record) else: # NOTE: having same hash does not mean that the sequences are identical: as # the hash space is smaller than the sequence space, it means that collision # are unavoidable (but not probable) and record with same hashes should be compared # https://www.uniprot.org/help/uniparc (sequence comparison) # # the list of records having the same checksum, but potentially, # different sequences -> compare them: python does so # comparing the sequences as strings, assuming a similar alphabet lst_records = hsh_checksum_list[checksum] sequence = record.seq has_identical = False for prev_record in lst_records: if prev_record.seq == sequence: has_identical = True break if not has_identical: lst_records.append(record) record.id = checksum + "-" + len(lst_records) record.name = "" updated_records.append(record) SeqIO.write(updated_records, nr_fasta, "fasta")
print IUPACData.ambiguous_dna_complement #dictionary of complements #and a lot more from Bio.Data import CodonTable print CodonTable.generic_by_id[2] #SeqUtils. Several functions to deal with DNA and protein sequences. #DNA utils import Bio.SeqUtils as SeqUtils print SeqUtils.GC('gacgatcggtattcgtag') #GC content from Bio.SeqUtils import MeltingTemp print MeltingTemp.Tm_staluc('tgcagtacgtatcgt') #DNA/RNA melting temperature #checksum functions: short alphanumeric string signature of a file or sequence #usually written in description of sequence #cgc is a easy, weak, very used checksum (better crc32, crc64) from Bio.SeqUtils import CheckSum myseq='acaagatgccattgtcccccggcctcctgctgctgct' print CheckSum.gcg(myseq) print CheckSum.crc32(myseq) print CheckSum.crc64(myseq) print CheckSum.seguid(myseq) #Protein utils from Bio.SeqUtils import ProtParam myprot=ProtParam.ProteinAnalysis('MLTNK') print myprot.count_amino_acids() print myprot.get_amino_acids_percent() print myprot.molecular_weight() print myprot.aromaticity() print myprot.instability_index() print myprot.flexibility() print myprot.isoelectric_point() print myprot.secondary_structure_fraction()