def test_allowed_chars_can_be_specified(self): self.assertEqual(('', '', [('A', 0)]), extract_sequence_from_fasta('ABCD', allowed_chars="BCD")) self.assertEqual(('', '', [('A', 0)]), extract_sequence_from_fasta('ABCD-DC', allowed_chars="-BCD"))
def test_a_real_sequence_should_parse_correctly(self): self.assertEqual( ('>sp|P30559|OXYR_HUMAN Oxytocin receptor OS=H**o sapiens GN=OXTR PE=1 SV=2', 'MEGALAANWSAEAANASAAPPGAEGNRTAGPPRRNEALARVEVAVLCLILLLALSGNACVLLALRTTRQKHSRLFFFMKHLSIADLVVAVFQVLPQLLWDITFRFYGPDLLCRLVKYLQVVGMFASTYLLLLMSLDRCLAICQPLRSLRRRTDRLAVLATWLGCLVASAPQVHIFSLREVADGVFDCWAVFIQPWGPKAYITWITLAVYIVPVIVLAACYGLISFKIWQNLRLKTAAAAAAEAPEGAAAGDGGRVALARVSSVKLISKAKIRTVKMTFIIVLAFIVCWTPFFFVQMWSVWDANAPKEASAFIIVMLLASLNSCCNPWIYMLFTGHLFHELVQRFLCCSASYLKGRRLGETSASKKSNSSSFVLSHRSSSQRSCSQPSTA', []), extract_sequence_from_fasta( '>sp|P30559|OXYR_HUMAN Oxytocin receptor OS=H**o sapiens GN=OXTR PE=1 SV=2\nMEGALAANWSAEAANASAAPPGAEGNRTAGPPRRNEALARVEVAVLCLILLLALSGNACVLLALRTTRQKHSRLFFFMKH\nLSIADLVVAVFQVLPQLLWDITFRFYGPDLLCRLVKYLQVVGMFASTYLLLLMSLDRCLAICQPLRSLRRRTDRLAVLAT\nWLGCLVASAPQVHIFSLREVADGVFDCWAVFIQPWGPKAYITWITLAVYIVPVIVLAACYGLISFKIWQNLRLKTAAAAA\nAEAPEGAAAGDGGRVALARVSSVKLISKAKIRTVKMTFIIVLAFIVCWTPFFFVQMWSVWDANAPKEASAFIIVMLLASL\nNSCCNPWIYMLFTGHLFHELVQRFLCCSASYLKGRRLGETSASKKSNSSSFVLSHRSSSQRSCSQPSTA' ))
def clean_input(self): input = self.cleaned_data['input'] defline, sequence, errors = extract_sequence_from_fasta(input) if sequence: if set([char.upper() for char in sequence]).issubset(set(['A', 'C', 'T', 'G'])): raise forms.ValidationError('''Error: You appear to have submitted a nucleotide sequence. Only amino acid sequences are accepted.''') else: if not errors: raise forms.ValidationError("You did not submit a sequence") error_list = [] for error in errors: error_list.append("""position %d,('%s')""" % (error[1] + 1, error[0])) error_list = ', '.join(error_list) if ">" in error_list: raise forms.ValidationError('''Error: illegal characters were found at: %s . \nYou may have submitted more than one sequence.''' % error_list) raise forms.ValidationError("Error: illegal characters were found at: %s" % error_list) raise forms.ValidationError("Error: illegal characters were found at: %s" % error_list) return input
def get_uniprot_id_from_fasta(fasta_seq): """Retrieve UniProt identifier from a FASTA sequence Returns -- tuple (uniprot identifier, record) """ # TODO: assume properly formatted sequence; don't do validation here (defline, sequence, errors) = extract_sequence_from_fasta(fasta_seq) if errors: return (None, None) proper_seq = '\n'.join([defline, sequence.encode('ascii')]) handle = StringIO.StringIO(proper_seq) record = SeqIO.parse(handle, "fasta").next() record_id = record.id header = record.description aa_seguid = seguid(record.seq) objects = UniProt.objects.filter(seguid=aa_seguid).all() if len(objects) == 0: return (None, record) for object in objects: uniprot_identifier = object.uniprot_identifier # Is this a holdover from the chained headers used previously? if uniprot_identifier and (uniprot_identifier in record_id): return (object.uniprot_identifier, record) for object in objects: uniprot_taxon = object.taxon if uniprot_taxon and getattr( uniprot_taxon, 'scientific_name', False) and uniprot_taxon.scientific_name in header: return (object.uniprot_identifier, record) return (objects[0].uniprot_identifier, record)
def test_a_sequence_may_have_at_most_one_defline(self): self.assertEqual(('>', '', [('>', 0), ('o', 2), ('o', 3)]), extract_sequence_from_fasta(">\n>\n\nfoo\nA"))
def test_a_sequence_may_contain_numbers(self): self.assertEqual(('', 'ACCANMA', []), extract_sequence_from_fasta("ACCANM 7 A "))
def test_a_defline_can_begin_with_multiple_gt_signs(self): self.assertEqual(('>>foo', 'ACCMNT', []), extract_sequence_from_fasta(">>foo\nACCMNT"))
def test_a_defline_can_contain_multiple_gt_signs(self): self.assertEqual(('>foo>bar', 'ACCMNT', []), extract_sequence_from_fasta(">foo>bar\nACCMNT"))
def test_a_sequence_cannot_just_have_mutliple_deflines(self): self.assertEqual(('>defline1', '', [('>', 0)]), extract_sequence_from_fasta(">defline1\n>defline2\n"))
def test_a_sequence_cannot_begin_with_a_gt(self): self.assertEqual(('>', '', [('>', 0)]), extract_sequence_from_fasta(">\n>A"))
def test_a_mostly_empty_defline_is_OK(self): self.assertEqual(('>', "ACLACTR", []), extract_sequence_from_fasta(">\nACLACTR"))
def test_a_sequence_with_blacklist_chars(self): self.assertEqual(('', '', [("B", 1), ("J", 2)]), extract_sequence_from_fasta("ABJA"))
def test_a_sequence_may_have_an_asterisk_only_at_the_end(self): self.assertEqual(('', '', [('*', 7)]), extract_sequence_from_fasta("ACLACTR*AC"))
def test_an_input_is_required(self): self.assertEqual(('', '', []), extract_sequence_from_fasta(''))
def test_a_sequence_may_end_with_an_asterisk(self): self.assertEqual(('', "ACLACTR", []), extract_sequence_from_fasta("ACLACTR*"))
def test_a_sequence_may_not_contain_non_alphanumeric_chars(self): self.assertEqual(('', '', [('?', 3)]), extract_sequence_from_fasta("ACL?CA"))
def test_a_sequence_may_contain_newlines_tabs_and_spaces(self): self.assertEqual(('', "ACLACTR", []), extract_sequence_from_fasta("ACL\nA CT\tR"))
def test_an_input_cannot_simply_be_whitespace(self): self.assertEqual(('', '', []), extract_sequence_from_fasta(' '))
def test_a_sequence_cannot_just_have_a_defline(self): self.assertEqual(('>', '', []), extract_sequence_from_fasta(">\n")) self.assertEqual(('>some text here', '', []), extract_sequence_from_fasta(">some text here\n"))
def test_a_sequence_does_not_require_a_defline(self): self.assertEqual(('', "ACLACTR", []), extract_sequence_from_fasta("ACLACTR")) self.assertEqual(('>foo', "ACLACTR", []), extract_sequence_from_fasta(">foo\nACLACTR"))
def test_the_case_of_a_sequence_does_not_matter(self): self.assertEqual(('', "acca", []), extract_sequence_from_fasta("acca"))