Пример #1
0
def read_embl(path_to_embls: list, num_of_entries: int, exclude_csv: str, queue):
    """ Reads entries from a list of existing embl files """
    if exclude_csv is None:
        # If no exclude csv is provided, we execute the reading without an if checking! (performance)
        for input_f in path_to_embls:
            # For each entry: try to read it and
            # add it to the queue
            try:
                entries = SwissProt.parse(input_f)
                for entry in entries:
                    queue.put(entry)
            except Exception as e:
                print("File '{}' could not be parsed and was excluded. Reason: {}".format(input_f, e))

    else:
        # If a exclude csv is provided, then a simple if check is added (reduced performance)
        with open(exclude_csv) as in_f:
            # Read the contents of the csv
            csv_reader = csv.reader(in_f)
            exclude_set = set(x[0] for x in list(csv_reader))

            for input_f in path_to_embls:
                # For each entry: try to read it and
                # add it to the queue
                try:
                    entries = SwissProt.parse(input_f)
                    for entry in entries:
                        if entry.accessions[0] in exclude_set:
                            # This effectively skips an entry at the cost to check whether to skip in EACH entry!
                            continue
                        queue.put(entry)
                except Exception as e:
                    print("File '{}' could not be parsed and was excluded. Reason: {}".format(input_f, e))
Пример #2
0
def access_sequence(accession):
    handle = ExPASy.get_sprot_raw(accession)
    try:
        record = SwissProt.read(handle)
    except ValueException:
        print("WARNING: Accession %s not found" % accession)
    return record.sequence
def write_swissprot_annotations(outf, indentation_level, uniprot, uniprot_f):
    uniprot_dat_indices = UniProtDatIndex.objects.filter(uniprot=uniprot)
    for uniprot_dat_index in uniprot_dat_indices:
        if uniprot_dat_index.uniprot_accession == uniprot.accession:
            break
    uniprot_f.seek(uniprot_dat_index.file_char)
    record = SwissProt.parse(uniprot_f).next()
    indented_write(outf, indentation_level + 1,
                   "Length: %d\n" % record.sequence_length)
    if len(record.gene_name) > 0:
        for name_spec in record.gene_name.replace('\n', ' ').split('; '):
            name_type, names = name_spec.split('=')
            indented_write(outf, indentation_level + 1,
                           "%s: %s\n" % (name_type, names))
    for keyword in record.keywords:
        indented_write(outf, indentation_level + 1, 'Keyword: %s\n' % keyword)
    for comment in record.comments:
        if comment[0:5] == '-----':
            continue
        components = comment.replace(':\n', ': ').split(': ')
        comment_type = components[0]
        comment_lines = ': '.join(components[1:]).split('\n')
        indented_write(outf, indentation_level + 1, "%s:\n" % comment_type)
        for line in comment_lines:
            indented_write(outf, indentation_level + 2, "%s\n" % line)
    for cross_reference in record.cross_references:
        indented_write(
            outf, indentation_level + 1,
            "%s: %s\n" % (cross_reference[0], '; '.join(cross_reference[1:])))
Пример #4
0
def pull_uniprot(repull=False):
    xmlname = os.path.join(os.path.dirname(__file__),
                           'uniprot_sprot_human.dat')
    if repull:
        xmldata = pull_and_decompress(
            'ftp.uniprot.org',
            '/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/',
            'uniprot_sprot_human.dat.gz')
        with open(xmlname, 'w') as xmlfile:
            xmlfile.write(xmldata)
    seq_to_idlist = defaultdict(set)
    #I only want the PRO sequences.  One day, I could get the -1 -2 sequences as well if
    # there were a reason.
    with open(xmlname, 'r') as unif:
        for record in SwissProt.parse(unif):
            uniprotid = f'UniProtKB:{record.accessions[0]}'
            #xrefs = [ f"{x[0]}:{x[1]}" for x in record.cross_references if x[0].lower() in ['mint','string','nextprot']]
            #xrefs.append( f'PR:{record.accessions[0]}' )
            #xrefs.append( uniprotid )
            feats = [
                f for f in record.features if f[4].startswith('PRO_')
                and isinstance(f[1], int) and isinstance(f[2], int)
            ]
            fseq = [(record.sequence[f[1] - 1:f[2]], f[4]) for f in feats]
            #seq_to_idlist[record.sequence].update(xrefs)
            for fs, fn in fseq:
                seq_to_idlist[fs].add(f'{uniprotid}#{fn}')
    return seq_to_idlist
Пример #5
0
    def get_genes (self,gene_name=""):
        if gene_name != "":
            print "Finding \"{}\" gene in Uniprot database...".format(gene_name)
            upper_name = gene_name.upper() # Rho --> RHO

            output_handle = open(self.fasta_file, "w")

            for record in SwissProt.parse (self.fd):

                match = record.gene_name[5:5+len(upper_name)+1].upper()
                # Name=Rhodop; --> RHOD (Length of the queried name (rho)+1)
                # For matching the two possibilities
                # 1) Name=Rho;
                # 2) Name=rho {ECO.....}
                # So, it fill compare the queried gene name and match one e.g.
                # in 1st case "RHO " == "RHO;" or "RHO;" == "RHO;"
                # in 2nd case "RHO " == "RHO " or "RHO;" == "RHO "
                # We do not consider gene names differ to "Name=...;" in swisprot file



                if (upper_name+" ") == match or (upper_name+";") == match:
                    print "Add protein to fasta file: " + record.entry_name + ", ...." + record.gene_name
                    output = ">"+record.entry_name+"\n"+record.sequence.format("fasta")+"\n"
                    #print output
                    output_handle.write(output)
            output_handle.write("")
            output_handle.close()
Пример #6
0
def get_records(ids):
    records = []
    for id in ids:
        handle = ExPASy.get_sprot_raw(id)
        record = SwissProt.read(handle)
        records.append(record.sequence)
    return records
Пример #7
0
def download_sequences(accessions):
    records = {}
    for accession in accessions:
        handle = ExPASy.get_sprot_raw(accession)
        record = SwissProt.read(handle)
        records[accession] = record.sequence
    return records
Пример #8
0
def sync_query_list_with_response(response_fn, query_list):
    db_data = {}
    with open(response_fn, 'r') as fh:
        for record in SwissProt.parse(fh):
            acc = record.accessions[0]
            # Select only EMBL and RefSeq crossrefs
            refseq_refs, embl_refs = [], []
            for db_ref in record.cross_references:
                if db_ref[0] == 'RefSeq':
                    refseq_refs.append(db_ref[1:])
                elif db_ref[0] == 'EMBL':
                    embl_refs.append(db_ref[1:])
            db_data[acc] = {'RefSeq': refseq_refs,
                            'EMBL': embl_refs}

    # This is to handle isoforms
    # E.g. P03692 and P03692-1 can both be included in the query list
    # P03705-2 can be in the query list but not P03705
    for prot in query_list:  # For each of the original queries
        if (prot not in db_data) and ('-' in prot):  # If the query is not returned
            base_name = prot.split('-')[0]  # Search for the fist part
            if base_name in db_data:  # If it is present in the db_data
                if base_name not in query_list:  # If it's not in the original query list
                    db_data[prot] = db_data[base_name]  # Fill in the information for the corresponding protein
                    db_data.pop(base_name)  # AND remove the original part
                elif base_name in query_list:  # If the first part is in the original query
                    db_data[prot] = db_data[base_name]  # Fill in the information WITHOUT removing the original part
        elif prot not in db_data:
            print("I don't know what to do with this id: {}".format(prot))
            pass

    return db_data
Пример #9
0
def fetch_swp_expasy(uniprot_acc):
    """
    Fetch information on SwissProt accession (manually reviewed UniProt entry).
    
    http://biopython.org/DIST/docs/api/Bio.SwissProt.Record-class.html

    Parameters
    ----------
    arg1 : str
        SwissProt accession or identifier.

    Returns
    -------
    list
        list of length 2 with the name of the attributes found and their values.
    """

    #generates record object with information regarding SwissProt identifier
    handle = ExPASy.get_sprot_raw(uniprot_acc)
    record = SwissProt.read(handle)

    #checks all the attributes possibles for the record object generated and their type
    #attributes are of type: str, tuple, or list
    #attribute list found here: http://biopython.org/DIST/docs/api/Bio.SwissProt.Record-class.html
    attrib_names = [
        'accessions', 'data created', 'date created (ISO)', 'organism',
        'gene names', 'description', 'comments', 'keywords'
    ]
    swp_info_list = [
        record.accessions, record.created[0],
        dating(record.created[0]), record.organism, record.gene_name,
        record.description, record.comments, record.keywords
    ]
    return (attrib_names, swp_info_list)
def sequence_file(*args):
    '''The function sequence_file save the sequence of the protein in fasta
    format, to do so the sequence is retrieved and the other necessary
    information to make the fasta header.
    We included a try/except chunck to display an Error if the code is invalid'''

    a = code.get()
    try:
        from Bio import ExPASy
        from Bio import SwissProt
        with ExPASy.get_sprot_raw(a) as handle:
            record = SwissProt.read(handle)
    except:
        if a == "":
            open_window("No Code", "Please Insert an Uniprot Code", "#FFC3C3",
                        '200x30')
        else:
            open_window("No Valid Code", "Please Insert a valid Uniprot Code",
                        "#FFC3C3", '200x30')

    descrip = record.description.split(";")[0]
    num = descrip.find("Full=") + 5
    descrip = descrip[num:]
    fasta_header = ">sp|" + code.get(
    ) + "|" + record.entry_name + " " + descrip + " OS=" + record.organism

    filename = filedialog.asksaveasfilename(defaultextension='.fasta',
                                            filetypes=[("fasta", "*.fasta")])
    TextFile = open(filename, "w")
    TextFile.write(fasta_header + '\n')
    TextFile.write(record.sequence)
    TextFile.close()
Пример #11
0
def parseBlast():
    result_handle = open("./output/blastOut.xml")
    blast_records = NCBIXML.parse(result_handle)
    E_VALUE_THRESH = 1
    blastHits = {}
    accessions = {}
    #Loop through each protein query results
    for blast_record in blast_records:
        keyword_list = []  #stores running keyword list
        queryID = blast_record.query.split()[0].split(':')[
            1]  #parse for the query protein ID
        #Loop through the hits associated with particular sequence
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                #Hit must have e-value < threshold to be considered
                if hsp.expect < E_VALUE_THRESH:
                    title = alignment.title  #title of hit
                    splittitle = title.split()
                    raw_protein_title = title.split('OS')[
                        0]  #specific keywords in title
                    protein_title = " ".join(raw_protein_title.split()[2:])
                    keyword_list.append(protein_title)
                    accession = splittitle[1].split('|')[
                        1]  #parse for the accession number
                    accessions.setdefault(queryID, []).append(accession)
                    handle = ExPASy.get_sprot_raw(accession)
                    record = SwissProt.read(handle)
                    keyword_list += record.keywords
                    keyword_string = '; '.join(keyword_list)
                    blastHits[queryID] = keyword_string
            break  #only take top hit for now
    return (blastHits, accessions)
Пример #12
0
def features(files):
	ft=['ZN_FING', 'REGION','METAL','SITE','SIGNAL','REPEAT', 'NP_REGION', 'BINDING','MOTIF','MOD_RES', 'LIPID','DOMAIN','DNA_BIND','DISULFID','CROSSLNK', 'CARBOHYD','CA_BIND', 'ACT_SITE']
	for record in SwissProt.parse(open(files)):
		for l in record.features:
			
			if l[0] in ft:
				print l[0]+','+str(l[1])+'-'+str(l[2])+','+l[3]
def parse_uniprot(input_file):
    dic_pfam = {}
    dic_dom = {}
    dic_king = {}
    # probably faster/easier to use the XML parser directly
    #print (input_file)
    handle = open(input_file)
    for record in SwissProt.parse(handle):
        #print (record)
        #print (record.entry_name)
        #print (record.cross_references)
        entry = record.entry_name
        id = entry
        dic_pfam[id] = ''
        dic_dom[id] = 0
        dic_king[id] = 'Unique'
        for db in record.cross_references:
            if (db[0] == "Pfam"):
                dic_pfam[id] = dic_pfam[id] + db[1] + ";"
                dic_dom[id] += 1
                if (db[1] in shared_domains.keys()):
                    dic_king[id] = "Shared"
        if (dic_dom[id] == 0):
            dic_king[id] = 'None'
    return dic_pfam, dic_dom, dic_king
Пример #14
0
def main(filename):
    with open(filename) as fin:
        my_seq = fin.read().strip()
    handle = ExPASy.get_sprot_raw(my_seq) 
    record = SwissProt.read(handle)
    for s in [f[2].split(':')[1] for f in record.cross_references if f[0]=='GO' and f[2][0]=='P']:
        print s
Пример #15
0
	def find_COG2(self):
		"""Find records from uniprotIDs without use of keggIDs."""
		handle = ExPASy.get_sprot_raw(self.uprotID)
		record = SwissProt.read(handle)
		query = record.gene_name.strip("Name""="";")
		url_open = urllib.urlopen("http://rest.genome.jp/oc/?"+query)
		return url_open.read()
Пример #16
0
def get_SwissProt(dict, accession):
    try:
        handle = ExPASy.get_sprot_raw(accession)
        record = SwissProt.read(handle)
        dict[accession] = record
    except urllib2.HTTPError, error:
        print accession + ": protein not found on UniProt . "
Пример #17
0
def main():
    # Read the UniProt ID for a txt file.
    with open('problem_datasets/rosalind_dbpr.txt', 'r') as infile:
        uni_id = infile.read().strip()

    # Retrieve the data from UniProt (separated IDs by commas).
    raw_data = ExPASy.get_sprot_raw(uni_id)
    record = SwissProt.read(
        raw_data)  # use SwissProt.parse for multiple proteins

    # Collect the relevant information.
    go = []
    for i in record.cross_references:
        if i[2].startswith('P:'):
            go.append(i[2][2:])

    # Output answer.
    with open('output/rosalind_dbpr_out.txt', 'w') as outfile:
        outfile.write('\n'.join(go))

    # Optional: Print answer and gene ID/name
    name = record.gene_name.split(' ')[0][5:]
    print('Gene:\n',
          name,
          ' (UniProt ID = ',
          uni_id,
          ')\n\nBiological Processes:\n',
          '\n'.join(go),
          sep='')
Пример #18
0
def go_in_papers(sp_path):
    # Returns: papers: key: pubmed_id; value: list of go_rec records
    # go_rec record is a dictionary. Keys (values): 'sp_id' (swissprot id);
    # 'go_id': (GO ID); 'go_ec': (GO Evidence Code).

    # To be used with SP data, not GOA

    papers = {}
    go_ids = {}
    sp_recs = {}
    papers_prots = {}
    sph = open(sp_path)
    for sp_rec in SP.parse(sph):
        cur_go_recs = get_go_evidence_codes(sp_rec)
        #        print cur_go_recs
        if not cur_go_recs:
            continue
        cur_papers = get_papers(sp_rec)
        for paper in cur_papers:
            if paper not in papers_prots:
                papers_prots[paper] = {sp_rec.entry_name: 1}
            else:
                papers_prots[paper][sp_rec.entry_name] = \
                    papers_prots[paper].get(sp_rec.entry_name,0)+1
            for cur_go_rec in cur_go_recs:
                d1 = dict(sp_id=sp_rec.entry_name,
                          go_id=cur_go_rec[0],
                          go_ec=cur_go_rec[1])
                papers.setdefault(paper, []).append(d1)
    return papers, papers_prots
Пример #19
0
 def test_can_parse_record_into_protein_objects(self):
     for record in SwissProt.parse(self.records):
         obj = parse_record_into_protein(record)
         break
     self.assertEqual(obj.uniprot_id, "P31946")
     self.assertEqual(obj.gene_id, "YWHAB")
     self.assertEqual(obj.reviewed, True)
Пример #20
0
def file_parse():
    file = gzip.open("uniprot.gz")

    #Declaration of arrays which check for repitions
    non_rep_id = []
    non_rep_org = []
    non_rep_tax = []

    swiss_records = SwissProt.parse(file)

    for swiss_record in swiss_records:

        #NCBI ID
        id = swiss_record.taxonomy_id
        if id not in non_rep_id:
            non_rep_id.append(id)

        #ORGANISM NAME
        organism = (swiss_record.organism.strip('.'))
        if organism not in non_rep_org:
            non_rep_org.append(organism)

        #TAXONOMY
        taxonomy= (swiss_record.organism_classification)
        if taxonomy not in non_rep_tax:
            non_rep_tax.append(taxonomy)

    #ZIP arrays to column/tab seperated output
    for i in zip(non_rep_id, non_rep_org, non_rep_tax):
        print ("".join(map((str), list(format(i)))))
Пример #21
0
def gen_uniprot_features_for_pdb(infile):
  for line in open(infile,'r'):
    (pdb_dom, count, uniprot_ids) = line.replace('\n','').split('\t')
    uniprot_ids = uniprot_ids.split('|')
    for uniprot_id in uniprot_ids:
      data = SwissProt.read(ExPASy.get_sprot_raw(uniprot_id)).__dict__  
      keep = False
      go = []; interpro = ''; evo_trace = ''
      for xref in data['cross_references']:
        if xref[0] == 'GO':
          go.append(xref[1])
        if xref[0] == 'InterPro':
          interpro = xref[1]
        if xref[0] == 'EvolutionaryTrace':
          evo_trace = xref[1]
        if xref[0] == 'PDB' and xref[1].lower() == pdb_dom.lower():
          keep = True
      if keep == False:
        continue
      organism = data['organism']
      loc = ''
      for comment in data['comments']:
        if comment.startswith('SUBCELLULAR LOCATION'):
          loc = comment
      print '%s\t%s\t%s\t%s\t%s\t%s\t%s' %(pdb_dom,uniprot_id,'|'.join(go),interpro,evo_trace,organism,loc)
Пример #22
0
 def test_compute_features_returns_None_if_target_is_None(self):
     for record in SwissProt.parse(self.records):
         protein = parse_record_into_protein(record)
         break
     protein.save(self.session, commit=True)
     protein = Protein.query.get(protein.id)  # Refresh
     self.assertIsNone(compute_interaction_features(protein, None))
Пример #23
0
    def test_compute_features_return_empty_list_if_features_are_empty(self):
        for record in SwissProt.parse(self.records):
            protein = parse_record_into_protein(record)
            break

        protein.go_mf = None
        protein.go_bp = None
        protein.go_cc = None
        protein.interpro = None
        protein.pfam = None
        protein.keywords = None

        protein.save(self.session, commit=True)
        protein = Protein.query.get(protein.id)  # Refresh

        features = compute_interaction_features(protein, protein)
        expected = dict(go_mf=[],
                        go_bp=[],
                        go_cc=[],
                        ulca_go_mf=[],
                        ulca_go_bp=[],
                        ulca_go_cc=[],
                        interpro=[],
                        pfam=[],
                        keywords=[])
        self.assertEqual(expected, features)
Пример #24
0
def main(input_string):
    record = SwissProt.read(ExPASy.get_sprot_raw(input_string))
    for ref in record.cross_references:
        if ref[0] == 'GO' and ref[2].startswith('P:'):
            # if reference is a Gene Ontology reference and refers to a
            # biological process
            print(ref[2][2:])
Пример #25
0
def get_SwissProt(dict,accession):
    try:
        handle = ExPASy.get_sprot_raw(accession)
        record = SwissProt.read(handle)
        dict[accession] = record
    except urllib2.HTTPError, error:
        print accession + ": protein not found on UniProt . "
Пример #26
0
def go_in_papers(sp_path):
    # Returns: papers: key: pubmed_id; value: list of go_rec records
    # go_rec record is a dictionary. Keys (values): 'sp_id' (swissprot id); 
    # 'go_id': (GO ID); 'go_ec': (GO Evidence Code).
    
    # To be used with SP data, not GOA
    
    papers = {}
    go_ids = {}
    sp_recs = {}
    papers_prots = {}
    sph = open(sp_path)
    for sp_rec in SP.parse(sph):
        cur_go_recs = get_go_evidence_codes(sp_rec)
#        print cur_go_recs
        if not cur_go_recs: 
            continue
        cur_papers = get_papers(sp_rec)
        for paper in cur_papers:
            if paper not in papers_prots:
                papers_prots[paper] = {sp_rec.entry_name: 1}
            else:
                papers_prots[paper][sp_rec.entry_name] = \
                    papers_prots[paper].get(sp_rec.entry_name,0)+1
            for cur_go_rec in cur_go_recs:
                d1 = dict(sp_id=sp_rec.entry_name,
                          go_id=cur_go_rec[0],
                          go_ec=cur_go_rec[1])
                papers.setdefault(paper,[]).append(d1)
    return papers, papers_prots        
Пример #27
0
    def _parse_features( self ):
    
        print( 'uniprot flat files, to get features...' )
        with open( path + files[16], 'wt' ) as outf:

            for j in [11,12,13,14]:
                print( files[j] + '...' )
                with open(path + files[j], 'rt') as handle:
                    for record in SwissProt.parse(handle):
                        if record.taxonomy_id[0] in ['9606', '10090', '10116']:
                            accs  = record.accessions
                            acc   = accs.pop(0)
                            feats = record.features
                            for f in feats:
                                f = list(f)
                                f.insert(3, '')
                                if re.search(r'^[^\.]+\.\s*$', f[4]):
                                    m = re.match(r'^(.+)\.\s*$', f[4])
                                    if m:
                                        f[3] = m.group(1)
                                        f[4] = ''
                                elif re.search(r'.+\.\s+\{', f[4]):
                                    m = re.match(r'^(.+)\.\s*\{(.+)\}\.$', f[4])
                                    if m:
                                        f[3] = m.group(1)
                                        f[4] = m.group(2)
                                elif re.search(r'.+\.\s+\/', f[4]):
                                    m = re.match(r'^(.+)\.\s*\/(.+)\.$', f[4])
                                    if m:
                                        f[3] = m.group(1)
                                        f[4] = m.group(2)                                
                                else :
                                    f[4] = re.sub(r'[\{\}\.\/]', '', f[4]) 
                                #print(f)
                                outf.write( acc + "\t" + '\t'.join(map(str, f)) + '\n')
Пример #28
0
    def MouseHomolog(self, dfs):

        print('\nFinding mouse homologs')
        ind = 0
        new_dfs = []

        for acc in self.accs:

            try:
                handle = ExPASy.get_sprot_raw(acc)
                record = SwissProt.read(handle)
                name = record.entry_name
            except:
                print('\nNo entry for', acc, ',continuing')
                ind += 1
                continue

            try:
                mname = name.split('_')[0] + '_MOUSE'
                mhandle = ExPASy.get_sprot_raw(mname)
                mrecord = SwissProt.read(mhandle)
                mseq = mrecord.sequence
                print(f'\nFound mouse homolog for {name}: {mname}')
            except:
                print(f'\nNo mouse gene entry for {acc}-{name}, continuing')
                ind += 1
                continue

            df = dfs[ind]
            mcol = []

            for row in range(len(df)):
                pepseq = df.Sequence[df.index[row]]
                print(pepseq)
                if str(pepseq) in mseq:
                    mcol.append('True')
                else:
                    mcol.append('False')

            df['Mouse'] = mcol
            new_dfs.append(df)
            ind += 1

        df_final = pd.concat(new_dfs, sort=True)
        df_final.to_excel(self.out_folder + '/' + 'MouseHomologPeptides.xlsx',
                          index=True)
Пример #29
0
 def get(self,id):
     """Open and Read a Swiss-Prot file locally from remote source (ExPASy database)
         Swiss-Prot file over the internet from the ExPASy database.
         Input must be a accession number stored on the swissprot site.
     """
     handle = ExPASy.get_sprot_raw(id)
     record = SwissProt.read(handle)
     return record
Пример #30
0
 def test_parses_function_as_None_for_entry_with_no_comment(self):
     for record in SwissProt.parse(self.records):
         r = record
         break
     r.comments = [x for x in r.comments if "FUNCTION: " not in x]
     result = function(r)
     expected = None
     self.assertEqual(result, expected)
Пример #31
0
def swissprot_search():

    f = open('output/seq_accession.txt')
    db = f.readline()
    for accession in f:
        handle = ExPASy.get_sprot_raw(accession)
        record = SwissProt.read(handle)
        print(record)
Пример #32
0
 def load_uniprot(self):
     self.uniprot = None
     if not self.exists('uniprot.txt'):
         return
     with self.open('uniprot.txt') as fp:
         self.uniprot = []
         for record in SwissProt.parse(fp):
             self.uniprot.append(record)
Пример #33
0
def main(argv):
    # input() reads stdin
    handle = ExPASy.get_sprot_raw(input().strip()) #you can give several IDs separated by commas
    record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins
    
    # there ought to be a better way to pull GO information from the record! maybe there is...
    for p in filter(lambda x:x[0]=='GO' and x[2].startswith('P:'),record.cross_references):
        print(p[2][2:])
Пример #34
0
def getgo(id):
    handle = ExPASy.get_sprot_raw(id)
    record = SwissProt.read(handle)
    go = [
        r[2].split(":")[1] for r in record.cross_references
        if r[0] == "GO" and r[2].startswith("P")
    ]
    print("\n".join(go))
Пример #35
0
 def generate_uniprot_record(self):
     for file_handle, file_number in self._uniprot_file_handle():
         data_source = self._file_number_to_source(file_number)
         for record in SwissProt.parse(file_handle):
             if self._check_id_to_use(record.accessions[0]):
                 current_record_dict = self._parse_record(
                     record, data_source)
                 yield current_record_dict
Пример #36
0
def main():
    with open("dbpr") as f:
        handle = ExPASy.get_sprot_raw(f.readline().strip())
        record = SwissProt.read(handle)
        record = [x[2] for x in record.cross_references if x[0] == 'GO']
        record = [x[2:] for x in record if x[0] == 'P']
        sys.stdout = open("dbpr.out","w")
        print "\n".join(record)
Пример #37
0
def main(id):
    handle = ExPASy.get_sprot_raw(id)
    record = SwissProt.read(handle)
    for cr in record.cross_references:
        if cr[0] == "GO":
            bits = cr[2].split(":")
            if bits[0] == "P":
                print bits[1]
Пример #38
0
def main(argv):
    line = files.read_line(argv[0])
    handle = ExPASy.get_sprot_raw(line)
    record = SwissProt.read(handle)

    go = filter(lambda x: x[0] == 'GO' and 'P:' in x[2],
                record.cross_references)

    print '\n'.join(g[2].split(':')[1] for g in go)
Пример #39
0
def dbpr():
    uniprot_id = open("rosalind_dbpr.txt").read().strip()
    handle = ExPASy.get_sprot_raw(uniprot_id)
    record = SwissProt.read(handle)

    # return the list of biological functions
    for ref in record.cross_references:
        if ref[0] == 'GO' and ref[2].startswith('P:'):
            print ref[2][2:]
Пример #40
0
    def _parse_flat_files( self ):
    
        print( 'uniprot flat files...' )
        with open( path + files[15], 'wt' ) as outf:

            for j in [11,12,13,14]:
                print( files[j] + '...' )
                with open(path + files[j], 'rt') as handle:
                    for record in SwissProt.parse(handle):
                        if record.taxonomy_id[0] in ['9606', '10090', '10116']:
                            accs  = record.accessions
                            acc   = accs.pop(0)
                            rev   = record.data_class
                            gname = re.sub(r'.*Name=([^;{]+)[{;].*', r'\1', record.gene_name).strip()
                            uid   = record.entry_name
                            taxid = record.taxonomy_id[0]
                            seq   = record.sequence
                            sinfo = str(record.seqinfo[0])
                            srcdb = 'sp'
                            if re.search(r'trembl', files[j]):
                                srcdb = 'tr'
                            rname = ''
                            fname = ''
                            sname = ''
                            flags = ''
                            if 'RecName' in record.description:
                                rname = re.sub(r'.*RecName: *Full=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip()
                            elif 'SubName' in record.description:
                                rname = re.sub(r'.*SubName: *Full=([^;{]+) *[;{].*', r'\1', record.description, re.IGNORECASE).strip()
                            if 'AltName' in record.description:
                                if re.search(r'AltName:[^:]*Full=', record.description, re.IGNORECASE): 
                                    fname = re.sub(r'.*AltName:[^:]*Full=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip()
                                if re.search(r'AltName:[^:]*Short=', record.description, re.IGNORECASE): 
                                    sname = re.sub(r'.*AltName:[^:]*Short=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip()
                            if 'Flags:' in record.description:
                                flags = re.sub(r'.*Flags: *([^;]+);.*', r'\1', record.description, re.IGNORECASE).strip()
                            refs  = list()
                            eids  = list()
                            mgis  = list()
                            hgnc  = list()
                            dids  = list()
                            dnms  = list()
                            ddbs  = list()
                            for i in range(0, len(record.cross_references)):
                                if record.cross_references[i][0] == 'GeneID':
                                    eids.append(record.cross_references[i][1])
                                if record.cross_references[i][0] == 'RefSeq':
                                    refs.append(re.sub(r'\.\d+$', r'', record.cross_references[i][1]))
                                if record.cross_references[i][0] == 'MGI':
                                    mgis.append(record.cross_references[i][1])
                                if record.cross_references[i][0] == 'HGNC':
                                    hgnc.append(record.cross_references[i][1])
                                if record.cross_references[i][0] in xdoms:
                                    dids.append(record.cross_references[i][1])
                                    ddbs.append(record.cross_references[i][0])
                                    dnms.append(record.cross_references[i][2])
                            outf.write( '\t'.join([ acc, uid, srcdb, taxid, rev, gname, rname, fname, sname, flags, '|'.join(accs), '|'.join(eids), '|'.join(refs), '|'.join(hgnc), '|'.join(mgis), '|'.join(ddbs), '|'.join(dids), '|'.join(dnms), sinfo, seq ]) + '\n' )          
def get_ancestors_list():
    i = 0
    handle = open("uniprot_sprot.dat")
    for record in SwissProt.parse(handle):
        descriptions.append(record.sequence)
          print(descriptions)
        i += 1
        if i == 1:
            break
Пример #42
0
 def acession(self):
     self.rec=[]
     for ide in self.ids:
         if ide!='ND':
             results=ExPASy.get_sprot_raw(ide)
             rec=SwissProt.read(results)
             self.rec.append(rec)
         else:
             self.rec.append('ND')
     return self.rec
Пример #43
0
def get_keywords(lookup):
    try:
        handle = ExPASy.get_sprot_raw(lookup)
    except:
        print("Error in ExPASy")
        sys.exit(1)
    try:
        record = SwissProt.read(handle)
    except ValueError, error:
        print(error)
        sys.exit(1)
Пример #44
0
def BiologicalProcesses(UniProtID):
    Handle = ExPASy.get_sprot_raw(UniProtID)
    Record = SwissProt.read(Handle)

    Processes = []
    for i in Record.cross_references:
        if "GO" in i:
            for j in i:
                if re.match("P:.*", j):
                    Processes.append(j[j.rfind(':')+1:])
    return "\n".join(Processes)
Пример #45
0
def fetch(acc) :
    '''Downloads data from UniProt.
    Input: 
    acc: accession code of the record
    database: database name
    Return: the Entrez record
    '''
    base_url = 'http://www.uniprot.org/uniprot/'
    handle = urllib.request.urlopen(base_url + acc + '.txt')
    record = SwissProt.read(handle)
    return record
Пример #46
0
 def obtain_taxons(self, protein_dict, fh_sprot): 
     found = False
     for rec in sp.parse(fh_sprot):
         for ac in range(len(rec.accessions)): 
             if rec.accessions[ac] in protein_dict.keys(): 
                 # assign rec.taxonomy_id list to the protein 
                 protein_dict[rec.accessions[ac]] = rec.taxonomy_id 
                 found = True
                 break
         #if found: 
         #    break 
     return protein_dict
Пример #47
0
def main(protein_id):
    handle = ExPASy.get_sprot_raw(protein_id) #you can give several IDs separated by commas
    record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins

    answer = ""
    for r in record.cross_references:
        print r
        if r[0] == "GO":
            if r[2].split(":")[0] == 'P':
                answer += r[2].split(":")[1] + "\n"

    return answer.strip()
Пример #48
0
 def __init__(self, sprot_cache='', trembl_cache='', organism='h**o sapien'):
     self.records = {}
     self.organism = organism.strip().lower()
     if sprot_cache:
         # Load the swissprot records if file can be found
         try:
             with open(sprot_cache) as fp:
                 for record in SwissProt.parse(fp):
                     for accession in record.accessions:
                         self.records[accession] = record
         except IOError, e:
             print(e); print("SwissProt cache not loaded")
Пример #49
0
    def download_entry(self, accession):
        try:
            handle = ExPASy.get_sprot_raw(accession)
            record = SwissProt.read(handle)
        except:
            raise KeyError('{}'.format(accession))

        record_org = record.organism.strip().lower()
        if self.organism not in record_org:
            print('{} ortholog of {} not found.'.format(self.organism, accession))
            raise KeyError('{} ortholog of {} not found.'.format(self.organism, accession))
        else:
            self.records[accession] = record
            return record
Пример #50
0
def main():
    #Grab our input id value
    uniprot_id = get_uniprot_id_from_file(arguments['<input>'])
    #Get a handle on the data for the uniprot id
    handle = ExPASy.get_sprot_raw(uniprot_id)
    #Parse our data
    record = SwissProt.read(handle)
    handle.close()
    #Process out the stuff of interest, GO values in this case
    go_refs = [ref[1:] for ref in record.cross_references if ref[0] == 'GO']
    for go_entry in go_refs:
        pre, val = go_entry[1].split(':')
        if pre == 'P':
            print(val)
Пример #51
0
def main(fichier):
	"""
		navigate into protein database
	"""
	f = open(fichier,'r')
	fline = f.readline().strip()
	from Bio import ExPASy
	from Bio import SwissProt
	handle = ExPASy.get_sprot_raw(fline)
	record = SwissProt.read(handle)
	go = []
	for i in record.cross_references:
		if i[0] == 'GO' and i[2][0]=='P':
		        go.append(i[2].lstrip('P:'))
	print '\n'.join(go)
Пример #52
0
def __build_NEXP_accession_singleSpecies(fh_sprot, taxon_id, ontType, EXP_default=set([])):
    '''
    This method builds a list of accessions of the proteins whose annotations 
    have non-EXP evidence but no EXP evidence codes in a specific 
    UniProtKB/SwissProt file (file pointer fh_sprot) for some ontology 
    type (ontType). The method returns the list. 
    '''
    # nexp_accessions: Initialize a list to store the accessions of the 
    # proteins that meet the criteria: (1) the protein whose annotation 
    # is supported some Non-EXP evidence code in the specific ontology 
    # ontType, but (2) the annotation is NOT supported by any EXP 
    # evidence code.
    nexp_accessions = []
    print('      Building the accession list with the proteins ' + \
          'that have only non-EXP evidence codes at time t1 ...')
    for rec in sp.parse(fh_sprot):
        # Selects records that are related to a specific
        # taxonomy id taxon_id:
        if taxon_id in rec.taxonomy_id:
            # ont_specific_code_exist: this varilable is initialized to False
            # at the beginning of each iteration. If an evidence code (either 
            # EXP or Non-EXP) for the current record is found, this varilable 
            # will be set to True
            ont_specific_code_exist = False
            # exp_code: this variable is initialized to False at the beginning 
            # of each iteration. If an EXP evidence for the current record is 
            # found, this variable will be set to True.
            exp_code = False
            # Going over the list of DB reference entries:
            for crossRef in rec.cross_references:
            # Consider the cross_reference entries
            # that relate to GO DB:
                if crossRef[0] == 'GO':
                    goList = [crossRef[1],
                              (crossRef[3].split(':'))[0],
                              crossRef[2][0]]
                    if not ont_specific_code_exist and goList[2] == ontType:
                        ont_specific_code_exist = True
                    if goList[2] == ontType and \
                        (crossRef[3].split(':'))[0] in EXP_default:
                        exp_code = True
                        break
            # If the protein's annotation is supported by some Non-EXP evidence
            # code but is not supported by any EXP evidence code, append the 
            # protein's accessions list to the nexp_accessions list:
            if ont_specific_code_exist and not exp_code:
                nexp_accessions.append(rec.accessions)
    return nexp_accessions
Пример #53
0
 def obtain_goterms(self, goterm_dict, fh_sprot):
     found = False
     for rec in sp.parse(fh_sprot):
         for ac in range(len(rec.accessions)):
             goList = []
             if rec.accessions[ac] in goterm_dict.keys():
                 for crossRef in rec.cross_references:
                     if crossRef[0] == 'GO':
                        goDef = (crossRef[1], (crossRef[3].split(':'))[0], \
                                  crossRef[2][0])
                        goterm_dict[rec.accessions[ac]].add(goDef)
                 found = True
                 break
         #if found: 
             #break 
     return goterm_dict
Пример #54
0
def UNIPROT_GENE_PLUS(UNIPROT): #LIST-The difference between this and UNIPROT_GENE is that UNIPROT_GENE_PLUS returns synonim genes as well if    
                                #any and the gene name in the first entry
    import urllib, urllib2
    from Bio import SwissProt
    
    url=urllib2.urlopen("http://www.uniprot.org/uniprot/%s.txt"%UNIPROT)
    GENES=[]
    for record in SwissProt.parse(url):
        if len(record.gene_name.split(";"))>2:
            GENES.append(record.gene_name.split(";")[0].split("=")[1])
            SYN=record.gene_name.split(";")[1].split("=")[1].split(",")
            for syno in SYN:
                GENES.append("".join(syno.split()))
        else:
            GENES.append(record.gene_name.split(";")[0].split("=")[1])
    return GENES
Пример #55
0
def count_genes_with_EXP_old(fh_sprot, taxon_id, EXP_default=set([])):
    # The exp_bpo_ct variable counts total number of genes in
    # the sprot file related to the taxonomy id taxon_id whose
    # annotations have EXP evidence and in BPO ontological category:
    exp_bpo_ct = 0

    # The exp_cco_ct variable counts total number of genes in
    # the sprot file related to the taxonomy id taxon_id whose
    # annotations have EXP evidence and in CCO ontological category:
    exp_cco_ct = 0

    # The exp_mfo_ct variable counts total number of genes in
    # the sprot file related to the taxonomy id taxon_id whose
    # annotations have EXP evidence and in MFO ontological category:
    exp_mfo_ct = 0

    for rec in sp.parse(fh_sprot):
        # SELECT records that are related to a specific
        # taxon_id such as 559292 for yeast:
        if taxon_id in rec.taxonomy_id:
            bpo_exp_flag = cco_exp_flag = mfo_exp_flag = False
            # Go over the list of GO information:
            for crossRef in rec.cross_references:
                # Consider the cross_reference entries that
                # relate to GO DB:
                if crossRef[0] == 'GO':
                    goList = [crossRef[1],
                              (crossRef[3].split(':'))[0],
                              crossRef[2][0]]
                    if (crossRef[3].split(':'))[0] in EXP_default:
                        if goList[-1].upper() == 'P':
                            bpo_exp_flag = True
                        elif goList[-1].upper() == 'C':
                            cco_exp_flag = True
                        elif goList[-1].upper() == 'F':
                            mfo_exp_flag = True
                if (bpo_exp_flag and cco_exp_flag and mfo_exp_flag):
                    break
            # Increase gene counts in BPO, CCO, and MFO categories
            # depending on the corresponding flag values:
            if bpo_exp_flag:
                exp_bpo_ct += 1
            if cco_exp_flag:  
                exp_cco_ct += 1
            if mfo_exp_flag:  
                exp_mfo_ct += 1
    return (exp_bpo_ct, exp_cco_ct, exp_mfo_ct)
def count_GOterms_with_EXP(fh_sprot, taxon_id, EXP_default=set([])):
    '''
    This method extract the distinct GO terms for each gene that 
    have validation with any of the experimental evidence codes.
    A set is created for these GO terms for each gene and then 
    are placed in a dictionary of each ontological categories. 
    At the end, these THREE dictionaries are returned.
    '''
    mfo_terms = OrderedDict()
    bpo_terms = OrderedDict()
    cco_terms = OrderedDict()
    count = 0
    for rec in sp.parse(fh_sprot):
        # SELECT records that are related to a specific
        # taxon_id such as 559292 for yeast:
        if taxon_id in rec.taxonomy_id:
            protName = rec.accessions[0]
            # Initialize lists for adding GO terms:
            terms_mfo = set()
            terms_bpo = set()
            terms_cco = set()
            # Go over the list of DB cross references:
            for crossRef in rec.cross_references:
                # Consider the cross_reference entries that
                # relate to GO DB:
                if crossRef[0] == 'GO':
                    goList = [crossRef[1],
                              (crossRef[3].split(':'))[0],
                              crossRef[2][0]]
                    if (crossRef[3].split(':'))[0] in EXP_default:
#                        print goList
                        if goList[-1].upper() == 'F':
                            terms_mfo.add(goList[0])
                        elif goList[-1].upper() == 'P':
                            terms_bpo.add(goList[0])
                        elif goList[-1].upper() == 'C':
                            terms_cco.add(goList[0])
            # Increase gene counts in BPO, CCO, and MFO categories
            # depending on the corresponding flag values:
            mfo_terms[protName] = terms_mfo
            bpo_terms[protName] = terms_bpo
            cco_terms[protName] = terms_cco
            count += 1
            if count > 20: 
                break
            #break
    return (mfo_terms, bpo_terms, cco_terms) 
Пример #57
0
def check_sprot_format(fh_sprot):
    """
    This method checks whether the format of the file
    (with file handle fh_sprot) is in UniProtKB/Swissprot format.
    If the file is in UniProtKB/Swissprot format format,
        it returns True
    Otherwise,
       it returns False.
    """
    iter_handle = sp.parse(fh_sprot) # sp.parse method returns a generator
    try:
        for rec in iter_handle:
            break
    except:
        return False
    else:
        return True
Пример #58
0
def count_genes_with_EXP(fh_sprot, taxon_id, EXP_default=set([])):
    gene_count = {} 
    gene_count['MFO'] = 0
    gene_count['BPO'] = 0
    gene_count['CCO'] = 0

    for rec in sp.parse(fh_sprot):
        # SELECT records that are related to a specific
        # taxon_id such as 559292 for yeast:
        if taxon_id in rec.taxonomy_id:
            # Three flags to check whether an Exp evidence is found
            # in any of BPO, CCO, and MFO ontological categories:
            exp_flag = {}
            exp_flag['MFO'] = False
            exp_flag['BPO'] = False
            exp_flag['CCO'] = False

            # Go over the list of DB cross references:
            for crossRef in rec.cross_references:
                # Consider the cross_reference entries that
                # relate to GO DB:
                if crossRef[0] == 'GO':
                    goList = [crossRef[1],
                              (crossRef[3].split(':'))[0],
                              crossRef[2][0]]
                    if (crossRef[3].split(':'))[0] in EXP_default:
                        if goList[-1].upper() == 'F':
                            exp_flag['MFO'] = True
                        elif goList[-1].upper() == 'P':
                            exp_flag['BPO'] = True
                        elif goList[-1].upper() == 'C':
                            exp_flag['CCO'] = True
                # Whenever an exp evidence for all three ontological 
                # categories are found, break out the loop:
                if (exp_flag['MFO'] and exp_flag['BPO'] and exp_flag['CCO']):
                    break
            # Increase gene counts in BPO, CCO, and MFO categories
            # depending on the corresponding flag values:
            if exp_flag['MFO']:
                gene_count['MFO'] += 1
            if exp_flag['BPO']:
                gene_count['BPO'] += 1
            if exp_flag['CCO']:
                gene_count['CCO'] += 1
    return gene_count
Пример #59
0
def read_sprot_dat(sprot_dat_file, seq_dict):
    num_record = 0
    for record in SwissProt.parse(open(sprot_dat_file)): # Use Bio.SwissProt to parse the uniprot_sprot.dat file
        for seqID in record.accessions:
            if seqID in seq_dict:
                num_record += 1
                if num_record % 10000 == 0:
                    sys.stderr.write("{} records read so far\n".format(num_record))
                go_terms = [i[1][3:] for i in record.cross_references if i[0] == 'GO'] # GO terms ['GO:0031012', 'GO:0005576', 'GO:0004222', 'GO:0008270']
                organism = record.organism # organism name
                lineage = record.organism_classification # taxonomic classification ['Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae', 'Chloriridovirus']
                tax_id = record.taxonomy_id[0] # taxonomy id '345201'
                gene_name, OLN, ORF = parse_GN(record.gene_name) # GN line,include gene names, ordered locus names, and ORF names 
                full_name, EC = parse_DE(record.description) # DE line with descriptive information. RecName, AltName (Full=, short=, EC=, ...)
                seq_dict[seqID] = {'organism' : organism, 'EC' : EC, 'gene_name' : gene_name,'OLN' : OLN, 'ORF' : ORF, 'GO': go_terms, 'KW': record.keywords, 'full_name': full_name, 'tax_id': tax_id, 'lineage': lineage} # map primary ID to annotation dictionary
            else:
                continue
    sys.stderr.write("\nnumber of sequences is {}\n".format(len(seq_dict)))
    return seq_dict
def UNIPROT_CHAIN_LIMITS(UNIPROT_ID): #Given a uniprot, it returns the limits of the mature protein numbering
    import urllib2
    from Bio import SwissProt
    
    PAGE=urllib2.urlopen("http://www.uniprot.org/uniprot/%s.txt"%UNIPROT_ID)

    PARSED_PAGE=SwissProt.parse(PAGE)
    for record in PARSED_PAGE:
        CHAIN_VALUES=[]
        for feature in record.features:
            if feature[0]=="CHAIN":
                CHAIN_VALUES=CHAIN_VALUES+[str(feature[1]), str(feature[2])]

        if any(X.isdigit()==False for X in CHAIN_VALUES) or not CHAIN_VALUES:
            CHAIN_START=1
            CHAIN_END=record.sequence_length
        else:
            CHAIN_START=min(int(X) for X in CHAIN_VALUES)
            CHAIN_END=max(int(X) for X in CHAIN_VALUES)                
    
    return[CHAIN_START, CHAIN_END]