def random_human_genes(num): genes_set = set() for rec in SeqIO.parse( open( "data\UniProt full Proteomes\uniprot-proteome-full H**o sapiens (71599 proteins).fasta" ), 'fasta'): seq = str(rec.seq) uniqueIdentifier, entryName, proteinName, organismName, geneName = \ utils.parse_UniProtKB_header(rec.description) genes_set.add(geneName) return random.sample(genes_set, num)
def get_human_proteins_that_contain_saar(SAAR): rcp_set = set() genes_set = set() for rec in SeqIO.parse( open( "data\UniProt full Proteomes\uniprot-proteome-full H**o sapiens (71599 proteins).fasta" ), 'fasta'): seq = str(rec.seq) uniqueIdentifier, entryName, proteinName, organismName, geneName = \ utils.parse_UniProtKB_header(rec.description) genes_set.add(geneName) if SAAR in seq: rcp_set.add(geneName) return rcp_set
kmer for kmer in self.repeating_non_overlapping_kmers if self.seq.find(kmer) + params.MIN_DIST_BETWEEN_REPETITIONS < self.seq.rfind(kmer) ] all_proteins = list() # all_proteins[i] = PROTEIN()_OBJECT protein_seq = dict() # protein_seq[GENE_NAME] = AMINO_ACID_SEQUENCE print "Reading Uniprot file and generating k-mers list for each protein..." created_protein_names = set( ) # prevent creation of two similar protein objects for rec in SeqIO.parse(open(params.HUMAN_PROTEOME), 'fasta'): seq = str(rec.seq) uniqueIdentifier, entryName, proteinName, organismName, geneName = \ utils.parse_UniProtKB_header(rec.description) if geneName == '': print 'Ignoring unknown gene: %s' % rec.description continue if geneName in created_protein_names: print 'Ignoring duplicate gene: %s' % rec.description continue # create a new Protein object created_protein_names.add(geneName) protein_seq[geneName] = seq all_proteins.append(Protein(geneName, seq, params.K)) print print "Counted k-mer (k=%d) for %d different genes (proteins)." % ( params.K, len(all_proteins))
def main(proteome_file, similar_diluted): all_proteins = list() # all_proteins[i] = PROTEIN()_OBJECT protein_seq = dict() # protein_seq[GENE_NAME] = AMINO_ACID_SEQUENCE print "Reading Uniprot file and generating k-mers list for each protein..." created_protein_names = set( ) # prevent creation of two similar protein objects duplicate_genes_ignored = 0 for rec in SeqIO.parse(open(proteome_file), 'fasta'): seq = str(rec.seq) uniqueIdentifier, entryName, proteinName, organismName, geneName = \ utils.parse_UniProtKB_header(rec.description) if geneName == '': geneName = uniqueIdentifier #print 'Using uncharacterized gene with identifier %s' % uniqueIdentifier #print 'Ignoring unknown gene: %s' % rec.description #continue if geneName in created_protein_names: duplicate_genes_ignored += 1 #print 'Ignoring duplicate gene: %s' % rec.description continue # create a new Protein object created_protein_names.add(geneName) protein_seq[geneName] = seq all_proteins.append(Protein(geneName, seq, params.K)) print print "Ignored %d duplicate genes." % duplicate_genes_ignored print "Counted k-mer (k=%d) for %d different genes (proteins)." % ( params.K, len(all_proteins)) pb = Progressbar('Generating frequency dictionary for k-mers') kmers_frequency = dict() # track popularity of kmer accross all proteins i = 0 for prot in all_proteins: i += 1 pb.update_progress(i, len(all_proteins)) for kmer in prot.kmers: if kmer not in kmers_frequency: kmers_frequency[kmer] = set() # add the new protein only if it's dissimilar enough from all other # proteins that were already added and contain this kmer protein_names = kmers_frequency[ kmer] # list of all prots that share this kmer redundantProt = False if similar_diluted: for protein_name in protein_names: #print '%s: Checking similarity of %s and %s' % (kmer, protein_name, prot.geneName) if not utils.proteins_are_dissimilar( protein_name, prot.geneName, protein_seq[protein_name], prot.seq): redundantProt = True break if not redundantProt: kmers_frequency[kmer].add(prot.geneName) redundantProt = False print "Sorting frequent k-mers by frequency..." most_frequenct_kmers = sorted(kmers_frequency, key=lambda k: len(kmers_frequency[k]), reverse=True) print "Writing results to file..." import datetime, time, csv timestamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H%M%S') filename_without_extension = os.path.splitext( os.path.basename(proteome_file))[0] dilution_status = 'with dilution' if similar_diluted else 'without dilution' outfile = "{0} - frequent k{1}-mers - {2} - {3}.csv".format( filename_without_extension, params.K, dilution_status, timestamp) outfile = os.path.join(os.path.dirname(proteome_file), outfile) with open(outfile, "wb") as csv_file: writer = csv.writer(csv_file, delimiter=',') writer.writerow([ 'k-mer', 'number of proteins', 'percentage', 'all', '(Out of %d proteins in total)' % len(all_proteins) ]) for kmer in most_frequenct_kmers: total_proteins = len(kmers_frequency[kmer]) if total_proteins < 5: break percentage = round(float(total_proteins) / len(all_proteins), 6) geneList = list(kmers_frequency[kmer]) #geneList = '\r\n'.join(geneList) row = [kmer, total_proteins, percentage, geneList] writer.writerow(row)
def main(proteome_file, output_dir): all_proteins = list() # all_proteins[i] = PROTEIN()_OBJECT protein_seq = dict() # protein_seq[GENE_NAME] = AMINO_ACID_SEQUENCE print "Reading Uniprot file and generating k-mers list for each protein..." created_protein_names = set( ) # prevent creation of two similar protein objects for rec in SeqIO.parse(open(proteome_file), 'fasta'): seq = str(rec.seq) uniqueIdentifier, entryName, proteinName, organismName, geneName = \ utils.parse_UniProtKB_header(rec.description) if geneName == '': geneName = uniqueIdentifier #print 'Using uncharacterized gene with identifier %s' % uniqueIdentifier #print 'Ignoring unknown gene: %s' % rec.description #continue if geneName in created_protein_names: print 'Ignoring duplicate gene: %s' % rec.description continue # create a new Protein object created_protein_names.add(geneName) protein_seq[geneName] = seq all_proteins.append(Protein(geneName, seq, params.K)) print print "Counted k-mer (k=%d) for %d different genes (proteins)." % ( params.K, len(all_proteins)) pb = Progressbar('Generating frequency dictionary for k-mers') skipped_prots = 0 kmers_frequency = dict() # track popularity of kmer accross all proteins i = 0 for prot in all_proteins: i += 1 pb.update_progress(i, len(all_proteins)) """ if prot.geneName.startswith('ZNF') or prot.geneName.startswith('ZF'): skipped_prots += 1 continue if prot.geneName.startswith('OR'): skipped_prots += 1 continue if prot.geneName.startswith('HOX'): skipped_prots += 1 continue if prot.geneName.startswith('IGKV'): skipped_prots += 1 continue """ for kmer in prot.kmers: if kmer not in kmers_frequency: kmers_frequency[kmer] = set() kmers_frequency[kmer].add(prot.geneName) print "Sorting frequent k-mers by frequency..." most_frequenct_kmers = sorted(kmers_frequency, key=lambda k: len(kmers_frequency[k]), reverse=True) import datetime, time, csv timestamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H%M%S') outfile = '{}/frequent k{}-mers - {}.csv'.format(output_dir, params.K, timestamp) with open(outfile, "wb") as csv_file: writer = csv.writer(csv_file, delimiter=',') writer.writerow([ 'k-mer', 'number of proteins', 'percentage of total', 'all', '(Out of %d proteins in total)' % (len(all_proteins) - skipped_prots) ]) for kmer in most_frequenct_kmers: total_proteins = len(kmers_frequency[kmer]) if total_proteins < 10: break percentage = "{0:.4f}".format( float(total_proteins) / (len(all_proteins) - skipped_prots)) geneList = list(kmers_frequency[kmer]) #geneList = '\r\n'.join(geneList) row = [kmer, total_proteins, percentage, geneList] writer.writerow(row)