# Read source data into list bioMartList = srcFile.readlines() # remove trailing '\n' from every record LEN_bML = len(bioMartList) i = 0 while i < LEN_bML: bioMartList[i] = bioMartList[i].rstrip('\n') i = i + 1 # Grouping records by gene name and splitting record fields into lists groupedList = JamesDefs.groupByField(bioMartList, 0) # Combine exon records into a single gene line record with start and stop coords for coding region # TranscriptID field will be removed and fields representing the number of exons encountered and # the chromosomal coverage will be appended respectivly to the end of each record oneLineRecordList = combineExons(groupedList, bdryLen) # Write out oneLineRecordList to outFile boundaryFile.writelines(oneLineRecordList) boundaryFile.close() print 'Tada!'
t1 = time() # Populate a Dict with Seq objs for Anopheles boundary seqs # What follows directly is a klugde to get my seqDict vals to have the IUPAC ambiguous alphabet boundarySeqs = list(SeqIO.parse(open(boundarySeqs, "rU"), "fasta")) for record in boundarySeqs : record.seq.alphabet = IUPACAmbiguousDNA boundarySeqs = SeqIO.to_dict(boundarySeqs, key_function = lambda rec : rec.description.split()[0]) # convert iupac motifs to regexs and creat list of lists with each motif represented as ['IUPAC', 'REGEX'] convertMotifList(motifList) # group ClusterDefs by ClusterName clusterDefinitionList = JamesDefs.groupByField(clusterDefinitionList, 0) # This will become a list of tab delim'd params for the hyperGeo func: 'Motif:ClusterID';'motifCountInAll';'len(all)';'motifCountInCluster';'numOfSeqsInCluster' hyperGeoParams_4_motifClusterPairs = [] m=0 for motif in motifList: m+=1 print 'Motif '+str(m) # Count how many seq in total list have motif in either orientation motifCountInAll = None motifCountInAll = countMotifInAll(motif[1], boundarySeqs) for cluster in clusterDefinitionList:
# open and create handle for outFile resFile = open(outFile, 'w') tick = time.clock() # read file into list conflictList = conflictFile.readlines() # remove trailing '\n' from every record LEN_cL = len(conflictList) i = 0 while i < LEN_cL: conflictList[i] = conflictList[i].rstrip('\n') i = i + 1 # group file by target gene id using groupByField fjoinOutByGeneIDList = JamesDefs.groupByField(conflictList, 1) resolverArgs = { 'strandField' : 4, 'lowerBoundProximal' : 10, 'higherBoundProximal' : 11, 'conflictRegionStrt' : 18, 'conflictRegionEnd' : 19, 'whichBoundary':'upStream' } resolvedBoundariesList = resolver(fjoinOutByGeneIDList, resolverArgs) resFile.writelines(resolvedBoundariesList) tock = time.clock()
#========================= User Defined Variables ========================= # Path to original file originalFastaDict = open('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas', 'rU') desiredFastaList = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Clusters.txt' outDir = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/' #========================================================================== desiredFastaList = map(lambda line : line.strip(), open(desiredFastaList, 'rU').readlines()) # Parse clusterDefs into list of clusters listOfClusterDefs = JamesDefs.groupByField(desiredFastaList,0) # Instantiate the fasta rec lists with BioPython Seq using geneID field of discriptor as key to seq objects originalFastaDict = SeqIO.to_dict(SeqIO.parse(originalFastaDict, 'fasta'), key_function = lambda rec : rec.description.split()[0]) for cluster in listOfClusterDefs: print "Working on Cluster: %s" % (cluster[0][0]) # New dict to catch copied seqObjs desiredFastaObjList = [] for rec in cluster: if originalFastaDict.has_key(rec[1]):