genes = {} for h in headers: (seq,strand,start,end) = re.search(loc, h).groups() strand = '+' if strand is None else '-' (geneID,) = re.search(ID, h).groups() # Format the line for output, update dict 'genes'. genes[(seq, int(start))] = '\t'.join(( geneID, seq, start, end, strand )) table_header = '\t'.join(( 'geneID', 'seqname', 'start', 'end', 'strand' )) + '\n' # Print the vheader. sys.stdout.write(vheader(*sys.argv)) # Print a header. sys.stdout.write(table_header) # Sort lines by key, ie seqname, start. for key in sorted(genes): sys.stdout.write('%s\n' % genes[key])
def JSONtargets(mappingfile, bindingfile): """Create a gene target set in JSON format from a gene mapping file and a discrete binding profile.""" # Read in gene mapping. Skip comment lines and remove stray # 'chr' sometimes present in chromosome names. mapping = [ l.rstrip().replace('chr','').split('\t') \ for l in vskip(open(mappingfile, 'r')) \ if l[0] != '#' ] # Remove the header if present (recognized by 'start' and # 'end' in third and fourth columns. if mapping[0][2:4] == ['start','end']: mapping.pop(0) # Collect TSS, if gene is on +, TSS is on start, else on end. TSS = {} for row in mapping: thisTSS = { '+': lambda x: (x[1], int(x[2])), # 2nd and 3rd column. '-': lambda x: (x[1], int(x[3])) # 2nd and 4th column. }.get(row[4])(row) # Arrange geneIDs by TSS in a dictionary. # Example: TSS['FBgn0031208'] = ('2L', 7529) TSS[row[0]] = thisTSS # Read in binding data. Skip comment lines and remove # 'chr' on chromosome names. binding = [ l.rstrip().replace('chr','').split('\t') \ for l in vskip(open(bindingfile, 'r')) \ if l[0] != '#' ] # Get feature names and remove (pop) the header. # Example: features = ['D005', 'D007', ...] features = binding.pop(0)[4:] # "all" and "NA" are mutually exclusive lists of genes. targets = {'total': [], 'NA': []} for feature in features: targets[feature] = [] # Collect mapping information (seqname, start, end) and # binding info (0/1). mapinfo = {} bindinfo = {} for row in binding: # Example: mapinfo['r5GATC2L00037'] = ('2L', 5301, 6026) mapinfo[row[0]] = (row[1], int(row[2]), int(row[3])) # Example: bindinfo['r5GATC2L00037'] = [0,0,1,...] bindinfo[row[0]] = row[4:] # Get the closest feature to TSS. close_elt = get_closest(TSS, mapinfo, dist = dist) for geneID in close_elt: if dist(TSS[geneID], mapinfo[close_elt[geneID]]) > MAXDIST: # The gene is too far. Push it to NA. targets.get('NA').append(geneID) else: targets.get('total').append(geneID) # The gene gets the status of the binding element closest # to its TSS. for feature in [ feat for (feat, yes) in \ # Example: [('D005', 0), ('D007', 0), ...] zip(features, bindinfo[close_elt[geneID]]) \ if yes == '1' ]: targets.get(feature).append(geneID) # Print the version tracking header and the JSON data. sys.stdout.write(vheader(*sys.argv)) json.dump(targets, sys.stdout, indent=4)
def collect_integrations(fname_starcode_out, fname_mapped, *args): """This function reads the stacode output and changes all the barcodes mapped by their canonicals while it calculates the mapped distance rejecting multiple mapping integrations or unmmaped ones. It also counts the frequency that each barcode is found in the mapped data even for the non-mapping barcodes.""" KEEP = frozenset([ '2L', '2LHet', '2R', '2RHet', '3L', '3LHet', '3R', '3RHet', '4', 'X', 'XHet', 'U', 'Uextra', 'dmel_mitochondrion_genome', 'pT2', ]) fname_insertions_table = re.sub(r'\.map', '_insertions.txt', fname_mapped) # Substitution failed, append '_insertions.txt' to avoid name conflict. if fname_insertions_table == fname_mapped: fname_insertions_table = fname_mapped + '_insertions.txt' # Skip if file exists. if os.path.exists(fname_insertions_table): return def dist(intlist): intlist.sort() try: if intlist[0][0] != intlist[-1][0]: return float('inf') return intlist[-1][1] - intlist[0][1] except IndexError: return float('inf') canonical = dict() with open(fname_starcode_out) as f: for line in f: items = line.split() for brcd in items[2].split(','): canonical[brcd] = items[0] counts = defaultdict(lambda: defaultdict(int)) with open(fname_mapped) as f: for line in f: items = line.split() try: barcode = canonical[items[0]] except KeyError: continue if items[3] == '-': position = ('', 0) else: pos = items[3].split(':') loc = int(pos[2]) if pos[1] == '+' else \ int(pos[2]) + len(items[1]) position = (pos[0], loc, pos[1]) counts[barcode][position] += 1 integrations = dict() for brcd,hist in counts.items(): total = sum(hist.values()) top = [pos for pos,count in hist.items() \ if count > max(1, 0.1*total)] # Skip barcode in case of disagreement between top votes. if dist(top) > 10: continue ins = max(hist, key=hist.get) integrations[brcd] = (ins, total) # Count reads from other files. reads = dict() # First item of tuple is barcode file, second is the spike's one for (fname,ignore) in args: reads[fname] = defaultdict(int) with open(fname) as f: for line in f: items = line.split('\t') try: reads[fname][items[0]] = int(items[1]) except (IndexError, ValueError) as ex: raise FormatException("Input file with wrong format") with open(fname_insertions_table, 'w') as outf: outf.write(vheader(*sys.argv)) unmapped = 0 mapped = 0 for brcd in sorted(integrations, key=lambda x: (integrations.get(x),x)): try: (chrom,pos,strand),total = integrations[brcd] if chrom not in KEEP: raise ValueError except ValueError: unmapped += 1 continue mapped += 1 outf.write('%s\t%s\t%s\t%d\t%d' % (brcd,chrom,strand,pos,total)) for fname,ignore in args: outf.write('\t' + str(reads[fname][brcd])) outf.write('\n') # Now add the spikes if the experiment was spiked, otherwise continue. N = len(args) for i in range(N): (ignore,fname) = args[i] with open(fname) as f: for line in f: try: items = line.rstrip().split('\t') array = ['0'] * N array[i] = items[1] outf.write('%s\tspike\t*\t0\t0\t' % items[0]) outf.write('\t'.join(array) + '\n') except IndexError: continue with open(LOGFNAME, 'a') as f: f.write('%s: mapped:%d, unmapped:%d\n' \ % (fname_mapped, mapped, unmapped)) return
def collect_integrations(fname_starcode_out, fname_mapped, *args): """This function reads the starcode output and changes all the barcodes mapped by their canonicals while it calculates the mapped distance rejecting multiple mapping integrations or unmmaped ones. It also counts the frequency that each barcode is found in the mapped data even for the non-mapping barcodes.""" fname_insertions_table = re.sub(r'\.sam', '_insertions.txt', fname_mapped) # Substitution failed, append '_insertions.txt' to avoid name conflict. if fname_insertions_table == fname_mapped: fname_insertions_table = fname_mapped + '_insertions.txt' # Skip if file exists. if os.path.exists(fname_insertions_table): return def dist(intlist): intlist.sort() try: if intlist[0][0] != intlist[-1][0]: return float('inf') return intlist[-1][1] - intlist[0][1] except IndexError: return float('inf') canonical = dict() with open(fname_starcode_out) as f: for line in f: items = line.split() for brcd in items[2].split(','): canonical[brcd] = items[0] counts = defaultdict(lambda: defaultdict(int)) ISREV = 0b10000 with open(fname_mapped) as f: for line in f: if line[0] == '@': continue items = line.split() try: barcode = canonical[items[0]] except KeyError: continue if items[2] == '*': position = ('', 0) else: # GTTACATCGGTTAATAGATA 16 2L 9743332 60 9S32M [...] strand = '-' if int(items[1]) & ISREV else '+' chrom = items[2] pos = int(items[3]) position = (chrom, pos, strand) counts[barcode][position] += 1 integrations = dict() for brcd, hist in counts.items(): total = sum(hist.values()) top = [ loc for loc, count in hist.items() if count > max(1, 0.1 * total) ] # Skip barcode in case of disagreement between top votes. if dist(top) > 10: continue ins = max(hist, key=hist.get) integrations[brcd] = (ins, total) # Count reads from other files. reads = dict() # First item of tuple is barcode file, second is the spike's one for (fname, ignore) in args: reads[fname] = defaultdict(int) with open(fname) as f: for line in f: items = line.split('\t') try: reads[fname][items[0]] = int(items[1]) except (IndexError, ValueError) as ex: raise FormatException("Input file with wrong format") with open(fname_insertions_table, 'w') as outf: outf.write(vheader(*sys.argv)) unmapped = 0 mapped = 0 for brcd in sorted(integrations, key=lambda x: (integrations.get(x), x)): (chrom, pos, strand), total = integrations[brcd] mapped += 1 outf.write('%s\t%s\t%s\t%d\t%d' % (brcd, chrom, strand, pos, total)) for fname, ignore in args: outf.write('\t' + str(reads[fname][brcd])) outf.write('\n') # Now add the spikes if the experiment was spiked, otherwise continue. N = len(args) for i in range(N): (ignore, fname) = args[i] with open(fname) as f: for line in f: try: items = line.rstrip().split('\t') array = ['0'] * N array[i] = items[1] outf.write('%s\tspike\t*\t0\t0\t' % items[0]) outf.write('\t'.join(array) + '\n') except IndexError: continue with open(LOGFNAME, 'a') as f: f.write('%s: mapped:%d, unmapped:%d\n' % (fname_mapped, mapped, unmapped)) return