def fragendPairs(pairIn, fasta, resite ,maxDistance, fragendOut): ''' Function identifies and reports upstream fragends for HiC read pairs. The function takes 5 arguments: 1) pairIn - Read apit input object 2) fasta - Genome fasta file 3) reSite - Restriction enzyme recognition sequence 4) maxDistance - Maximum acceptable distance between start of read and RE site. 5) pairOut - Name of output gzipped file containing fragend ligations. ''' # Create fragend dictionary and metrics dictionary fragDict = findFragendSites(fasta, resite) fragendCounts = collections.defaultdict(int) fragendCounts['fragDist'] = [] fragendCounts['ligDist'] = [] # Open input and output file if pairIn.endswith('.gz'): inFile = gzip.open(pairIn, 'r') else: inFile = open(pairIn, 'r') outFile = writeFile.writeFileProcess(fragendOut) for pair in inFile: pair = pair.strip().split('\t') # Count entries fragendCounts['total'] += 1 # Create output containg fragend data output = downstream([pair[0:4],pair[4:8]], fragDict) # Skip reads without identified fragends if output[0] == None or output[1] == None: fragendCounts['none'] += 1 continue # Add fragend distance data for pairs with fragends fragendCounts['fragDist'].extend([ output[0][3], output[1][3] ]) # Count and skip reads too distant from the fragend if output[0][3] > maxDistance or output[1][3] > maxDistance: fragendCounts['distant'] += 1 continue # Save to file accepted ligation pairs outData = '\t'.join(map(str,output[0][0:3] + output[1][0:3])) outFile.add(outData + '\n') # Count interchromosomal ligations if output[0][0] != output[1][0]: fragendCounts['interchromosomal'] += 1 # Count intrachromosomal ligations and store distance else: fragendCounts['intrachromosomal'] += 1 fragendCounts['ligDist'].append( abs(output[0][1] - output[1][1]) ) # Close files and return data inFile.close() outFile.close() return(fragendCounts)
def fragendPairs(pairIn, fasta, resite, maxDistance, fragendOut): ''' Function identifies and reports upstream fragends for HiC read pairs. The function takes 5 arguments: 1) pairIn - Read apit input object 2) fasta - Genome fasta file 3) reSite - Restriction enzyme recognition sequence 4) maxDistance - Maximum acceptable distance between start of read and RE site. 5) pairOut - Name of output gzipped file containing fragend ligations. ''' # Create fragend dictionary and metrics dictionary fragDict = findFragendSites(fasta, resite) fragendCounts = collections.defaultdict(int) fragendCounts['fragDist'] = [] fragendCounts['ligDist'] = [] # Open input and output file if pairIn.endswith('.gz'): inFile = gzip.open(pairIn, 'r') else: inFile = open(pairIn, 'r') outFile = writeFile.writeFileProcess(fragendOut) for pair in inFile: pair = pair.strip().split('\t') # Count entries fragendCounts['total'] += 1 # Create output containg fragend data output = downstream([pair[0:4], pair[4:8]], fragDict) # Skip reads without identified fragends if output[0] == None or output[1] == None: fragendCounts['none'] += 1 continue # Add fragend distance data for pairs with fragends fragendCounts['fragDist'].extend([output[0][3], output[1][3]]) # Count and skip reads too distant from the fragend if output[0][3] > maxDistance or output[1][3] > maxDistance: fragendCounts['distant'] += 1 continue # Save to file accepted ligation pairs outData = '\t'.join(map(str, output[0][0:3] + output[1][0:3])) outFile.add(outData + '\n') # Count interchromosomal ligations if output[0][0] != output[1][0]: fragendCounts['interchromosomal'] += 1 # Count intrachromosomal ligations and store distance else: fragendCounts['intrachromosomal'] += 1 fragendCounts['ligDist'].append(abs(output[0][1] - output[1][1])) # Close files and return data inFile.close() outFile.close() return (fragendCounts)
def mergeLabelPair(fastqIn1, fastqIn2, fastqOut, label1=':1', label2=':2'): ''' Function merges two FASTQ files into a single FASTQ file. Specified names are added to the end of the merged reads. 1) fastqIn1 - Read one FASTQ file(s). Either a string or a list of strings. 2) fastqIn2 - Read two FASTQ file(s). Either a string or a list of strings. 3) fastqOut - Output FASTQ file 4) label1 - Label to add to read1 file. 5) label2 - Label to add to read2 file. ''' # Open input and output process output = writeFile.writeFileProcess(fastqOut) input1 = fastqExtract.readFastqProcess(fastqIn1) input2 = fastqExtract.readFastqProcess(fastqIn2) # Extract labelled reads and save to output for read1, read2 in itertools.izip(input1, input2): # Extract read ID and check for equality read1Header, read1Remainder = read1.split('\n' ,1) read1Header = read1Header.split(' ', 1) read2Header, read2Remainder = read2.split('\n' ,1) read2Header = read2Header.split(' ', 1) if read1Header[0] != read2Header[0]: raise IOError('Input FASTQ files contain unmatched reads') else: read1Header[0] += label1 read2Header[0] += label2 output.add('%s\n%s\n%s\n%s\n' %( ' '.join(read1Header), read1Remainder, ' '.join(read2Header), read2Remainder )) # Close pipes and processes input1.close() input2.close() output.close()
def processPairs(pipe, pairOut, rmDup, rmConcord, maxSize): ''' Function to output read pairs generated from the extract function while processing concordant and duplicate reads. Function takes five arguments: 1) readPairs - a read pair dictionary created by the extract function. 2) pairOut - output object which is processed by the iohandle.handleout function. 3) rmDup - Boolean indicating whether to remove duplicates from the output. 4) rmConcord - Boolean indicating whether to remove concordant pairs from the output. 5) alignLog - log dictionary generated by the extract function. Function returns two items: 1) A closed iohandle.handleout object 2) The altered alignLog from the input ''' # Create counter and pair set pairCount = collections.defaultdict(int) pairSet = set() # Open output file process outObject = writeFile.writeFileProcess(fileName=pairOut) # Loop through pairs while True: # Get pair from pipe pair = pipe.recv() if pair == None: break # Count and check for duplicates pairs pairCount['total'] += 1 if pair in pairSet: dup = True pairCount['duplicate'] += 1 else: dup = False pairCount['unique'] += 1 pairSet.add(pair) # Count and check for concordant pairs concord = concordant(pair, maxSize) if concord: pairCount['concord'] += 1 if not dup: pairCount['concorduni'] += 1 else: pairCount['discord'] += 1 if not dup: pairCount['discorduni'] += 1 # Process output if dup and rmDup: continue elif concord and rmConcord: continue else: outData = '\t'.join(map(str, pair)) + '\n' outObject.add(outData) # Close file, return data and close pipe outObject.close() pipe.send(pairCount) pipe.close()
def mergeLabelTrimPair(fastqIn1, fastqIn2, trimSeq, fastqOut, minLength = 20, label1=':1', label2=':2'): ''' Function merges two paired FASTQ files into a single FASTQ file. FASTQ entries are trimmed to not extend beyong a supplied trim sequence. Any pair of reads for which one of the trimmed reads is shorter than the supplied minimum length is discarded. Specified labels are added to the end of the merged reads. Function takes seven arguments: 1) fastqIn1 - Read one FASTQ file(s). Either a string or a list of strings. 2) fastqIn2 - Read two FASTQ file(s). Either a string or a list of strings. 3) trimSeq - Sequence at which to terminate reads 4) fastqOut - Output FASTQ file. 5) minLength - Minimum length of reads to be output. 6) label1 - Label to add to read1 file. 7) label2 - Label to add to read2 file. Function returns a dictionary containing the following elements 1) total - Total number of read pairs. 2) short - Number of pairs with at least one read too short. 3) trim1 - Number of acceptable pairs with read1 trimmed. 4) trim2 - Number of acceptable pairs with read2 trimmed. ''' # Create output dictionary and key variables metrics = {'total' : 0, 'short': 0, 'trim1': 0, 'trim2' : 0} seqLength = len(trimSeq) # Open input and output process input1 = fastqExtract.readFastqProcess(fastqIn1) input2 = fastqExtract.readFastqProcess(fastqIn2) output = writeFile.writeFileProcess(fastqOut) # Extract labelled reads and save to output for read1, read2 in itertools.izip(input1, input2): # Count total reads metrics['total'] += 1 # Extract elements of read and identify trim sequence read1 = read1.split('\n') read2 = read2.split('\n') read1Loc = read1[1].find(trimSeq) read2Loc = read2[1].find(trimSeq) # Set trim length for read1 and count and skip if too short if read1Loc == -1: read1Trim = None else: read1Trim = read1Loc + seqLength if read1Trim and read1Trim < minLength: metrics['short'] += 1 continue # Set trim length for read2 and count and skip if too short if read2Loc == -1: read2Trim = None else: read2Trim = read2Loc + seqLength if read2Trim and read2Trim < minLength: metrics['short'] +=1 continue # Trim reads if required if read1Trim and read1Trim < len(read1[1]): read1[1] = read1[1][:read1Trim] read1[3] = read1[3][:read1Trim] metrics['trim1'] += 1 if read2Trim and read2Trim < len(read2[1]): read2[1] = read2[1][:read2Trim] read2[3] = read2[3][:read2Trim] metrics['trim2'] += 1 # Label reads if label1: read1Head = read1[0].split(' ',1) read1Head[0] += label1 read1[0] = ' '.join(read1Head) if label2: read2Head = read2[0].split(' ',1) read2Head[0] += label2 read2[0] = ' '.join(read2Head) # Save to file output.add('%s\n%s\n' %( '\n'.join(read1), '\n'.join(read2) )) # Close input and output objects input1.close() input2.close() output.close() # Return metrics return(metrics)
def processPairs(pipe, pairOut, rmDup, rmConcord, maxSize): ''' Function to output read pairs generated from the extract function while processing concordant and duplicate reads. Function takes five arguments: 1) readPairs - a read pair dictionary created by the extract function. 2) pairOut - output object which is processed by the iohandle.handleout function. 3) rmDup - Boolean indicating whether to remove duplicates from the output. 4) rmConcord - Boolean indicating whether to remove concordant pairs from the output. 5) alignLog - log dictionary generated by the extract function. Function returns two items: 1) A closed iohandle.handleout object 2) The altered alignLog from the input ''' # Create counter and pair set pairCount = collections.defaultdict(int) pairSet = set() # Open output file process outObject = writeFile.writeFileProcess(fileName = pairOut) # Loop through pairs while True: # Get pair from pipe pair = pipe.recv() if pair == None: break # Count and check for duplicates pairs pairCount['total'] += 1 if pair in pairSet: dup = True pairCount['duplicate'] += 1 else: dup = False pairCount['unique'] += 1 pairSet.add(pair) # Count and check for concordant pairs concord = concordant(pair, maxSize) if concord: pairCount['concord'] += 1 if not dup: pairCount['concorduni'] += 1 else: pairCount['discord'] += 1 if not dup: pairCount['discorduni'] += 1 # Process output if dup and rmDup: continue elif concord and rmConcord: continue else: outData = '\t'.join(map(str,pair)) + '\n' outObject.add(outData) # Close file, return data and close pipe outObject.close() pipe.send(pairCount) pipe.close()