Python writeFileProcess примеры, general_python.writeFile.writeFileProcess Python примеры использования

Пример #1

0

Показать файл

Файл: fragendPair.py Проект: adam-rabinowitz/ngs_python

def fragendPairs(pairIn, fasta, resite ,maxDistance, fragendOut):
    ''' Function identifies and reports upstream fragends for HiC read pairs. The
    function takes 5 arguments:
    
    1)  pairIn - Read apit input object
    2)  fasta - Genome fasta file
    3)  reSite - Restriction enzyme recognition sequence
    4)  maxDistance - Maximum acceptable distance between start of read and RE site.
    5)  pairOut - Name of output gzipped file containing fragend ligations.
    
    '''
    # Create fragend dictionary and metrics dictionary
    fragDict = findFragendSites(fasta, resite)
    fragendCounts = collections.defaultdict(int)
    fragendCounts['fragDist'] = []
    fragendCounts['ligDist'] = []
    # Open input and output file
    if pairIn.endswith('.gz'):
        inFile = gzip.open(pairIn, 'r')
    else:
        inFile = open(pairIn, 'r')
    outFile = writeFile.writeFileProcess(fragendOut)
    for pair in inFile:
        pair = pair.strip().split('\t')
        # Count entries
        fragendCounts['total'] += 1
        # Create output containg fragend data
        output = downstream([pair[0:4],pair[4:8]], fragDict)
        # Skip reads without identified fragends
        if output[0] == None or output[1] == None:
            fragendCounts['none'] += 1
            continue
        # Add fragend distance data for pairs with fragends
        fragendCounts['fragDist'].extend([
            output[0][3],
            output[1][3] 
        ])
        # Count and skip reads too distant from the fragend
        if output[0][3] > maxDistance or output[1][3] > maxDistance:
            fragendCounts['distant'] += 1
            continue
        # Save to file accepted ligation pairs
        outData = '\t'.join(map(str,output[0][0:3] + output[1][0:3]))
        outFile.add(outData + '\n')
        # Count interchromosomal ligations 
        if output[0][0] != output[1][0]:
            fragendCounts['interchromosomal'] += 1
            # Count intrachromosomal ligations and store distance
        else:
            fragendCounts['intrachromosomal'] += 1
            fragendCounts['ligDist'].append(
                abs(output[0][1] - output[1][1])
            )
    # Close files and return data
    inFile.close()
    outFile.close()
    return(fragendCounts)

Пример #2

0

Показать файл

Файл: fragendPair.py Проект: hjanime/ngs_python

def fragendPairs(pairIn, fasta, resite, maxDistance, fragendOut):
    ''' Function identifies and reports upstream fragends for HiC read pairs. The
    function takes 5 arguments:
    
    1)  pairIn - Read apit input object
    2)  fasta - Genome fasta file
    3)  reSite - Restriction enzyme recognition sequence
    4)  maxDistance - Maximum acceptable distance between start of read and RE site.
    5)  pairOut - Name of output gzipped file containing fragend ligations.
    
    '''
    # Create fragend dictionary and metrics dictionary
    fragDict = findFragendSites(fasta, resite)
    fragendCounts = collections.defaultdict(int)
    fragendCounts['fragDist'] = []
    fragendCounts['ligDist'] = []
    # Open input and output file
    if pairIn.endswith('.gz'):
        inFile = gzip.open(pairIn, 'r')
    else:
        inFile = open(pairIn, 'r')
    outFile = writeFile.writeFileProcess(fragendOut)
    for pair in inFile:
        pair = pair.strip().split('\t')
        # Count entries
        fragendCounts['total'] += 1
        # Create output containg fragend data
        output = downstream([pair[0:4], pair[4:8]], fragDict)
        # Skip reads without identified fragends
        if output[0] == None or output[1] == None:
            fragendCounts['none'] += 1
            continue
        # Add fragend distance data for pairs with fragends
        fragendCounts['fragDist'].extend([output[0][3], output[1][3]])
        # Count and skip reads too distant from the fragend
        if output[0][3] > maxDistance or output[1][3] > maxDistance:
            fragendCounts['distant'] += 1
            continue
        # Save to file accepted ligation pairs
        outData = '\t'.join(map(str, output[0][0:3] + output[1][0:3]))
        outFile.add(outData + '\n')
        # Count interchromosomal ligations
        if output[0][0] != output[1][0]:
            fragendCounts['interchromosomal'] += 1
            # Count intrachromosomal ligations and store distance
        else:
            fragendCounts['intrachromosomal'] += 1
            fragendCounts['ligDist'].append(abs(output[0][1] - output[1][1]))
    # Close files and return data
    inFile.close()
    outFile.close()
    return (fragendCounts)

Пример #3

0

Показать файл

Файл: fastqMerge.py Проект: hjanime/ngs_python

def mergeLabelPair(fastqIn1, fastqIn2, fastqOut, label1=':1', label2=':2'):
    ''' Function merges two FASTQ files into a single FASTQ file.
    Specified names are added to the end of the merged reads.
    
    1)  fastqIn1 - Read one FASTQ file(s). Either a string or a list of
        strings.
    2)  fastqIn2 - Read two FASTQ file(s). Either a string or a list of
        strings.
    3)  fastqOut - Output FASTQ file
    4)  label1 - Label to add to read1 file.
    5)  label2 - Label to add to read2 file.
    
    '''
    # Open input and output process
    output = writeFile.writeFileProcess(fastqOut)
    input1 = fastqExtract.readFastqProcess(fastqIn1)
    input2 = fastqExtract.readFastqProcess(fastqIn2)
    # Extract labelled reads and save to output
    for read1, read2 in itertools.izip(input1, input2):
        # Extract read ID and check for equality
        read1Header, read1Remainder = read1.split('\n' ,1)
        read1Header = read1Header.split(' ', 1)
        read2Header, read2Remainder = read2.split('\n' ,1)
        read2Header = read2Header.split(' ', 1)
        if read1Header[0] != read2Header[0]:
            raise IOError('Input FASTQ files contain unmatched reads')
        else:
            read1Header[0] += label1
            read2Header[0] += label2
            output.add('%s\n%s\n%s\n%s\n' %(
                ' '.join(read1Header),
                read1Remainder,
                ' '.join(read2Header),
                read2Remainder
            ))
    # Close pipes and processes
    input1.close()
    input2.close()
    output.close()

Пример #4

0

Показать файл

def processPairs(pipe, pairOut, rmDup, rmConcord, maxSize):
    ''' Function to output read pairs generated from the extract
    function while processing concordant and duplicate reads.
    Function takes five arguments:
    
    1)  readPairs - a read pair dictionary created by the extract
        function.
    2)  pairOut - output object which is processed by the
        iohandle.handleout function.
    3)  rmDup - Boolean indicating whether to remove duplicates
        from the output.
    4)  rmConcord - Boolean indicating whether to remove concordant
        pairs from the output.
    5)  alignLog - log dictionary generated by the extract
        function.
    
    Function returns two items:
    
    1)  A closed iohandle.handleout object
    2)  The altered alignLog from the input
    
    '''
    # Create counter and pair set
    pairCount = collections.defaultdict(int)
    pairSet = set()
    # Open output file process
    outObject = writeFile.writeFileProcess(fileName=pairOut)
    # Loop through pairs
    while True:
        # Get pair from pipe
        pair = pipe.recv()
        if pair == None:
            break
        # Count and check for duplicates pairs
        pairCount['total'] += 1
        if pair in pairSet:
            dup = True
            pairCount['duplicate'] += 1
        else:
            dup = False
            pairCount['unique'] += 1
            pairSet.add(pair)
        # Count and check for concordant pairs
        concord = concordant(pair, maxSize)
        if concord:
            pairCount['concord'] += 1
            if not dup:
                pairCount['concorduni'] += 1
        else:
            pairCount['discord'] += 1
            if not dup:
                pairCount['discorduni'] += 1
        # Process output
        if dup and rmDup:
            continue
        elif concord and rmConcord:
            continue
        else:
            outData = '\t'.join(map(str, pair)) + '\n'
            outObject.add(outData)
    # Close file, return data and close pipe
    outObject.close()
    pipe.send(pairCount)
    pipe.close()

Пример #5

0

Показать файл

Файл: fastqMerge.py Проект: hjanime/ngs_python

def mergeLabelTrimPair(fastqIn1, fastqIn2, trimSeq, fastqOut, minLength = 20,
    label1=':1', label2=':2'):
    ''' Function merges two paired FASTQ files into a single FASTQ file.
    FASTQ entries are trimmed to not extend beyong a supplied trim
    sequence. Any pair of reads for which one of the trimmed reads is
    shorter than the supplied minimum length is discarded. Specified
    labels are added to the end of the merged reads. Function takes seven
    arguments:
    
    1)  fastqIn1 - Read one FASTQ file(s). Either a string or a list of
        strings.
    2)  fastqIn2 - Read two FASTQ file(s). Either a string or a list of
        strings.
    3)  trimSeq - Sequence at which to terminate reads
    4)  fastqOut - Output FASTQ file.
    5)  minLength - Minimum length of reads to be output.
    6)  label1 - Label to add to read1 file.
    7)  label2 - Label to add to read2 file.
    
    Function returns a dictionary containing the following elements

    1)  total - Total number of read pairs.
    2)  short - Number of pairs with at least one read too short.
    3)  trim1 - Number of acceptable pairs with read1 trimmed.
    4)  trim2 - Number of acceptable pairs with read2 trimmed.
    '''
    # Create output dictionary and key variables
    metrics = {'total' : 0, 'short': 0, 'trim1': 0, 'trim2' : 0}
    seqLength = len(trimSeq)
    # Open input and output process
    input1 = fastqExtract.readFastqProcess(fastqIn1)
    input2 = fastqExtract.readFastqProcess(fastqIn2)
    output = writeFile.writeFileProcess(fastqOut)
    # Extract labelled reads and save to output
    for read1, read2 in itertools.izip(input1, input2):
        # Count total reads
        metrics['total'] += 1
        # Extract elements of read and identify trim sequence
        read1 = read1.split('\n')
        read2 = read2.split('\n')
        read1Loc = read1[1].find(trimSeq)
        read2Loc = read2[1].find(trimSeq)
        # Set trim length for read1 and count and skip if too short
        if read1Loc == -1:
            read1Trim = None
        else:
            read1Trim = read1Loc + seqLength
        if read1Trim and read1Trim < minLength:
            metrics['short'] += 1
            continue
        # Set trim length for read2 and count and skip if too short
        if read2Loc == -1:
            read2Trim = None
        else:
            read2Trim = read2Loc + seqLength
        if read2Trim and read2Trim < minLength:
            metrics['short'] +=1
            continue
        # Trim reads if required
        if read1Trim and read1Trim < len(read1[1]):
            read1[1] = read1[1][:read1Trim]
            read1[3] = read1[3][:read1Trim]
            metrics['trim1'] += 1
        if read2Trim and read2Trim < len(read2[1]):
            read2[1] = read2[1][:read2Trim]
            read2[3] = read2[3][:read2Trim]
            metrics['trim2'] += 1
        # Label reads
        if label1:
            read1Head = read1[0].split(' ',1)
            read1Head[0] += label1
            read1[0] = ' '.join(read1Head)
        if label2:
            read2Head = read2[0].split(' ',1)
            read2Head[0] += label2
            read2[0] = ' '.join(read2Head)
        # Save to file
        output.add('%s\n%s\n' %(
            '\n'.join(read1),
            '\n'.join(read2)
        ))
    # Close input and output objects
    input1.close()
    input2.close()
    output.close()
    # Return metrics
    return(metrics)

Пример #6

0

Показать файл

Файл: alignedPair.py Проект: adam-rabinowitz/ngs_python

def processPairs(pipe, pairOut, rmDup, rmConcord, maxSize):
    ''' Function to output read pairs generated from the extract
    function while processing concordant and duplicate reads.
    Function takes five arguments:
    
    1)  readPairs - a read pair dictionary created by the extract
        function.
    2)  pairOut - output object which is processed by the
        iohandle.handleout function.
    3)  rmDup - Boolean indicating whether to remove duplicates
        from the output.
    4)  rmConcord - Boolean indicating whether to remove concordant
        pairs from the output.
    5)  alignLog - log dictionary generated by the extract
        function.
    
    Function returns two items:
    
    1)  A closed iohandle.handleout object
    2)  The altered alignLog from the input
    
    '''
    # Create counter and pair set
    pairCount = collections.defaultdict(int)
    pairSet = set()
    # Open output file process
    outObject = writeFile.writeFileProcess(fileName = pairOut)
    # Loop through pairs
    while True:
        # Get pair from pipe
        pair = pipe.recv()
        if pair == None:
            break
        # Count and check for duplicates pairs
        pairCount['total'] += 1
        if pair in pairSet:
            dup = True
            pairCount['duplicate'] += 1
        else:
            dup = False
            pairCount['unique'] += 1
            pairSet.add(pair)
        # Count and check for concordant pairs
        concord =  concordant(pair, maxSize)
        if concord:
            pairCount['concord'] += 1
            if not dup:
                pairCount['concorduni'] += 1
        else:
            pairCount['discord'] += 1
            if not dup:
                pairCount['discorduni'] += 1
        # Process output
        if dup and rmDup:
            continue
        elif concord and rmConcord:
            continue
        else:
            outData = '\t'.join(map(str,pair)) + '\n'
            outObject.add(outData)
    # Close file, return data and close pipe
    outObject.close()
    pipe.send(pairCount)
    pipe.close()

Python writeFileProcess примеры использования