Пример #1
0
def get_readgroup_and_seq_dict_from_bam(bam_files, allow_collision=False):
    print "Gather seq dict and read groups from %s bam files" % len(bam_files)
    all_read_groups = {}
    all_seq_dict = OrderedDict()
    for bam_file in bam_files:
        command = "samtools view -H %s | egrep '@RG|@SQ' " % bam_file
        stdout, process = utils_commands.get_output_stream_from_command(
            command)
        for line in stdout:
            if line.startswith('@RG'):
                read_group_dict = {}
                for element in line.strip().split('\t'):
                    if element != '@RG':
                        key, value = element.split(':')
                        read_group_dict[key] = value
                if read_group_dict.has_key('ID') and read_group_dict.get(
                        'ID') not in all_read_groups:
                    all_read_groups[read_group_dict.get(
                        'ID')] = read_group_dict
            if line.startswith('@SQ'):
                seq_dict = {}
                for element in line.strip().split('\t'):
                    if element != '@SQ':
                        key, value = element.split(':')
                        if key == 'LN': value = int(value)
                        seq_dict[key] = value
                if seq_dict.has_key('SN'):
                    name = seq_dict.get('SN')
                    if all_seq_dict.has_key(name) and not allow_collision:
                        raise StandardError(
                            "Identical sequence dictionary name %s in %s and previous bam entry and collision not allowed"
                            % (name, bam_file))
                    all_seq_dict[name] = seq_dict

    return all_read_groups.values(), all_seq_dict.values()
Пример #2
0
def process_alleles(vcf_record_in_one_contig, sample_names,curr_reference):
    sample_to_allele = generate_empty_hash_with_sample(sample_names)
    command = "samtools view -F 1028 %s %s"%(bam_file,curr_reference)
    stream,process=utils_commands.get_output_stream_from_command(command)
    for line in stream:
        sam_record=Sam_record(line)
        allele_array=[]
        sequence = sam_record.get_query_sequence()
        sample = sam_record.get_tag("RG")
        for position in vcf_record_in_one_contig.keys():
            #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20:
            allele_array.append(sequence[position-1])
            #else:
            #    allele_array.append('.')
        count_with_hash(sample_to_allele[sample], ''.join(allele_array))
        count_with_hash(sample_to_allele['all'], ''.join(allele_array))
    process.wait()
    pprint.pprint(sample_to_allele)
    filter_alleles(sample_to_allele)
    pprint.pprint(sample_to_allele)
    all_alleles=set()
    valid=True
    for sample in sample_to_allele.keys():
        alleles = sample_to_allele.get(sample)
        all_alleles.update(set(alleles.keys()))
        if len(alleles)>2:
            valid=False
    if len(all_alleles)>4:
        valid=False
    if not valid:
        print curr_reference
Пример #3
0
def get_readgroup_and_seq_dict_from_bam(bam_files, allow_collision=False):
    print "Gather seq dict and read groups from %s bam files" % len(bam_files)
    all_read_groups = {}
    all_seq_dict = OrderedDict()
    for bam_file in bam_files:
        command = "samtools view -H %s | egrep '@RG|@SQ' " % bam_file
        stdout, process = utils_commands.get_output_stream_from_command(command)
        for line in stdout:
            if line.startswith('@RG'):
                read_group_dict = {}
                for element in line.strip().split('\t'):
                    if element != '@RG':
                        key, value = element.split(':')
                        read_group_dict[key] = value
                if read_group_dict.has_key('ID') and read_group_dict.get('ID') not in all_read_groups:
                    all_read_groups[read_group_dict.get('ID')] = read_group_dict
            if line.startswith('@SQ'):
                seq_dict = {}
                for element in line.strip().split('\t'):
                    if element != '@SQ':
                        key, value = element.split(':')
                        if key == 'LN': value = int(value)
                        seq_dict[key] = value
                if seq_dict.has_key('SN'):
                    name = seq_dict.get('SN')
                    if all_seq_dict.has_key(name) and not allow_collision:
                        raise StandardError(
                            "Identical sequence dictionary name %s in %s and previous bam entry and collision not allowed" % (
                            name, bam_file))
                    all_seq_dict[name] = seq_dict

    return all_read_groups.values(), all_seq_dict.values()
Пример #4
0
def createDirectories(baseDir, directories, server=None):
    """
    This function create the directories listed in the directories array.
    @param baseDir: the parent directory.
    @param directories: the list of directory to create.
    @param server: the server on which the directory will be created.
    """
    for (directory) in directories:
        dir = os.path.join(baseDir, directory)
        if server is not None:
            command = 'ssh %s "ls -d1 %s"' % (server, dir)
            stream, process = utils_commands.get_output_stream_from_command(
                command, logger_name=None)
            line = None
            for line in stream:
                line = line.strip()
                if line == dir:
                    break
            if line != dir:
                logging.info('%s does not exists on %s: create it' %
                             (dir, server))
                command = 'ssh %s "mkdir %s"' % (server, dir)
                utils_commands.launchCommandLocally(command)
        elif not os.path.exists(dir):
            logging.info('%s does not exists: create it' % dir)
            os.mkdir(dir, 0775)
Пример #5
0
def check_file_or_dir(filePath, server=None):
    """ Check if the given file is a file and if its size is greater than 0."""
    if server:
        returnValue = False
        command = 'ssh %s "ls -ld %s"' % (server, filePath)
        stream, process = utils_commands.get_output_stream_from_command(
            command, logger_name=None)
        line = None
        for line in stream:
            line = line.strip()
            if line:
                if line.startswith('d'):
                    #It's a directory
                    returnValue = 'dir'
                else:
                    returnValue = 'file'
                break
            else:
                returnValue = False
    else:
        if os.path.isfile(filePath):
            returnValue = 'file'
        elif os.path.isdir(filePath):
            returnValue = 'dir'
        else:
            returnValue = False

    return returnValue
Пример #6
0
def check_file_or_dir(filePath, server=None):
    """ Check if the given file is a file and if its size is greater than 0."""
    if server:
        returnValue = False
        command = 'ssh %s "ls -ld %s"'%(server, filePath)
        stream,process = utils_commands.get_output_stream_from_command(command,logger_name=None)
        line=None
        for line in stream:
            line=line.strip()
            if line:
                if line.startswith('d'):
                    #It's a directory
                    returnValue = 'dir'
                else:
                    returnValue = 'file'
                break
            else:
                returnValue = False
    else:
        if os.path.isfile(filePath):
            returnValue = 'file'
        elif os.path.isdir(filePath):
            returnValue = 'dir'
        else:
            returnValue = False
        
    return returnValue
Пример #7
0
def process_alleles(vcf_record_in_one_contig, sample_names, curr_reference):
    sample_to_allele = generate_empty_hash_with_sample(sample_names)
    command = "samtools view -F 1028 %s %s" % (bam_file, curr_reference)
    stream, process = utils_commands.get_output_stream_from_command(command)
    for line in stream:
        sam_record = Sam_record(line)
        allele_array = []
        sequence = sam_record.get_query_sequence()
        sample = sam_record.get_tag("RG")
        for position in vcf_record_in_one_contig.keys():
            #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20:
            allele_array.append(sequence[position - 1])
            #else:
            #    allele_array.append('.')
        count_with_hash(sample_to_allele[sample], ''.join(allele_array))
        count_with_hash(sample_to_allele['all'], ''.join(allele_array))
    process.wait()
    pprint.pprint(sample_to_allele)
    filter_alleles(sample_to_allele)
    pprint.pprint(sample_to_allele)
    all_alleles = set()
    valid = True
    for sample in sample_to_allele.keys():
        alleles = sample_to_allele.get(sample)
        all_alleles.update(set(alleles.keys()))
        if len(alleles) > 2:
            valid = False
    if len(all_alleles) > 4:
        valid = False
    if not valid:
        print curr_reference
Пример #8
0
def get_readgroup_from_bam(bam_files):
    all_read_groups=[]
    for bam_file in bam_files:
        command = "samtools view -H %s | grep '^@RG' " % bam_file
        stdout, process = utils_commands.get_output_stream_from_command(command)
        for line in stdout:
            all_read_groups.append(line.strip())
    return all_read_groups
Пример #9
0
def run_velvetk(fastq_file_name, estimated_size=600, **kwarg):
    command = "%s --size %s --best %s 2> /dev/null"%(velvetk_bin,estimated_size, fastq_file_name)
    logging.info(command)
    stream,process = utils_commands.get_output_stream_from_command(command)
    kmer_length=29
    for line in stream:
        if line.strip().isdigit():
            kmer_length = int(line.strip())
    if kmer_length<19: kmer_length=19
    elif kmer_length>99: kmer_length=99
    logging.info("velvetk kmer: %s"%kmer_length)
    return kmer_length
Пример #10
0
def run_velvetk(fastq_file_name, estimated_size=600, **kwarg):
    command = "%s --size %s --best %s 2> /dev/null" % (
        velvetk_bin, estimated_size, fastq_file_name)
    logging.info(command)
    stream, process = utils_commands.get_output_stream_from_command(command)
    kmer_length = 29
    for line in stream:
        if line.strip().isdigit():
            kmer_length = int(line.strip())
    if kmer_length < 19: kmer_length = 19
    elif kmer_length > 99: kmer_length = 99
    logging.info("velvetk kmer: %s" % kmer_length)
    return kmer_length
Пример #11
0
def count_reads_in_fastq(fastq_file):
    command = '''awk '{if (NR%%4==1){split($1,array,"RGID:"); print array[2]}}' %s| uniq -c'''%(fastq_file)
    logging.info(command)
    stream, process = get_output_stream_from_command(command)
    total=0
    all_read_groups=Counter()
    for line in stream:
        if len(line.strip().split())==2:
            count, rgid = line.strip().split()
            count=int(count)
            total+=count
            all_read_groups[rgid]
    return total, all_read_groups
Пример #12
0
def count_reads_in_fastq(fastq_file):
    command = '''awk '{if (NR%%4==1){split($1,array,"RGID:"); print array[2]}}' %s| uniq -c''' % (
        fastq_file)
    logging.info(command)
    stream, process = get_output_stream_from_command(command)
    total = 0
    all_read_groups = Counter()
    for line in stream:
        if len(line.strip().split()) == 2:
            count, rgid = line.strip().split()
            count = int(count)
            total += count
            all_read_groups[rgid]
    return total, all_read_groups
Пример #13
0
def process_double_digest_rad_run(bam_file,all_sites_info,samtools_bin):
    command="%s view -h %s"%(samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    sample_name, ext = os.path.splitext(bam_file)
    read_groups={}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id=rg_sample=rg_library=None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id=value[3:]
                    elif value.startswith("SM"):
                        rg_sample=value[3:]
                    elif value.startswith("LB"):
                        rg_library=value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id]=rg_sample
                    elif rg_library:
                        read_groups[rg_id]=rg_library
                    else:
                        read_groups[rg_id]=rg_id

        all_sample_coverage={}
        for sample in read_groups.values():
            all_sample_coverage[sample]=Counter()

        i=0
        for sam_record_r1,sam_record_r2 in load_from_sites_generator(open_stream):
            duplicate=0
            i+=1
            if i%1000000==0:
                print i
            if not sam_record_r1.is_unmapped() and not sam_record_r2.is_unmapped():
                loci = get_dd_RAD_loci_from_read_pair(sam_record_r1,sam_record_r2)
                if sam_record_r1.is_duplicate_read():
                    duplicate=1
                all_sites_info.add_values(loci, coverage=1, duplicate=duplicate,
                                          sample=read_groups.get(sam_record_r1.get_tag("RG")))
    finally:
        open_stream.close()
Пример #14
0
def process_single_samtools_run(bam_file, all_contigs_info, samtools_bin):
    command="%s view -F 132 %s"%(samtools_bin, bam_file)
    open_stream, process=get_output_stream_from_command(command)
    current_contig=None
    coverage=0
    duplicate=0
    sample_name, ext = os.path.splitext(bam_file)
    for line in open_stream:
        sp_line=line.strip().split()
        if current_contig!=sp_line[2] and current_contig != None:
            all_contigs_info.add_values(current_contig, coverage, duplicate, sample=sample_name)
            coverage=0
            duplicate=0
        current_contig=sp_line[2]
        if int(sp_line[3])==1:
            if int(sp_line[1]) & 1024 == 1024:
                duplicate+=1
            coverage+=1
            
    open_stream.close()
Пример #15
0
def process_single_samtools_run(bam_file, all_contigs_info, samtools_bin):
    command = "%s view -F 132 %s" % (samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig = None
    coverage = 0
    duplicate = 0
    sample_name, ext = os.path.splitext(bam_file)
    for line in open_stream:
        sp_line = line.strip().split()
        if current_contig != sp_line[2] and current_contig != None:
            all_contigs_info.add_values(current_contig, coverage, duplicate, sample=sample_name)
            coverage = 0
            duplicate = 0
        current_contig = sp_line[2]
        if int(sp_line[3]) == 1:
            if int(sp_line[1]) & 1024 == 1024:
                duplicate += 1
            coverage += 1

    open_stream.close()
def generate_readgroup_exclusion_file_per_samples(bam_file):
    directory = os.path.dirname(os.path.abspath(bam_file))
    command = 'samtools view -H %s | grep @RG'%(bam_file)
    stream, process = get_output_stream_from_command(command)
    all_samples=set()
    all_samples2id=defaultdict(list)
    for line in stream:
        RG_dict = parse_RG_line(line)
        all_samples.add(RG_dict.get('SM'))
        all_samples2id[RG_dict.get('SM')].append(RG_dict.get('ID'))
    
    all_samples2exclusion_id_file={}
    for sample in all_samples:
        exclusion_id = []
        exclusion_samples = all_samples.difference(set([sample]))
        for exclusion_sample in exclusion_samples:
            exclusion_id.extend(all_samples2id.get(exclusion_sample))
        sample_exclusion_file=os.path.join(directory,'exclusion_id_for_%s.txt'%sample)
        
        with open(sample_exclusion_file,'w') as open_file: open_file.write('\n'.join(exclusion_id))
        all_samples2exclusion_id_file[sample]= sample_exclusion_file  
    return all_samples2exclusion_id_file
def generate_readgroup_exclusion_file_per_samples(bam_file):
    directory = os.path.dirname(os.path.abspath(bam_file))
    command = 'samtools view -H %s | grep @RG' % (bam_file)
    stream, process = get_output_stream_from_command(command)
    all_samples = set()
    all_samples2id = defaultdict(list)
    for line in stream:
        RG_dict = parse_RG_line(line)
        all_samples.add(RG_dict.get('SM'))
        all_samples2id[RG_dict.get('SM')].append(RG_dict.get('ID'))

    all_samples2exclusion_id_file = {}
    for sample in all_samples:
        exclusion_id = []
        exclusion_samples = all_samples.difference(set([sample]))
        for exclusion_sample in exclusion_samples:
            exclusion_id.extend(all_samples2id.get(exclusion_sample))
        sample_exclusion_file = os.path.join(
            directory, 'exclusion_id_for_%s.txt' % sample)

        with open(sample_exclusion_file, 'w') as open_file:
            open_file.write('\n'.join(exclusion_id))
        all_samples2exclusion_id_file[sample] = sample_exclusion_file
    return all_samples2exclusion_id_file
Пример #18
0
def createDirectories(baseDir,directories, server=None):
    """
    This function create the directories listed in the directories array.
    @param baseDir: the parent directory.
    @param directories: the list of directory to create.
    @param server: the server on which the directory will be created.
    """
    for (directory) in directories:
        dir=os.path.join(baseDir,directory)
        if server is not None:
            command = 'ssh %s "ls -d1 %s"'%(server, dir)
            stream, process = utils_commands.get_output_stream_from_command(command, logger_name=None)
            line=None
            for line in stream:
                line=line.strip()
                if line == dir:
                    break
            if line != dir:
                logging.info('%s does not exists on %s: create it'%(dir, server))
                command = 'ssh %s "mkdir %s"'%(server, dir)
                utils_commands.launchCommandLocally(command)
        elif not os.path.exists(dir):
            logging.info('%s does not exists: create it'%dir)
            os.mkdir(dir,0775)
def get_mpileup_from_bam(bam_file, options=''):
    try:
        pipeline_parm = utils_param.get_pipeline_parameters()
        samtools_bin = os.path.join(pipeline_parm.get_samtools_dir(),
                                    'samtools')
    except Config_file_error, e:
        logging.warning(
            "Can't find the configuration file you'll need to have samtools in you path."
        )
        samtools_bin = 'samtools'
    if bam_file == 'PIPE':
        bam_file = '-'
    else:
        command = '%s mpileup -A %s %s' % (samtools_bin, bam_file, options)
    stream, process = utils_commands.get_output_stream_from_command(
        command, logger_name=None)
    return stream


def allele_freq_from_bam_and_list_pos(output_file,
                                      input_file,
                                      list_position_file,
                                      all_positions_loaded,
                                      exclusion_id_file,
                                      bas_qual_threshold=20,
                                      map_qual_threshold=10,
                                      coverage_threshold=6):
    input_stream = get_mpileup_from_bam(
        input_file,
        options='-s -l %s -G %s' % (list_position_file, exclusion_id_file))
    all_positions_loaded = copy.copy(all_positions_loaded)
Пример #20
0
def get_mapview_stream(maq_bin, map_file):
    """This method opens a .map file with Maq and returns an open file. The std error will be output in the console through another thread."""
    command = '%s mapview %s' % (maq_bin, map_file)
    stdout, process = utils_commands.get_output_stream_from_command(command)
    return stdout
Пример #21
0
                   chomosome_and_position=''):
    """This method opens a .bam file with samtools and returns an open file. The std error will be output in the console through another thread."""
    if samtools_bin == None:
        try:
            pipeline_parm = utils_param.get_pipeline_parameters()
            samtools_bin = os.path.join(pipeline_parm.get_samtools_dir(),
                                        'samtools')
        except Config_file_error, e:
            logging.warning(
                "Can't find the configuration file you'll need to have samtools in you path."
            )
            samtools_bin = 'samtools'

    command = '%s view %s %s %s' % (samtools_bin, options, bam_file,
                                    chomosome_and_position)
    stdout, process = utils_commands.get_output_stream_from_command(command)
    return stdout, process


def get_pileup_from_bam(bam_file,
                        genome_file=None,
                        samtools_bin=None,
                        options=''):
    if samtools_bin == None:
        try:
            pipeline_parm = utils_param.get_pipeline_parameters()
            samtools_bin = os.path.join(pipeline_parm.get_samtools_dir(),
                                        'samtools')
        except Config_file_error, e:
            logging.warning(
                "Can't find the configuration file you'll need to have samtools in you path."
     pipeline_param=utils_param.get_pipeline_parameters()
     samtools_dir=pipeline_param.get_samtools_dir()
 except Config_file_error, e:
     #logging.exception('Config_file_error:')
     logging.critical("You need to have the environment variable properly set to use that script")
     return False
     
 
 samtools_bin=os.path.join(samtools_dir,'samtools')
 name, ext = os.path.splitext(output_bam_file)
 if ext=='.bam':
     output_bam_file=name
 #change_consensus_on_read2
 command ="%s view -h %s "%(samtools_bin,input_bam_file)
 logging.info(command)
 input_stream,process_input = utils_commands.get_output_stream_from_command(command)
 command ="%s view -bS - | %s sort - %s"%(samtools_bin,  samtools_bin, output_bam_file)
 logging.info(command)
 output_stream,process_output= utils_commands.get_input_stream_from_command(command)
 
 #get the header
 line = input_stream.readline()
 while line.startswith("@"):
     output_stream.write(line)
     line = input_stream.readline()
 
 while line:
     read1=Sam_record(line)
     line = input_stream.readline()
     read2=Sam_record(line)
     if read1.get_query_name() == read2.get_query_name():
Пример #23
0
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin):
    command = "%s view -h -F 132 %s" % (samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig = None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups = {}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id = rg_sample = rg_library = None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id = value[3:]
                    elif value.startswith("SM"):
                        rg_sample = value[3:]
                    elif value.startswith("LB"):
                        rg_library = value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id] = rg_sample
                    elif rg_library:
                        read_groups[rg_id] = rg_library
                    else:
                        read_groups[rg_id] = rg_id
        all_sample_coverage = {}
        all_sample_duplicate = {}
        for sample in read_groups.values():
            all_sample_coverage[sample] = 0
            all_sample_duplicate[sample] = 0
            # process the first read
            # if line.startswith("@"):
        #    #Still in the header. There's no read, exit
        #    return
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)] += 1
            all_sample_coverage[read_groups.get(rg_id)] += 1
        i = 1
        # process all the others
        for line in open_stream:
            i += 1
            if i % 1000000 == 0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name() and current_contig != None:
                for sample in read_groups.values():
                    all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample),
                                                all_sample_duplicate.get(sample), sample=sample)
                    all_sample_coverage[sample] = 0
                    all_sample_duplicate[sample] = 0
            current_contig = sam_record.get_reference_name()

            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)] += 1
                all_sample_coverage[read_groups.get(rg_id)] += 1
        if current_contig != None:
            for sample in read_groups.values():
                all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample),
                                            all_sample_duplicate.get(sample), sample=sample)
                all_sample_coverage[sample] = 0
                all_sample_duplicate[sample] = 0
    finally:
        open_stream.close()
Пример #24
0
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info,
                                                samtools_bin):
    command = "%s view -h -F 132 %s" % (samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig = None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups = {}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id = rg_sample = rg_library = None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id = value[3:]
                    elif value.startswith("SM"):
                        rg_sample = value[3:]
                    elif value.startswith("LB"):
                        rg_library = value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id] = rg_sample
                    elif rg_library:
                        read_groups[rg_id] = rg_library
                    else:
                        read_groups[rg_id] = rg_id
        all_sample_coverage = {}
        all_sample_duplicate = {}
        for sample in read_groups.values():
            all_sample_coverage[sample] = 0
            all_sample_duplicate[sample] = 0
            # process the first read
            # if line.startswith("@"):
        #    #Still in the header. There's no read, exit
        #    return
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)] += 1
            all_sample_coverage[read_groups.get(rg_id)] += 1
        i = 1
        # process all the others
        for line in open_stream:
            i += 1
            if i % 1000000 == 0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name(
            ) and current_contig != None:
                for sample in read_groups.values():
                    all_contigs_info.add_values(
                        current_contig,
                        all_sample_coverage.get(sample),
                        all_sample_duplicate.get(sample),
                        sample=sample)
                    all_sample_coverage[sample] = 0
                    all_sample_duplicate[sample] = 0
            current_contig = sam_record.get_reference_name()

            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)] += 1
                all_sample_coverage[read_groups.get(rg_id)] += 1
        if current_contig != None:
            for sample in read_groups.values():
                all_contigs_info.add_values(current_contig,
                                            all_sample_coverage.get(sample),
                                            all_sample_duplicate.get(sample),
                                            sample=sample)
                all_sample_coverage[sample] = 0
                all_sample_duplicate[sample] = 0
    finally:
        open_stream.close()
Пример #25
0
    command='%s mapview %s'%(maq_bin, map_file)
    stdout, process = utils_commands.get_output_stream_from_command(command)
    return stdout

def get_sam_stream(bam_file, samtools_bin=None, options='', chomosome_and_position=''):
    """This method opens a .bam file with samtools and returns an open file. The std error will be output in the console through another thread."""
    if samtools_bin==None:
        try:
            pipeline_parm=utils_param.get_pipeline_parameters()
            samtools_bin=os.path.join(pipeline_parm.get_samtools_dir(),'samtools')
        except Config_file_error, e:
            logging.warning("Can't find the configuration file you'll need to have samtools in you path.")
            samtools_bin='samtools'
    
    command='%s view %s %s %s'%(samtools_bin, options, bam_file, chomosome_and_position)
    stdout, process = utils_commands.get_output_stream_from_command(command)
    return stdout, process


def get_pileup_from_bam(bam_file, genome_file=None, samtools_bin=None, options=''):
    if samtools_bin==None:
        try:
            pipeline_parm=utils_param.get_pipeline_parameters()
            samtools_bin=os.path.join(pipeline_parm.get_samtools_dir(),'samtools')
        except Config_file_error, e:
            logging.warning("Can't find the configuration file you'll need to have samtools in you path.")
            samtools_bin='samtools'
    if bam_file=='PIPE':
        bam_file='-'
    if genome_file:
        command = '%s pileup -f %s %s %s'%(samtools_bin, genome_file, bam_file, options)
    out.append('C:%s:%s'%(ATCG['C'],ATCG_filtered['C']))
    out.append('G:%s:%s'%(ATCG['G'],ATCG_filtered['G']))
    return '\t'.join(out)

def get_mpileup_from_bam(bam_file, options=''):
    try:
        pipeline_parm=utils_param.get_pipeline_parameters()
        samtools_bin=os.path.join(pipeline_parm.get_samtools_dir(),'samtools')
    except Config_file_error, e:
        logging.warning("Can't find the configuration file you'll need to have samtools in you path.")
        samtools_bin='samtools'
    if bam_file=='PIPE':
        bam_file='-'
    else:
        command = '%s mpileup -A %s %s'%(samtools_bin, bam_file, options)
    stream, process = utils_commands.get_output_stream_from_command(command, logger_name=None)
    return stream


def allele_freq_from_bam_and_list_pos(output_file, input_file, list_position_file, all_positions_loaded, exclusion_id_file, bas_qual_threshold=20,
                                   map_qual_threshold=10, coverage_threshold=6):
    input_stream = get_mpileup_from_bam(input_file, options='-s -l %s -G %s'%(list_position_file,exclusion_id_file))
    all_positions_loaded=copy.copy(all_positions_loaded)
    if input_stream is not None:
        open_output=open(output_file,'w')
        for line in input_stream:
            sp_line = line.strip().split()
            position = '%s\t%s'%(sp_line[0],sp_line[1])
            
            if position in all_positions_loaded :
                all_positions_loaded.remove(position)
Пример #27
0
def get_mapview_stream(maq_bin, map_file):
    """This method opens a .map file with Maq and returns an open file. The std error will be output in the console through another thread."""
    command='%s mapview %s'%(maq_bin, map_file)
    stdout, process = utils_commands.get_output_stream_from_command(command)
    return stdout
Пример #28
0
def process_single_samtools_run_with_read_group(bam_file,all_contigs_info,samtools_bin):
    command="%s view -h -F 132 %s"%(samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig=None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups={}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id=rg_sample=rg_library=None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id=value[3:]
                    elif value.startswith("SM"):
                        rg_sample=value[3:]
                    elif value.startswith("LB"):
                        rg_library=value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id]=rg_sample
                    elif rg_library:
                        read_groups[rg_id]=rg_library
                    else:
                        read_groups[rg_id]=rg_id
        all_sample_coverage={}
        all_sample_coverage_reads = {}
        all_sample_duplicate={}
        for sample in read_groups.values():
            all_sample_coverage[sample]=Counter()
            all_sample_duplicate[sample]=Counter()
            all_sample_coverage_reads[sample] = defaultdict(Counter)
        #process the first read
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            read_sequence = sam_record.get_query_sequence()
            loci = get_loci_from_read(sam_record)
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1
            all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1
            all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1
        i=1
        #process all the others
        for line in open_stream:
            i+=1
            if i%1000000==0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name() and current_contig != None:
                for sample in read_groups.values():
                    for loci in all_sample_coverage.get(sample):
                        alleles = all_sample_coverage_reads[sample].get(loci)
                        all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0),
                                                    all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles,
                                                    sample=sample)

                    all_sample_coverage[sample]=Counter()
                    all_sample_duplicate[sample]=Counter()
                    all_sample_coverage_reads[sample] = defaultdict(Counter)
            current_contig = sam_record.get_reference_name()
            
            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                loci = get_loci_from_read(sam_record)
                read_sequence = sam_record.get_query_sequence()
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1
                all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1
                all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1
        if current_contig != None:
            for sample in read_groups.values():
                for loci in all_sample_coverage.get(sample):
                    alleles = all_sample_coverage_reads[sample].get(loci)
                    all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0),
                                                all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles,
                                                sample=sample)
                all_sample_coverage[sample]=Counter()
                all_sample_duplicate[sample]=Counter()
                all_sample_coverage_reads[sample] = defaultdict(Counter)
    finally:
        open_stream.close()