def main(): 'The main function' kind, io_fhands, input_file_format1, input_file_format2 = set_parameters() seqs1 = seqs_in_file( io_fhands['seqfile1'], io_fhands['qualfile1'], input_file_format1) seqs2 = seqs_in_file( io_fhands['seqfile2'], io_fhands['qualfile2'], input_file_format2) seq_distrib_diff(seqs1, seqs2, kind, distrib_fhand=io_fhands['distrib'], plot_fhand=io_fhands['plot'])
def main(): 'The main function' kind, io_fhands, format = set_parameters() if io_fhands['qualfile1']: seqs1 = seqs_in_file(io_fhands['seqfile1'], io_fhands['qualfile1'], format=format) else: seqs1 = seqs_in_file(io_fhands['seqfile1'], format=format) seq_distrib(sequences=seqs1, kind=kind, distrib_fhand=io_fhands['distrib'], plot_fhand=io_fhands['plot'], low_memory=True)
def test_pipeline_run(): 'It tests that the pipeline runs ok' pipeline = 'sanger_with_qual' fhand_adaptors = NamedTemporaryFile() fhand_adaptors.write(ADAPTORS) fhand_adaptors.flush() arabidopsis_genes = 'arabidopsis_genes+' univec = os.path.join(TEST_DATA_DIR, 'blast', arabidopsis_genes) configuration = {'remove_vectors_blastdb': {'vectors': univec}, 'remove_adaptors': {'adaptors': fhand_adaptors.name}} seq_fhand = open(os.path.join(TEST_DATA_DIR, 'seq.fasta'), 'r') qual_fhand = open(os.path.join(TEST_DATA_DIR, 'qual.fasta'), 'r') seq_iter = seqs_in_file(seq_fhand, qual_fhand) filtered_seq_iter = _pipeline_builder(pipeline, seq_iter, configuration) seq_list = list(filtered_seq_iter) assert 'CGAtcgggggg' in str(seq_list[0].seq) assert len(seq_list) == 6
def _get_lengths_quals_from_file(seq_fpath): 'Given a sequence file it returns the lengths and quals' lengths = IntsStats(init_len=1000) quals = IntsStats(init_len=100) for seq in seqs_in_file(open(seq_fpath)): lengths.append(len(seq)) qual = seq.qual if qual: quals.extend(qual) return lengths, quals
def _change_names_in_files_by_seq(fhand_in, fhand_out, naming, file_format): 'It replaces the seq name using the per_seq method' seqs = seqs_in_file(fhand_in, format=file_format) for seq in seqs: old_name = get_seq_name(seq) new_name = naming.get_uniquename(old_name) seq.name = new_name seq.id = new_name write_seqs_in_file([seq], fhand_out, format=file_format)
def main(): 'The main part of the script' io_fhands, minlength = set_parameters() #Get sequences from input files seq_iter = seqs_in_file(io_fhands['in_seq'], io_fhands['in_qual']) # split new long seqs new_seq_iter = split_seq_by_masked_regions(seq_iter, minlength) # Write cutted seqs to a new fasta write_fasta_file(new_seq_iter, io_fhands['out_seq'], io_fhands['out_qual'])
def _get_quals_by_length_from_file(fpath): 'It returns the qualities along the sequence as a list of lists' quals_by_position = [] for seq in seqs_in_file(open(fpath)): quals = seq.qual if quals: for base_number, qual in enumerate(quals): try: quals_by_position[base_number] except IndexError: quals_by_position.append(array.array('B')) quals_by_position[base_number].append(qual) return quals_by_position
def main(): 'The main function' infhand, work_dir, tag, format = set_parameters() seqs = seqs_in_file(infhand, format=format) tags = {} # split seqs by tag. Create a list with all the seqrecords for seq in seqs: item = get_item_from_tag(seq.description, tag) if item not in tags.keys(): name = "".join(infhand.name.split('.')[:-1]) name += '.' + item + '.' + format tags[item] = open(os.path.join(work_dir, name), 'w') write_seqs_in_file([seq], tags[item], format=format) for files in tags.values(): files.close()
def main(): 'Main section' fhand_seq, fhand_qual, result_file, file_format = set_parameters() seqs = seqs_in_file(fhand_seq, fhand_qual, file_format) stats = general_seq_statistics(seqs) if result_file: output = result_file else: output = sys.stdout for key, value in stats.items(): if value is not None: to_print = '%-19s : %d\n' % (key, value) output.write(to_print)
def merge_sam(infiles, outfile, reference): 'It merges a list of sam files' #first the reference part of the header ref_header = [] for seq in seqs_in_file(reference): name = seq.name length = len(seq) ref_header.append(['@SQ', 'SN:%s' % name, 'LN:%d' % length]) #now the read groups headers = set() for input_ in infiles: input_.seek(0) for line in input_: line = line.strip() if not line: continue if line.startswith('@SQ') or line.startswith('@PG'): continue elif line.startswith('@'): if 'SO:' in line: continue else: headers.add(tuple(line.split())) else: break #join and write both header parts headers = list(headers) headers.extend(ref_header) for header in headers: outfile.write('\t'.join(header)) outfile.write('\n') #the non header parts for input_ in infiles: input_.seek(0) for line in input_: if line.startswith('@'): continue outfile.write(line) outfile.flush()
def _do_seq_distrib_for_pair(self, pair): 'It does the distribution for a pair of cleaned and raw seqs' get_stats_dir = lambda seq_type: os.path.join(self._get_project_path(), BACKBONE_DIRECTORIES['%s_reads_stats' % seq_type]) #the statistics for both clean and raw sequences lengths = {} quals = {} for seq_type in ('raw', 'cleaned'): stats_dir = get_stats_dir(seq_type) stats_fpath = os.path.join(stats_dir, BACKBONE_BASENAMES['statistics_file']) stats_fhand = open(stats_fpath, 'a') if seq_type in pair: fpath = pair[seq_type].last_version basename = pair[seq_type].basename # nucleotide freq per position out_fpath = os.path.join(stats_dir, basename + '.freq_position') if not os.path.exists(out_fpath): plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT seqs = seqs_in_file(open(fpath)) create_nucleotide_freq_histogram(seqs, fhand=open(plot_fpath, 'w')) #the names for the output files out_fpath = os.path.join(stats_dir, basename + '.length') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' if os.path.exists(plot_fpath): continue lengths_, quals_ = self._get_lengths_quals_from_file(fpath) lengths[seq_type] = lengths_ quals[seq_type] = quals_ #the distributions for the lengths distrib = lengths_.calculate_distribution() lengths_.draw_distribution(distrib, labels=PLOT_LABELS['seq_length'], distrib_fhand=open(distrib_fpath, 'w'), plot_fhand=open(plot_fpath, 'w')) #the distributions for the quals out_fpath = os.path.join(stats_dir, basename + '.qual') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' if quals_.count != 0: distrib = quals_.calculate_distribution() quals_.draw_distribution(distrib, labels=PLOT_LABELS['seq_qual'], plot_fhand=open(plot_fpath, 'w'), distrib_fhand=open(distrib_fpath, 'w')) #the statistics for the statistics file self._write_statistics(stats_fhand, fpath, lengths_, quals_) #the statistics for the differences if 'raw' in pair and 'cleaned' in pair: fpath = pair['cleaned'].last_version basename = pair['cleaned'].basename #the names for the output files stats_dir = get_stats_dir('cleaned') out_fpath = os.path.join(stats_dir, basename + '.length.diff') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' if not os.path.exists(plot_fpath) and lengths: #the distributions for the lengths lengths = lengths['raw'], lengths['cleaned'] self._do_diff_distrib_for_numbers(lengths, plot_fhand= open(plot_fpath, 'w'), distrib_fhand= open(distrib_fpath, 'w'), dist_type='seq_length') del lengths #the distributions for the quals out_fpath = os.path.join(stats_dir, basename + '.qual.diff') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' quals = quals['raw'], quals['cleaned'] if quals[0].count != 0 and quals[1].count != 0: self._do_diff_distrib_for_numbers(quals, plot_fhand= open(plot_fpath, 'w'), distrib_fhand= open(distrib_fpath, 'w'), dist_type='seq_qual') del quals for seq_type in ('raw', 'cleaned'): if seq_type in pair: stats_dir = get_stats_dir(seq_type) fpath = pair[seq_type].last_version basename = pair[seq_type].basename #the names for the output files out_fpath = os.path.join(stats_dir, basename + '.qual.boxplot') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' if os.path.exists(plot_fpath): continue quals_ = self._get_quals_by_length_from_file(fpath) if quals_ and quals_[0]: #boxplot draw_boxplot(quals_, fhand=open(plot_fpath, 'w'), title=PLOT_LABELS['qual_boxplot']['title'], xlabel=PLOT_LABELS['qual_boxplot']['xlabel'], ylabel=PLOT_LABELS['qual_boxplot']['ylabel'], stats_fhand=open(distrib_fpath, 'w'), max_plotted_boxes=30) del quals_
def _do_stats_for_file(self, path, stats_dir): 'It calculates the stats for a file os seqs' stats_fpath = os.path.join(stats_dir, BACKBONE_BASENAMES['statistics_file']) stats_fhand = open(stats_fpath, 'a') fpath = path.last_version basename = path.basename #output_files freq_pos_out_fpath = os.path.join(stats_dir, basename + '.freq_position') length_out_fpath = os.path.join(stats_dir, basename + '.length') qual_out_fpath = os.path.join(stats_dir, basename + '.qual') qual_boxplot_out_fpath = os.path.join(stats_dir, basename + '.qual.boxplot') # nucleotide freq per position if not os.path.exists(freq_pos_out_fpath): plot_fpath = freq_pos_out_fpath + '.' + PLOT_FILE_FORMAT seqs = seqs_in_file(open(fpath)) create_nucleotide_freq_histogram(seqs, fhand=open(plot_fpath, 'w')) #Extract lengths and quals if (not os.path.exists(length_out_fpath) or not os.path.exists(qual_out_fpath)): lengths_, quals_ = self._get_lengths_quals_from_file(fpath) #the distributions for the lengths if not os.path.exists(length_out_fpath): plot_fpath = length_out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = length_out_fpath + '.dat' distrib = lengths_.calculate_distribution() lengths_.draw_distribution(distrib, labels=PLOT_LABELS['seq_length'], distrib_fhand=open(distrib_fpath, 'w'), plot_fhand=open(plot_fpath, 'w')) #the distributions for the quals if not os.path.exists(qual_out_fpath) or quals_.count != 0: plot_fpath = qual_out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = qual_out_fpath + '.dat' distrib = quals_.calculate_distribution() quals_.draw_distribution(distrib, labels=PLOT_LABELS['seq_qual'], plot_fhand=open(plot_fpath, 'w'), distrib_fhand=open(distrib_fpath, 'w')) # qual boxplot if not os.path.exists(qual_boxplot_out_fpath): quals = self._get_quals_by_length_from_file(fpath) if (quals and quals[0]): plot_fpath = qual_boxplot_out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = qual_boxplot_out_fpath + '.dat' #boxplot draw_boxplot(quals, fhand=open(plot_fpath, 'w'), title=PLOT_LABELS['qual_boxplot']['title'], xlabel=PLOT_LABELS['qual_boxplot']['xlabel'], ylabel=PLOT_LABELS['qual_boxplot']['ylabel'], stats_fhand=open(distrib_fpath, 'w'), max_plotted_boxes=30) #the statistics for the statistics file self._write_statistics(stats_fhand, fpath, lengths_, quals_)