def _run_and_save_state(action, timestamp): #filename = os.path.join('.state', grace.filesystem_friendly_name(action.ident())) #temp_filename = os.path.join('.state', 'temp-' + grace.filesystem_friendly_name(action.ident())) filename = action.state_filename() temp_filename = filename + '.temp' if os.path.exists(filename): os.unlink(filename) if selection.matches(LOCAL.done_selection, [action.shell_name()]): result = None else: result = action.run() LOCAL.time = max(LOCAL.time, timestamp) action.timestamp = timestamp action.timestamp_for = filename action.timestamp_cwd = os.getcwd() #timestamp_for is used to ensure the action is being # run from the same (relative) current directory as previously dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.mkdir(dirname) with open(temp_filename, 'wb') as f: pickle.dump(action, f) os.rename(temp_filename, filename) return result
def run(self): if self.output is not None: out_file = open(self.output, 'wb') else: out_file = sys.stdout annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if 'ID' not in item.attr and 'locus_tag' in item.attr: item.attr['ID'] = item.attr['locus_tag'] if 'color' not in item.attr: if item.type == 'CDS': item.attr['color'] = '#008800' if item.type == 'rRNA': item.attr['color'] = '#bb0000' if item.type == 'tRNA': item.attr['color'] = '#bb00bb' if item.type == 'misc_feature': item.attr['color'] = '#8888ff' print >> out_file, item.as_gff() if self.output is not None: out_file.close()
def _get_timestamp(action): """ Look for ident() in .state subdirectory of current directory. If pickled value matches return the timestamp. """ if selection.matches(LOCAL.do_selection, [action.shell_name()]): return None try: for filename in [ action.state_filename(), os.path.join('.state', grace.filesystem_friendly_name(action.ident())), #Old location of state files ]: if os.path.exists(filename): with open(filename,'rb') as f: old = pickle.load(f) if action != old: return None if not hasattr(old, 'timestamp'): return None if hasattr(old, 'timestamp_for') and old.timestamp_for != filename: return None return old.timestamp #for parameter in self.parameters: # if parameter.get(self) != parameter.get(old): # print >> sys.stderr, parameter.name, parameter.get(old), '->', parameter.get(self) except Exception, error: import traceback traceback.print_exc() print >> sys.stderr, 'Error making %s, re-running: %s' % (action.ident(), error)
def _run_and_save_state(action, timestamp): #filename = os.path.join('.state', grace.filesystem_friendly_name(action.ident())) #temp_filename = os.path.join('.state', 'temp-' + grace.filesystem_friendly_name(action.ident())) filename = action.state_filename() temp_filename = filename + '.temp' if os.path.exists(filename): os.unlink(filename) if selection.matches(LOCAL.done_selection, [action.shell_name()]): result = None else: result = action.run() LOCAL.time = max(LOCAL.time, timestamp) action.timestamp = timestamp action.timestamp_for = filename action.timestamp_cwd = os.getcwd() #timestamp_for is used to ensure the action is being # run from the same (relative) current directory as previously dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.mkdir(dirname) with open(temp_filename,'wb') as f: pickle.dump(action, f) os.rename(temp_filename, filename) return result
def run(self): if self.output is not None: out_file = open(self.output,'wb') else: out_file = sys.stdout annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if 'ID' not in item.attr and 'locus_tag' in item.attr: item.attr['ID'] = item.attr['locus_tag'] if 'color' not in item.attr: if item.type == 'CDS': item.attr['color'] = '#008800' if item.type == 'rRNA': item.attr['color'] = '#bb0000' if item.type == 'tRNA': item.attr['color'] = '#bb00bb' if item.type == 'misc_feature': item.attr['color'] = '#8888ff' print >> out_file, item.as_gff() if self.output is not None: out_file.close()
def run(self): features_parent = [ _Related_feature(item,item.start,item.end,[]) for item in annotation.read_annotations(self.parent) if selection.matches(self.select_parent, [item.type]) ] features_child = [ _Related_feature(item,item.start,item.end,[]) for item in annotation.read_annotations(self.child) if selection.matches(self.select_child, [item.type]) ] index = { } for item in features_child: if item.feature.seqid not in index: index[item.feature.seqid] = span_index.Span_index() index[item.feature.seqid].insert(item) for value in index.values(): value.prepare() for item_1 in features_parent: if item_1.feature.strand == 1: start = item_1.start - self.upstrand end = item_1.end + self.downstrand elif item_1.feature.strand == -1: start = item_1.start - self.downstrand end = item_1.end + self.upstrand else: start = item_1.start - max(self.upstrand,self.downstrand) end = item_1.end + max(self.upstrand,self.downstrand) if item_1.feature.seqid in index: for item_2 in index[item_1.feature.seqid].get(start,end): item_1.relations.append(item_2) item_2.relations.append(item_1) for item in features_parent: item.modify_with_relations(self.use, self.to_child, self.to_parent) with open(self.prefix + '-parent.gff','wb') as f: annotation.write_gff3_header(f) for item in features_parent: print >> f, item.feature.as_gff() with open(self.prefix + '-child.gff','wb') as f: annotation.write_gff3_header(f) for item in features_child: print >> f, item.feature.as_gff()
def run(self): features_parent = [ _Related_feature(item, item.start, item.end, []) for item in annotation.read_annotations(self.parent) if selection.matches(self.select_parent, [item.type]) ] features_child = [ _Related_feature(item, item.start, item.end, []) for item in annotation.read_annotations(self.child) if selection.matches(self.select_child, [item.type]) ] index = {} for item in features_child: if item.feature.seqid not in index: index[item.feature.seqid] = span_index.Span_index() index[item.feature.seqid].insert(item) for value in index.values(): value.prepare() for item_1 in features_parent: if item_1.feature.strand == 1: start = item_1.start - self.upstrand end = item_1.end + self.downstrand elif item_1.feature.strand == -1: start = item_1.start - self.downstrand end = item_1.end + self.upstrand else: start = item_1.start - max(self.upstrand, self.downstrand) end = item_1.end + max(self.upstrand, self.downstrand) if item_1.feature.seqid in index: for item_2 in index[item_1.feature.seqid].get(start, end): item_1.relations.append(item_2) item_2.relations.append(item_1) for item in features_parent: item.modify_with_relations(self.use, self.to_child, self.to_parent) with open(self.prefix + '-parent.gff', 'wb') as f: annotation.write_gff3_header(f) for item in features_parent: print >> f, item.feature.as_gff() with open(self.prefix + '-child.gff', 'wb') as f: annotation.write_gff3_header(f) for item in features_child: print >> f, item.feature.as_gff()
def convert(filename): info = io.get_file_info(filename) ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name,'wb') as f: for name, seq, qual in io.read_sequences(filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name
def run(self): assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.' strand_changer = STRAND_CHANGE[self.change_strand] shift_start_absolute, shift_start_proportion = decode_shift( self.shift_start) shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end) renames = [] if self.rename: for item in self.rename.split(','): new, old = item.split('=') if new != old: renames.append((new, old)) out_file = open(self.prefix + '.gff', 'wb') annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type length = item.end - item.start shift_start = int( math.floor(0.5 + shift_start_absolute + shift_start_proportion * length)) shift_end = int( math.floor(0.5 + shift_end_absolute + shift_end_proportion * length)) if item.strand == 1: item.start += shift_start item.end += shift_end elif item.strand == -1: item.end -= shift_start item.start -= shift_end item.start = max(0, item.start) #IGV complains item.strand = strand_changer[item.strand] old_attr = item.attr.copy() for new, old in renames: if old in item.attr: del item.attr[old] for new, old in renames: if old in old_attr: item.attr[new] = old_attr[old] print >> out_file, item.as_gff() out_file.close()
def modify_with_relations(self, use, to_child, to_parent): buckets = collections.defaultdict(list) my_strand = self.feature.strand or 0 for item in self.relations: their_strand = item.feature.strand or 0 overlaps = self.feature.overlaps(item.feature,check_strand=False) if my_strand * their_strand == -1: if overlaps: relation = 'opposite' elif item.feature.start*my_strand < self.feature.start*my_strand: relation = 'upstrand_opposite' else: relation = 'downstrand_opposite' elif overlaps: relation = 'in' else: strand = my_strand or their_strand if not strand: relation = 'near' else: if item.feature.start*strand < self.feature.start*strand: relation = 'upstrand' else: relation = 'downstrand' buckets[relation].append(item) for name,relatives in buckets.items(): if selection.matches(use, [name]): for relative in relatives: self.add_to_attr('has_'+name, relative.feature.get_id()) relative.add_to_attr('is_'+name, self.feature.get_id()) relative.add_to_attr('Parent', self.feature.get_id()) for key in self.feature.attr: if selection.matches(to_child,[key]): relative.add_to_attr(key, self.feature.attr[key]) for key in relative.feature.attr: if selection.matches(to_parent,[key]): self.add_to_attr(key, relative.feature.attr[key])
def modify_with_relations(self, use, to_child, to_parent): buckets = collections.defaultdict(list) my_strand = self.feature.strand or 0 for item in self.relations: their_strand = item.feature.strand or 0 overlaps = self.feature.overlaps(item.feature, check_strand=False) if my_strand * their_strand == -1: if overlaps: relation = 'opposite' elif item.feature.start * my_strand < self.feature.start * my_strand: relation = 'upstrand_opposite' else: relation = 'downstrand_opposite' elif overlaps: relation = 'in' else: strand = my_strand or their_strand if not strand: relation = 'near' else: if item.feature.start * strand < self.feature.start * strand: relation = 'upstrand' else: relation = 'downstrand' buckets[relation].append(item) for name, relatives in buckets.items(): if selection.matches(use, [name]): for relative in relatives: self.add_to_attr('has_' + name, relative.feature.get_id()) relative.add_to_attr('is_' + name, self.feature.get_id()) relative.add_to_attr('Parent', self.feature.get_id()) for key in self.feature.attr: if selection.matches(to_child, [key]): relative.add_to_attr(key, self.feature.attr[key]) for key in relative.feature.attr: if selection.matches(to_parent, [key]): self.add_to_attr(key, relative.feature.attr[key])
def run(self): annotations = [ ] for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type annotations.append(item) annotations.sort(key=lambda item: (item.seqid, item.strand, item.start)) group = [ ] groups = [ ] def emit(): if not group: return groups.append(group[:]) del group[:] seqid = None strand = None end = 0 for item in annotations: if item.seqid != seqid or item.strand != strand or item.start >= end: emit() seqid = item.seqid strand = item.strand end = item.end-self.overlap group.append(item) end = max(item.end-self.overlap, end) emit() out_file = open(self.prefix+'.gff','wb') annotation.write_gff3_header(out_file) for group in groups: item = annotation.Annotation() item.source = group[0].source item.type = join_descriptions( item2.type for item2 in group ) item.seqid = group[0].seqid item.strand = group[0].strand item.start = min( item2.start for item2 in group ) item.end = max( item2.end for item2 in group ) item.score = None item.phase = None item.attr = { } for item2 in group: for key in item2.attr: if key in item.attr: continue item.attr[key] = join_descriptions( item3.attr[key] for item3 in group if key in item3.attr ) print >> out_file, item.as_gff() out_file.close()
def convert(filename): info = io.get_file_info(filename) ok = selection.matches( 'type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name, 'wb') as f: for name, seq, qual in io.read_sequences( filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name
def classify_files(filenames, selectors): """ Put each of a set of files into one or more categories. """ results = [[] for item in categories] for filename in filenames: info = get_file_info(filename) any = False for i, selector in enumerate(selectors): if selection.matches(selector, info): results[i].append(filename) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) return results
def classify_files(filenames, selectors): """ Put each of a set of files into one or more categories. """ results = [ [] for item in categories ] for filename in filenames: info = get_file_info(filename) any = False for i, selector in enumerate(selectors): if selection.matches(selector, info): results[i].append(filename) any = True if not any: raise grace.Error('Don\'t know what to do with '+filename) return results
def run(self): assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.' strand_changer = STRAND_CHANGE[self.change_strand] shift_start_absolute, shift_start_proportion = decode_shift(self.shift_start) shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end) renames = [ ] if self.rename: for item in self.rename.split(','): new, old = item.split('=') if new != old: renames.append((new,old)) out_file = open(self.prefix+'.gff','wb') annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type length = item.end-item.start shift_start = int(math.floor(0.5+shift_start_absolute+shift_start_proportion*length)) shift_end = int(math.floor(0.5+shift_end_absolute+shift_end_proportion*length)) if item.strand == 1: item.start += shift_start item.end += shift_end elif item.strand == -1: item.end -= shift_start item.start -= shift_end item.start = max(0, item.start) #IGV complains item.strand = strand_changer[item.strand] old_attr = item.attr.copy() for new,old in renames: if old in item.attr: del item.attr[old] for new,old in renames: if old in old_attr: item.attr[new] = old_attr[old] print >> out_file, item.as_gff() out_file.close()
def _write_table(self, samples, items): names = [ '%s:%d' % (item.record.CHROM, item.record.POS) for item in items ] sample_list = io.named_list_type(samples) groups = [] locations_list = io.named_list_type(['CHROM', 'POS']) locations = io.named_list_type(names, locations_list)([ locations_list([item.record.CHROM, item.record.POS]) for item in items ]) groups.append(('Location', locations)) genotypes = io.named_list_type(names, sample_list)([ sample_list([ describe_genotype(item2, item.variants) for item2 in item.genotypes ]) for item in items ]) groups.append(('Genotype', genotypes)) if self.qualities: qualities = io.named_list_type(names, sample_list)( [sample_list(item.qualities) for item in items]) groups.append(('Quality', qualities)) if self.counts: counts = io.named_list_type(names, sample_list)([ sample_list([ describe_counts(item2, item.variants) for item2 in item.counts ]) for item in items ]) groups.append(('Count', counts)) annotation_list = io.named_list_type(['snpeff']) annotations = io.named_list_type(names, annotation_list)([ annotation_list([ ' /// '.join(item2[0] for item2 in item.snpeff if selection.matches(self.snpeff_show, item2[1])) ]) for item in items ]) groups.append(('Annotation', annotations)) io.write_grouped_csv(self.prefix + '.csv', groups)
def _write_table(self, samples, items): names = [ '%s:%d' % (item.record.CHROM, item.record.POS) for item in items ] sample_list = io.named_list_type(samples) groups = [ ] locations_list = io.named_list_type(['CHROM','POS']) locations = io.named_list_type(names, locations_list)([ locations_list([ item.record.CHROM, item.record.POS ]) for item in items ]) groups.append(('Location',locations)) genotypes = io.named_list_type(names,sample_list)([ sample_list([ describe_genotype(item2,item.variants) for item2 in item.genotypes ]) for item in items ]) groups.append(('Genotype',genotypes)) if self.qualities: qualities = io.named_list_type(names,sample_list)([ sample_list(item.qualities) for item in items ]) groups.append(('Quality',qualities)) if self.counts: counts = io.named_list_type(names,sample_list)([ sample_list([ describe_counts(item2,item.variants) for item2 in item.counts ]) for item in items ]) groups.append(('Count',counts)) annotation_list = io.named_list_type(['snpeff']) annotations = io.named_list_type(names, annotation_list)([ annotation_list([ ' /// '.join(item2[0] for item2 in item.snpeff if selection.matches(self.snpeff_show, item2[1])) ]) for item in items ]) groups.append(('Annotation',annotations)) io.write_grouped_csv(self.prefix + '.csv', groups)
def _make_inner(action): timestamp = coordinator().time() assert timestamp > LOCAL.time, 'Time running in reverse.' cores = action.cores_required() if cores > 1: coordinator().trade_cores(1, cores) try: config.write_colored_text(sys.stderr, '\n'+action.describe()+'\n') if LOCAL.abort_make and not selection.matches(LOCAL.do_selection, [action.shell_name()]): raise grace.Error('%s would be run. Stopping here.' % action.ident()) old_status = grace.status(action.shell_name()) try: _run_and_save_state(action, timestamp) finally: grace.status(old_status) finally: if cores > 1: coordinator().trade_cores(cores, 1)
def _get_timestamp(action): """ Look for ident() in .state subdirectory of current directory. If pickled value matches return the timestamp. """ if selection.matches(LOCAL.do_selection, [action.shell_name()]): return None try: for filename in [ action.state_filename(), os.path.join( '.state', grace.filesystem_friendly_name( action.ident())), #Old location of state files ]: if os.path.exists(filename): with open(filename, 'rb') as f: old = pickle.load(f) if action != old: return None if not hasattr(old, 'timestamp'): return None if hasattr(old, 'timestamp_for') and old.timestamp_for != filename: return None return old.timestamp #for parameter in self.parameters: # if parameter.get(self) != parameter.get(old): # print >> sys.stderr, parameter.name, parameter.get(old), '->', parameter.get(self) except Exception, error: import traceback traceback.print_exc() print >> sys.stderr, 'Error making %s, re-running: %s' % ( action.ident(), error)
def run(self): assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.' strand_changer = STRAND_CHANGE[self.change_strand] out_file = open(self.prefix+'.gff','wb') annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if item.strand == 1: item.start += self.shift_start item.end += self.shift_end elif item.strand == -1: item.end -= self.shift_start item.start -= self.shift_end item.strand = strand_changer[item.strand] print >> out_file, item.as_gff() out_file.close()
def _make_inner(action): timestamp = coordinator().time() assert timestamp > LOCAL.time, 'Time running in reverse.' cores = action.cores_required() if cores > 1: coordinator().trade_cores(1, cores) try: config.write_colored_text(sys.stderr, '\n' + action.describe() + '\n') if LOCAL.abort_make and not selection.matches(LOCAL.do_selection, [action.shell_name()]): raise grace.Error('%s would be run. Stopping here.' % action.ident()) old_status = grace.status(action.shell_name()) try: _run_and_save_state(action, timestamp) finally: grace.status(old_status) finally: if cores > 1: coordinator().trade_cores(cores, 1)
def _create_json(self): workspace = io.Workspace(self.output_dir, must_exist=False) samples = [ ] groups = [ ] for sample in self.samples: this_groups = [ ] for item in self.groups: if selection.matches( selection.term_specification(item), sample.tags + [ sample.output_dir ] ): this_groups.append(selection.term_name(item)) group = ','.join(this_groups) if this_groups else 'ungrouped' if group not in groups: groups.append(group) item = { 'name' : sample.output_dir, 'bam' : os.path.abspath( workspace/('samples',sample.output_dir,'alignments_filtered_sorted.bam') ), 'group' : group, 'tags' : sample.tags, } samples.append(item) obj = collections.OrderedDict() obj['reference'] = os.path.abspath( self.reference ) obj['extension'] = self.extension obj['genes'] = os.path.abspath( workspace/('peaks','relation-parent.gff') ) obj['peaks'] = os.path.abspath( workspace/('peaks','relation-child.gff') ) obj['groups'] = groups obj['samples'] = samples with open(workspace/"plotter-config.json","wb") as f: json.dump(obj, f, indent=4)
def matches(self, expression): return selection.matches(expression, self.get_tags())
def run(self): data = io.read_grouped_table( self.counts, [('Count',str), ('Annotation',str), ('Tail_count',str), ('Tail',str), ('Proportion',str)], 'Count', ) features = data['Count'].keys() samples = data['Count'].value_type().keys() tags = { } for sample in samples: tags[sample] = [sample] for line in data.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') tags[parts[0]] = parts group_names = [ ] groups = [ ] group_tags = [ ] for item in self.groups: select = selection.term_specification(item) name = selection.term_name(item) group = [ item for item in samples if selection.matches(select, tags[item]) ] assert group, 'Empty group: '+name this_group_tags = [ name ] for tag in tags[group[0]]: if tag == name: continue for item in group[1:]: for item2 in tags[item]: if tag not in item2: break else: this_group_tags.append(tag) group_names.append(name) groups.append(group) group_tags.append(this_group_tags) result = io.Grouped_table() result.comments = [ '#Counts' ] for item in group_tags: result.comments.append('#sampleTags='+','.join(item)) count = [ ] tail_count = [ ] tail = [ ] proportion = [ ] for feature in features: this_count = [ ] this_tail_count = [ ] this_tail = [ ] this_proportion = [ ] for group in groups: this_this_count = [ ] this_this_tail_count = [ ] this_this_tail = [ ] this_this_proportion = [ ] for sample in group: this_this_count.append(int(data['Count'][feature][sample])) this_this_tail_count.append(int(data['Tail_count'][feature][sample])) item = data['Tail'][feature][sample] if item != 'NA': this_this_tail.append(float(item)) item = data['Proportion'][feature][sample] if item != 'NA': this_this_proportion.append(float(item)) this_count.append(str(sum(this_this_count))) this_tail_count.append(str(sum(this_this_tail_count))) this_tail.append(str(sum(this_this_tail)/len(this_this_tail)) if this_this_tail else 'NA') this_proportion.append(str(sum(this_this_proportion)/len(this_this_proportion)) if this_this_proportion else 'NA') count.append(this_count) tail_count.append(this_tail_count) tail.append(this_tail) proportion.append(this_proportion) matrix = io.named_matrix_type(features,group_names) result['Count'] = matrix(count) result['Annotation'] = data['Annotation'] result['Tail_count'] = matrix(tail_count) result['Tail'] = matrix(tail) result['Proportion'] = matrix(proportion) result.write_csv(self.prefix + '.csv')
def run(self): data = io.read_grouped_table( self.counts, [("Count", str), ("Annotation", str), ("Tail_count", str), ("Tail", str), ("Proportion", str)], "Count", ) features = data["Count"].keys() samples = data["Count"].value_type().keys() tags = {} for sample in samples: tags[sample] = [sample] for line in data.comments: if line.startswith("#sampleTags="): parts = line[len("#sampleTags=") :].split(",") tags[parts[0]] = parts group_names = [] groups = [] group_tags = [] for item in self.groups: select = selection.term_specification(item) name = selection.term_name(item) group = [item for item in samples if selection.matches(select, tags[item])] assert group, "Empty group: " + name this_group_tags = [name] for tag in tags[group[0]]: if tag == name: continue for item in group[1:]: for item2 in tags[item]: if tag not in item2: break else: this_group_tags.append(tag) group_names.append(name) groups.append(group) group_tags.append(this_group_tags) result = io.Grouped_table() result.comments = ["#Counts"] for item in group_tags: result.comments.append("#sampleTags=" + ",".join(item)) count = [] tail_count = [] tail = [] proportion = [] for feature in features: this_count = [] this_tail_count = [] this_tail = [] this_proportion = [] for group in groups: this_this_count = [] this_this_tail_count = [] this_this_tail = [] this_this_proportion = [] for sample in group: this_this_count.append(int(data["Count"][feature][sample])) this_this_tail_count.append(int(data["Tail_count"][feature][sample])) item = data["Tail"][feature][sample] if item != "NA": this_this_tail.append(float(item)) item = data["Proportion"][feature][sample] if item != "NA": this_this_proportion.append(float(item)) this_count.append(str(sum(this_this_count))) this_tail_count.append(str(sum(this_this_tail_count))) this_tail.append(str(sum(this_this_tail) / len(this_this_tail)) if this_this_tail else "NA") this_proportion.append( str(sum(this_this_proportion) / len(this_this_proportion)) if this_this_proportion else "NA" ) count.append(this_count) tail_count.append(this_tail_count) tail.append(this_tail) proportion.append(this_proportion) matrix = io.named_matrix_type(features, group_names) result["Count"] = matrix(count) result["Annotation"] = data["Annotation"] result["Tail_count"] = matrix(tail_count) result["Tail"] = matrix(tail) result["Proportion"] = matrix(proportion) result.write_csv(self.prefix + ".csv")
def run(self): reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) tags = {} for item in reader.metadata.get('sampleTags', []): parts = item.split(',') tags[parts[0]] = parts assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.' samples = ['reference'] + reader.samples for sample in samples: if sample not in tags: tags[sample] = [sample, 'all'] samples = selection.select_and_sort(self.select, self.sort, samples, lambda sample: tags[sample]) required = [ i for i, sample in enumerate(samples) if selection.matches(self.require, tags[sample]) ] sample_number = dict((b, a) for a, b in enumerate(reader.samples)) items = [] for record in reader: variants = get_variants(record) genotypes = [] counts = [] qualities = [] for sample in samples: if sample == 'reference': genotypes.append([0]) counts.append([1]) qualities.append(float('inf')) else: genotypes.append( get_genotype(record.samples[sample_number[sample]])) counts.append( get_variant_counts( record.samples[sample_number[sample]])) qualities.append( record.samples[sample_number[sample]].data.GQ) # Only output when there are at least two genotypes any_interesting = False for i in xrange(len(genotypes)): for j in xrange(i): if (genotypes[i] is not None and genotypes[j] is not None and not genotypes_equal(genotypes[i], genotypes[j])): any_interesting = True break if any_interesting: break if not any_interesting: continue if any(genotypes[i] is None for i in required): continue if self.only_snps and any(genotype is not None and any( len(variants[i]) != 1 for i in genotype) for genotype in genotypes): continue snpeff = snpeff_describe(record.INFO.get('EFF', '')) if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('', [])])): continue items.append( _Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record)) self.log.log('%d variants\n\n' % len(items)) if self.as_ == 'table': self._write_table(samples, items) elif self.as_ == 'nexus': self._write_nexus(samples, items) elif self.as_ == 'splitstree': self._write_nexus(samples, items) io.execute( 'SplitsTree +g -i INPUT -x COMMAND', no_display=True, INPUT=self.prefix + '.nex', COMMAND='UPDATE; ' 'SAVE FILE=\'%s.nex\' REPLACE=yes; ' 'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 'QUIT' % (self.prefix, self.prefix, len(items)), ) elif self.as_ == 'vcf': self._write_vcf(samples, items, reader) else: raise grace.Error('Unknown output format: ' + self.as_)
def run(self): annotations = [] for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type annotations.append(item) annotations.sort( key=lambda item: (item.type, item.seqid, item.strand, item.start)) group = [] groups = [] def emit(): if not group: return groups.append(group[:]) del group[:] type = None seqid = None strand = None end = 0 for item in annotations: if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end: emit() type = item.type seqid = item.seqid strand = item.strand end = item.end - self.overlap group.append(item) end = max(item.end - self.overlap, end) emit() items = [] id_map = {} for group in groups: item = annotation.Annotation() item.source = group[0].source item.type = group[0].type item.seqid = group[0].seqid item.strand = group[0].strand item.start = min(item2.start for item2 in group) item.end = max(item2.end for item2 in group) item.score = None item.phase = None item.attr = {} for item2 in group: for key in item2.attr: if key in item.attr: continue item.attr[key] = join_descriptions([ item3.attr[key] for item3 in group if key in item3.attr ], self.joiner) item.parents = [] for item2 in group: if 'ID' in item2.attr: assert item2.attr[ 'ID'] not in id_map, 'Duplicate ID: ' + item2.attr['ID'] id_map[item2.attr['ID']] = item.attr['ID'] if 'Parent' in item2.attr: item.parents.append(item2.attr['Parent']) items.append(item) for item in items: if item.parents: item.attr['Parent'] = join_descriptions( [id_map.get(parent, parent) for parent in item.parents], ',') with open(self.prefix + '.gff', 'wb') as out_file: annotation.write_gff3_header(out_file) for item in items: print >> out_file, item.as_gff()
def run(self): annotations = [ ] for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type annotations.append(item) annotations.sort(key=lambda item: (item.type, item.seqid, item.strand, item.start)) group = [ ] groups = [ ] def emit(): if not group: return groups.append(group[:]) del group[:] type = None seqid = None strand = None end = 0 for item in annotations: if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end: emit() type = item.type seqid = item.seqid strand = item.strand end = item.end-self.overlap group.append(item) end = max(item.end-self.overlap, end) emit() items = [ ] id_map = { } for group in groups: item = annotation.Annotation() item.source = group[0].source item.type = group[0].type item.seqid = group[0].seqid item.strand = group[0].strand item.start = min( item2.start for item2 in group ) item.end = max( item2.end for item2 in group ) item.score = None item.phase = None item.attr = { } for item2 in group: for key in item2.attr: if key in item.attr: continue item.attr[key] = join_descriptions([ item3.attr[key] for item3 in group if key in item3.attr ], self.joiner ) item.parents = [ ] for item2 in group: if 'ID' in item2.attr: assert item2.attr['ID'] not in id_map, 'Duplicate ID: '+item2.attr['ID'] id_map[item2.attr['ID']] = item.attr['ID'] if 'Parent' in item2.attr: item.parents.append(item2.attr['Parent']) items.append(item) for item in items: if item.parents: item.attr['Parent'] = join_descriptions([ id_map.get(parent,parent) for parent in item.parents ], ',') with open(self.prefix+'.gff','wb') as out_file: annotation.write_gff3_header(out_file) for item in items: print >> out_file, item.as_gff()
def run(self): reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) tags = { } for item in reader.metadata.get('sampleTags',[]): parts = item.split(',') tags[parts[0]] = parts assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.' samples = [ 'reference'] + reader.samples for sample in samples: if sample not in tags: tags[sample] = [ sample, 'all' ] samples = selection.select_and_sort( self.select, self.sort, samples, lambda sample: tags[sample]) required = [ i for i, sample in enumerate(samples) if selection.matches(self.require, tags[sample]) ] sample_number = dict((b,a) for a,b in enumerate(reader.samples)) items = [ ] for record in reader: variants = get_variants(record) genotypes = [ ] counts = [ ] qualities = [ ] for sample in samples: if sample == 'reference': genotypes.append([0]) counts.append([1]) qualities.append(float('inf')) else: genotypes.append(get_genotype(record.samples[sample_number[sample]])) counts.append(get_variant_counts(record.samples[sample_number[sample]])) qualities.append(record.samples[sample_number[sample]].data.GQ) # Only output when there are at least two genotypes any_interesting = False for i in xrange(len(genotypes)): for j in xrange(i): if (genotypes[i] is not None and genotypes[j] is not None and not genotypes_equal(genotypes[i], genotypes[j])): any_interesting = True break if any_interesting: break if not any_interesting: continue if any(genotypes[i] is None for i in required): continue if self.only_snps and any( genotype is not None and any(len(variants[i]) != 1 for i in genotype) for genotype in genotypes): continue snpeff = snpeff_describe(record.INFO.get('EFF','')) if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('',[])]) ): continue items.append(_Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record)) self.log.log('%d variants\n\n' % len(items)) if self.as_ == 'table': self._write_table(samples, items) elif self.as_ == 'nexus': self._write_nexus(samples, items) elif self.as_ == 'splitstree': self._write_nexus(samples, items) io.execute( 'SplitsTree +g -i INPUT -x COMMAND', no_display=True, INPUT=self.prefix + '.nex', COMMAND='UPDATE; ' 'SAVE FILE=\'%s.nex\' REPLACE=yes; ' 'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 'QUIT' % (self.prefix, self.prefix, len(items)), ) elif self.as_ == 'vcf': self._write_vcf(samples, items, reader) else: raise grace.Error('Unknown output format: '+self.as_)
def run(self): data = io.read_grouped_table( self.counts, [('Count', str), ('Annotation', str), ('Tail_count', str), ('Tail', str), ('Proportion', str)], 'Count', ) features = data['Count'].keys() samples = data['Count'].value_type().keys() tags = {} for sample in samples: tags[sample] = [sample] for line in data.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') tags[parts[0]] = parts group_names = [] groups = [] group_tags = [] for item in self.groups: select = selection.term_specification(item) name = selection.term_name(item) group = [ item for item in samples if selection.matches(select, tags[item]) ] assert group, 'Empty group: ' + name this_group_tags = [name] for tag in tags[group[0]]: if tag == name: continue for item in group[1:]: for item2 in tags[item]: if tag not in item2: break else: this_group_tags.append(tag) group_names.append(name) groups.append(group) group_tags.append(this_group_tags) result = io.Grouped_table() result.comments = ['#Counts'] for item in group_tags: result.comments.append('#sampleTags=' + ','.join(item)) count = [] tail_count = [] tail = [] proportion = [] for feature in features: this_count = [] this_tail_count = [] this_tail = [] this_proportion = [] for group in groups: this_this_count = [] this_this_tail_count = [] this_this_tail = [] this_this_proportion = [] for sample in group: this_this_count.append(int(data['Count'][feature][sample])) this_this_tail_count.append( int(data['Tail_count'][feature][sample])) item = data['Tail'][feature][sample] if item != 'NA': this_this_tail.append(float(item)) item = data['Proportion'][feature][sample] if item != 'NA': this_this_proportion.append(float(item)) this_count.append(str(sum(this_this_count))) this_tail_count.append(str(sum(this_this_tail_count))) this_tail.append( str(sum(this_this_tail) / len(this_this_tail)) if this_this_tail else 'NA') this_proportion.append( str(sum(this_this_proportion) / len(this_this_proportion) ) if this_this_proportion else 'NA') count.append(this_count) tail_count.append(this_tail_count) tail.append(this_tail) proportion.append(this_proportion) matrix = io.named_matrix_type(features, group_names) result['Count'] = matrix(count) result['Annotation'] = data['Annotation'] result['Tail_count'] = matrix(tail_count) result['Tail'] = matrix(tail) result['Proportion'] = matrix(proportion) result.write_csv(self.prefix + '.csv')
def run(self): title = self.get_title() n_alt = len(self.alt) n_null = len(self.null) suffix = '-dedup' if self.dedup else '' genewise_filename = join(self.analysis,'expression','genewise'+suffix,'counts.csv') genewise_norm_filename = join(self.analysis,'expression','genewise'+suffix,'norm.csv') peakwise_filename = join(self.analysis,'expression','peakwise'+suffix,'counts.csv') peakwise_norm_filename = join(self.analysis,'expression','peakwise'+suffix,'norm.csv') pairwise_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs.csv') pairwise_norm_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs-norm.csv') reader = io.Table_reader(genewise_filename, 'Count') reader.close() samples = [ item for i, item in enumerate(reader.headings) if reader.groups[i] == 'Count' ] tags = { } for item in samples: tags[item] = [ item ] for line in reader.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') tags[parts[0]] = parts model = [ ] for term in self.alt + self.null: spec = term_specification(term) model.append([ 1 if selection.matches(spec, tags[item]) else 0 for item in samples ]) model = zip(*model) #Transpose select = [ any(row) for row in model ] model = [ row for row,selected in zip(model,select) if selected ] model_columns = [ term_name(item) for item in self.alt + self.null ] pairs_n_alt = n_alt pairs_select = select + select pairs_model = ( [ (0,) * n_alt + row + (0,) for row in model ] + [ row[:n_alt] + row + (1,) for row in model ] ) pairs_model_columns = ( [ item+'-interaction' for item in model_columns[:n_alt] ] + model_columns + [ 'pair2' ] ) workspace = self.get_workspace() runr.run_script(TEST_R, self.tell, SOURCE = os.path.join(os.path.dirname(__file__),'tail_tools.R'), DIR = workspace.working_dir, MIN_READS = self.min_reads, GENEWISE_FILENAME = genewise_filename, GENEWISE_NORM_FILENAME = genewise_norm_filename, PEAKWISE_FILENAME = peakwise_filename, PEAKWISE_NORM_FILENAME = peakwise_norm_filename, PAIRWISE_FILENAME = pairwise_filename, PAIRWISE_NORM_FILENAME = pairwise_norm_filename, N_ALT = n_alt, SELECT = select, MODEL = model, MODEL_COLUMNS = model_columns, PAIRS_N_ALT = pairs_n_alt, PAIRS_SELECT = pairs_select, PAIRS_MODEL = pairs_model, PAIRS_MODEL_COLUMNS = pairs_model_columns, ) if self.tell: return reporter = reporting.Reporter(workspace.working_dir, title) if self.dedup: reporter.p('Read deduplication was used.') for entities, result, aveexpr, subtitle, terms in [ ('genes', 'genewise-voom', 'avg.expression', 'Genewise expression level', model_columns[:n_alt]), ('genes', 'genewise-tail', 'avg.tail', 'Genewise tail length', model_columns[:n_alt]), ('peaks', 'peakwise-voom', 'avg.expression', 'Peakwise expression level', model_columns[:n_alt]), ('peaks', 'peakwise-tail', 'avg.tail', 'Peakwise tail length', model_columns[:n_alt]), ('peak pairs', 'pairwise-voom', 'avg.expression', 'Peak-pair expression shift', pairs_model_columns[:n_alt]), ('peak pairs', 'pairwise-tail', 'avg.tail', 'Peak-pair tail length shift', pairs_model_columns[:n_alt]), ]: #data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All'] #n = 0 #n_01 = 0 #n_05 = 0 #for row in data.values(): # fdr = float(row['adj.P.Val']) # if fdr <= 0.01: n_01 += 1 # if fdr <= 0.05: n_05 += 1 # n += 1 io.execute([ 'degust.py', '--name', title + ' : ' + subtitle, '--avg', aveexpr, '--primary', 'baseline', '--logFC', ','.join(terms), '--fdr', 'adj.P.Val', '--info', 'gene,locus_tag,product,reads,polya.reads,tail.lengths,'+aveexpr, '--notour', '1', '--out', workspace/(result+'.html'), workspace/(result+'-toptable.csv'), ]) reporter.subheading( reporter.href(workspace/(result+'.html'), subtitle) ) #reporter.p( '%d %s, %d with fdr<=0.01, %d with fdr<=0.05' % (n,entities,n_01,n_05) ) with open(workspace/(result+'.txt'),'rU') as f: for line in f: reporter.write(line.strip() + '<br/>\n') reporter.close()