def main(): sameformat = False options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.sameformat: sameformat = True in_file_1, in_file_2, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in concat( [g1, g2], sameformat=sameformat ): if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: print(skipped( g2, filedesc=" of 2nd dataset" ))
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in coverage( [g1, g2] ): if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: print(skipped( g2, filedesc=" of 2nd dataset" ))
def main(): mincols = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.mincols: mincols = int(options.mincols) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) out_file = open(out_fname, "w") try: for line in merge(g1, mincols=mincols): if options.threecol: if type(line) is GenomicInterval: out_file.write( "%s\t%s\t%s\n" % (line.chrom, str(line.startCol), str(line.endCol))) elif type(line) is list: out_file.write("%s\t%s\t%s\n" % (line[chr_col_1], str(line[start_col_1]), str(line[end_col_1]))) else: out_file.write("%s\n" % line) else: if type(line) is GenomicInterval: out_file.write("%s\n" % "\t".join(line.fields)) elif type(line) is list: out_file.write("%s\n" % "\t".join(line)) else: out_file.write("%s\n" % line) except ParseError as exc: out_file.close() fail("Invalid file format: %s" % str(exc)) out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=" of 1st dataset"))
def main(): mincols = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) if options.mincols: mincols = int( options.mincols ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in merge(g1, mincols=mincols): if options.threecol: if type( line ) is GenomicInterval: out_file.write( "%s\t%s\t%s\n" % ( line.chrom, str( line.startCol ), str( line.endCol ) ) ) elif type( line ) is list: out_file.write( "%s\t%s\t%s\n" % ( line[chr_col_1], str( line[start_col_1] ), str( line[end_col_1] ) ) ) else: out_file.write( "%s\n" % line ) else: if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) elif type( line ) is list: out_file.write( "%s\n" % "\t".join( line ) ) else: out_file.write( "%s\n" % line ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" ))
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) try: bases = base_coverage(g1) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) out_file = open( out_fname, "w" ) out_file.write( "%s\n" % str( bases ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc="" ))
# should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand ) except Exception, exc: print >> sys.stderr, str( exc ) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write( "%s\n" % outinterval ) f1.close() out_file.close() if g1.skipped > 0: print skipped( g1, filedesc="" ) if __name__ == "__main__": main()
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) in1_gff_format = bool( options.gff1 ) in2_gff_format = bool( options.gff2 ) in_fname, in2_fname, out_fname, direction = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) # Find flanking features. out_file = open( out_fname, "w" ) try: for result in proximal_region_finder([g1, g2], direction): if type( result ) is list: line, closest_feature = result # Need to join outputs differently depending on file types. if in1_gff_format: # Output is GFF with added attribute 'closest feature.' # Invervals are in BED coordinates; need to convert to GFF. line = convert_bed_coords_to_gff( line ) closest_feature = convert_bed_coords_to_gff( closest_feature ) # Replace double quotes with single quotes in closest feature's attributes. out_file.write( "%s closest_feature \"%s\" \n" % ( "\t".join( line.fields ), "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) ) ) else: # Output is BED + closest feature fields. output_line_fields = [] output_line_fields.extend( line.fields ) output_line_fields.extend( closest_feature.fields ) out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) ) else: out_file.write( "%s\n" % result ) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) print("Direction: %s" % (direction)) if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: print(skipped( g2, filedesc=" of 2nd dataset" ))
def main(): mincols = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) if options.mincols: mincols = int(options.mincols) pieces = bool(options.pieces) in1_gff_format = bool(options.gff1) in2_gff_format = bool(options.gff2) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) if in1_gff_format: # Subtract requires coordinates in BED format. g1.convert_to_bed_coord = True g2 = in2_reader_wrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) if in2_gff_format: # Subtract requires coordinates in BED format. g2.convert_to_bed_coord = True out_file = open(out_fname, "w") try: for feature in subtract([g1, g2], pieces=pieces, mincols=mincols): if isinstance(feature, GFFFeature): # Convert back to GFF coordinates since reader converted automatically. convert_bed_coords_to_gff(feature) for interval in feature.intervals: out_file.write("%s\n" % "\t".join(interval.fields)) elif isinstance(feature, GenomicInterval): out_file.write("%s\n" % "\t".join(feature.fields)) else: out_file.write("%s\n" % feature) except ParseError as exc: out_file.close() fail("Invalid file format: %s" % str(exc)) out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=" of 2nd dataset")) if g2.skipped > 0: print(skipped(g2, filedesc=" of 1st dataset"))
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) in1_gff_format = bool(options.gff1) in2_gff_format = bool(options.gff2) in_fname, in2_fname, out_fname, direction = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = in2_reader_wrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) # Find flanking features. out_file = open(out_fname, "w") try: for result in proximal_region_finder([g1, g2], direction): if type(result) is list: line, closest_feature = result # Need to join outputs differently depending on file types. if in1_gff_format: # Output is GFF with added attribute 'closest feature.' # Invervals are in BED coordinates; need to convert to GFF. line = convert_bed_coords_to_gff(line) closest_feature = convert_bed_coords_to_gff( closest_feature) # Replace double quotes with single quotes in closest feature's attributes. out_file.write( "%s closest_feature \"%s\" \n" % ("\t".join(line.fields), "\t".join( closest_feature.fields).replace("\"", "\\\""))) else: # Output is BED + closest feature fields. output_line_fields = [] output_line_fields.extend(line.fields) output_line_fields.extend(closest_feature.fields) out_file.write("%s\n" % ("\t".join(output_line_fields))) else: out_file.write("%s\n" % result) except ParseError as exc: fail("Invalid file format: %s" % str(exc)) print("Direction: %s" % (direction)) if g1.skipped > 0: print(skipped(g1, filedesc=" of 1st dataset")) if g2.skipped > 0: print(skipped(g2, filedesc=" of 2nd dataset"))
def main(): mincols = 1 leftfill = False rightfill = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) if options.mincols: mincols = int(options.mincols) if options.fill: if options.fill == "both": rightfill = leftfill = True else: rightfill = options.fill == "right" leftfill = options.fill == "left" in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type(outfields) is list: out_file.write("%s\n" % "\t".join(outfields)) else: out_file.write("%s\n" % outfields) except ParseError as exc: out_file.close() fail("Invalid file format: %s" % str(exc)) except MemoryError: out_file.close() fail("Input datasets were too large to complete the join operation.") out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=" of 1st dataset")) if g2.skipped > 0: print(skipped(g2, filedesc=" of 2nd dataset"))
def main(): allchroms = False options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) lengths = options.lengths if options.all: allchroms = True in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) lens = dict() chroms = list() # dbfile is used to determine the length of each chromosome. The lengths # are added to the lens dict and passed copmlement operation code in bx. dbfile = fileinput.FileInput( lengths ) if dbfile: if not allchroms: try: for line in dbfile: fields = line.split("\t") lens[fields[0]] = int(fields[1]) except: # assume LEN doesn't exist or is corrupt somehow pass elif allchroms: try: for line in dbfile: fields = line.split("\t") end = int(fields[1]) chroms.append("\t".join([fields[0], "0", str(end)])) except: pass # Safety...if the dbfile didn't exist and we're on allchroms, then # default to generic complement if allchroms and len(chroms) == 0: allchroms = False if allchroms: chromReader = GenomicIntervalReader(chroms) generator = subtract([chromReader, g1]) else: generator = complement(g1, lens) out_file = open( out_fname, "w" ) try: for interval in generator: if type( interval ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( interval ) ) else: out_file.write( "%s\n" % interval ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc="" ))
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.distance: distance = int(options.distance) if options.overlap: distance = -1 * int(options.overlap) if options.output: output = int(options.output) if options.minregions: minregions = int(options.minregions) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) # Get the cluster tree try: clusters, extra = find_clusters(g1, mincols=distance, minregions=minregions) except ParseError as exc: fail("Invalid file format: %s" % str(exc)) f1 = open(in_fname, "r") out_file = open(out_fname, "w") # If "merge" if output == 1: fields = [ "." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1) ] for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): fields[g1.chrom_col] = chrom fields[g1.start_col] = str(start) fields[g1.end_col] = str(end) out_file.write("%s\n" % "\t".join(fields)) # If "filtered" we preserve order of file and comments, etc. if output == 2: linenums = dict() for chrom, tree in clusters.items(): for linenum in tree.getlines(): linenums[linenum] = 0 linenum = -1 f1.seek(0) for line in f1.readlines(): linenum += 1 if linenum in linenums or linenum in extra: out_file.write("%s\n" % line.rstrip("\n\r")) # If "clustered" we output original intervals, but near each other (i.e. clustered) if output == 3: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for linenum in tree.getlines(): out_file.write("%s\n" % fileLines[linenum].rstrip("\n\r")) # If "minimum" we output the smallest interval in each cluster if output == 4 or output == 5: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): outsize = -1 outinterval = None for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand) except Exception as exc: print(str(exc), file=sys.stderr) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write("%s\n" % outinterval) f1.close() out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=""))
class CreateCaseControlTrack(GeneralGuiTool): @staticmethod def getToolName(): ''' Specifies a header of the tool, which is displayed at the top of the page. ''' return "Combine two BED files into single case-control track" @staticmethod def getInputBoxNames(): ''' Specifies a list of headers for the input boxes, and implicitly also the number of input boxes to display on the page. The returned list can have two syntaxes: 1) A list of strings denoting the headers for the input boxes in numerical order. 2) A list of tuples of strings, where each tuple has two items: a header and a key. The contents of each input box must be defined by the function getOptionsBoxK, where K is either a number in the range of 1 to the number of boxes (case 1), or the specified key (case 2). ''' return [('Select genome build: ', 'genome'), \ ('Select track to be used as case: ', 'case'), \ ('Select track to be used as control: ', 'control'), \ ('Shared regions should be: ', 'shared')] #Alternatively: [ ('box1','key1'), ('box2','key2') ] #@staticmethod #def getInputBoxOrder(): # ''' # Specifies the order in which the input boxes should be displayed, as a # list. The input boxes are specified by index (starting with 1) or by # key. If None, the order of the input boxes is in the order specified by # getInputBoxNames. # ''' # return None @staticmethod def getOptionsBoxGenome(): # Alternatively: getOptionsBoxKey1() ''' Defines the type and contents of the input box. User selections are returned to the tools in the prevChoices and choices attributes to other methods. These are lists of results, one for each input box (in the order specified by getInputBoxOrder()). The input box is defined according to the following syntax: Selection box: ['choice1','choice2'] - Returns: string Text area: 'textbox' | ('textbox',1) | ('textbox',1,False) - Tuple syntax: (contents, height (#lines) = 1, read only flag = False) - Returns: string Password field: '__password__' - Returns: string Genome selection box: '__genome__' - Returns: string Track selection box: '__track__' - Requires genome selection box. - Returns: colon-separated string denoting track name History selection box: ('__history__',) | ('__history__', 'bed', 'wig') - Only history items of specified types are shown. - Returns: colon-separated string denoting galaxy track name, as specified in ExternalTrackManager.py. History check box list: ('__multihistory__', ) | ('__multihistory__', 'bed', 'wig') - Only history items of specified types are shown. - Returns: OrderedDict with galaxy id as key and galaxy track name as value if checked, else None. Hidden field: ('__hidden__', 'Hidden value') - Returns: string Table: [['header1','header2'], ['cell1_1','cell1_2'], ['cell2_1','cell2_2']] - Returns: None Check box list: OrderedDict([('key1', True), ('key2', False), ('key3', False)]) - Returns: OrderedDict from key to selection status (bool). ''' return '__genome__' @staticmethod def getOptionsBoxCase(prevChoices): # Alternatively: getOptionsBoxKey2() ''' See getOptionsBox1(). prevChoices is a namedtuple of selections made by the user in the previous input boxes (that is, a namedtuple containing only one element in this case). The elements can accessed either by index, e.g. prevChoices[0] for the result of input box 1, or by key, e.g. prevChoices.key (case 2). ''' return '__history__', 'bed', 'point.bed', 'category.bed', 'valued.bed' @staticmethod def getOptionsBoxControl( prevChoices): # Alternatively: getOptionsBoxKey2() ''' See getOptionsBox1(). prevChoices is a namedtuple of selections made by the user in the previous input boxes (that is, a namedtuple containing only one element in this case). The elements can accessed either by index, e.g. prevChoices[0] for the result of input box 1, or by key, e.g. prevChoices.key (case 2). ''' return '__history__', 'bed', 'point.bed', 'category.bed', 'valued.bed' @staticmethod def getOptionsBoxShared(prevChoices): return [ 'removed', 'returned as case regions', 'returned as control regions', 'returned as they are (possibly overlapping)' ] #@staticmethod #def getOptionsBox4(prevChoices): # return [''] #@staticmethod #def getDemoSelections(): # return ['testChoice1','..'] @classmethod def subtract_files(cls, fn1, fn2, out_fn): g1 = NiceReaderWrapper(fileinput.FileInput(fn1), fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(fn2), fix_strand=True) out_file = open(out_fn, "w") try: for feature in subtract([g1, g2], pieces=True, mincols=1): out_file.write("%s\n" % feature) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc)) out_file.close() if g1.skipped > 0: print skipped(g1, filedesc=" of 2nd dataset") if g2.skipped > 0: print skipped(g2, filedesc=" of 1st dataset")
def main(): mincols = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.mincols: mincols = int( options.mincols ) pieces = bool( options.pieces ) in1_gff_format = bool( options.gff1 ) in2_gff_format = bool( options.gff2 ) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) if in1_gff_format: # Subtract requires coordinates in BED format. g1.convert_to_bed_coord = True g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) if in2_gff_format: # Subtract requires coordinates in BED format. g2.convert_to_bed_coord = True out_file = open( out_fname, "w" ) try: for feature in subtract( [g1, g2], pieces=pieces, mincols=mincols ): if isinstance( feature, GFFFeature ): # Convert back to GFF coordinates since reader converted automatically. convert_bed_coords_to_gff( feature ) for interval in feature.intervals: out_file.write( "%s\n" % "\t".join( interval.fields ) ) elif isinstance( feature, GenomicInterval ): out_file.write( "%s\n" % "\t".join( feature.fields ) ) else: out_file.write( "%s\n" % feature ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc=" of 2nd dataset" )) if g2.skipped > 0: print(skipped( g2, filedesc=" of 1st dataset" ))
chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type( outfields ) is list: out_file.write( "%s\n" % "\t".join( outfields ) ) else: out_file.write( "%s\n" % outfields ) except ParseError, exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) except MemoryError: out_file.close() fail( "Input datasets were too large to complete the join operation." ) out_file.close() if g1.skipped > 0: print skipped( g1, filedesc=" of 1st dataset" ) if g2.skipped > 0: print skipped( g2, filedesc=" of 2nd dataset" ) if __name__ == "__main__": main()
for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand) except Exception, exc: print >> sys.stderr, str(exc) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write("%s\n" % outinterval) f1.close() out_file.close() if g1.skipped > 0: print skipped(g1, filedesc="") if __name__ == "__main__": main()
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) if options.distance: distance = int( options.distance ) if options.overlap: distance = -1 * int( options.overlap ) if options.output: output = int( options.output ) if options.minregions: minregions = int( options.minregions ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) # Get the cluster tree try: clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) f1 = open( in_fname, "r" ) out_file = open( out_fname, "w" ) # If "merge" if output == 1: fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)] for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): fields[g1.chrom_col] = chrom fields[g1.start_col] = str(start) fields[g1.end_col] = str(end) out_file.write( "%s\n" % "\t".join( fields ) ) # If "filtered" we preserve order of file and comments, etc. if output == 2: linenums = dict() for chrom, tree in clusters.items(): for linenum in tree.getlines(): linenums[linenum] = 0 linenum = -1 f1.seek(0) for line in f1.readlines(): linenum += 1 if linenum in linenums or linenum in extra: out_file.write( "%s\n" % line.rstrip( "\n\r" ) ) # If "clustered" we output original intervals, but near each other (i.e. clustered) if output == 3: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for linenum in tree.getlines(): out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) ) # If "minimum" we output the smallest interval in each cluster if output == 4 or output == 5: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): outsize = -1 outinterval = None for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand ) except Exception as exc: print(str( exc ), file=sys.stderr) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write( "%s\n" % outinterval ) f1.close() out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc="" ))
if options.threecol: if type(line) is GenomicInterval: out_file.write( "%s\t%s\t%s\n" % (line.chrom, str(line.startCol), str(line.endCol))) elif type(line) is list: out_file.write("%s\t%s\t%s\n" % (line[chr_col_1], str(line[start_col_1]), str(line[end_col_1]))) else: out_file.write("%s\n" % line) else: if type(line) is GenomicInterval: out_file.write("%s\n" % "\t".join(line.fields)) elif type(line) is list: out_file.write("%s\n" % "\t".join(line)) else: out_file.write("%s\n" % line) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc)) out_file.close() if g1.skipped > 0: print skipped(g1, filedesc=" of 1st dataset") if __name__ == "__main__": main()
def main(): allchroms = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) lengths = options.lengths if options.all: allchroms = True in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) lens = dict() chroms = list() # dbfile is used to determine the length of each chromosome. The lengths # are added to the lens dict and passed copmlement operation code in bx. dbfile = fileinput.FileInput(lengths) if dbfile: if not allchroms: try: for line in dbfile: fields = line.split("\t") lens[fields[0]] = int(fields[1]) except: # assume LEN doesn't exist or is corrupt somehow pass elif allchroms: try: for line in dbfile: fields = line.split("\t") end = int(fields[1]) chroms.append("\t".join([fields[0], "0", str(end)])) except: pass # Safety...if the dbfile didn't exist and we're on allchroms, then # default to generic complement if allchroms and len(chroms) == 0: allchroms = False if allchroms: chromReader = GenomicIntervalReader(chroms) generator = subtract([chromReader, g1]) else: generator = complement(g1, lens) out_file = open(out_fname, "w") try: for interval in generator: if type(interval) is GenomicInterval: out_file.write("%s\n" % "\t".join(interval)) else: out_file.write("%s\n" % interval) except ParseError as exc: out_file.close() fail("Invalid file format: %s" % str(exc)) out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=""))