def parse(self): # Initial variables # fields = [] info = {} # Main loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict([p.split('=',1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error("The track%s seems to have an invalid <track> header line", self.path, number) fields = [] continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() # Chromosome # chrom = items.pop(0) # Length is nine # if len(items) != 8: self.handler.error("The track%s doesn't have nine columns", self.path, number) # Have we started a track already ? # if not fields: self.handler.newTrack(info, self.name) fields = all_fields[0:len(items)] self.handler.defineFields(fields) # Source field # if items[0] == '.': items[0] = '' # Name field # if items[1] == '.': items[1] = '' # Start and end fields # try: items[2] = int(items[2]) items[3] = int(items[3]) except ValueError: self.handler.error("The track%s has non integers as interval bounds", self.path, number) if items[3] <= items[2]: self.handler.error("The track%s has negative or null intervals", self.path, number) # Score field # if items[4] == '.' or items[4] == '': items[4] = 0.0 try: items[4] = float(items[4]) except ValueError: self.handler.error("The track%s has non floats as score values", self.path, number) # Strand field # items[5] = strand_to_int(items[5]) # Frame field # if items[6] == '.': items[6] = None else: try: items[6] = int(items[6]) except ValueError: self.handler.error("The track%s has non integers as frame value", self.path, number) # Group or attribute field # if items[7] == '.': items[7] = '' # Yield it # self.handler.newFeature(chrom, items)
def parse_chr_file(self, path): """Read a chromsome file and return a dictionary""" chrmeta = {} for number, line in iterate_lines(path): items = line.split('\t') if len(items) == 1: items = line.split() if len(items) != 2: raise Exception("The file '" + path + ":" + str(number) + "' does not seam to be a valid chromosome file.") name = items[0] try: length = int(items[1]) except ValueError: raise Exception("The file '" + path + ":" + str(number) + "' has non-integer as chromosome lengths.") chrmeta[name] = dict([('length', length)]) if not chrmeta: raise Exception("The file '" + path + "' does not seam to contain any information.") return chrmeta
def parse(self): # Initial variables # info = {} declare_track = True # Main loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict([p.split('=',1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error("The track%s seems to have an invalid <track> header line", self.path, number) declare_track = True continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() # Chromosome # chrom = items.pop(0) # Length is three # if len(items) != 3: self.handler.error("The track%s doesn't have four columns", self.path, number) # Have we started a track already ? # if declare_track: declare_track = False self.handler.defineFields(all_fields) self.handler.newTrack(info, self.name) # Start and end fields # try: items[0] = int(items[0]) items[1] = int(items[1]) except ValueError: self.handler.error("The track%s has non integers as interval bounds", self.path, number) # Score field # if items[2] == '.' or items[2] == '': items[2] = 0.0 try: items[2] = float(items[2]) except ValueError: self.handler.error("The track%s has non floats as score values", self.path, number) # Yield it # self.handler.newFeature(chrom, items)
def parse(self): # Initial variables # l_chrom, l_name, l_start, l_end, l_strand, l_score = None, None, None, None, None, None # Start a new track # self.handler.newTrack({'int_to_float':'score'}, self.name) self.handler.defineFields(all_fields) # Line loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue if line.startswith("track "): continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() # Is it a legal line ? # if len(items) < 5: self.handler.error("The track%s has less than five columns", self.path, number) # Chromosome # chrom = items.pop(0) # Name field # name = items[0] # Start and end field # try: pos = int(items[1]) except ValueError: self.handler.error("The track%s has non integers as position", self.path, number) start = pos-1 end = pos # Strand field # strand = strand_to_int(items[2]) # Score field # try: score = int(items[3]) except ValueError: self.handler.error("The track%s has non integers as tag count values", self.path, number) # Ignore null scores # if score == 0: continue # Merge adjacent features with same scores # if (l_chrom, l_name, l_strand, l_score) == (chrom, name, strand, score) and start == l_end: l_end = end continue else: if l_chrom: self.handler.newFeature(l_chrom, (l_name, l_start, l_end, l_strand, l_score)) l_chrom, l_name, l_start, l_end, l_strand, l_score = chrom, name, start, end, strand, score # Last feature # if l_chrom: self.handler.newFeature(l_chrom, (l_name, l_start, l_end, l_strand, l_score))
def parse(self): # Initial variables # info = {} declare_track = True # Main loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict( [p.split('=', 1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error( "The track%s seems to have an invalid <track> header line", self.path, number) declare_track = True continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() if len(items) > 8: items = items[0:8] + [' '.join(items[8:])] # Chromosome # chrom = items.pop(0) # Length is nine # if len(items) != 8: self.handler.error("The track%s doesn't have nine columns", self.path, number) # Have we started a track already ? # if declare_track: declare_track = False self.handler.newTrack(info, self.name) # Source field # if items[0] == '.': items[0] = '' # Name field # if items[1] == '.': items[1] = '' # Start and end fields # try: items[2] = int(items[2]) items[3] = int(items[3]) except ValueError: self.handler.error( "The track%s has non integers as interval bounds", self.path, number) # Strand field # items[5] = strand_to_int(items[5]) # Frame field # if items[6] == '.': items[6] = None else: try: items[6] = int(items[6]) except ValueError: self.handler.error( "The track%s has non integers as frame value", self.path, number) # The last special column # attr = shlex.split(items.pop()) attr = [(attr[i], attr[i + 1].strip(';')) for i in xrange(0, len(attr), 2)] # Not using dict to preserve annotation order # keys, values = [x[0] for x in attr], [x[1] for x in attr] # GTF attribute column must have annotations starting with "gene_id" and "transcript_id" # assert ["gene_id", "transcript_id"] == keys[:2], "Invalid " \ "attribute column: %r. Valid attributes begin with " \ "\"gene_id\" and \"transcript_id\"" self.handler.defineFields(all_fields + keys) items += values # Yield it # self.handler.newFeature(chrom, items)
def parse(self): # Initial variables # fields = [] info = {} # Main loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict( [p.split('=', 1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error( "The track%s seems to have an invalid <track> header line", self.path, number) fields = [] continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() # Chromosome # chrom = items.pop(0) # Length is nine # if len(items) != 8: self.handler.error("The track%s doesn't have nine columns", self.path, number) # Have we started a track already ? # if not fields: self.handler.newTrack(info, self.name) fields = all_fields[0:len(items)] self.handler.defineFields(fields) # Source field # if items[0] == '.': items[0] = '' # Name field # if items[1] == '.': items[1] = '' # Start and end fields # try: items[2] = int(items[2]) items[3] = int(items[3]) except ValueError: self.handler.error( "The track%s has non integers as interval bounds", self.path, number) if items[3] <= items[2]: self.handler.error( "The track%s has negative or null intervals", self.path, number) # Score field # if items[4] == '.' or items[4] == '': items[4] = 0.0 try: items[4] = float(items[4]) except ValueError: self.handler.error( "The track%s has non floats as score values", self.path, number) # Strand field # items[5] = strand_to_int(items[5]) # Frame field # if items[6] == '.': items[6] = None else: try: items[6] = int(items[6]) except ValueError: self.handler.error( "The track%s has non integers as frame value", self.path, number) # Group or attribute field # if items[7] == '.': items[7] = '' # Yield it # self.handler.newFeature(chrom, items)
def parse(self): # Initial variables # info = {} params = {} declare_track = True last_feature = None last_chrom = None # Line loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict([p.split('=',1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error("The track%s seems to have an invalid <track> header line", self.path, number) declare_track = True continue # Have we started a track already ? # if declare_track: declare_track = False if last_feature: self.handler.newFeature(last_chrom, last_feature) last_feature = None last_chrom = None self.handler.newTrack(info, self.name) self.handler.defineFields(all_fields) # Directive line # if line.startswith("variableStep") or line.startswith("fixedStep"): params = dict([p.split('=',1) for p in shlex.split('mode=' + line)]) if not params.get('chrom', False): self.handler.error("The track%s doesn't specify a chromosome.", self.path, number) try: params['span'] = int(params.get('span', 1)) except ValueError: self.handler.error("The track%s has a non integer as span value.", self.path, number) if params['span'] < 1: self.handler.error("The track%s has a negative or null span value.", self.path, number) if line.startswith("fixedStep "): if not 'start' in params: self.handler.error("The track%s has a fixedStep directive without a start.", self.path, number) try: params['start'] = int(params['start']) except ValueError: self.handler.error("The track%s has a non integer as start value.", self.path, number) try: params['step'] = int(params.get('step',1)) except ValueError: self.handler.error("The track%s has a non integer as step value.", self.path, number) if params['step'] < 1: self.handler.error("The track%s has a negative or null step value.", self.path, number) continue # Not a directive line # if not params: self.handler.error("The track%s is missing a fixedStep or variableStep directive.", self.path, number) # Fixed # if params['mode'] == 'fixedStep': try: line = float(line) except ValueError: self.handler.error("The track%s has non floats as score values.", self.path, number) chrom = params['chrom'] feature = [params['start'], params['start'] + params['span'], line] params['start'] += params['step'] # Variable # elif params['mode'] == 'variableStep': line = line.split('\t') if len(line) == 1: line = line[0].split() try: line[0] = int(line[0]) line[1] = float(line[1]) except ValueError: self.handler.error("The track%s has invalid values.", self.path, number) except IndexError: self.handler.error("The track%s has missing values.", self.path, number) chrom = params['chrom'] feature = [line[0], line[0] + params['span'], line[1]] # Ignore null scores # if feature[2] == 0.0: continue # Merge adjacent features with same scores # # For instance ['chr1', 10, 11, 9.8] and ['chr1', 11, 12, 9.8] should merge. if last_feature: if last_chrom == chrom: if last_feature[1] > feature[0]: self.handler.error("The track%s has a start or span larger than its end or step.", self.path, number) if floats_eq(last_feature[2], feature[2]) and overlapping(last_feature[0], last_feature[1], feature[0], feature[1]): last_feature[0] = min(last_feature[0], feature[0]) last_feature[1] = max(last_feature[1], feature[1]) continue self.handler.newFeature(last_chrom, last_feature) last_feature = feature last_chrom = chrom if last_feature: self.handler.newFeature(last_chrom, last_feature)
def parse(self): # Initial variables # info = {} declare_track = True # Main loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict([p.split('=',1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error("The track%s seems to have an invalid <track> header line", self.path, number) declare_track = True continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() if len(items) > 8: items = items[0:8] + [' '.join(items[8:])] # Chromosome # chrom = items.pop(0) # Length is nine # if len(items) != 8: self.handler.error("The track%s doesn't have nine columns", self.path, number) # Have we started a track already ? # if declare_track: declare_track = False self.handler.newTrack(info, self.name) # Source field # if items[0] == '.': items[0] = '' # Name field # if items[1] == '.': items[1] = '' # Start and end fields # try: items[2] = int(items[2]) items[3] = int(items[3]) except ValueError: self.handler.error("The track%s has non integers as interval bounds", self.path, number) # Strand field # items[5] = strand_to_int(items[5]) # Frame field # if items[6] == '.': items[6] = None else: try: items[6] = int(items[6]) except ValueError: self.handler.error("The track%s has non integers as frame value", self.path, number) # The last special column # attr = shlex.split(items.pop()) attr = [(attr[i],attr[i+1].strip(';')) for i in xrange(0,len(attr),2)] # Not using dict to preserve annotation order # keys, values = [x[0] for x in attr], [x[1] for x in attr] # GTF attribute column must have annotations starting with "gene_id" and "transcript_id" # assert ["gene_id", "transcript_id"] == keys[:2], "Invalid " \ "attribute column: %r. Valid attributes begin with " \ "\"gene_id\" and \"transcript_id\"" self.handler.defineFields(all_fields + keys) items += values # Yield it # self.handler.newFeature(chrom, items)
def parse(self): # Initial variables # fields = [] info = {} # Main loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict([p.split('=',1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error("The track%s seems to have an invalid <track> header line", self.path, number) fields = [] continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() # Chromosome # chrom = items.pop(0) # Have we started a track already ? # if not fields: self.handler.newTrack(info, self.name) fields = all_fields[0:len(items)] self.handler.defineFields(fields) # Start and end fields # try: items[0] = int(items[0]) items[1] = int(items[1]) except ValueError: self.handler.error("The track%s has non integers as interval bounds", self.path, number) except IndexError: self.handler.error("The track%s has less than two columns", self.path, number) # All following fields are optional # try: # Name field # if items[2] == '.': items[2] = '' # Score field # if items[3] == '.' or items[3] == '': items[3] = 0.0 try: items[3] = float(items[3]) except ValueError: self.handler.error("The track%s has non floats as score values", self.path, number) # Strand field # items[4] = strand_to_int(items[4]) # Thick starts # try: items[5] = float(items[5]) except ValueError: self.handler.error("The track%s has non integers as thick starts", self.path, number) # Thick ends # try: items[6] = float(items[6]) except ValueError: self.handler.error("The track%s has non integers as thick ends", self.path, number) # Too many fields # if len(items) > 11: self.handler.error("The track%s has more than twelve columns", self.path, number) # All index errors are ignored since the fields above three are optional # except IndexError: pass finally: self.handler.newFeature(chrom, items)
def parse(self): # Initial variables # info = {} declare_track = True # Main loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict( [p.split('=', 1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error( "The track%s seems to have an invalid <track> header line", self.path, number) declare_track = True continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() if len(items) > 8: items = items[0:8] + [' '.join(items[8:])] # Chromosome # chrom = items.pop(0) # Length is nine # if len(items) != 8: self.handler.error("The track%s doesn't have nine columns", self.path, number) # Have we started a track already ? # if declare_track: declare_track = False self.handler.newTrack(info, self.name) # Source field # if items[0] == '.': items[0] = '' # Name field # if items[1] == '.': items[1] = '' # Start and end fields # try: items[2] = int(items[2]) items[3] = int(items[3]) except ValueError: self.handler.error( "The track%s has non integers as interval bounds", self.path, number) # Score field # if items[4] == '.' or items[4] == '': items[4] = 0.0 try: items[4] = float(items[4]) except ValueError: self.handler.error( "The track%s has non floats as score values", self.path, number) # Strand field # items[5] = strand_to_int(items[5]) # Frame field # if items[6] == '.': items[6] = None else: try: items[6] = int(items[6]) except ValueError: self.handler.error( "The track%s has non integers as frame value", self.path, number) # The last special column # attr = shlex.split(items.pop()) attr = dict([(attr[i], attr[i + 1].strip(';')) for i in xrange(0, len(attr), 2)]) self.handler.defineFields(all_fields + attr.keys()) items += attr.values() # Yield it # self.handler.newFeature(chrom, items)
def parse(self): # Initial variables # info = {} params = {} declare_track = True last_feature = None last_chrom = None # Line loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict( [p.split('=', 1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error( "The track%s seems to have an invalid <track> header line", self.path, number) declare_track = True continue # Have we started a track already ? # if declare_track: declare_track = False if last_feature: self.handler.newFeature(last_chrom, last_feature) last_feature = None last_chrom = None self.handler.newTrack(info, self.name) self.handler.defineFields(all_fields) # Directive line # if line.startswith("variableStep") or line.startswith("fixedStep"): params = dict( [p.split('=', 1) for p in shlex.split('mode=' + line)]) if not params.get('chrom', False): self.handler.error( "The track%s doesn't specify a chromosome.", self.path, number) try: params['span'] = int(params.get('span', 1)) except ValueError: self.handler.error( "The track%s has a non integer as span value.", self.path, number) if params['span'] < 1: self.handler.error( "The track%s has a negative or null span value.", self.path, number) if line.startswith("fixedStep "): if not 'start' in params: self.handler.error( "The track%s has a fixedStep directive without a start.", self.path, number) try: params['start'] = int(params['start']) except ValueError: self.handler.error( "The track%s has a non integer as start value.", self.path, number) try: params['step'] = int(params.get('step', 1)) except ValueError: self.handler.error( "The track%s has a non integer as step value.", self.path, number) if params['step'] < 1: self.handler.error( "The track%s has a negative or null step value.", self.path, number) continue # Not a directive line # if not params: self.handler.error( "The track%s is missing a fixedStep or variableStep directive.", self.path, number) # Fixed # if params['mode'] == 'fixedStep': try: line = float(line) except ValueError: self.handler.error( "The track%s has non floats as score values.", self.path, number) chrom = params['chrom'] feature = [ params['start'], params['start'] + params['span'], line ] params['start'] += params['span'] + params['step'] - 1 # Variable # elif params['mode'] == 'variableStep': line = line.split('\t') if len(line) == 1: line = line[0].split() try: line[0] = int(line[0]) line[1] = float(line[1]) except ValueError: self.handler.error("The track%s has invalid values.", self.path, number) except IndexError: self.handler.error("The track%s has missing values.", self.path, number) chrom = params['chrom'] feature = [line[0], line[0] + params['span'], line[1]] # Ignore null scores # if feature[2] == 0.0: continue # Merge adjacent features with same scores # # For instance ['chr1', 10, 11, 9.8] and ['chr1', 11, 12, 9.8] should merge. if last_feature: if last_chrom == chrom: if last_feature[1] > feature[0]: self.handler.error( "The track%s has a start or span larger than its end or step.", self.path, number) if floats_eq(last_feature[2], feature[2]) and overlapping( last_feature[0], last_feature[1], feature[0], feature[1]): last_feature[0] = min(last_feature[0], feature[0]) last_feature[1] = max(last_feature[1], feature[1]) continue self.handler.newFeature(last_chrom, last_feature) last_feature = feature last_chrom = chrom if last_feature: self.handler.newFeature(last_chrom, last_feature)
def parse(self): # Initial variables # info = {} declare_track = True # Main loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict([p.split('=',1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error("The track%s seems to have an invalid <track> header line", self.path, number) declare_track = True continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() if len(items) > 8: items = items[0:8] + [' '.join(items[8:])] # Chromosome # chrom = items.pop(0) # Length is nine # if len(items) != 8: self.handler.error("The track%s doesn't have nine columns", self.path, number) # Have we started a track already ? # if declare_track: declare_track = False self.handler.newTrack(info, self.name) # Source field # if items[0] == '.': items[0] = '' # Name field # if items[1] == '.': items[1] = '' # Start and end fields # try: items[2] = int(items[2]) items[3] = int(items[3]) except ValueError: self.handler.error("The track%s has non integers as interval bounds", self.path, number) # Score field # if items[4] == '.' or items[4] == '': items[4] = 0.0 try: items[4] = float(items[4]) except ValueError: self.handler.error("The track%s has non floats as score values", self.path, number) # Strand field # items[5] = strand_to_int(items[5]) # Frame field # if items[6] == '.': items[6] = None else: try: items[6] = int(items[6]) except ValueError: self.handler.error("The track%s has non integers as frame value", self.path, number) # The last special column # attr = shlex.split(items.pop()) attr = dict([(attr[i],attr[i+1].strip(';')) for i in xrange(0,len(attr),2)]) self.handler.defineFields(all_fields + attr.keys()) items += attr.values() # Yield it # self.handler.newFeature(chrom, items)
def parse(self): # Initial variables # fields = [] info = {} # Main loop # for number, line in iterate_lines(self.path): # Ignored lines # if line.startswith("browser "): continue # Track headers # if line.startswith("track "): try: info = dict( [p.split('=', 1) for p in shlex.split(line[6:])]) except ValueError: self.handler.error( "The track%s seems to have an invalid <track> header line", self.path, number) fields = [] continue # Split the lines # items = line.split('\t') if len(items) == 1: items = line.split() # Chromosome # chrom = items.pop(0) # Have we started a track already ? # if not fields: self.handler.newTrack(info, self.name) fields = all_fields[0:len(items)] self.handler.defineFields(fields) # Start and end fields # try: items[0] = int(items[0]) items[1] = int(items[1]) except ValueError: self.handler.error( "The track%s has non integers as interval bounds", self.path, number) except IndexError: self.handler.error("The track%s has less than two columns", self.path, number) # All following fields are optional # try: # Name field # if items[2] == '.': items[2] = '' # Score field # if items[3] == '.' or items[3] == '': items[3] = 0.0 try: items[3] = float(items[3]) except ValueError: self.handler.error( "The track%s has non floats as score values", self.path, number) # Strand field # items[4] = strand_to_int(items[4]) # Thick starts # try: items[5] = float(items[5]) except ValueError: self.handler.error( "The track%s has non integers as thick starts", self.path, number) # Thick ends # try: items[6] = float(items[6]) except ValueError: self.handler.error( "The track%s has non integers as thick ends", self.path, number) # Too many fields # if len(items) > 11: self.handler.error( "The track%s has more than twelve columns", self.path, number) # All index errors are ignored since the fields above three are optional # except IndexError: pass finally: self.handler.newFeature(chrom, items)