def __init__(self, autosql, parent=None, delim=""): """Create an |AutoSqlField| Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or None, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) """ AbstractAutoSqlElement.__init__(self, autosql, parent=parent, delim=delim) type_ = self.attr["type"] try: self.formatter = self.field_types[type_][0] except KeyError: try: self.formatter = self.parent.field_types[type_][0] except: self.formatter = str warn( "Could not find formatter for field '%s' of type '%s'. Casting to 'string' instead." % (self.attr["name"], type_), DataWarning)
def _assemble(self, line): """Read `BED`_ files line-by-line into types specified by `self.return_type`""" self.counter += 1 if line.strip() == "": return self.__next__() elif line.startswith("browser"): return self.__next__() elif line.startswith("track"): # reset metadata self._parse_track_line(line[5:]) return self.__next__() elif line.startswith("#"): return self.__next__() else: try: return self.return_type.from_bed(line, extra_columns=self.extra_columns) except: self.rejected.append(line) msg = "Cannot parse BED line number %s. " % self.counter if self.metadata.get("type", None) is not None: msg += "Are you sure this is a %s BED file with extra columns (%s)?" % ( self.metadata.get("type"), self._get_extra_column_names(), ) elif self.extra_columns != 0: msg += "Are you sure this BED file has extra columns (%s)?" % self._get_extra_column_names() else: msg += "Maybe this BED has extra columns (i.e. is an extended BED file)?" msg += "\n %s" % line warn(msg, FileFormatWarning) return self.__next__()
def _assemble(self, line): """Read `BED`_ files line-by-line into types specified by `self.return_type`""" self.counter += 1 if line.strip() == "": return self.__next__() elif line.startswith("browser"): return self.__next__() elif line.startswith("track"): # reset metadata self._parse_track_line(line[5:]) return self.__next__() elif line.startswith("#"): return self.__next__() else: try: return self.return_type.from_bed( line, extra_columns=self.extra_columns) except: self.rejected.append(line) msg = "Cannot parse BED line number %s. " % self.counter if self.metadata.get("type", None) is not None: msg += ( "Are you sure this is a %s BED file with extra columns (%s)?" % (self.metadata.get("type"), self._get_extra_column_names())) elif self.extra_columns != 0: msg += ( "Are you sure this BED file has extra columns (%s)?" % self._get_extra_column_names()) else: msg += "Maybe this BED has extra columns (i.e. is an extended BED file)?" msg += ("\n %s" % line) warn(msg, FileFormatWarning) return self.__next__()
def parse_GFF3_tokens(inp,list_types=None): """Helper function to parse tokens in the final column of a `GFF3`_ file into a dictionary of attributes. Because, the following attributes are permitted to have multiple values in the `GFF3`_ spec, their values, if present are returned as lists in the dictionary rather than strings: - `Parent` - `Alias` - `Note` - `Dbxref` - `Ontology_term` All values are unescaped folowing the `GFF3`_ specification. Examples -------- >>> tokens = 'a=1;c=3;b=2;e=5;d=4;z=26,Parents=gene01' >>> parse_GFF3_tokens(tokens) {'a': '1', 'c': '3', 'b': '2', 'e': '5', 'd': '4', 'z': '26', 'parents' : ['gene01'] } >>> tokens = 'a=1;c=3,7;b=2;e=5;d=4;z=26,Parents=gene01,gene02' >>> parse_GFF3_tokens(tokens) {'a': '1', 'c': '3,7', 'b': '2', 'e': '5', 'd': '4', 'z': '26', 'parents' : ['gene01','gene02']} Parameters ---------- inp : str Ninth column of `GFF3`_ entry list_types : list, optional Names of attributes that should be returned as lists (Default: %s) Returns ------- dict : key-value pairs """ % ",".join(_GFF3_DEFAULT_LISTS) if list_types is None: list_types = _GFF3_DEFAULT_LISTS d = {} items = inp.strip("\n").strip(";").split(";") for item in items: if len(item) > 0: key, val = item.split("=") key = unescape_GFF3(key.strip(" ")) if key in list_types: val = [unescape_GFF3(X) for X in val.strip(" ").split(",")] else: val = unescape_GFF3(val.strip(" ")) if key in d: warn("Found duplicate attribute key '%s' in GFF3 line. Catenating value with previous value for key in attr dict:\n %s" % (key,inp), FileFormatWarning) val = "%s,%s" % (d[key],val) d[key] = val return d
def _parse_fields(self): """Parse fields of an autoSql declaration, and populate ``self.field_formatters`` and ``self.field_comments``. """ # order in which we try to match autoSql fields match_order = [AutoSqlField, SizedAutoSqlField, ValuesAutoSqlField] # fields are area of string from last starting point to end of comment # first starting point is 0;all subsequent starting points will be end # of previous comment _, comment_locs = self.mask_comments(self._field_text) last_index = 0 for (_, next_index) in comment_locs: field_str = self._field_text[last_index:next_index + 1] for field_class in match_order: if field_class.matches(field_str): my_parser = field_class(field_str) name = my_parser.attr["name"] if name in self.field_formatters: oldname = name i = 1 current_formatter = self.field_formatters[name] current_type = current_formatter.attr.get( "type", current_formatter.__class__.__name__) new_type = my_parser.attr.get( "type", my_parser.__class__.__name__) while name in self.field_formatters: i += 1 name = "%s%s" % (oldname, i) warn( "Element named '%s' of type '%s' already found in autoSql declaration '%s.' Renaming current element of type '%s' to '%s'" % (oldname, current_type, self.attr.get("name", "unnamed declaration"), new_type, name), DataWarning) my_parser.attr["name"] = name self.field_formatters[name] = my_parser self.field_comments[name] = my_parser.attr["comment"] last_index = next_index + 1
def __call__(self, text, rec=None): """Parse an value matching the field described by ``self.autosql`` from a block of delimited text Parameters ---------- text : str Multiline text block, formatted in autoSql Returns ------- Value or object of appropriate type """ try: return self.formatter(text) except ValueError: message = "Could not convert autoSql value '%s' for field '%s' to type '%s'. Casting to 'string' instead. " % ( text, self.attr["name"], self.formatter.__name__) warn(message, DataWarning) return text
def _assemble(self,line): """Read `PSL`_ files line-by-line into types specified by ``self.return_type``""" self.counter += 1 if line.strip() == "": return self.__next__() elif line.startswith("psLayout"): return self.__next__() elif line.lstrip().startswith("match"): return self.__next__() elif line.startswith("--"): return self.__next__() elif line.startswith("#"): return self.__next__() else: try: return self.return_type.from_psl(line) except Exception as e: self.rejected.append(line) warn("Rejecting line %s because of %s: %s" % (self.counter,e.message,line),FileFormatWarning) return self.__next__()
def __call__(self,text,rec=None): """Parse an value matching the field described by ``self.autosql`` from a block of delimited text Parameters ---------- text : str Multiline text block, formatted in autoSql Returns ------- Value or object of appropriate type """ try: return self.formatter(text) except ValueError: message = "Could not convert autoSql value '%s' for field '%s' to type '%s'. Casting to 'string' instead. " % (text, self.attr["name"], self.formatter.__name__) warn(message,DataWarning) return text
def _parse_fields(self): """Parse fields of an autoSql declaration, and populate ``self.field_formatters`` and ``self.field_comments``. """ # order in which we try to match autoSql fields match_order = [AutoSqlField,SizedAutoSqlField,ValuesAutoSqlField] # fields are area of string from last starting point to end of comment # first starting point is 0;all subsequent starting points will be end # of previous comment _, comment_locs = self.mask_comments(self._field_text) last_index = 0 for (_,next_index) in comment_locs: field_str = self._field_text[last_index:next_index+1] for field_class in match_order: if field_class.matches(field_str): my_parser = field_class(field_str) name = my_parser.attr["name"] if name in self.field_formatters: oldname = name i = 1 current_formatter = self.field_formatters[name] current_type = current_formatter.attr.get("type",current_formatter.__class__.__name__) new_type = my_parser.attr.get("type",my_parser.__class__.__name__) while name in self.field_formatters: i += 1 name = "%s%s" % (oldname,i) warn("Element named '%s' of type '%s' already found in autoSql declaration '%s.' Renaming current element of type '%s' to '%s'" % (oldname, current_type, self.attr.get("name","unnamed declaration"), new_type, name), DataWarning) my_parser.attr["name"] = name self.field_formatters[name] = my_parser self.field_comments[ name] = my_parser.attr["comment"] last_index = next_index+1
def __call__(self, text, rec=None): """Parse an value matching the field described by ``self.autosql`` from a block of delimited text Parameters ---------- text : str Multiline text block, formatted in autoSql rec : OrderedDict or None, optional Record whose attributes are being populated by recursive processing of ``text``. Passed in cases where fields sized by variables need to look up instance values of earlier fields to evaluate those variables. Returns ------- tuple Tuple of appropriate type """ if self.formatter != str: try: retval = tuple([ self.formatter(X) for X in text.strip().strip(self.delim).split(self.delim) ]) except ValueError: message = "Could not convert autoSql value '%s' in field '%s' to tuple of type '%s'. Leaving as str " % ( text, self.attr["name"], self.formatter.__name__) warn(message, DataWarning) return text else: retval = text if self.attr["size_is_int"] == True: assert len(retval) == self.attr["size"] else: assert len(retval) == rec[self.attr["size"]] return retval
def _parse_track_line(self, inp): """Parse track line from `BED`_ / extended BED file Parameters ---------- inp : str track definition line from `BED`_ / extended BED file Returns ------- dict key-value pairs from `BED`_ line """ self.metadata = {} ltmp = shlex.split(inp.strip("\n")) for item in ltmp: k, v = item.split("=") self.metadata[k] = v track_type = self.metadata.get("type", None) if track_type is not None: if track_type in bed_x_formats: self.printer.write( "Found track type '%s' in track definition line. Assuming extra columns follow UCSC definitions." % track_type ) if self.extra_columns == 0: self.extra_columns = bed_x_formats[track_type] elif self.extra_columns != bed_x_formats[track_type]: my_columns = self._get_extra_column_names() track_format_columns = ",".join([X[0] for X in bed_x_formats[track_type]]) warn( "Extra columns specified by %s track type declaration (%s) don't match those specified by user (%s). Using those specified by user." % (track_type, track_format_columns, my_columns), FileFormatWarning, ) self.metadata["type"] = "custom" else: self.printer.write("Found track type '%s' in track definition line." % track_type)
def _assemble(self, line): """Read `PSL`_ files line-by-line into types specified by ``self.return_type``""" self.counter += 1 if line.strip() == "": return self.__next__() elif line.startswith("psLayout"): return self.__next__() elif line.lstrip().startswith("match"): return self.__next__() elif line.startswith("--"): return self.__next__() elif line.startswith("#"): return self.__next__() else: try: return self.return_type.from_psl(line) except Exception as e: self.rejected.append(line) warn( "Rejecting line %s because of %s: %s" % (self.counter, e.message, line), FileFormatWarning) return self.__next__()
def _parse_track_line(self, inp): """Parse track line from `BED`_ / extended BED file Parameters ---------- inp : str track definition line from `BED`_ / extended BED file Returns ------- dict key-value pairs from `BED`_ line """ self.metadata = {} ltmp = shlex.split(inp.strip("\n")) for item in ltmp: k, v = item.split("=") self.metadata[k] = v track_type = self.metadata.get("type", None) if track_type is not None: if track_type in bed_x_formats: self.printer.write( "Found track type '%s' in track definition line. Assuming extra columns follow UCSC definitions." % track_type) if self.extra_columns == 0: self.extra_columns = bed_x_formats[track_type] elif self.extra_columns != bed_x_formats[track_type]: my_columns = self._get_extra_column_names() track_format_columns = ",".join( [X[0] for X in bed_x_formats[track_type]]) warn("Extra columns specified by %s track type declaration (%s) don't match those specified by user (%s). Using those specified by user." %\ (track_type,track_format_columns,my_columns),FileFormatWarning) self.metadata["type"] = "custom" else: self.printer.write( "Found track type '%s' in track definition line." % track_type)
def __call__(self,text,rec=None): """Parse an value matching the field described by ``self.autosql`` from a block of delimited text Parameters ---------- text : str Multiline text block, formatted in autoSql rec : OrderedDict or None, optional Record whose attributes are being populated by recursive processing of ``text``. Passed in cases where fields sized by variables need to look up instance values of earlier fields to evaluate those variables. Returns ------- tuple Tuple of appropriate type """ if self.formatter != str: try: retval = tuple([self.formatter(X) for X in text.strip().strip(self.delim).split(self.delim)]) except ValueError: message = "Could not convert autoSql value '%s' in field '%s' to tuple of type '%s'. Leaving as str " % (text, self.attr["name"], self.formatter.__name__) warn(message,DataWarning) return text else: retval = text if self.attr["size_is_int"] == True: assert len(retval) == self.attr["size"] else: assert len(retval) == rec[self.attr["size"]] return retval
def __init__(self,autosql,parent=None,delim=""): """Create an |AutoSqlField| Parameters ---------- autosql : str Block of autoSql text specifying format of element parent : instance of subclass of |AbstractAutoSqlObject| or None, optional Parent / enclosing element. Default: None delim : str, optional Field delimiter (default: tab) """ AbstractAutoSqlElement.__init__(self,autosql,parent=parent,delim=delim) type_ = self.attr["type"] try: self.formatter = self.field_types[type_][0] except KeyError: try: self.formatter = self.parent.field_types[type_][0] except: self.formatter = str warn("Could not find formatter for field '%s' of type '%s'. Casting to 'string' instead." % (self.attr["name"],type_),DataWarning)
def parse_GTF2_tokens(inp): """Helper function to parse tokens in the final column of a `GTF2`_ file into a dictionary of attributes. All attributes are returned as strings, and are unescaped if GFF escape sequences (e.g. *'%2B'*) are present. If duplicate keys are present (e.g. as in GENCODE `GTF2`_ files), their values are catenated, separated by a comma. Examples -------- >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript";' >>> parse_GTF2_tokens(tokens) {'gene_id' : 'mygene', 'transcript_id' : 'mytranscript'} >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"' >>> parse_GTF2_tokens(tokens) {'gene_id' : 'mygene', 'transcript_id' : 'mytranscript'} >>> tokens = 'gene_id "mygene;"; transcript_id "myt;ranscript"' >>> parse_GTF2_tokens(tokens) {'gene_id' : 'mygene;', 'transcript_id' : 'myt;ranscript'} >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"; tag "tag value";' >>> parse_GTF2_tokens(tokens) {'gene_id' : 'mygene', 'tag' : 'tag value', 'transcript_id' : 'mytranscript'} >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"; tag "tag value"; tag "tag value 2";' >>> parse_GTF2_tokens(tokens) {'gene_id' : 'mygene', 'tag' : 'tag value,tag value 2', 'transcript_id' : 'mytranscript'} Parameters ---------- inp : str Ninth column of `GTF2`_ entry Returns ------- dict : key-value pairs """ d = {} items = shlex.split(inp.strip("\n")) assert len(items) % 2 == 0 for i in range(0,len(items),2): key = unescape_GTF2(items[i]) val = items[i+1] # require separation by semicolons for all but final token if i+1 < len(items) - 2: assert val.endswith(";") if val.endswith(";"): val = val[:-1] if key in d: warn("Found duplicate attribute key '%s' in GTF2 line. Catenating value with previous value for key in attr dict:\n %s" % (key,inp), FileFormatWarning) d[key] = "%s,%s" % (d[key],unescape_GTF2(val)) else: d[key] = unescape_GTF2(val) return d
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: sys.argv[1:] (actually command-line arguments) """ ap = AnnotationParser() bp = BaseParser() annotation_parser = ap.get_parser() base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), parents=[base_parser,annotation_parser], formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--no_escape",default=True,action="store_false", help="If specified and output format is GTF2, special characters in column 9 will be escaped (default: True)") parser.add_argument("--output_format",choices=["BED","GTF2"],default="GTF2", help="Format of output file. (default: GTF2)") parser.add_argument("--extra_columns",nargs="+",default=[],type=str, help="Attributes (e.g. 'gene_id' to output as extra columns in extended BED format (BED output only).") parser.add_argument("--empty_value",default="na",type=str, help="Value to use of an attribute in `extra_columns` is not defined for a particular record (Default: 'na'") parser.add_argument("outfile",metavar="outfile.[ bed | gtf ]",type=str, help="Output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) end_message = "" extra_cols = args.extra_columns if extra_cols is not None: if args.output_format == "BED": # avoid name clashes names_used = copy.copy(BED12_RESERVED_NAMES) asql_names = [fix_name(X,names_used) for X in extra_cols] autosql_str = "\n".join(AUTOSQL_ROW_FMT_STR % (X," "*max(15-len(X),2)) for X in asql_names) file_info = { "outbase" : args.outfile.replace(".bed","").replace(".gtf",""), "numcols" : len(extra_cols), "autosql" : DEFAULT_AUTOSQL_STR % (os.path.basename(args.outfile[:-4]),autosql_str), } end_message = MAKE_BIGBED_MESSAGE % file_info else: warn("`--extra_columns` is ignored for %s-formatted output." % (args.output_format),ArgumentWarning) with argsopener(args.outfile,args,"w") as fout: c = 0 transcripts = ap.get_transcripts_from_args(args,printer=printer) for transcript in transcripts: if args.output_format == "GTF2": fout.write(transcript.as_gtf(escape=args.no_escape)) elif args.output_format == "BED": fout.write(transcript.as_bed(extra_columns=extra_cols,empty_value=args.empty_value)) if c % 1000 == 1: printer.write("Processed %s transcripts ..." % c) c += 1 printer.write("Processed %s transcripts total." % c) printer.write("Done.") print(end_message)