Пример #1
0
class MacsXLS:
    """Class for reading and manipulating XLS output from MACS

    Reads the XLS output file from the MACS peak caller and
    processes and stores the information for subsequent manipulation
    and output.

    To read in data from a MACS output file:

    >>> macs = MacsXLS("macs.xls")

    This reads in the data and prepends an additional 'order'
    column (a list of numbers from one to the number of data
    lines).

    To get the MACS version:

    >>> macs.macs_version
    2.0.10

    To access the 'header' information (as a Python list):

    >>> macs.header

    To see the column names (as a Python list):

    >>> macs.columns

    The data is stored as a TabFile object; to access the data
    use the 'data' property, e.g.

    >>> for line in macs.data:
    ...    print "Chr %s Start %s End" % (line['chr'],line['start'],line['end'])

    To sort the data on a particular column use the 'sort_on'
    method, e.g.

    >>> macs.sort_on('chr')

    (Note that the order column is always recalculated after
    sorting.)

    """

    def __init__(self,filen=None,fp=None,name=None):
        """Create a new MacsXLS instance

        Arguments:
          filen: name of the file to read the MACS output from.
            If None then fp argument must be supplied instead.
          fp: file-like object opened for reading. If None then
            filen argument must be supplied instead. If both filen
            and fp are supplied then fp will be used preferentially.

        """
        # Store data
        self.__filen = filen
        self.__name = name
        self.__macs_version = None
        self.__command_line = None
        self.__header = []
        self.__data = None
        # Open file, if necessary
        if fp is None:
            fp = open(filen,'r')
        else:
            filen = None
        # Iterate over header lines
        for line in fp:
            line = line.strip()
            if line.startswith('#') or line == '':
                # Header line
                self.__header.append(line)
                # Detect/extract data from header
                if line.startswith("# This file is generated by MACS version "):
                    # Look for MACS version
                    self.__macs_version = line.split()[8]
                elif self.__name is None and line.startswith("# name = "):
                    # Look for 'name' if none set
                    self.__name = line[len("# name = "):]
                elif line.startswith("# Command line: "):
                    # Look for command line
                    self.__command_line = line[16:]
            else:
                if self.__data is None:
                    # First line of actual data should be the column names
                    columns = line.split('\t')
                    # Insert an additional column called 'order'
                    columns.insert(0,"order")
                    # Set up TabFile to handle actual data
                    self.__data = TabFile(column_names=columns)
                else:
                    # Assume it's actual data and store it
                    self.__data.append(tabdata="\t%s" % line)
        # Close the file handle, if we opened it
        if filen is not None:
            fp.close()
        # Check that we actually got a version line
        if self.macs_version is None:
            raise Exception,"Failed to extract MACS version, not a MACS output file?"
        # Populate the 'order' column
        self.update_order()

    @property
    def filen(self):
        """Return the source file name

        """
        return self.__filen

    @property
    def name(self):
        """Return the name property

        """
        return self.__name

    @property
    def macs_version(self):
        """Return the MACS version extracted from the file

        """
        return self.__macs_version

    @property
    def command_line(self):
        """Return the command line string extracted from the header

        This is the value associated with the "# Command line: ..."
        header line.

        Will be 'None' if no matching header line is found, else is
        the string following the ':'.

        """
        return self.__command_line

    @property
    def columns(self):
        """Return the column names for the MACS data

        Returns a list of the column names from the data
        extracted from the file.

        """
        return self.__data.header()

    @property
    def columns_as_xls_header(self):
        """Returns the column name list, with hash prepended

        """
        return ['#'+self.columns[0]] + self.columns[1:]

    @property
    def header(self):
        """Return the header data from the file

        Returns a list of lines comprising the header
        extracted from the file.

        """
        return self.__header

    @property
    def data(self):
        """Return the data from the file

        Returns a TabFile object comprising the data
        extracted from the file.

        """
        return self.__data

    @property
    def with_broad_option(self):
        """Returns True if MACS was run with --broad option

        If --broad wasn't detected then returns False.

        """
        if self.macs_version.startswith('1.'):
            # Not an option in MACS 1.*
            return False
        try:
            # Was --broad specified in the command line?
            return '--broad' in self.command_line.split()
        except AttributeError:
            # No command line? Check for 'abs_summit' column
            return 'abs_summit' not in self.columns

    def sort_on(self,column,reverse=True):
        """Sort data on specified column

        Sorts the data in-place, by the specified column.

        By default data is sorted in descending order; set
        'reverse' argument to False to sort values in ascending
        order instead
 
        Note that the 'order' column is automatically updated
        after each sorting operation.

        Arguments:
          column: name of the column to sort on
          reverse: if True (default) then sort in descending
            order (i.e. largest to smallest). Otherwise sort in
            ascending order.

        """
        # Sort the data
        self.__data.sort(lambda line: line[column],reverse=reverse)
        # Update the 'order' column
        self.update_order()

    def update_order(self):
        # Set/update values in 'order' column
        for i in range(0,len(self.__data)):
            self.__data[i]['order'] = i+1
Пример #2
0
            break
    if macs_version is None:
        logging.error("couldn't detect MACS version")
        sys.exit(1)
    else:
        print "Input file is from MACS %s" % macs_version

    # Don't try to convert output from MACS2
    if macs_version.startswith("2."):
        logging.error(
            "input XLS comes from MACS %s, this version only handles 1.4" %
            macs_version)
        sys.exit(1)

    # Sort into order by fold_enrichment and then by -10*log10(pvalue) column
    data.sort(lambda line: line['fold_enrichment'], reverse=True)
    data.sort(lambda line: line['-10*log10(pvalue)'], reverse=True)

    # Restore first line
    data.insert(0, tabdata=header_line)

    # Insert "order" column
    data.appendColumn("order")
    # Perhaps confusingly must also insert initial value "#order"
    data[0]['order'] = "#order"
    for i in range(1, len(data)):
        data[i]['order'] = i
    # Reorder columns to put it at the start
    data = data.reorderColumns([
        'order', 'chr', 'start', 'end', 'length', 'summit', 'tags',
        '-10*log10(pvalue)', 'fold_enrichment', 'FDR(%)'
            macs_version = line.split()[8]
            break
    if macs_version is None:
        logging.error("couldn't detect MACS version")
        sys.exit(1)
    else:
        print "Input file is from MACS %s" % macs_version

    # Don't try to convert output from MACS2
    if macs_version.startswith("2."):
        logging.error("input XLS comes from MACS %s, this version only handles 1.4" %
                      macs_version)
        sys.exit(1)

    # Sort into order by fold_enrichment and then by -10*log10(pvalue) column
    data.sort(lambda line: line['fold_enrichment'],reverse=True)
    data.sort(lambda line: line['-10*log10(pvalue)'],reverse=True)

    # Restore first line
    data.insert(0,tabdata=header_line)

    # Insert "order" column
    data.appendColumn("order")
    # Perhaps confusingly must also insert initial value "#order"
    data[0]['order'] = "#order"
    for i in range(1,len(data)):
        data[i]['order'] = i
    # Reorder columns to put it at the start
    data = data.reorderColumns(['order','chr','start','end','length','summit','tags',
                                '-10*log10(pvalue)','fold_enrichment','FDR(%)'])
Пример #4
0
class MacsXLS:
    """Class for reading and manipulating XLS output from MACS

    Reads the XLS output file from the MACS peak caller and
    processes and stores the information for subsequent manipulation
    and output.

    To read in data from a MACS output file:

    >>> macs = MacsXLS("macs.xls")

    This reads in the data and prepends an additional 'order'
    column (a list of numbers from one to the number of data
    lines).

    To get the MACS version:

    >>> macs.macs_version
    2.0.10

    To access the 'header' information (as a Python list):

    >>> macs.header

    To see the column names (as a Python list):

    >>> macs.columns

    The data is stored as a TabFile object; to access the data
    use the 'data' property, e.g.

    >>> for line in macs.data:
    ...    print "Chr %s Start %s End" % (line['chr'],line['start'],line['end'])

    To sort the data on a particular column use the 'sort_on'
    method, e.g.

    >>> macs.sort_on('chr')

    (Note that the order column is always recalculated after
    sorting.)

    """

    def __init__(self,filen=None,fp=None,name=None):
        """Create a new MacsXLS instance

        Arguments:
          filen: name of the file to read the MACS output from.
            If None then fp argument must be supplied instead.
          fp: file-like object opened for reading. If None then
            filen argument must be supplied instead. If both filen
            and fp are supplied then fp will be used preferentially.

        """
        # Store data
        self.__filen = filen
        self.__name = name
        self.__macs_version = None
        self.__command_line = None
        self.__header = []
        self.__data = None
        # Open file, if necessary
        if fp is None:
            fp = open(filen,'r')
        else:
            filen = None
        # Iterate over header lines
        for line in fp:
            line = line.strip()
            if line.startswith('#') or line == '':
                # Header line
                self.__header.append(line)
                # Detect/extract data from header
                if line.startswith("# This file is generated by MACS version "):
                    # Look for MACS version
                    self.__macs_version = line.split()[8]
                elif self.__name is None and line.startswith("# name = "):
                    # Look for 'name' if none set
                    self.__name = line[len("# name = "):]
                elif line.startswith("# Command line: "):
                    # Look for command line
                    self.__command_line = line[16:]
            else:
                if self.__data is None:
                    # First line of actual data should be the column names
                    columns = line.split('\t')
                    # Insert an additional column called 'order'
                    columns.insert(0,"order")
                    # Set up TabFile to handle actual data
                    self.__data = TabFile(column_names=columns)
                else:
                    # Assume it's actual data and store it
                    self.__data.append(tabdata="\t%s" % line)
        # Close the file handle, if we opened it
        if filen is not None:
            fp.close()
        # Check that we actually got a version line
        if self.macs_version is None:
            raise Exception,"Failed to extract MACS version, not a MACS output file?"
        # Populate the 'order' column
        self.update_order()

    @property
    def filen(self):
        """Return the source file name

        """
        return self.__filen

    @property
    def name(self):
        """Return the name property

        """
        return self.__name

    @property
    def macs_version(self):
        """Return the MACS version extracted from the file

        """
        return self.__macs_version

    @property
    def command_line(self):
        """Return the command line string extracted from the header

        This is the value associated with the "# Command line: ..."
        header line.

        Will be 'None' if no matching header line is found, else is
        the string following the ':'.

        """
        return self.__command_line

    @property
    def columns(self):
        """Return the column names for the MACS data

        Returns a list of the column names from the data
        extracted from the file.

        """
        return self.__data.header()

    @property
    def columns_as_xls_header(self):
        """Returns the column name list, with hash prepended

        """
        return ['#'+self.columns[0]] + self.columns[1:]

    @property
    def header(self):
        """Return the header data from the file

        Returns a list of lines comprising the header
        extracted from the file.

        """
        return self.__header

    @property
    def data(self):
        """Return the data from the file

        Returns a TabFile object comprising the data
        extracted from the file.

        """
        return self.__data

    @property
    def with_broad_option(self):
        """Returns True if MACS was run with --broad option

        If --broad wasn't detected then returns False.

        """
        if self.macs_version.startswith('1.'):
            # Not an option in MACS 1.*
            return False
        try:
            # Was --broad specified in the command line?
            return '--broad' in self.command_line.split()
        except AttributeError:
            # No command line? Check for 'abs_summit' column
            return 'abs_summit' not in self.columns

    def sort_on(self,column,reverse=True):
        """Sort data on specified column

        Sorts the data in-place, by the specified column.

        By default data is sorted in descending order; set
        'reverse' argument to False to sort values in ascending
        order instead
 
        Note that the 'order' column is automatically updated
        after each sorting operation.

        Arguments:
          column: name of the column to sort on
          reverse: if True (default) then sort in descending
            order (i.e. largest to smallest). Otherwise sort in
            ascending order.

        """
        # Sort the data
        self.__data.sort(lambda line: line[column],reverse=reverse)
        # Update the 'order' column
        self.update_order()

    def update_order(self):
        # Set/update values in 'order' column
        for i in range(0,len(self.__data)):
            self.__data[i]['order'] = i+1