示例#1
0
    def getSalltitles(self, hit_def_array):
        """ Example salltitles is "Mus musculus ribosomal protein S8 (Rps8), mRNA <>Mus musculus ES cells cDNA, RIKEN full-length enriched library, clone:2410041L12 product:ribosomal protein S8, full insert sequence"
		"""
        salltitles = []
        try:
            for name in hit_def_array:
                id_desc = name.split(None, 1)
                if len(id_desc) == 1:
                    salltitles.append('missing title - database issue')
                else:
                    salltitles.append(id_desc[1])
        except IndexError as e:
            common.stop_err("Problem splitting multiple hits?\n%r\n--> %s" %
                            (hit_def, e))

        return salltitles
示例#2
0
    def build_bins(self, bins, columns):

        if bins == None:
            self.reference_bins = []
            return False

        for myfield in bins.strip().strip(';').split(';'):
            field_spec = myfield.strip().split(':')
            field_name = field_spec[0].strip()

            if field_name != '':
                if not field_name.replace('_', '').isalnum():
                    common.stop_err("Invalid bin name: " + field_name + ':' +
                                    myfield)

                if len(field_spec) < 2:
                    field_spec.append('column')  # default grouping = column
                if len(field_spec) < 3:
                    field_spec.append('')  # default filtering = none
                if len(field_spec) < 4:
                    field_spec.append('')  # default no description

                grouping = field_spec[1].strip()
                if not grouping in ['table', 'column', 'hidden']:
                    common.stop_err("Invalid bin layout: " + grouping)

                bin_filter = field_spec[2].strip()
                if not bin_filter in ['', 'include', 'exclude']:
                    common.stop_err("Invalid bin sort: " + bin_filter)

                newbin = self.buildBin(field_name, bin_filter)
                self.reference_bins.append(newbin)

                field = {  # any time we have a bin we want sort descending
                    'field': field_name,
                    'group': grouping,
                    'sort': 'desc',
                    'label': newbin.name,
                    'type': 'bin'
                }
                columns.append(field)
                if (field_spec[3] == 'true'):  # description field requested
                    field = {
                        'field': field_name + '_desc',
                        'group': 'column',
                        'sort': '',  # Allow other sorts????
                        'label': newbin.name + ' Description',
                        'type': 'text'
                    }
                    columns.append(field)
示例#3
0
    def __main__(self):

        ## *************************** Parse Command Line *****************************
        parser = common.MyParser(
            description=
            'Generates tab-delimited table report based on BLAST XML results.',
            usage=
            'python blast_reporting.py [blastxml_input_file] [out_format] [tabular_output_file] [option: html_output_file] [option: selection_output_file:id_1:id_2:id_3] [options]',
            epilog="""Details:

			This tool can be used both via command line and via a local Galaxy install.  
			Galaxy uses .loc files (blast_reporting_fields.loc, fasta_reference_dbs.loc)
			as indicated by the tool's tool_data_table_conf.xml.sample.  The command line script 
			uses .tab versions (located in the script's folder) which need to reflect any changes
			made in the .loc versions.
			
			Note: the selection file option is used mainly by the galaxy blast reporting tool.
			
		   [out_format] is one of:
			 "std" : standard 12 column
			 "std+seqs" : standard 12 column plus search (qseq) and matched (sseq) sequences
			 "ext" : extended 25 column
			 "ext+": 26+ column
			 "custom": Use only given field selections.
	
		   Use -i to see possible field (column) selections as defined by blast_reporting_fields.tab.

		   REFERENCE_BINS: Selected bins have their columns shown in output table for clarity, even when custom fields are selected, unless selecting the bin "exclude" option.

		   FILTERS: 
			Format: ([field_name]:[comparator] [value];)*
			e.g. "pident: gt 97; salltitles: excludes bovine|clone|environmental|swine|uncultivated|uncultured|unidentified"
			[comparator] =
				==	numeric equal
				!=	numeric not equal
				gt	numeric greater than 
				gte	numeric greater than or equal to 
				lt	numeric less than
				lte	numeric less than or equal to
				includes (search text fields for included words/phrases)
				excludes (same as above but exclude result if text found)
	
			Textual comparisons may have a value consisting of phrases to search for separated by "|" (disjunction).
		
	
		""")

        parser.set_defaults(row_limit=0)
        # Don't use "-h" , it is reserved for --help!

        parser.add_option(
            '-b',
            '--bins',
            type='string',
            dest='reference_bins',
            help=
            'Provide a comma-delimited list of reference databases to check, along with their sort order, and a flag to exclude them if desired, e.g. "16Sncbi desc,euzby desc,16Srdp exclude".  See -i option for a list of available databases.'
        )

        parser.add_option(
            '-c',
            '--columns',
            type='string',
            dest='custom_fields',
            help=
            'To modify sorting and formatting, specify a comma-delimited list of field specifications of the form: "[field_name]:[column|table|section]:[asc|desc|none]:[new label text];..." .'
        )

        parser.add_option(
            '-f',
            '--filter',
            type='string',
            dest='filters',
            help=
            'Provide a semicolon-delimited list of fields and their criteria to filter by.'
        )

        parser.add_option(
            '-i',
            '--info',
            dest='info',
            default=False,
            action='store_true',
            help=
            'Provides list of columns and their descriptions, for use in filter, sort and custom column lists. Shows a list of available sequence type reference bins as well'
        )

        parser.add_option(
            '-l',
            '--label',
            type='string',
            dest='column_labels',
            help=
            'Include field labels in first row of tab-delimited result table as short names or data field names (or none)'
        )

        parser.add_option(
            '-n',
            '--number',
            type='int',
            dest='row_limit',
            help=
            'Provide a limit to the number of rows of returned data. The default 0=unlimited.'
        )

        #TESTING Galaxy library dataset files for reference bins.
        parser.add_option('-B',
                          '--refbins',
                          type='string',
                          dest='refbins',
                          help='Testing library_data form input.')

        parser.add_option(
            '-r',
            '--redundant',
            dest='drop_redundant_hits',
            default=False,
            action='store_true',
            help='Return only first match to a gene bank id result.')

        options, args = parser.parse_args()

        import time
        time_start = time.time()

        # "info" command provides a dump of all the fields that can be displayed from the Blast search.
        if options.info:
            # Future: Can stand-alone command line program access Galaxy's version of the field spec file?  Right now it is a separate copy.
            print 'FIELDS:\n'
            field_spec_path = os.path.join(os.path.dirname(__file__),
                                           'blast_reporting_fields.tab')
            fields = common.FieldSpec(field_spec_path)
            for field in sorted(fields.dict.keys()):
                print field + "\t" + fields.getAttribute(
                    field, 'type') + "\t" + fields.getAttribute(field, 'name')

            print '\nREFERENCE BINS:\n'
            field_spec_path = os.path.join(os.path.dirname(__file__),
                                           'fasta_reference_dbs.tab')
            fields = common.FieldSpec(field_spec_path)
            for field in sorted(fields.dict.keys()):
                print field + "\t" + fields.getAttribute(
                    field, 'path'
                ) + field + '/accession_ids.tab' + '\t' + fields.getAttribute(
                    field, 'name')

            sys.exit(1)

        try:
            in_file, output_format, out_tabular_file = args[0:3]

        except:
            common.stop_err(
                "Expecting 3 arguments: input BLAST XML file, out format (std | std+seqs | ext | ext+ | custom), and output tabular file"
            )

        try:
            # Get an iterable, see http://effbot.org/zone/element-iterparse.htm
            context = ElementTree.iterparse(
                in_file,
                events=("start", "end"))  # By default only does end events.
            context = iter(context)
            event, root = context.next(
            )  # Creates reference to root element on 'start' event, for housecleaning below.
        except:
            common.stop_err("Invalid data format. !!")

        tagGroup = XMLRecordScan(options, output_format)
        fieldFilter = common.FieldFilter(
            tagGroup, options)  # .filter list field names are changed above.

        if options.reference_bins:
            print 'Database bins: %s' % str([
                bin.name
                for (ptr, bin) in enumerate(tagGroup.binManager.reference_bins)
            ]).translate(None, "[']")
        if options.custom_fields:
            print 'Customized Fields: %s' % options.custom_fields
        if options.filters: print 'Filters: ' + options.filters
        if options.drop_redundant_hits: print 'Throwing out redundant hits...'

        # ************************ FILE OUTPUT *****************************
        # IT IS CRITICAL THAT EVERY <HIT>/<HSP> RETURN A COMPLETE XML SET OF TAGS OTHERWISE PREV. RECORD VALUES PERSIST
        # NOTE: GALAXY 2012 has bug in html data display - it will show duplicate records OCCASIONALLY (at least on some browsers).  You have to download data file to verify there are no duplicates

        row_count = 0
        row_count_filtered = 0
        outfile = open(out_tabular_file, 'w')
        query_stats = []

        for event, elem in context:

            # Alternative is to wipe Hit/Hsp fields on event == "start".
            tag = elem.tag
            if event == 'end':
                if tag in tagGroup.tags:  #Content of these tags fills a tabular line with column info.
                    tagGroup.setRecordAttr(tag, elem.text)
                    if tag == 'Iteration_query-def':
                        row_count = 0
                        row_count_filtered = 0
                        query_stats.append({
                            'id': elem.text,
                            'rows': 0,
                            'filtered_rows': 0
                        })

                # Process each </hsp> record
                elif tag == 'Hsp':
                    row_count += 1
                    query_stats[-1][
                        'rows'] = row_count  # real rows, not clipped
                    if options.row_limit == 0 or row_count_filtered < options.row_limit:

                        # Transform <Hsp> record & add field info.
                        if tagGroup.processRecord():

                            #if tagGroup.processFilters():
                            if fieldFilter.process(tagGroup.record):
                                row_count_filtered += 1
                                query_stats[-1][
                                    'filtered_rows'] = row_count_filtered
                                outfile.write(tagGroup.outputTabDelimited())

                        root.clear(
                        )  # Clears references from root to (now unused) children to keep iterated datastructure small ???

                elem.clear()  # I think root.clear() cover this case.

        root.clear()
        outfile.close()

        # Use fast Linux "sort" after filtering & file write
        common.fileSort(out_tabular_file, tagGroup.columns)
        """
		The "Selection file" option is meant for galaxy UI use in conjunction 
		with the "Select Subsets on data" tool.  If a selection_file is called 
		for, then we need to extract its id as well.  For that we have to test 
		for somewhat odd expression from xml-generated command line, the 
		[$selection_file:$selection_file.hid:$selection_file.dataset_id:$selection_file.id]
		Selection list doesn't necessarily need the HTML selectable report template, 
		but that template was designed to feed the galaxy "Select subsets" tool with its data.
		
		From galaxy, incoming format is $selection_file:$selection_file.hid:$selection_file.dataset_id:$selection_file.id
		"""

        if len(args) > 4 and args[4] != 'None:None:None:None':

            sel_file_fields = args[4].split(':')
            selection_file = sel_file_fields[0]

            # From command line, user won't have specified any of this, so ignore.
            options.dataset_selection_id = None
            if len(sel_file_fields) > 3 and selection_file != 'None':
                # Unfortunately we can't tell galaxy not to set up selection_file handle on xml form if input fields haven't been selected.
                # Have to test for needed input fields here
                sel_requisites = 0
                for (idx, field) in enumerate(tagGroup.columns):
                    if field['field'] in 'qseqid _qseq sseqid _sseq':
                        sel_requisites += 1

                if sel_requisites == 4:
                    options.dataset_selection_id = sel_file_fields[3]
                    common.fileSelections(out_tabular_file, selection_file,
                                          tagGroup, options)
        """
		We must have a template in order to write anything to above html output file.
		All report templates need to be listed in the module's tabular data "blast_reporting_templates" folder.
		# There are two possible HTML Report template locations: 
		# 1) The stock reports included in the module in the "templates/" subfolder, e.g. html_report.py
		# 2) User customized templates.  To set this up:
			- add a custom template folder in a location of your choice.
			- Copy this module's templates folder into it.  
			- The new folder must be in python's sys.path, which is achieved by adding a .pth file to python's site-packages folder..  E.g. set up /usr/lib/python2.6/site-packages/galaxy-custom-modules.pth to contain "/usr/local/galaxy/shared/python2.6_galaxy_custom_modules" 
		, and place 'templates_custom/html_report.py' in there.
		"""
        if len(args) > 3:
            out_html_file = args[3]  #Galaxy-generated
            # args[5] = html_template, default from galaxy xml is 'templates.html_report', but testing can receive 'None' value
            if len(args) > 5 and len(
                    args[5].strip()) > 0 and not args[5].strip() == 'None':

                html_template = args[5]  #User-selected
                if not html_template.translate(None, "._-").isalnum():
                    common.stop_err(
                        "The HTML Report template name is not correct.  It should be a python class path like templates.html_report)! : "
                        + html_template)

            else:
                html_template = 'templates.html_report'

            try:
                # See http://stackoverflow.com/questions/769534/dynamic-loading-of-python-modules
                HTMLReportModule = __import__(
                    html_template,
                    fromlist=['does not in fact matter what goes here!'])
                # Now create final tabular, html (or future: xml) data
                htmlManager = HTMLReportModule.HTMLReport(
                    tagGroup, options, query_stats)
                # htmlManager might not be initialized if the caller couldn't provide all the data the particular template needed.
                htmlManager.render(out_tabular_file, out_html_file)

            except ImportError:
                common.stop_err("Unable to locate HTML Report template! : " +
                                html_template)

        common.fileTabular(out_tabular_file, tagGroup, options)

        print('Execution time (seconds): ' +
              str(int(time.time() - time_start)))
示例#4
0
    def processRecord(self):

        bline = self.record

        # NCBI notes: Expecting either this,
        # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
        # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
        # <Hit_accession>P56514</Hit_accession>
        #or,
        # <Hit_id>Subject_1</Hit_id>
        # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
        # <Hit_accession>Subject_1</Hit_accession>
        #or,
        # <Hit_id>Subject_1</Hit_id>
        # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
        # <Hit_accession>Subject_1</Hit_accession>
        #apparently depending on the parse_deflines switch

        sseqid = self.record._hit_id.split(None, 1)[0]

        # If Hit_id == Hit_accession AND it is a default "Subject_1" ...
        # OR Hit_accession IN Hit_id and BL_ORD_ID|XXXX contains hit_accession
        if common.re_default_subject_id.match(sseqid) and sseqid.find(
                bline._hit_acc):
            # and sseqid == bline._hit_acc:
            #Place holder ID, take the first word of the subject definition
            hit_def = bline._hit_def
            sseqid = hit_def.split(None, 1)[0]
        else:
            hit_def = sseqid + " " + bline._hit_def

        self.record.sseqid = sseqid

        if common.re_default_ncbi_id.match(sseqid):
            self.record.accessionid = sseqid.split('|')[3]
        elif common.re_default_ref_id.match(sseqid):
            self.record.accessionid = sseqid.split('|')[1]
        else:
            # Have to use the whole string.
            self.record.accessionid = sseqid

        # NCBI notes: Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
        # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
        # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=H**o sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
        # <Iteration_query-len>406</Iteration_query-len>
        # <Iteration_hits></Iteration_hits>
        #
        #Or, from BLAST 2.2.24+ run online
        # <Iteration_query-ID>Query_1</Iteration_query-ID>
        # <Iteration_query-def>Sample</Iteration_query-def>
        # <Iteration_query-len>516</Iteration_query-len>
        # <Iteration_hits>...

        # Note BioPython's approach http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO.blast_xml-pysrc.html
        # ... if hit_id.startswith('gnl|BL_ORD_ID|'): ...

        if common.re_default_query_id.match(bline._qseqid):
            #Place holder ID, take the first word of the query definition
            qseqid = bline._qdef.split(None, 1)[0]
        else:
            qseqid = bline._qseqid

        self.record.qseqid = qseqid

        self.record.evalue = "0.0" if bline._evalue == "0" else "%0.0e" % float(
            bline._evalue)

        # NCBI notes:
        #   if bline._bitscore < 100:
        #       #Seems to show one decimal place for lower scores
        #       bitscore = "%0.1f" % bline._bitscore
        #   else:
        #       #Note BLAST does not round to nearest int, it truncates
        #       bitscore = "%i" % bline._bitscore
        bitscore = float(bline._bitscore)
        self.record.bitscore = "%0.1f" % bitscore if bitscore < 100 else "%i" % bitscore

        self.record.pident = "%0.2f" % (100 * float(bline._nident) /
                                        float(bline._length))

        self.record.gapopen = str(len(bline._qseq.replace('-', ' ').split())-1 + \
         len(bline._sseq.replace('-', ' ').split())-1)

        mismatch = bline._mseq.count(' ') + bline._mseq.count('+') \
             - bline._qseq.count('-') - bline._sseq.count('-')
        #assert len(bline._qseq) == len(bline._sseq) == len(bline._mseq) == int(bline._length)
        self.record.mismatch = str(mismatch)

        # Extended fields
        #sallseqid gets ";" delimited list of first words in each hit_def "x>y>z" expression.
        #Nov 7 2013 fix: https://github.com/peterjc/galaxy_blast/blob/master/tools/ncbi_blast_plus/blastxml_to_tabular.py
        hit_def_array = hit_def.split(
            " >")  #Note: elem.text below converts escaped "&gt;" back to ">"
        try:
            self.record.sallseqid = ";".join(
                name.split(None, 1)[0] for name in hit_def_array)
        except IndexError as e:
            common.stop_err("Problem splitting multiple hit ids?\n%r\n--> %s" %
                            (hit_def, e))

        # Calculate accession ids, and check bin(s) for them, update record accordingly.
        self.binManager.setStatus(self.record)

        self.record.ppos = "%0.2f" % (100 * float(bline._positive) /
                                      float(bline._length))
        qframe = bline._qframe
        sframe = bline._sframe
        if bline._blast_program == "blastp":
            #Probably a bug in BLASTP that they use 0 or 1 depending on format
            if qframe == "0": qframe = "1"
            if sframe == "0": sframe = "1"

        self.record.qframe = qframe
        self.record.sframe = sframe
        self.record.slen = str(int(bline._hit_len))
        self.record.qlen = str(int(bline._qlen))

        #NCBI DOCUMENTATION ON qcovs == pcov == pct_coverage == http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/include/objects/seqalign/Seq_align.hpp#L54
        #extended+
        self.record.pcov = "%0.2f" % (
            float(int(bline._qend) - int(bline._qstart) + 1) /
            int(bline._qlen) * 100)

        titlesArray = self.getSalltitles(hit_def_array)
        self.record.salltitles = "<>".join(titlesArray)
        self.record.stitle = titlesArray[0]

        return True  # One may return false anywhere above to filter out current <Hsp> record.
示例#5
0
    def __init__(self, options, output_format):
        """ Creates a record object that holds field data for each <hit> iteration in Blastn XML data
		
		 .record object: holds values read in from <XML> <hit> record mainly.
		 .tags dictionary: XML tags and the record.[x] fields/attributes that should be set to tag values.
		 .column_format dictionary: Name to field count dictionary used for selecting # of output fields
		 .fieldSpec dictionary: Specification of each possible field's type (for validation), full name, and suitability for sorting, filtering, etc.
		 .custom_columns array takes list of custom columns to output. (If sorting by a column it must be in this list)
		 .reference_bins dictionary
		
		"""
        self.record = GenericRecord(
        )  # Set up so we can use object attributes.

        #This is a list of all incomming blast generated XML fields that we want to capture
        # self.record gets all underscored variables values as well as new derived ones in process() below
        self.tags = {
            "BlastOutput_program": '_blast_program',
            "Iteration_query-ID": '_qseqid',
            "Iteration_query-def": '_qdef',
            "Iteration_query-len": '_qlen',  #extended+ calc
            "Hit_id": '_hit_id',
            "Hit_def": '_hit_def',  #extended+ calc
            "Hit_accession": '_hit_acc',
            "Hit_len": '_hit_len',
            "Hsp_bit-score": '_bitscore',  #basic
            "Hsp_score": '_score',  #extended
            "Hsp_evalue": '_evalue',  #basic
            "Hsp_query-from": '_qstart',  #basic, extended+ calc
            "Hsp_query-to": '_qend',  #basic, extended+ calc
            "Hsp_hit-from": '_sstart',  #basic
            "Hsp_hit-to": '_send',  #basic
            "Hsp_query-frame": '_qframe',  #extended only
            "Hsp_hit-frame": '_sframe',  #extended only
            "Hsp_identity": '_nident',  #extended
            "Hsp_positive": '_positive',  #extended
            "Hsp_gaps": '_gaps',  #extended
            "Hsp_align-len": '_length',  #basic
            "Hsp_qseq": '_qseq',  #extended
            "Hsp_hseq": '_sseq',  #extended
            "Hsp_midline": '_mseq'  #basic
        }

        self.column_format = {
            'std': 12,
            'std+seqs': 12,
            'ext': 25,
            'ext+': 26,
            'custom': 1
        }

        if not output_format in self.column_format:
            common.stop_err(
                "Format argument should be std (12 column) or ext (extended 25 columns) or ext+ (extended 26+ columns) or custom (you choose fields). Format argument x22 has been replaced with ext (extended 25 columns)"
            )

        # Array of columns destined for tab-delimited output - This defines default ORDER of fields too.
        # Raw data fields that never get output: _bitscore, _evalue, _qframe, _sframe,
        # and this that has no m_frame equivalent: _mseq
        self.columns_in = 'qseqid sseqid pident _length mismatch gapopen _qstart _qend _sstart _send evalue bitscore \
			sallseqid _score _nident _positive _gaps ppos qframe sframe _qseq _sseq qlen slen \
			salltitles pcov accessionid stitle _mseq'.split()

        fieldSpecFile = os.path.join(os.path.dirname(__file__),
                                     'blast_reporting_fields.tab')
        self.field_spec = common.FieldSpec(fieldSpecFile, self.columns_in)

        # Include first N fields from .columns according to format.
        # In all cases qseqid is included.
        # Default everything to "column".
        columns_out = self.columns_in[0:self.column_format[output_format]]

        # This column list is designed for creating phylogeny reports.
        if output_format == 'std+seqs': columns_out.extend(['_qseq', '_sseq'])

        self.columns = self.field_spec.initColumns(columns_out,
                                                   options.custom_fields)

        # We're making these columns hidden for this particular HTML report format
        # UNLESS they are mentioned in options.custom_fields
        if output_format == 'std+seqs':
            for (ptr, target) in enumerate(self.columns):
                if target['field'] in [
                        '_qseq', '_sseq'
                ] and options.custom_fields and not target[
                        'field'] in options.custom_fields:
                    target['group'] = 'hidden'

        # ADD SELECTED BINS TO COLUMN LIST;
        self.binManager = reference_bins.ReferenceBins()
        self.binManager.build_bins(options.reference_bins, self.columns)