Exemplo n.º 1
0
 def setup(self):
     super(TestMutator, self).setup()
     self.gb_parser = genbank.GBparser()
Exemplo n.º 2
0
    def loadrecord(self, accession):
        """
        Load a RefSeq record and return it.

        The record is found by trying the following options in order:

        1. Returned from the cache if it is there.
        2. Re-created (if it was created by slicing) or re-downloaded (if it
           was created by URL) if we have information on its source in the
           database.
        3. Fetched from the NCBI.

        :arg unicode accession: A RefSeq accession number.

        :returns: A parsed RefSeq record or `None` if no record could be found
          for the given accession.
        :rtype: object
        """
        reference = Reference.query.filter_by(accession=accession).first()

        if reference is None:
            # We don't know it, fetch it from NCBI.
            filename = self.fetch(accession)

        else:
            # We have seen it before.
            filename = self._name_to_file(reference.accession)

            if os.path.isfile(filename):
                # It is still in the cache, so filename is valid.
                pass

            elif reference.source == 'ncbi_slice':
                # It was previously created by slicing.
                cast_orientation = {None: None, 'forward': 1, 'reverse': 2}
                (slice_accession, slice_start, slice_stop,
                 slice_orientation) = reference.source_data.split(':')
                slice_start = int(slice_start)
                slice_stop = int(slice_stop)
                slice_orientation = cast_orientation[slice_orientation]
                if not self.retrieveslice(slice_accession, slice_start,
                                          slice_stop, slice_orientation):
                    filename = None

            elif reference.source == 'url':
                # It was previously created by URL.
                if not self.downloadrecord(reference.source_data):
                    filename = None

            elif reference.source == 'ncbi':
                # It was previously fetched from NCBI.
                filename = self.fetch(reference.accession)

            else:
                # It was previously created by uploading.
                self._output.addMessage(__file__, 4, 'ERETR',
                                        'Please upload this sequence again.')
                filename = None

        # If filename is None, we could not retrieve the record.
        if filename is None:
            # Notify batch job to skip all instance of identifier.
            self._output.addOutput('BatchFlags', ('S1', accession))
            return None

        # Now we have the file, so we can parse it.
        genbank_parser = genbank.GBparser()
        record = genbank_parser.create_record(filename)

        if reference:
            record.id = reference.accession
        else:
            record.id = record.source_id

        # Todo: This will change once we support protein references.
        if isinstance(record.seq.alphabet, ProteinAlphabet):
            self._output.addMessage(
                __file__, 4, 'ENOTIMPLEMENTED',
                'Protein reference sequences are not supported.')
            return None

        return record
Exemplo n.º 3
0
    def loadrecord(self, identifier):
        """
        Load a RefSeq record and return it.

        The record is found by trying the following options in order:

        1. Returned from the cache if it is there.
        2. Re-created (if it was created by slicing) or re-downloaded (if it
           was created by URL) if we have information on its source in the
           database.
        3. Fetched from the NCBI.

        :arg identifier: A RefSeq accession number or geninfo identifier (GI).
        :type identifier: unicode

        :return: A parsed RefSeq record or `None` if no record could be found
            for the given identifier.
        :rtype: mutalyzer.GenRecord.Record
        """
        if identifier[0].isdigit():
            # This is a GI number (geninfo identifier).
            reference = Reference.query \
                .filter_by(geninfo_identifier=identifier) \
                .first()
        else:
            # This is a RefSeq accession number.
            reference = Reference.query \
                .filter_by(accession=identifier) \
                .first()

        if reference is None:
            # We don't know it, fetch it from NCBI.
            filename = self.fetch(identifier)

        else:
            # We have seen it before.
            filename = self._nametofile(reference.accession)

            if os.path.isfile(filename):
                # It is still in the cache, so filename is valid.
                pass

            elif reference.slice_accession:
                # It was previously created by slicing.
                cast_orientation = {None: None,
                                    'forward': 1,
                                    'reverse': 2}
                if not self.retrieveslice(reference.slice_accession,
                                          reference.slice_start,
                                          reference.slice_stop,
                                          cast_orientation[reference.slice_orientation]):
                    filename = None

            elif reference.download_url:
                # It was previously created by URL.
                if not self.downloadrecord(reference.download_url):
                    filename = None

            elif reference.geninfo_identifier:
                # It was previously fetched from NCBI.
                filename = self.fetch(reference.accession)

            else:
                # It was previously created by uploading.
                self._output.addMessage(__file__, 4, 'ERETR',
                                        'Please upload this sequence again.')
                filename = None

        # If filename is None, we could not retrieve the record.
        if filename is None:
            # Notify batch job to skip all instance of identifier.
            self._output.addOutput('BatchFlags', ('S1', identifier))
            return None

        # Now we have the file, so we can parse it.
        GenBankParser = genbank.GBparser()
        record = GenBankParser.create_record(filename)

        if reference:
            record.id = reference.accession
        else:
            record.id = record.source_id

        # Todo: This will change once we support protein references.
        if isinstance(record.seq.alphabet, ProteinAlphabet):
            self._output.addMessage(
                __file__, 4, 'ENOTIMPLEMENTED',
                'Protein reference sequences are not supported.')
            return None

        return record