예제 #1
0
파일: writer.py 프로젝트: animesh/topmsv
 def __init__(self,
              outfile,
              close=False,
              vocabularies=None,
              missing_reference_is_error=False,
              vocabulary_resolver=None,
              version='1.2.0',
              **kwargs):
     if vocabularies is None:
         vocabularies = []
     vocabularies = list(default_cv_list) + list(vocabularies)
     ComponentDispatcher.__init__(
         self,
         vocabularies=vocabularies,
         missing_reference_is_error=missing_reference_is_error,
         vocabulary_resolver=vocabulary_resolver)
     XMLDocumentWriter.__init__(self, outfile, close, **kwargs)
     self.version = version
     self.xmlns = MzIdentML.attr_version_map[version]['xmlns']
     self.state_machine = TableStateMachine([
         ("start", ['controlled_vocabularies']),
         ("controlled_vocabularies", [
             'analysis_software_list', 'provider', 'audit_collection',
             'analysis_sample_collection', 'sequence_collection',
             'analysis_collection'
         ]),
         ('analysis_software_list', [
             'provider', 'audit_collection', 'analysis_sample_collection',
             'sequence_collection', 'analysis_collection'
         ]),
         ('provider', [
             'audit_collection', 'analysis_sample_collection',
             'sequence_collection', 'analysis_collection'
         ]),
         ('audit_collection', [
             'analysis_sample_collection', 'sequence_collection',
             'analysis_collection'
         ]),
         ('analysis_sample_collection',
          ['sequence_collection', 'analysis_collection']),
         ('sequence_collection', ['analysis_collection']),
         ('analysis_collection', ['analysis_protocol_collection']),
         ('analysis_protocol_collection', ['data_collection']),
         ('data_collection', ['inputs']),
         ('inputs', ['analysis_data']),
         ('analysis_data', ['spectrum_identification_list']),
         ('spectrum_identification_list',
          ['spectrum_identification_list', 'protein_detection_list']),
         ('protein_detection_list', ['bibliography']),
         ('bibliography', []),
     ], 'start')
예제 #2
0
파일: writer.py 프로젝트: animesh/topmsv
 def __init__(self,
              outfile,
              close=False,
              vocabularies=None,
              missing_reference_is_error=False,
              vocabulary_resolver=None,
              id=None,
              accession=None,
              **kwargs):
     if vocabularies is None:
         vocabularies = []
     vocabularies = list(default_cv_list) + list(vocabularies)
     ComponentDispatcher.__init__(
         self,
         vocabularies=vocabularies,
         vocabulary_resolver=vocabulary_resolver,
         missing_reference_is_error=missing_reference_is_error)
     XMLDocumentWriter.__init__(self, outfile, close, **kwargs)
     self.id = id
     self.accession = accession
     self.spectrum_count = 0
     self.chromatogram_count = 0
     self.default_instrument_configuration = None
     self.state_machine = TableStateMachine([
         ("start", [
             'controlled_vocabularies',
         ]), ("controlled_vocabularies", [
             'file_description',
         ]),
         ("file_description",
          ['reference_param_group_list', 'sample_list', 'software_list']),
         ("reference_param_group_list", ['sample_list', 'software_list']),
         ("sample_list", [
             'software_list',
         ]),
         ("software_list",
          ["scan_settings_list", 'instrument_configuration_list']),
         ("scan_settings_list", [
             'instrument_configuration_list',
         ]), ("instrument_configuration_list", ['data_processing_list']),
         ("data_processing_list", ['run']),
         ("run", ['spectrum_list', 'chromatogram_list']),
         ('spectrum_list', ['chromatogram_list']), ('chromatogram_list', [])
     ])
예제 #3
0
class MzIdentMLWriter(ComponentDispatcher, XMLDocumentWriter):
    """
    A high level API for generating MzIdentML XML files from simple Python objects.

    This class depends heavily on lxml's incremental file writing API which in turn
    depends heavily on context managers. Almost all logic is handled inside a context
    manager and in the context of a particular document. Since all operations assume
    that they have access to a universal identity map for each element in the document,
    that map is centralized in this instance.

    MzIdentMLWriter inherits from :class:`.ComponentDispatcher`, giving it a :attr:`context`
    attribute and access to all `Component` objects pre-bound to that context with attribute-access
    notation.

    Attributes
    ----------
    outfile : file
        The open, writable file descriptor which XML will be written to.
    xmlfile : lxml.etree.xmlfile
        The incremental XML file wrapper which organizes file writes onto :attr:`outfile`.
        Kept to control context.
    writer : lxml.etree._IncrementalFileWriter
        The incremental XML writer produced by :attr:`xmlfile`. Kept to control context.
    toplevel : lxml.etree._FileWriterElement
        The top level incremental xml writer element which will be closed at the end
        of file generation. Kept to control context
    context : :class:`.DocumentContext`
    """
    def __init__(self,
                 outfile,
                 close=False,
                 vocabularies=None,
                 missing_reference_is_error=False,
                 vocabulary_resolver=None,
                 version='1.2.0',
                 **kwargs):
        if vocabularies is None:
            vocabularies = list(default_cv_list)
        ComponentDispatcher.__init__(
            self,
            vocabularies=vocabularies,
            missing_reference_is_error=missing_reference_is_error,
            vocabulary_resolver=vocabulary_resolver)
        XMLDocumentWriter.__init__(self, outfile, close, **kwargs)
        self.version = version
        self.xmlns = MzIdentML.attr_version_map[version]['xmlns']
        self.state_machine = TableStateMachine([
            ("start", ['controlled_vocabularies']),
            ("controlled_vocabularies", [
                'analysis_software_list', 'provider', 'audit_collection',
                'analysis_sample_collection', 'sequence_collection',
                'analysis_collection'
            ]),
            ('analysis_software_list', [
                'provider', 'audit_collection', 'analysis_sample_collection',
                'sequence_collection', 'analysis_collection'
            ]),
            ('provider', [
                'audit_collection', 'analysis_sample_collection',
                'sequence_collection', 'analysis_collection'
            ]),
            ('audit_collection', [
                'analysis_sample_collection', 'sequence_collection',
                'analysis_collection'
            ]),
            ('analysis_sample_collection',
             ['sequence_collection', 'analysis_collection']),
            ('sequence_collection', ['analysis_collection']),
            ('analysis_collection', ['analysis_protocol_collection']),
            ('analysis_protocol_collection', ['data_collection']),
            ('data_collection', ['inputs']),
            ('inputs', ['analysis_data']),
            ('analysis_data', ['spectrum_identification_list']),
            ('spectrum_identification_list',
             ['spectrum_identification_list', 'protein_detection_list']),
            ('protein_detection_list', ['bibliography']),
            ('bibliography', []),
        ], 'start')

    def toplevel_tag(self):
        return MzIdentML(version=self.version)

    def controlled_vocabularies(self):
        """Write out the `<cvList>` element and all its children,
        including both this format's default controlled vocabularies
        and those passed as arguments to this method.this

        This method requires writing to have begun.
        """
        self.state_machine.transition("controlled_vocabularies")
        super(MzIdentMLWriter, self).controlled_vocabularies()

    def providence(self, *args, **kwargs):
        warnings.warn("Method renamed to `provenance`")
        self.provenance(*args, **kwargs)

    def provenance(self,
                   software=tuple(),
                   owner=tuple(),
                   organization=tuple(),
                   provider=None):
        """
        Write the analysis provenance section, a top-level segment of the MzIdentML document

        This section should be written early on to register the list of software used in this
        analysis

        Parameters
        ----------
        software : dict or list of dict, optional
            A single dictionary or list of dictionaries specifying an :class:`AnalysisSoftware` instance
        owner : dict, optional
            A dictionary specifying a :class:`Person` instance. If missing, a default person will be created
        organization : dict, optional
            A dictionary specifying a :class:`Organization` instance. If missing, a default organization will
            be created
        """
        self.state_machine.transition('audit_collection')
        organization = self.Organization.ensure_all(organization)
        owner = self.Person.ensure_all(owner)
        software = self.AnalysisSoftware.ensure_all(software)

        if not owner and not organization:
            affiliation = DEFAULT_ORGANIZATION_ID
            self.register("Organization", affiliation)
            owner = [self.Person(affiliations=[affiliation])]
            organization = [self.Organization(id=affiliation)]

        asl = self.AnalysisSoftwareList(software)
        asl.write(self.writer)
        if owner:
            owner_id = owner[0].id
        else:
            owner_id = None
        if not provider:
            self.Provider(contact=owner_id).write(self.writer)
        else:
            self.Provider.ensure(provider or {}).write(self.writer)
        self.AuditCollection(owner, organization).write(self.writer)

    def inputs(self,
               source_files=tuple(),
               search_databases=tuple(),
               spectra_data=tuple()):
        self.state_machine.transition('inputs')
        source_files = [
            self.SourceFile.ensure(s or {})
            for s in ensure_iterable(source_files)
        ]
        search_databases = [
            self.SearchDatabase.ensure(s or {})
            for s in ensure_iterable(search_databases)
        ]
        spectra_data = [
            self.SpectraData.ensure(s or {})
            for s in ensure_iterable(spectra_data)
        ]

        self.Inputs(source_files, search_databases,
                    spectra_data).write(self.writer)

    def analysis_protocol_collection(self):
        self.state_machine.transition('analysis_protocol_collection')
        return AnalysisProtocolCollectionSection(self.writer,
                                                 self.context,
                                                 xmlns=self.xmlns)

    def sequence_collection(self):
        self.state_machine.transition('sequence_collection')
        return SequenceCollectionSection(self.writer,
                                         self.context,
                                         xmlns=self.xmlns)

    def analysis_collection(self):
        self.state_machine.transition('analysis_collection')
        return AnalysisCollectionSection(self.writer,
                                         self.context,
                                         xmlns=self.xmlns)

    def data_collection(self):
        self.state_machine.transition('data_collection')
        return DataCollectionSection(self.writer,
                                     self.context,
                                     xmlns=self.xmlns)

    def analysis_sample_collection(self):
        self.state_machine.transition('analysis_sample_collection')
        return AnalysisSampleCollectionSection(self.writer,
                                               self.context,
                                               xmlns=self.xmlns)

    def sample(self,
               id,
               name=None,
               contacts=None,
               sub_samples=None,
               params=None,
               **kwargs):
        sample = self.Sample(id=id,
                             name=name,
                             contacts=contacts,
                             sub_samples=sub_samples,
                             params=params,
                             **kwargs)
        return sample

    def write_sample(self,
                     id,
                     name=None,
                     contacts=None,
                     sub_samples=None,
                     params=None,
                     **kwargs):
        sample = self.sample(id=id,
                             name=name,
                             contacts=contacts,
                             sub_samples=sub_samples,
                             params=params,
                             **kwargs)
        sample.write(self)

    def write_db_sequence(self,
                          accession,
                          sequence=None,
                          id=None,
                          search_database_id=1,
                          params=None,
                          **kwargs):
        el = self.DBSequence(accession=accession,
                             sequence=sequence,
                             id=id,
                             search_database_id=search_database_id,
                             params=params,
                             **kwargs)
        el.write(self.writer)

    def write_peptide(self,
                      peptide_sequence,
                      id,
                      modifications=None,
                      params=None,
                      **kwargs):
        el = self.Peptide(peptide_sequence=peptide_sequence,
                          id=id,
                          modifications=modifications,
                          params=params,
                          **kwargs)
        el.write(self.writer)

    def write_peptide_evidence(self,
                               peptide_id,
                               db_sequence_id,
                               id,
                               start_position,
                               end_position,
                               is_decoy=False,
                               pre=None,
                               post=None,
                               params=None,
                               frame=None,
                               translation_table_id=None,
                               **kwargs):
        el = self.PeptideEvidence(peptide_id=peptide_id,
                                  db_sequence_id=db_sequence_id,
                                  id=id,
                                  start_position=start_position,
                                  end_position=end_position,
                                  is_decoy=is_decoy,
                                  pre=pre,
                                  post=post,
                                  frame=frame,
                                  translation_table_id=translation_table_id,
                                  params=params,
                                  **kwargs)
        el.write(self.writer)

    def spectrum_identification_protocol(self,
                                         search_type='ms-ms search',
                                         analysis_software_id=1,
                                         id=1,
                                         additional_search_params=None,
                                         enzymes=None,
                                         modification_params=None,
                                         fragment_tolerance=None,
                                         parent_tolerance=None,
                                         threshold=None,
                                         mass_table=None):
        enzymes = [
            self.Enzyme.ensure((s or {})) for s in ensure_iterable(enzymes)
        ]
        modification_params = [
            self.SearchModification.ensure((s or {}))
            for s in ensure_iterable(modification_params)
        ]
        if isinstance(fragment_tolerance, (list, tuple)):
            fragment_tolerance = self.FragmentTolerance(*fragment_tolerance)
        elif isinstance(fragment_tolerance, Number):
            if fragment_tolerance < 1e-4:
                fragment_tolerance = self.FragmentTolerance(
                    fragment_tolerance * 1e6, None, "parts per million")
            else:
                fragment_tolerance = self.FragmentTolerance(
                    fragment_tolerance, None, "dalton")

        if isinstance(parent_tolerance, (list, tuple)):
            parent_tolerance = self.ParentTolerance(*parent_tolerance)
        elif isinstance(parent_tolerance, Number):
            if parent_tolerance < 1e-4:
                parent_tolerance = self.ParentTolerance(
                    parent_tolerance * 1e6, None, "parts per million")
            else:
                parent_tolerance = self.ParentTolerance(
                    parent_tolerance, None, "dalton")
        threshold = self.Threshold(threshold)
        protocol = self.SpectrumIdentificationProtocol(
            search_type, analysis_software_id, id, additional_search_params,
            modification_params, enzymes, fragment_tolerance, parent_tolerance,
            threshold)
        protocol.write(self.writer)

    def protein_detection_protocol(self,
                                   threshold=None,
                                   analysis_software_id=1,
                                   id=1,
                                   params=None,
                                   **kwargs):
        protocol = self.ProteinDetectionProtocol(
            id=id,
            threshold=threshold,
            params=params,
            analysis_software_id=analysis_software_id,
            **kwargs)
        protocol.write(self.writer)

    def analysis_data(self):
        self.state_machine.transition('analysis_data')
        return AnalysisDataSection(self.writer, self.context, xmlns=self.xmlns)

    def spectrum_identification_list(self,
                                     id,
                                     measures=None,
                                     num_sequences_searched=0,
                                     **kwargs):
        self.state_machine.transition('spectrum_identification_list')
        if measures is None:
            measures = self.FragmentationTable()
        return SpectrumIdentficationListSection(
            self.writer,
            self.context,
            id=id,
            fragmentation_table=measures,
            num_sequences_searched=num_sequences_searched,
            xmlns=self.xmlns,
            **kwargs)

    def write_spectrum_identification_result(self,
                                             spectrum_id,
                                             id,
                                             spectra_data_id=1,
                                             identifications=None,
                                             params=None,
                                             **kwargs):
        el = self.spectrum_identification_result(
            spectrum_id=spectrum_id,
            id=id,
            spectra_data_id=spectra_data_id,
            identifications=identifications,
            params=params,
            **kwargs)
        el.write(self.writer)

    def spectrum_identification_result(self,
                                       spectrum_id,
                                       id,
                                       spectra_data_id=1,
                                       identifications=None,
                                       params=None,
                                       **kwargs):
        return self.SpectrumIdentificationResult(
            spectra_data_id=spectra_data_id,
            spectrum_id=spectrum_id,
            id=id,
            params=params,
            identifications=(self.spectrum_identification_item(
                **(s or {})) if isinstance(s, Mapping) else
                             self.SpectrumIdentificationItem.ensure(s)
                             for s in ensure_iterable(identifications)),
            **kwargs)

    def spectrum_identification_item(self,
                                     experimental_mass_to_charge,
                                     charge_state,
                                     peptide_id,
                                     peptide_evidence_id,
                                     score,
                                     id,
                                     calculated_mass_to_charge=None,
                                     calculated_pi=None,
                                     ion_types=None,
                                     params=None,
                                     pass_threshold=True,
                                     rank=1,
                                     **kwargs):
        return self.SpectrumIdentificationItem(
            experimental_mass_to_charge=experimental_mass_to_charge,
            charge_state=charge_state,
            peptide_id=peptide_id,
            peptide_evidence_ids=peptide_evidence_id,
            score=score,
            id=id,
            ion_types=ion_types,
            calculated_mass_to_charge=calculated_mass_to_charge,
            params=ensure_iterable(params),
            pass_threshold=pass_threshold,
            rank=rank,
            **kwargs)

    def write_spectrum_identification_item(self,
                                           experimental_mass_to_charge,
                                           charge_state,
                                           peptide_id,
                                           peptide_evidence_id,
                                           score,
                                           id,
                                           calculated_mass_to_charge=None,
                                           calculated_pi=None,
                                           ion_types=None,
                                           params=None,
                                           pass_threshold=True,
                                           rank=1,
                                           **kwargs):
        item = self.SpectrumIdentificationItem(
            experimental_mass_to_charge=experimental_mass_to_charge,
            charge_state=charge_state,
            peptide_id=peptide_id,
            peptide_evidence_ids=peptide_evidence_id,
            score=score,
            id=id,
            ion_types=ion_types,
            calculated_mass_to_charge=calculated_mass_to_charge,
            params=ensure_iterable(params),
            pass_threshold=pass_threshold,
            rank=rank,
            **kwargs)
        item.write(self.writer)

    def protein_detection_list(self, id, count=None, params=None, **kwargs):
        self.state_machine.transition('protein_detection_list')
        return ProteinDetectionListSection(self.writer,
                                           self.context,
                                           id=id,
                                           count=count,
                                           params=params,
                                           xmlns=self.xmlns,
                                           **kwargs)

    def write_protein_ambiguity_group(self,
                                      protein_detection_hypotheses,
                                      id,
                                      pass_threshold=True,
                                      params=None,
                                      **kwargs):
        group = self.protein_ambiguity_group(
            protein_detection_hypotheses=protein_detection_hypotheses,
            id=id,
            pass_threshold=pass_threshold,
            params=params,
            **kwargs)
        group.write(self.writer)

    def protein_ambiguity_group(self,
                                protein_detection_hypotheses,
                                id,
                                pass_threshold=True,
                                params=None,
                                **kwargs):
        converting = (self.protein_detection_hypothesis(
            **(s or {})) if isinstance(s, Mapping) else
                      self.ProteinDetectionHypothesis.ensure(s)
                      for s in ensure_iterable(protein_detection_hypotheses))
        el = self.ProteinAmbiguityGroup(
            id=id,
            protein_detection_hypotheses=converting,
            pass_threshold=pass_threshold,
            params=params,
            **kwargs)
        return el

    def protein_detection_hypothesis(self,
                                     db_sequence_id,
                                     id,
                                     peptide_hypotheses,
                                     pass_threshold=True,
                                     name=None,
                                     params=None,
                                     **kwargs):
        converting = (self.peptide_hypothesis(**(s or {})) if isinstance(
            s, Mapping) else self.PeptideHypothesis.ensure(s)
                      for s in ensure_iterable(peptide_hypotheses))
        el = self.ProteinDetectionHypothesis(id=id,
                                             db_sequence_id=db_sequence_id,
                                             peptide_hypotheses=converting,
                                             pass_threshold=pass_threshold,
                                             name=name,
                                             params=params,
                                             **kwargs)
        return el

    def write_protein_detection_hypothesis(self,
                                           db_sequence_id,
                                           id,
                                           peptide_hypotheses,
                                           pass_threshold=True,
                                           name=None,
                                           params=None,
                                           **kwargs):
        el = self.protein_detection_hypothesis(db_sequence_id, id,
                                               peptide_hypotheses,
                                               pass_threshold, name, params,
                                               **kwargs)
        el.write(self.writer)

    def peptide_hypothesis(self,
                           peptide_evidence_id,
                           spectrum_identification_ids,
                           params=None,
                           **kwargs):
        el = self.PeptideHypothesis(peptide_evidence_id,
                                    spectrum_identification_ids,
                                    params=params,
                                    **kwargs)
        return el

    def write_peptide_hypothesis(self,
                                 peptide_evidence_id,
                                 spectrum_identification_ids,
                                 params=None,
                                 **kwargs):
        el = self.peptide_hypothesis(peptide_evidence_id,
                                     spectrum_identification_ids, params,
                                     **kwargs)
        el.write(self.writer)
예제 #4
0
파일: writer.py 프로젝트: animesh/topmsv
class PlainMzMLWriter(ComponentDispatcher, XMLDocumentWriter):
    """A high level API for generating mzML XML files from simple Python objects.

    This class depends heavily on lxml's incremental file writing API which in turn
    depends heavily on context managers. Almost all logic is handled inside a context
    manager and in the context of a particular document. Since all operations assume
    that they have access to a universal identity map for each element in the document,
    that map is centralized in this class.

    MzMLWriter inherits from :class:`.ComponentDispatcher`, giving it a :attr:`context`
    attribute and access to all `Component` objects pre-bound to that context with attribute-access
    notation.

    Attributes
    ----------
    chromatogram_count : int
        A count of the number of chromatograms written
    spectrum_count : int
        A count of the number of spectra written
    """

    DEFAULT_TIME_UNIT = DEFAULT_TIME_UNIT
    DEFAULT_INTENSITY_UNIT = DEFAULT_INTENSITY_UNIT

    def __init__(self,
                 outfile,
                 close=False,
                 vocabularies=None,
                 missing_reference_is_error=False,
                 vocabulary_resolver=None,
                 id=None,
                 accession=None,
                 **kwargs):
        if vocabularies is None:
            vocabularies = []
        vocabularies = list(default_cv_list) + list(vocabularies)
        ComponentDispatcher.__init__(
            self,
            vocabularies=vocabularies,
            vocabulary_resolver=vocabulary_resolver,
            missing_reference_is_error=missing_reference_is_error)
        XMLDocumentWriter.__init__(self, outfile, close, **kwargs)
        self.id = id
        self.accession = accession
        self.spectrum_count = 0
        self.chromatogram_count = 0
        self.default_instrument_configuration = None
        self.state_machine = TableStateMachine([
            ("start", [
                'controlled_vocabularies',
            ]), ("controlled_vocabularies", [
                'file_description',
            ]),
            ("file_description",
             ['reference_param_group_list', 'sample_list', 'software_list']),
            ("reference_param_group_list", ['sample_list', 'software_list']),
            ("sample_list", [
                'software_list',
            ]),
            ("software_list",
             ["scan_settings_list", 'instrument_configuration_list']),
            ("scan_settings_list", [
                'instrument_configuration_list',
            ]), ("instrument_configuration_list", ['data_processing_list']),
            ("data_processing_list", ['run']),
            ("run", ['spectrum_list', 'chromatogram_list']),
            ('spectrum_list', ['chromatogram_list']), ('chromatogram_list', [])
        ])

    def toplevel_tag(self):
        return MzML(id=self.id, accession=self.accession)

    def controlled_vocabularies(self):
        """Write out the `<cvList>` element and all its children,
        including both this format's default controlled vocabularies
        and those passed as arguments to this method.this

        This method requires writing to have begun.
        """
        self.state_machine.transition("controlled_vocabularies")
        super(PlainMzMLWriter, self).controlled_vocabularies()

    def software_list(self, software_list):
        """Writes the ``<softwareList>`` section of the document.

        .. note::
            List and descriptions of software used to acquire and/or process the
            data in this mzML file

        Parameters
        ----------
        software_list : list
            A list or other iterable of :class:`dict` or :class:`~.Software`-like objects
        """
        self.state_machine.transition("software_list")
        n = len(software_list)
        if n:
            software_list = [
                self.Software.ensure(sw)
                for sw in ensure_iterable(software_list)
            ]
        self.SoftwareList(software_list).write(self)

    def file_description(self,
                         file_contents=None,
                         source_files=None,
                         contacts=None):
        r"""Writes the ``<fileDescription>`` section of the document.

        .. note::
            Information pertaining to the entire mzML file (i.e. not specific
            to any part of the data set) is stored here.

        Parameters
        ----------
        file_contents : list, optional
            A list or other iterable of :class:`str`, :class:`dict`, or \*Param-types which will
            be placed in the ``<fileContent>`` element.
        source_files : list
            A list or other iterable of dict or :class:`~.SourceFile`-like objects
            to be placed in the ``<sourceFileList>`` element
        """
        self.state_machine.transition("file_description")
        fd = self.FileDescription(file_contents, [
            self.SourceFile.ensure(sf) for sf in ensure_iterable(source_files)
        ],
                                  contacts=[
                                      self.Contact.ensure(c)
                                      for c in ensure_iterable(contacts)
                                  ])
        fd.write(self.writer)

    def instrument_configuration_list(self, instrument_configurations):
        """Writes the ``<instrumentConfigurationList>`` section of the document.

        .. note::
            List and descriptions of instrument configurations. At least one instrument configuration MUST
            be specified, even if it is only to specify that the instrument is unknown. In that case, the
            "instrument model" term is used to indicate the unknown instrument in the instrumentConfiguration

        Parameters
        ----------
        instrument_configurations : list
            A list or other iterable of :class:`dict` or :class:`~.InstrumentConfiguration`-like
            objects
        """
        self.state_machine.transition("instrument_configuration_list")
        configs = [
            self.InstrumentConfiguration.ensure(ic)
            if not isinstance(ic, InstrumentConfiguration) else ic
            for ic in ensure_iterable(instrument_configurations)
        ]
        self.InstrumentConfigurationList(configs).write(self)

    def data_processing_list(self, data_processing):
        """Writes the ``<dataProcessingList>`` section of the document.

        .. note::
            List and descriptions of data processing applied to this data

        Parameters
        ----------
        data_processing : list
            A list or other iterable of :class:`dict` or :class:`~.DataProcessing`-like
            objects

        """
        self.state_machine.transition("data_processing_list")
        methods = [
            self.DataProcessing.ensure(dp)
            for dp in ensure_iterable(data_processing)
        ]
        self.DataProcessingList(methods).write(self)

    def reference_param_group_list(self, groups):
        """Writes the ``<referenceableParamGroupList>`` section of the document.

        Parameters
        ----------
        groups : list
            A list or other iterable of :class:`dict` or :class:`~.ReferenceableParamGroup`-like
            objects

        """
        self.state_machine.transition("reference_param_group_list")
        groups = [
            self.ReferenceableParamGroup.ensure(g)
            for g in ensure_iterable(groups)
        ]
        self.ReferenceableParamGroupList(groups).write(self)

    def sample_list(self, samples):
        """Writes the ``<sampleList>`` section of the document

        Parameters
        ----------
        samples : list
            A list or other iterable of :class:`dict` or :class:`~.mzml.components.Sample`-like
            objects

        """
        self.state_machine.transition("sample_list")
        for i, sample in enumerate(ensure_iterable(samples)):
            if isinstance(sample, Mapping):
                sample_id = sample.get('id')
                sample_name = sample.get("name")

                if sample_id is None and sample_name is not None:
                    sample_id = "%s_%d_id" % (sample_name, i)
                elif sample_id is not None and sample_name is None:
                    sample_name = str(sample_id)
                elif sample_id is sample_name is None:
                    sample_id = "sample_%d_id" % (i, )
                    sample_name = "sample_%d" % (i, )
                sample['id'] = sample_id
                sample['name'] = sample_name

        sample_entries = self.Sample.ensure_all(samples)

        self.SampleList(sample_entries).write(self)

    def scan_settings_list(self, scan_settings):
        self.state_machine.transition("scan_settings_list")
        scan_settings = self.ScanSettings.ensure_all(scan_settings)
        self.ScanSettingsList(scan_settings).write(self)

    def run(self,
            id=None,
            instrument_configuration=None,
            source_file=None,
            start_time=None,
            sample=None):
        """Begins the `<run>` section of the document, describing a single
        sample run.

        Parameters
        ----------
        id : str, optional
            The unique identifier for this element
        instrument_configuration : str, optional
            The id string for the default `InstrumentConfiguration` for this
            sample
        source_file : str, optional
            The id string for the source file used to produce this data
        start_time : str, optional
            A string encoding the date and time the sample was acquired
        sample: str, optional
            The id string for the sample used to produce this data

        Returns
        -------
        RunSection
        """
        self.state_machine.transition("run")
        kwargs = {}
        if start_time is not None:
            kwargs['startTimeStamp'] = start_time
        if instrument_configuration is None:
            keys = list(self.context['InstrumentConfiguration'].keys())
            if keys:
                instrument_configuration = keys[0]
            else:
                instrument_configuration = None
        self.default_instrument_configuration = instrument_configuration
        return RunSection(self.writer,
                          self.context,
                          id=id,
                          instrument_configuration=instrument_configuration,
                          source_file=source_file,
                          sample=sample,
                          **kwargs)

    def spectrum_list(self, count, data_processing_method=None):
        self.state_machine.transition('spectrum_list')
        if data_processing_method is None:
            dp_map = self.context['DataProcessing']
            try:
                data_processing_method = list(dp_map.keys())[0]
            except IndexError:
                warnings.warn(
                    "No Data Processing method found. mzML file may not be fully standard-compliant",
                    stacklevel=2)
        return SpectrumListSection(
            self.writer,
            self.context,
            count=count,
            data_processing_method=data_processing_method)

    def chromatogram_list(self, count, data_processing_method=None):
        self.state_machine.transition('chromatogram_list')
        if data_processing_method is None:
            dp_map = self.context['DataProcessing']
            try:
                data_processing_method = list(dp_map.keys())[0]
            except IndexError:
                warnings.warn(
                    "No Data Processing method found. mzML file may not be fully standard-compliant",
                    stacklevel=2)
        return ChromatogramListSection(
            self.writer,
            self.context,
            count=count,
            data_processing_method=data_processing_method)

    def spectrum(self,
                 mz_array=None,
                 intensity_array=None,
                 charge_array=None,
                 id=None,
                 polarity='positive scan',
                 centroided=True,
                 precursor_information=None,
                 scan_start_time=None,
                 params=None,
                 compression=COMPRESSION_ZLIB,
                 encoding=None,
                 other_arrays=None,
                 scan_params=None,
                 scan_window_list=None,
                 instrument_configuration_id=None,
                 intensity_unit=DEFAULT_INTENSITY_UNIT):
        '''Create a new :class:`~.Spectrum` instance to be written.

        Parameters
        ----------
        mz_array: :class:`np.ndarray` of floats
            The m/z array of the spectrum
        intensity_array: :class:`np.ndarray` of floats
            The intensity array of the spectrum
        charge_array: :class:`np.ndarray`, optional
            The charge state array of the spectrum, optional.
        id: str
            The native ID of the spectrum.
        polarity: str or int, optional
            The polarity of the spectrum. If an integer, the sign of
            the integer is used, otherwise it is interpreted as a cvParam
        centroided: bool, optional
            Whether the spectrum is continuous or discretized by peak picking.
            Defaults to :const:`True`.
        precursor_information: dict or :class:`PrecursorBuilder`, optional
            The precursor ion description. Will be passed to :meth:`_prepare_precursor_list`.
            The structure of this object should either be formatted as arguments to
            :meth:`precursor_builder`, or a :class:`PrecursorBuilder` instance populated
            with information.
        scan_start_time: float, optional
            The scan start time, in minutes
        params: list, optional
            The parameters of the `spectrum`
        compression: str, optional
            The compression type name to use. Defaults to `COMPRESSION_ZLIB`.
        encoding: dict, optional
            A mapping from array name to NumPy data types.
        other_arrays: dict, optional
            A mapping of array names to additional data arrays
        scan_params: list, optional
            A list of cvParams for the `scan` of this `spectrum`
        scan_window_list: list, optional
            A list of scan windows specified as pairs of m/z intervals
        instrument_configuration_id: str, optional
            The `id` of the `instrumentConfiguration` to associate with this spectrum
            if not the default one.

        Returns
        -------
        :class:`~.Spectrum`
        '''
        self.state_machine.expects_state("spectrum_list")
        if encoding is None:
            encoding = {MZ_ARRAY: np.float64}
        if params is None:
            params = []
        else:
            params = list(params)
        if scan_params is None:
            scan_params = []
        else:
            scan_params = list(scan_params)
        if other_arrays is None:
            other_arrays = []
        if scan_window_list is None:
            scan_window_list = []
        else:
            scan_window_list = list(scan_window_list)

        if isinstance(encoding, Mapping):
            encoding = defaultdict(lambda: np.float32, encoding)
        else:
            # create new variable to capture in closure
            _encoding = encoding
            encoding = defaultdict(lambda: _encoding)
        if polarity is not None:
            if isinstance(polarity, int):
                if polarity > 0:
                    polarity = 'positive scan'
                elif polarity < 0:
                    polarity = 'negative scan'
                else:
                    polarity = None
            elif 'positive' in polarity:
                polarity = 'positive scan'
            elif 'negative' in polarity:
                polarity = 'negative scan'
            else:
                polarity = None

            if polarity not in params and polarity is not None:
                params.append(polarity)

        if centroided:
            peak_mode = "centroid spectrum"
        else:
            peak_mode = 'profile spectrum'
        params.append(peak_mode)

        array_list = []
        default_array_length = len(mz_array) if mz_array is not None else 0
        if mz_array is not None:
            mz_array_tag = self._prepare_array(mz_array,
                                               encoding=encoding[MZ_ARRAY],
                                               compression=compression,
                                               array_type=MZ_ARRAY)
            array_list.append(mz_array_tag)

        if intensity_array is not None:
            intensity_array_tag = self._prepare_array(
                intensity_array,
                encoding=encoding[INTENSITY_ARRAY],
                compression=compression,
                array_type={
                    "name": INTENSITY_ARRAY,
                    "unit_name": intensity_unit
                })
            array_list.append(intensity_array_tag)

        if charge_array is not None:
            charge_array_tag = self._prepare_array(
                charge_array,
                encoding=encoding[CHARGE_ARRAY],
                compression=compression,
                array_type=CHARGE_ARRAY)
            array_list.append(charge_array_tag)
        for array_type, array in other_arrays:
            if array_type is None:
                raise ValueError("array type can't be None")
            array_tag = self._prepare_array(
                array,
                encoding=encoding[array_type],
                compression=compression,
                array_type=array_type,
                default_array_length=default_array_length)
            array_list.append(array_tag)
        array_list_tag = self.BinaryDataArrayList(array_list)

        if precursor_information is not None:
            precursor_list = self._prepare_precursor_list(
                precursor_information, intensity_unit=intensity_unit)
        else:
            precursor_list = None

        if scan_start_time is not None:
            if isinstance(scan_start_time, numbers.Number):
                scan_params.append({
                    "name": "scan start time",
                    "value": scan_start_time,
                    "unitName": DEFAULT_TIME_UNIT
                })
            else:
                scan_params.append(scan_start_time)
        # The spec says this is optional, but the validator calls this a must
        # if self.default_instrument_configuration == instrument_configuration_id:
        #     instrument_configuration_id = None
        scan = self.Scan(
            scan_window_list=scan_window_list,
            params=scan_params,
            instrument_configuration_ref=instrument_configuration_id)
        scan_list = self.ScanList([scan], params=["no combination"])

        index = self.spectrum_count
        self.spectrum_count += 1
        spectrum = self.Spectrum(index,
                                 array_list_tag,
                                 scan_list=scan_list,
                                 params=params,
                                 id=id,
                                 default_array_length=default_array_length,
                                 precursor_list=precursor_list)
        return spectrum

    def write_spectrum(self,
                       mz_array=None,
                       intensity_array=None,
                       charge_array=None,
                       id=None,
                       polarity='positive scan',
                       centroided=True,
                       precursor_information=None,
                       scan_start_time=None,
                       params=None,
                       compression=COMPRESSION_ZLIB,
                       encoding=None,
                       other_arrays=None,
                       scan_params=None,
                       scan_window_list=None,
                       instrument_configuration_id=None,
                       intensity_unit=DEFAULT_INTENSITY_UNIT):
        '''Write a :class:`~.Spectrum` with the provided data.

        Parameters
        ----------
        mz_array: :class:`np.ndarray` of floats
            The m/z array of the spectrum
        intensity_array: :class:`np.ndarray` of floats
            The intensity array of the spectrum
        charge_array: :class:`np.ndarray`, optional
            The charge state array of the spectrum, optional.
        id: str
            The native ID of the spectrum.
        polarity: str or int, optional
            The polarity of the spectrum. If an integer, the sign of
            the integer is used, otherwise it is interpreted as a cvParam
        centroided: bool, optional
            Whether the spectrum is continuous or discretized by peak picking.
            Defaults to :const:`True`.
        precursor_information: dict or :class:`PrecursorBuilder`, optional
            The precursor ion description. Will be passed to :meth:`_prepare_precursor_list`.
            The structure of this object should either be formatted as arguments to
            :meth:`precursor_builder`, or a :class:`PrecursorBuilder` instance populated
            with information.
        scan_start_time: float, optional
            The scan start time, in minutes
        params: list, optional
            The parameters of the `spectrum`
        compression: str, optional
            The compression type name to use. Defaults to `COMPRESSION_ZLIB`.
        encoding: dict, optional
            A mapping from array name to NumPy data types.
        other_arrays: dict, optional
            A mapping of array names to additional data arrays
        scan_params: list, optional
            A list of cvParams for the `scan` of this `spectrum`
        scan_window_list: list, optional
            A list of scan windows specified as pairs of m/z intervals
        instrument_configuration_id: str, optional
            The `id` of the `instrumentConfiguration` to associate with this spectrum
            if not the default one.

        See Also
        --------
        :meth:`spectrum`
        '''
        spectrum = self.spectrum(
            mz_array=mz_array,
            intensity_array=intensity_array,
            charge_array=charge_array,
            id=id,
            polarity=polarity,
            centroided=centroided,
            precursor_information=precursor_information,
            scan_start_time=scan_start_time,
            params=params,
            compression=compression,
            encoding=encoding,
            other_arrays=other_arrays,
            scan_params=scan_params,
            scan_window_list=scan_window_list,
            instrument_configuration_id=instrument_configuration_id,
            intensity_unit=intensity_unit)
        spectrum.write(self.writer)

    def chromatogram(self,
                     time_array,
                     intensity_array,
                     id=None,
                     chromatogram_type="selected ion current",
                     precursor_information=None,
                     params=None,
                     compression=COMPRESSION_ZLIB,
                     encoding=32,
                     other_arrays=None,
                     intensity_unit=DEFAULT_INTENSITY_UNIT,
                     time_unit=DEFAULT_TIME_UNIT):
        self.state_machine.expects_state("chromatogram_list")
        if params is None:
            params = []
        else:
            params = list(params)

        if isinstance(encoding, Mapping):
            encoding = defaultdict(lambda: np.float32, encoding)
        else:
            # create new variable to capture in closure
            _encoding = encoding
            encoding = defaultdict(lambda: _encoding)

        if other_arrays is None:
            other_arrays = []
        array_list = []

        if precursor_information is not None:
            precursor = self._prepare_precursor_list(
                precursor_information, intensity_unit=intensity_unit)[0]
        else:
            precursor = None

        default_array_length = len(time_array) if time_array is not None else 0
        if time_array is not None:
            time_array_tag = self._prepare_array(time_array,
                                                 encoding=encoding[TIME_ARRAY],
                                                 compression=compression,
                                                 array_type={
                                                     "name": TIME_ARRAY,
                                                     "unit_name": time_unit
                                                 })
            array_list.append(time_array_tag)

        if intensity_array is not None:
            intensity_array_tag = self._prepare_array(
                intensity_array,
                encoding=encoding[INTENSITY_ARRAY],
                compression=compression,
                array_type={
                    "name": INTENSITY_ARRAY,
                    "unit_name": intensity_unit
                })
            array_list.append(intensity_array_tag)

        for array_type, array in other_arrays:
            array_tag = self._prepare_array(
                array,
                encoding=encoding[array_type],
                compression=compression,
                array_type=array_type,
                default_array_length=default_array_length)
            array_list.append(array_tag)
        params.append(chromatogram_type)
        array_list_tag = self.BinaryDataArrayList(array_list)
        index = self.chromatogram_count
        self.chromatogram_count += 1
        chromatogram = self.Chromatogram(
            index=index,
            binary_data_list=array_list_tag,
            precursor=precursor,
            default_array_length=default_array_length,
            id=id,
            params=params)
        return chromatogram

    def write_chromatogram(self,
                           time_array,
                           intensity_array,
                           id=None,
                           chromatogram_type="selected ion current",
                           precursor_information=None,
                           params=None,
                           compression=COMPRESSION_ZLIB,
                           encoding=32,
                           other_arrays=None,
                           intensity_unit=DEFAULT_INTENSITY_UNIT,
                           time_unit=DEFAULT_TIME_UNIT):
        chromatogram = self.chromatogram(
            time_array=time_array,
            intensity_array=intensity_array,
            id=id,
            chromatogram_type=chromatogram_type,
            precursor_information=precursor_information,
            params=params,
            compression=compression,
            encoding=encoding,
            other_arrays=other_arrays,
            intensity_unit=intensity_unit,
            time_unit=time_unit)
        chromatogram.write(self.writer)

    def _prepare_array(self,
                       array,
                       encoding=32,
                       compression=COMPRESSION_ZLIB,
                       array_type=None,
                       default_array_length=None):
        if isinstance(encoding, numbers.Number):
            _encoding = int(encoding)
        else:
            _encoding = encoding
        dtype = encoding_map[_encoding]
        array = np.array(array, dtype=dtype)
        encoded_binary = encode_array(array,
                                      compression=compression,
                                      dtype=dtype)
        binary = self.Binary(encoded_binary)
        if default_array_length is not None and len(
                array) != default_array_length:
            override_length = True
        else:
            override_length = False
        params = []
        if array_type is not None:
            params.append(array_type)
            if isinstance(array_type, Mapping):
                array_type_ = array_type['name']
            else:
                array_type_ = array_type
            if array_type_ not in ARRAY_TYPES:
                params.append(NON_STANDARD_ARRAY)
        params.append(compression_map[compression])
        params.append(dtype_to_encoding[dtype])
        encoded_length = len(encoded_binary)
        return self.BinaryDataArray(
            binary,
            encoded_length,
            array_length=(len(array) if override_length else None),
            params=params)

    def _prepare_precursor_list(self,
                                precursors,
                                intensity_unit=DEFAULT_INTENSITY_UNIT):
        if isinstance(precursors, self.PrecursorList.type):
            return precursors
        elif isinstance(precursors, (dict)):
            precursors = self.PrecursorList([
                self._prepare_precursor_information(
                    intensity_unit=intensity_unit, **precursors)
            ])
        elif isinstance(precursors, PrecursorBuilder):
            precursors = self.PrecursorList([
                self._prepare_precursor_information(
                    precursors, intensity_unit=intensity_unit)
            ])
        else:
            packaged = []
            for p in ensure_iterable(precursors):
                if isinstance(p, self.Precursor.type):
                    packaged.append(p)
                elif isinstance(p, dict):
                    packaged.append(
                        self._prepare_precursor_information(
                            intensity_unit=intensity_unit, **p))
                elif isinstance(p, PrecursorBuilder):
                    packaged.append(
                        self._prepare_precursor_information(
                            p, intensity_unit=intensity_unit))
            precursors = self.PrecursorList(packaged)
        return precursors

    def _prepare_precursor_information(self,
                                       mz=None,
                                       intensity=None,
                                       charge=None,
                                       spectrum_reference=None,
                                       activation=None,
                                       isolation_window_args=None,
                                       params=None,
                                       intensity_unit=DEFAULT_INTENSITY_UNIT,
                                       scan_id=None,
                                       external_spectrum_id=None,
                                       source_file_reference=None):
        '''Prepare a :class:`Precursor` element from disparate data structures.

        Parameters
        ----------
        mz: float, optional
            The m/z of the first selected ion
        intensity: float, optional
            The intensity of the first selected ion
        charge: int, optional
            The charge state of the first seelcted ion
        spectrum_reference: str, optional
            The `id` of the prescursor `<spectrum>` for this precursor
        activation: dict, optional
            Parameters forwarded to :meth:`PrecursorBuilder.activation`
        isolation_window_args: tuple, list, or dict, optional
            Parameters forwarded to :meth:PrecursorBuilder.isolation_window`,
            tuple or list values are converted into :class:`dict` of the correct
            structure.
        params: list, optional
            The cv-params of the first selected ion
        intensity_unit: str
            The intensity unit of the first selected ion
        scan_id: str, optional
            An alias for `spectrum_reference`
        external_spectrum_id: str, optional
            The `externalSpectrumID` attribute of the precursor
        source_file_reference: str, optional
            The `sourceFileRef` attribute of the precursor

        Returns
        -------
        :class:`~.Precursor`
        '''
        if isinstance(mz, PrecursorBuilder):
            return self.Precursor(**mz.pack())
        if scan_id is not None:
            spectrum_reference = scan_id
        if params is None:
            params = []
        if activation:
            activation = self.Activation(activation)
        if any((mz, intensity, charge)):
            ion = self.SelectedIon(mz, intensity, charge, params=params)
            ion_list = self.SelectedIonList([ion])
        else:
            ion_list = None
        if isolation_window_args:
            isolation_window_tag = self.IsolationWindow(
                **isolation_window_args)
        else:
            isolation_window_tag = None
        precursor = self.Precursor(ion_list,
                                   activation=activation,
                                   isolation_window=isolation_window_tag,
                                   spectrum_reference=spectrum_reference)
        return precursor

    def precursor_builder(self,
                          mz=None,
                          intensity=None,
                          charge=None,
                          spectrum_reference=None,
                          activation=None,
                          isolation_window_args=None,
                          params=None,
                          intensity_unit=DEFAULT_INTENSITY_UNIT,
                          scan_id=None,
                          external_spectrum_id=None,
                          source_file_reference=None):
        '''Create a :class:`PrecursorBuilder`, an object to help populate the precursor information
        data structure.

        The helper object should be used to incrementally populate the precursor information passed
        to :meth:`spectrum` or :meth:`write_spectrum`'s `precursor_information` argument.

        Parameters
        ----------
        mz: float, optional
            The m/z of the first selected ion
        intensity: float, optional
            The intensity of the first selected ion
        charge: int, optional
            The charge state of the first selected ion
        spectrum_reference: str, optional
            The `id` of the prescursor `<spectrum>` for this precursor, mapped through the
            document context.
        activation: dict or list, optional
            Parameters forwarded to :meth:`PrecursorBuilder.activation`. This should be a dictionary
            with a key "params" and a list of :class:`~.CVParam` coerce-able values, with additional
            optional keys naming other :class:`~.CVParam` coerce-able values.
        isolation_window_args: tuple, list, or dict, optional
            Parameters forwarded to :meth:PrecursorBuilder.isolation_window`,
            tuple or list of three values are converted into :class:`dict` of the correct
            structure. The expected keys are "lower", the lower m/z offset, "target", the center m/z,
            and "upper", the upper m/z offset.
        params: list, optional
            The cv- and user-params of the first selected ion, in addition to `mz`, `intensity`,
            `charge`.
        intensity_unit: str
            The intensity unit of the first selected ion, to be specified with `intensity`
        scan_id: str, optional
            An alias for `spectrum_reference`
        external_spectrum_id: str, optional
            The `externalSpectrumID` attribute of the precursor
        source_file_reference: str, optional
            The `sourceFileRef` attribute of the precursor

        Returns
        -------
        :class:`PrecursorBuilder`
        '''
        if scan_id is None:
            spectrum_reference = scan_id
        inst = PrecursorBuilder(self,
                                spectrum_reference=spectrum_reference,
                                external_spectrum_id=external_spectrum_id)
        if mz is not None or intensity is not None or charge is not None or params is not None:
            inst.selected_ion(mz=mz,
                              intensity=intensity,
                              charge=charge,
                              intensity_unit=intensity_unit,
                              params=params)
        if isolation_window_args is None:
            if isinstance(isolation_window_args, (tuple, list)):
                isolation_window_args = {
                    "lower": isolation_window_args[0],
                    "target": isolation_window_args[1],
                    "upper": isolation_window_args[2]
                }
            inst.isolation_window(isolation_window_args)
        if activation is not None:
            inst.activation(activation)
        return inst