def index_database(database_connection, channel): try: channel.send(Message("Analyzing Database", 'update')) handle = DatabaseBoundOperation(database_connection) for (table, ix_name) in index_updates: session = handle.session() connection = session.connection() index = find_index_by_name(table, ix_name) if index is None: continue try: index.create(connection) except: session.rollback() session.commit() handle._analyze_database() session = handle.session() for query in target_queries: result = session.execute("EXPLAIN QUERY PLAN " + query) channel.log("%s:\n\t%r" % (query, ' '.join(map(str, result)))) channel.send(Message("Indexing Complete", 'update')) except: channel.send(Message.traceback())
def glycopeptide_mzidentml(database_connection, analysis_identifier, output_path=None, mzml_path=None, embed_protein_sequences=True): '''Write identified glycopeptides as mzIdentML file, and associated MSn spectra to a paired mzML file if the matched data are available. If an mzML file is written it will also contain the extracted ion chromatograms for each glycopeptide with an extracted elution profile. ''' database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() # pylint: disable=not-callable analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % (str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() loader = AnalysisDeserializer(database_connection._original_connection, analysis_id=analysis.id) click.echo("Loading Identifications") # glycopeptides = loader.load_identified_glycopeptides() glycopeptides = loader.query(IdentifiedGlycopeptide).filter( IdentifiedGlycopeptide.analysis_id == analysis_identifier).all() with open(output_path, 'wb') as outfile: writer = MzIdentMLSerializer( outfile, glycopeptides, analysis, loader, source_mzml_path=mzml_path, embed_protein_sequences=embed_protein_sequences) writer.run()
def glycopeptide_hypothesis(database_connection, hypothesis_identifier, output_path, multifasta=False): '''Write each theoretical glycopeptide in CSV format ''' database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() hypothesis = get_by_name_or_id(session, GlycopeptideHypothesis, hypothesis_identifier) def generate(): interval = 100000 i = 0 while True: session.expire_all() chunk = hypothesis.glycopeptides.slice(i, i + interval).all() if len(chunk) == 0: break for glycopeptide in chunk: yield glycopeptide i += interval if output_path is None: output_stream = ctxstream(sys.stdout) else: output_stream = open(output_path, 'wb') with output_stream: job = GlycopeptideHypothesisCSVSerializer(output_stream, generate()) job.run()
def export_identified_glycans_from_glycopeptides(database_connection, analysis_identifier, output_path): database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() # pylint: disable=not-callable analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % (str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() glycans = session.query(GlycanComposition).join( GlycanCombinationGlycanComposition).join(GlycanCombination).join( Glycopeptide, Glycopeptide.glycan_combination_id == GlycanCombination.id).join( IdentifiedGlycopeptide, IdentifiedGlycopeptide.structure_id == Glycopeptide.id).filter( IdentifiedGlycopeptide.analysis_id == analysis.id).all() if output_path is None: output_stream = ctxstream(click.get_binary_stream('stdout')) else: output_stream = open(output_path, 'wb') with output_stream: job = ImportableGlycanHypothesisCSVSerializer(output_stream, glycans) job.run()
def sql_shell(database_connection, script=None): db = DatabaseBoundOperation(database_connection) session = db.session() interpreter = SQLShellInterpreter(session) if script is None: interpreter.cmdloop() else: result = session.execute(script) interpreter._to_csv(list(result), sys.stdout)
def glycan_composition_identification(database_connection, analysis_identifier, output_path=None, threshold=0, report=False): '''Write each glycan chromatogram in CSV format ''' database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycan_lc_ms: click.secho("Analysis %r is of type %r." % ( str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() analysis_id = analysis.id if output_path is None: output_stream = ctxstream(sys.stdout) else: output_stream = open(output_path, 'wb') if report: with output_stream: job = GlycanChromatogramReportCreator( database_connection._original_connection, analysis_id, output_stream, threshold=threshold) job.run() else: def generate(): i = 0 interval = 100 query = session.query(GlycanCompositionChromatogram).filter( GlycanCompositionChromatogram.analysis_id == analysis_id, GlycanCompositionChromatogram.score > threshold) while True: session.expire_all() chunk = query.slice(i, i + interval).all() if len(chunk) == 0: break for gcs in chunk: yield gcs.convert() i += interval i = 0 query = session.query(UnidentifiedChromatogram).filter( UnidentifiedChromatogram.analysis_id == analysis_id, UnidentifiedChromatogram.score > threshold) while True: session.expire_all() chunk = query.slice(i, i + interval).all() if len(chunk) == 0: break for gcs in chunk: yield gcs.convert() i += interval with output_stream: job = GlycanLCMSAnalysisCSVSerializer(output_stream, generate()) job.run()
def sql_shell(database_connection, script=None): db = DatabaseBoundOperation(database_connection) session = db.session() # pylint: disable=not-callable interpreter = SQLShellInterpreter(session) if script is None: interpreter.cmdloop() else: result = session.execute(script) interpreter._to_csv(list(result), sys.stdout)
def stream_from_hypotheses(self, connection, hypothesis_id): self.log("Streaming from %s for hypothesis %d" % (connection, hypothesis_id)) connection = DatabaseBoundOperation(connection) session = connection.session() for db_composition in session.query(DBGlycanComposition).filter( DBGlycanComposition.hypothesis_id == hypothesis_id): structure_classes = list(db_composition.structure_classes) if len(structure_classes) > 0: yield db_composition, [sc.name for sc in db_composition.structure_classes] else: yield db_composition, [None]
def glycopeptide_identification(database_connection, analysis_identifier, output_path=None, report=False, mzml_path=None, threshold=0): '''Write each distinct identified glycopeptide in CSV format ''' database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % ( str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() analysis_id = analysis.id if output_path is None: output_stream = ctxstream(sys.stdout) else: output_stream = open(output_path, 'wb') if report: with output_stream: if mzml_path is None: mzml_path = analysis.parameters['sample_path'] if not os.path.exists(mzml_path): raise click.ClickException( ("Sample path {} not found. Pass the path to" " this file as `-m/--mzml-path` for this command.").format( mzml_path)) GlycopeptideDatabaseSearchReportCreator( database_connection._original_connection, analysis_id, stream=output_stream, threshold=threshold, mzml_path=mzml_path).run() else: query = session.query(Protein.id, Protein.name).join(Protein.glycopeptides).join( IdentifiedGlycopeptide).filter( IdentifiedGlycopeptide.analysis_id == analysis.id) protein_index = dict(query) def generate(): i = 0 interval = 100 query = session.query(IdentifiedGlycopeptide).filter( IdentifiedGlycopeptide.analysis_id == analysis_id) while True: session.expire_all() chunk = query.slice(i, i + interval).all() if len(chunk) == 0: break for glycopeptide in chunk: yield glycopeptide.convert() i += interval with output_stream: job = GlycopeptideLCMSMSAnalysisCSVSerializer(output_stream, generate(), protein_index) job.run()
def glycopeptide_training_mgf(database_connection, analysis_identifier, output_path=None, mzml_path=None, threshold=None): database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % ( str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() if output_path is None: output_stream = ctxstream(sys.stdout) else: output_stream = open(output_path, 'wb') with output_stream: TrainingMGFExporter.from_analysis( database_connection, analysis.id, output_stream, mzml_path, threshold).run()
def annotate_matched_spectra(database_connection, analysis_identifier, output_path, mzml_path=None): database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % ( str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() if output_path is None: output_path = os.path.dirname(database_connection._original_connection) task = SpectrumAnnotatorExport( database_connection._original_connection, analysis.id, output_path, mzml_path) task.display_header() task.start()
def glycopeptide_chromatogram_records(database_connection, analysis_identifier, output_path, apex_time_range=None): if apex_time_range is None: apex_time_range = (0, float('inf')) database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() # pylint: disable=not-callable analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % (str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() if output_path is None: fh = click.get_binary_stream('stdout') else: fh = open(output_path, 'wb') idgps = session.query(IdentifiedGlycopeptide).filter( IdentifiedGlycopeptide.analysis_id == analysis.id).all() n = len(idgps) from glycan_profiling.scoring.elution_time_grouping import GlycopeptideChromatogramProxy cases = [] analysis_name = analysis.name start_time, stop_time = apex_time_range for i, idgp in enumerate(idgps): if i % 50 == 0: click.echo("%d/%d Records Processed" % (i, n), err=True) if idgp.chromatogram is None: continue if idgp.ms1_score < 0: continue obj = GlycopeptideChromatogramProxy.from_obj( idgp, ms1_score=idgp.ms1_score, ms2_score=idgp.ms2_score, q_value=idgp.q_value, analysis_name=analysis_name, mass_shifts=';'.join( [m.name for m in idgp.chromatogram.mass_shifts])) if obj.apex_time < start_time or obj.apex_time > stop_time: continue cases.append(obj) click.echo("Writing %d Records" % (len(cases), ), err=True) with fh: GlycopeptideChromatogramProxy.to_csv(cases, csv_stream(fh))
def glycopeptide_spectrum_matches(database_connection, analysis_identifier, output_path=None): '''Write each matched glycopeptide spectrum in CSV format ''' database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() # pylint: disable=not-callable analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % (str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() analysis_id = analysis.id query = session.query(Protein.id, Protein.name).join( Protein.glycopeptides).join(GlycopeptideSpectrumMatch).filter( GlycopeptideSpectrumMatch.analysis_id == analysis.id) protein_index = dict(query) def generate(): i = 0 interval = 100000 query = session.query(GlycopeptideSpectrumMatch).filter( GlycopeptideSpectrumMatch.analysis_id == analysis_id).order_by( GlycopeptideSpectrumMatch.scan_id) while True: session.expire_all() chunk = query.slice(i, i + interval).all() if len(chunk) == 0: break for glycopeptide in chunk: yield glycopeptide.convert() i += interval if output_path is None: output_stream = ctxstream(click.get_binary_stream('stdout')) else: output_stream = open(output_path, 'wb') with output_stream: job = GlycopeptideSpectrumMatchAnalysisCSVSerializer( output_stream, generate(), protein_index) job.run()
def annotate_matched_spectra(database_connection, analysis_identifier, output_path, mzml_path=None): database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() # pylint: disable=not-callable analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % (str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() if output_path is None: output_path = os.path.dirname(database_connection._original_connection) task = SpectrumAnnotatorExport(database_connection._original_connection, analysis.id, output_path, mzml_path) task.display_header() task.start()
def glycopeptide_mzidentml(database_connection, analysis_identifier, output_path=None, mzml_path=None): '''Write identified glycopeptides as mzIdentML file, and associated MSn spectra to a paired mzML file if the matched data are available. If an mzML file is written it will also contain the extracted ion chromatograms for each glycopeptide with an extracted elution profile. ''' database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % ( str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() loader = AnalysisDeserializer( database_connection._original_connection, analysis_id=analysis.id) glycopeptides = loader.load_identified_glycopeptides() with open(output_path, 'wb') as outfile: writer = MzIdentMLSerializer( outfile, glycopeptides, analysis, loader, source_mzml_path=mzml_path) writer.run()
def export_identified_glycans_from_glycopeptides(database_connection, analysis_identifier, output_path): database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % ( str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() glycans = session.query(GlycanComposition).join( GlycanCombinationGlycanComposition).join(GlycanCombination).join( Glycopeptide, Glycopeptide.glycan_combination_id == GlycanCombination.id).join( IdentifiedGlycopeptide, IdentifiedGlycopeptide.structure_id == Glycopeptide.id).filter( IdentifiedGlycopeptide.analysis_id == analysis.id).all() if output_path is None: output_stream = ctxstream(sys.stdout) else: output_stream = open(output_path, 'wb') with output_stream: job = ImportableGlycanHypothesisCSVSerializer(output_stream, glycans) job.run()
def glycopeptide_spectrum_matches(database_connection, analysis_identifier, output_path=None): '''Write each matched glycopeptide spectrum in CSV format ''' database_connection = DatabaseBoundOperation(database_connection) session = database_connection.session() analysis = get_by_name_or_id(session, Analysis, analysis_identifier) if not analysis.analysis_type == AnalysisTypeEnum.glycopeptide_lc_msms: click.secho("Analysis %r is of type %r." % ( str(analysis.name), str(analysis.analysis_type)), fg='red', err=True) raise click.Abort() analysis_id = analysis.id query = session.query(Protein.id, Protein.name).join(Protein.glycopeptides).join( GlycopeptideSpectrumMatch).filter( GlycopeptideSpectrumMatch.analysis_id == analysis.id) protein_index = dict(query) def generate(): i = 0 interval = 100000 query = session.query(GlycopeptideSpectrumMatch).filter( GlycopeptideSpectrumMatch.analysis_id == analysis_id).order_by( GlycopeptideSpectrumMatch.scan_id) while True: session.expire_all() chunk = query.slice(i, i + interval).all() if len(chunk) == 0: break for glycopeptide in chunk: yield glycopeptide.convert() i += interval if output_path is None: output_stream = ctxstream(sys.stdout) else: output_stream = open(output_path, 'wb') with output_stream: job = GlycopeptideSpectrumMatchAnalysisCSVSerializer(output_stream, generate(), protein_index) job.run()