def system_call(cmd): """Call cmd and return (stdout, stderr, return_value). cmd: can be either a string containing the command to be run, or a sequence of strings that are the tokens of the command. This function is ported from QIIME (http://www.qiime.org), previously named qiime_system_call. QIIME is a GPL project, but we obtained permission from the authors of this function to port it to pyqi (and keep it under pyqi's BSD license). """ proc = Popen(cmd, universal_newlines=True, shell=True, stdout=PIPE, stderr=PIPE) # communicate pulls all stdout/stderr from the PIPEs to # avoid blocking -- don't remove this line! stdout, stderr = proc.communicate() return_value = proc.returncode if return_value != 0: raise ComputeError("Failed to execute: %s\nstdout: %s\nstderr: %s" % (cmd, stdout, stderr)) return stdout, stderr, return_value
def submit_EBI(preprocessed_data_id, action, send, fastq_dir_fp=None): """Submit a preprocessed data to EBI Parameters ---------- preprocessed_data_id : int The preprocesssed data id action : %s The action to perform with this data send : bool True to actually send the files fastq_dir_fp : str, optional The fastq filepath """ preprocessed_data = PreprocessedData(preprocessed_data_id) preprocessed_data_id_str = str(preprocessed_data_id) study = Study(preprocessed_data.study) sample_template = SampleTemplate(study.sample_template) prep_template = PrepTemplate(preprocessed_data.prep_template) investigation_type = None new_investigation_type = None status = preprocessed_data.submitted_to_insdc_status() if status in ('submitting', 'success'): raise ValueError("Cannot resubmit! Current status is: %s" % status) if send: # If we intend actually to send the files, then change the status in # the database preprocessed_data.update_insdc_status('submitting') # we need to figure out whether the investigation type is a known one # or if we have to submit a "new_investigation_type" to EBI current_type = prep_template.investigation_type ena_ontology = Ontology(convert_to_id('ENA', 'ontology')) if current_type in ena_ontology.terms: investigation_type = current_type elif current_type in ena_ontology.user_defined_terms: investigation_type = 'Other' new_investigation_type = current_type else: # This should never happen raise ValueError("Unrecognized investigation type: '%s'. This term " "is neither one of the official terms nor one of the " "user-defined terms in the ENA ontology") if fastq_dir_fp is not None: # If the user specifies a FASTQ directory, use it # Set demux_samples to None so that MetadataTemplate.to_file will put # all samples in the template files demux_samples = None else: # If the user does not specify a FASTQ directory, create one and # re-serialize the per-sample FASTQs from the demux file fastq_dir_fp = mkdtemp(prefix=qiita_config.working_dir) demux = [ path for _, path, ftype in preprocessed_data.get_filepaths() if ftype == 'preprocessed_demux' ][0] # Keep track of which files were actually in the demux file so that we # can write those rows to the prep and samples templates demux_samples = set() with open_file(demux) as demux_fh: for samp, iterator in to_per_sample_ascii(demux_fh, list(sample_template)): demux_samples.add(samp) sample_fp = join(fastq_dir_fp, "%s.fastq.gz" % samp) with gzopen(sample_fp, 'w') as fh: for record in iterator: fh.write(record) output_dir = fastq_dir_fp + '_submission' samp_fp = join(fastq_dir_fp, 'sample_metadata.txt') prep_fp = join(fastq_dir_fp, 'prep_metadata.txt') sample_template.to_file(samp_fp, demux_samples) prep_template.to_file(prep_fp, demux_samples) # Get specific output directory and set filepaths get_output_fp = partial(join, output_dir) study_fp = get_output_fp('study.xml') sample_fp = get_output_fp('sample.xml') experiment_fp = get_output_fp('experiment.xml') run_fp = get_output_fp('run.xml') submission_fp = get_output_fp('submission.xml') if not isdir(output_dir): makedirs(output_dir) else: raise IOError('The output folder already exists: %s' % output_dir) with open(samp_fp, 'U') as st, open(prep_fp, 'U') as pt: submission = EBISubmission.from_templates_and_per_sample_fastqs( preprocessed_data_id_str, study.title, study.info['study_abstract'], investigation_type, st, pt, fastq_dir_fp, new_investigation_type=new_investigation_type, pmids=study.pmids) submission.write_all_xml_files(study_fp, sample_fp, experiment_fp, run_fp, submission_fp, action) if send: submission.send_sequences() study_accession, submission_accession = submission.send_xml() if study_accession is None or submission_accession is None: preprocessed_data.update_insdc_status('failed') raise ComputeError("EBI Submission failed!") else: preprocessed_data.update_insdc_status('success', study_accession, submission_accession) else: study_accession, submission_accession = None, None return study_accession, submission_accession
def submit_EBI(artifact_id, action, send, test=False): """Submit an artifact to EBI Parameters ---------- artifact_id : int The artifact id action : %s The action to perform with this data send : bool True to actually send the files test : bool If True some restrictions will be ignored, only used in parse_EBI_reply """ # step 1: init and validate ebi_submission = EBISubmission(artifact_id, action) # step 2: generate demux fastq files try: ebi_submission.generate_demultiplexed_fastq() except Exception: error_msg = format_exc() if isdir(ebi_submission.full_ebi_dir): rmtree(ebi_submission.full_ebi_dir) LogEntry.create('Runtime', error_msg, info={'ebi_submission': artifact_id}) raise # step 3: generate and write xml files ebi_submission.generate_xml_files() if send: # getting aspera's password old_ascp_pass = environ.get('ASPERA_SCP_PASS', '') if old_ascp_pass == '': environ['ASPERA_SCP_PASS'] = qiita_config.ebi_seq_xfer_pass ascp_passwd = environ['ASPERA_SCP_PASS'] LogEntry.create('Runtime', ('Submission of sequences of pre_processed_id: ' '%d completed successfully' % artifact_id)) # step 4: sending sequences if action != 'MODIFY': LogEntry.create('Runtime', ("Submitting sequences for pre_processed_id: " "%d" % artifact_id)) for cmd in ebi_submission.generate_send_sequences_cmd(): stdout, stderr, rv = system_call(cmd) if rv != 0: error_msg = ("ASCP Error:\nStd output:%s\nStd error:%s" % (stdout, stderr)) environ['ASPERA_SCP_PASS'] = old_ascp_pass raise ComputeError(error_msg) open(ebi_submission.ascp_reply, 'a').write('stdout:\n%s\n\nstderr: %s' % (stdout, stderr)) environ['ASPERA_SCP_PASS'] = old_ascp_pass # step 5: sending xml and parsing answer xmls_cmds = ebi_submission.generate_curl_command( ebi_seq_xfer_pass=ascp_passwd) LogEntry.create('Runtime', ("Submitting XMLs for pre_processed_id: " "%d" % artifact_id)) xml_content, stderr, rv = system_call(xmls_cmds) if rv != 0: error_msg = ("Error:\nStd output:%s\nStd error:%s" % (xml_content, stderr)) raise ComputeError(error_msg) else: LogEntry.create('Runtime', ('Submission of sequences of pre_processed_id: ' '%d completed successfully' % artifact_id)) open(ebi_submission.curl_reply, 'w').write('stdout:\n%s\n\nstderr: %s' % (xml_content, stderr)) try: st_acc, sa_acc, bio_acc, ex_acc, run_acc = \ ebi_submission.parse_EBI_reply(xml_content, test=test) except EBISubmissionError as e: error = str(e) le = LogEntry.create('Fatal', "Command: %s\nError: %s\n" % (xml_content, error), info={'ebi_submission': artifact_id}) raise ComputeError("EBI Submission failed! Log id: %d\n%s" % (le.id, error)) if action == 'ADD' or test: if st_acc: ebi_submission.study.ebi_study_accession = st_acc if sa_acc: ebi_submission.sample_template.ebi_sample_accessions = sa_acc if bio_acc: ebi_submission.sample_template.biosample_accessions = bio_acc if ex_acc: ebi_submission.prep_template.ebi_experiment_accessions = ex_acc ebi_submission.artifact.ebi_run_accessions = run_acc else: st_acc, sa_acc, bio_acc, ex_acc, run_acc = None, None, None, None, None return st_acc, sa_acc, bio_acc, ex_acc, run_acc
def submit_VAMPS(artifact_id): """Submit artifact to VAMPS Parameters ---------- artifact_id : int The artifact id Raises ------ ComputeError - If the artifact cannot be submitted to VAMPS - If the artifact is associated with more than one prep template """ artifact = Artifact(artifact_id) if not artifact.can_be_submitted_to_vamps: raise ComputeError("Artifact %d cannot be submitted to VAMPS" % artifact_id) study = artifact.study sample_template = study.sample_template prep_templates = artifact.prep_templates if len(prep_templates) > 1: raise ComputeError( "Multiple prep templates associated with the artifact: %s" % artifact_id) prep_template = prep_templates[0] # Also need to check that is not submitting (see item in #1523) if artifact.is_submitted_to_vamps: raise ValueError("Cannot resubmit artifact %s to VAMPS!" % artifact_id) # Generating a tgz targz_folder = mkdtemp(prefix=qiita_config.working_dir) targz_fp = join(targz_folder, '%d_%d_%d.tgz' % (study.id, prep_template.id, artifact_id)) targz = taropen(targz_fp, mode='w:gz') # adding sample/prep samp_fp = join(targz_folder, 'sample_metadata.txt') sample_template.to_file(samp_fp) targz.add(samp_fp, arcname='sample_metadata.txt') prep_fp = join(targz_folder, 'prep_metadata.txt') prep_template.to_file(prep_fp) targz.add(prep_fp, arcname='prep_metadata.txt') # adding preprocessed data for _, fp, fp_type in artifact.filepaths: if fp_type == 'preprocessed_fasta': targz.add(fp, arcname='preprocessed_fasta.fna') targz.close() # submitting cmd = ("curl -F user=%s -F pass='******' -F uploadFile=@%s -F " "press=UploadFile %s" % (qiita_config.vamps_user, qiita_config.vamps_pass, targz_fp, qiita_config.vamps_url)) obs, stderr, rv = system_call(cmd) if rv != 0: error_msg = ("Error:\nStd output:%s\nStd error:%s" % (obs, stderr)) raise ComputeError(error_msg) exp = ("<html>\n<head>\n<title>Process Uploaded File</title>\n</head>\n" "<body>\n</body>\n</html>") if obs != exp: return False else: artifact.is_submitted_to_vamps = True return True
def submit_EBI(artifact_id, action, send, test=False, test_size=False): """Submit an artifact to EBI Parameters ---------- artifact_id : int The artifact id action : %s The action to perform with this data send : bool True to actually send the files test : bool If True some restrictions will be ignored, only used in parse_EBI_reply test_size : bool If True the EBI-ENA restriction size will be changed to 6000 """ # step 1: init and validate ebi_submission = EBISubmission(artifact_id, action) # step 2: generate demux fastq files try: ebi_submission.generate_demultiplexed_fastq() except Exception: error_msg = format_exc() if isdir(ebi_submission.full_ebi_dir): rmtree(ebi_submission.full_ebi_dir) LogEntry.create('Runtime', error_msg, info={'ebi_submission': artifact_id}) raise # step 3: generate and write xml files ebi_submission.generate_xml_files() # before we continue let's check the size of the submission to_review = [ ebi_submission.study_xml_fp, ebi_submission.sample_xml_fp, ebi_submission.experiment_xml_fp, ebi_submission.run_xml_fp, ebi_submission.submission_xml_fp ] total_size = sum([stat(tr).st_size for tr in to_review if tr is not None]) # note that the max for EBI is 10M but let's play it safe max_size = 10e+6 if not test_size else 5000 if total_size > max_size: LogEntry.create( 'Runtime', 'The submission: %d is larger than allowed (%d), will ' 'try to fix: %d' % (artifact_id, max_size, total_size)) # transform current metadata to dataframe for easier curation rows = {k: dict(v) for k, v in ebi_submission.samples.items()} df = pd.DataFrame.from_dict(rows, orient='index') # remove unique columns and same value in all columns nunique = df.apply(pd.Series.nunique) nsamples = len(df.index) cols_to_drop = set(nunique[(nunique == 1) | (nunique == nsamples)].index) # maximize deletion by removing also columns that are almost all the # same or almost all unique cols_to_drop = set(nunique[(nunique <= int(nsamples * .01)) | (nunique >= int(nsamples * .5))].index) cols_to_drop = cols_to_drop - { 'taxon_id', 'scientific_name', 'description' } all_samples = ebi_submission.sample_template.ebi_sample_accessions samples = [k for k in ebi_submission.samples if all_samples[k] is None] if samples: ebi_submission.write_xml_file( ebi_submission.generate_sample_xml(samples, cols_to_drop), ebi_submission.sample_xml_fp) # now let's recalculate the size to make sure it's fine new_total_size = sum( [stat(tr).st_size for tr in to_review if tr is not None]) LogEntry.create( 'Runtime', 'The submission: %d after cleaning is %d and was %d' % (artifact_id, total_size, new_total_size)) if new_total_size > max_size: raise ComputeError( 'Even after cleaning the submission: %d is too large. Before ' 'cleaning: %d, after: %d' % (artifact_id, total_size, new_total_size)) st_acc, sa_acc, bio_acc, ex_acc, run_acc = None, None, None, None, None if send: # getting aspera's password old_ascp_pass = environ.get('ASPERA_SCP_PASS', '') if old_ascp_pass == '': environ['ASPERA_SCP_PASS'] = qiita_config.ebi_seq_xfer_pass ascp_passwd = environ['ASPERA_SCP_PASS'] LogEntry.create('Runtime', ('Submission of sequences of pre_processed_id: ' '%d completed successfully' % artifact_id)) # step 4: sending sequences if action != 'MODIFY': LogEntry.create('Runtime', ("Submitting sequences for pre_processed_id: " "%d" % artifact_id)) for cmd in ebi_submission.generate_send_sequences_cmd(): stdout, stderr, rv = system_call(cmd) if rv != 0: error_msg = ("ASCP Error:\nStd output:%s\nStd error:%s" % (stdout, stderr)) environ['ASPERA_SCP_PASS'] = old_ascp_pass raise ComputeError(error_msg) open(ebi_submission.ascp_reply, 'a').write('stdout:\n%s\n\nstderr: %s' % (stdout, stderr)) environ['ASPERA_SCP_PASS'] = old_ascp_pass # step 5: sending xml xmls_cmds = ebi_submission.generate_curl_command( ebi_seq_xfer_pass=ascp_passwd) LogEntry.create('Runtime', ("Submitting XMLs for pre_processed_id: " "%d" % artifact_id)) xml_content, stderr, rv = system_call(xmls_cmds) if rv != 0: error_msg = ("Error:\nStd output:%s\nStd error:%s" % (xml_content, stderr)) raise ComputeError(error_msg) else: LogEntry.create('Runtime', ('Submission of sequences of pre_processed_id: ' '%d completed successfully' % artifact_id)) open(ebi_submission.curl_reply, 'w').write('stdout:\n%s\n\nstderr: %s' % (xml_content, stderr)) # parsing answer / only if adding if action == 'ADD' or test: try: st_acc, sa_acc, bio_acc, ex_acc, run_acc = \ ebi_submission.parse_EBI_reply(xml_content, test=test) except EBISubmissionError as e: error = str(e) le = LogEntry.create('Fatal', "Command: %s\nError: %s\n" % (xml_content, error), info={'ebi_submission': artifact_id}) raise ComputeError("EBI Submission failed! Log id: %d\n%s" % (le.id, error)) if st_acc: ebi_submission.study.ebi_study_accession = st_acc if sa_acc: ebi_submission.sample_template.ebi_sample_accessions = sa_acc if bio_acc: ebi_submission.sample_template.biosample_accessions = bio_acc if ex_acc: ebi_submission.prep_template.ebi_experiment_accessions = ex_acc ebi_submission.artifact.ebi_run_accessions = run_acc return st_acc, sa_acc, bio_acc, ex_acc, run_acc
def submit_EBI(preprocessed_data_id, action, send): """Submit a preprocessed data to EBI Parameters ---------- preprocessed_data_id : int The preprocesssed data id action : %s The action to perform with this data send : bool True to actually send the files """ # step 1: init and validate ebi_submission = EBISubmission(preprocessed_data_id, action) # step 2: generate demux fastq files ebi_submission.study.ebi_submission_status = 'submitting' try: ebi_submission.generate_demultiplexed_fastq() except: error_msg = format_exc() if isdir(ebi_submission.full_ebi_dir): rmtree(ebi_submission.full_ebi_dir) ebi_submission.study.ebi_submission_status = 'failed: %s' % error_msg LogEntry.create('Runtime', error_msg, info={'ebi_submission': preprocessed_data_id}) raise # step 3: generate and write xml files ebi_submission.generate_xml_files() if send: # step 4: sending sequences if action != 'MODIFY': old_ascp_pass = environ.get('ASPERA_SCP_PASS', '') environ['ASPERA_SCP_PASS'] = qiita_config.ebi_seq_xfer_pass LogEntry.create('Runtime', ("Submitting sequences for pre_processed_id: " "%d" % preprocessed_data_id)) for cmd in ebi_submission.generate_send_sequences_cmd(): stdout, stderr, rv = system_call(cmd) if rv != 0: error_msg = ("Error:\nStd output:%s\nStd error:%s" % (stdout, stderr)) raise ComputeError(error_msg) open(ebi_submission.ascp_reply, 'a').write('stdout:\n%s\n\nstderr: %s' % (stdout, stderr)) environ['ASPERA_SCP_PASS'] = old_ascp_pass LogEntry.create( 'Runtime', ('Submission of sequences of pre_processed_id: ' '%d completed successfully' % preprocessed_data_id)) # step 5: sending xml and parsing answer xmls_cmds = ebi_submission.generate_curl_command() LogEntry.create('Runtime', ("Submitting XMLs for pre_processed_id: " "%d" % preprocessed_data_id)) xml_content, stderr, rv = system_call(xmls_cmds) if rv != 0: error_msg = ("Error:\nStd output:%s\nStd error:%s" % (xml_content, stderr)) raise ComputeError(error_msg) else: LogEntry.create( 'Runtime', ('Submission of sequences of pre_processed_id: ' '%d completed successfully' % preprocessed_data_id)) open(ebi_submission.curl_reply, 'w').write('stdout:\n%s\n\nstderr: %s' % (xml_content, stderr)) try: st_acc, sa_acc, bio_acc, ex_acc, run_acc = \ ebi_submission.parse_EBI_reply(xml_content) except EBISubmissionError as e: le = LogEntry.create('Fatal', "Command: %s\nError: %s\n" % (xml_content, str(e)), info={'ebi_submission': preprocessed_data_id}) ebi_submission.study.ebi_submission_status = ( "failed: XML parsing, log id: %d" % le.id) raise ComputeError("EBI Submission failed! Log id: %d" % le.id) ebi_submission.study.ebi_submission_status = 'submitted' if action == 'ADD': if st_acc: ebi_submission.study.ebi_study_accession = st_acc if sa_acc: ebi_submission.sample_template.ebi_sample_accessions = sa_acc if bio_acc: ebi_submission.sample_template.biosample_accessions = bio_acc if ex_acc: ebi_submission.prep_template.ebi_experiment_accessions = ex_acc ebi_submission.artifact.ebi_run_accessions = run_acc else: st_acc, sa_acc, bio_acc, ex_acc, run_acc = None, None, None, None, None return st_acc, sa_acc, bio_acc, ex_acc, run_acc