def process_folder(result_folder, qai_server, qai_user, qai_password, pipeline_version): logger.info('Uploading data to Oracle from {}'.format(result_folder)) collated_conseqs = os.path.join(result_folder, 'conseq.csv') collated_counts = os.path.join(result_folder, 'remap_counts.csv') cascade = os.path.join(result_folder, 'cascade.csv') coverage_scores = os.path.join(result_folder, 'coverage_scores.csv') all_results_path, _ = os.path.split(os.path.normpath(result_folder)) run_path, _ = os.path.split(all_results_path) sample_sheet_file = os.path.join(run_path, "SampleSheet.csv") with open(sample_sheet_file, "rU") as f: sample_sheet = sample_sheet_parser.sample_sheet_parser(f) ok_sample_regions = load_ok_sample_regions(result_folder) with qai_helper.Session() as session: session.login(qai_server, qai_user, qai_password) run = find_run(session, sample_sheet["Experiment Name"]) with open(collated_conseqs, "rU") as f: conseqs = build_conseqs(f, run, sample_sheet, ok_sample_regions) with open(coverage_scores, "rU") as f, \ open(collated_counts, "rU") as f2, \ open(cascade, "rU") as f3: upload_review_to_qai(f, f2, f3, run, sample_sheet, conseqs, session, pipeline_version)
def download_quality(self, folder): """ Download quality control data for the run. @return path for the quality CSV file """ trimmed_folder = self.trim_folder(folder) destination_folder = os.path.join(settings.home, os.path.basename(folder)) destination = os.path.join(destination_folder, '{}_quality.csv'.format(trimmed_folder)) if not os.path.exists(destination_folder): os.makedirs(destination_folder) run_info_path = os.path.join(folder, 'RunInfo.xml') run_info = self.parse_run_info(run_info_path) with qai_helper.Session() as session: session.login(settings.qai_path, settings.qai_user, settings.qai_password) metrics = session.get_json('/miseqqc_errormetrics?runid=' + run_info.miseq_run_id) if not metrics: raise RuntimeError( 'No quality control metrics found for run ' + run_info.miseq_run_id) with open(destination, 'w') as f: self.write_quality(f, metrics, run_info) return destination
def main(): args = parse_args() dump = {} used_regions = set() with qai_helper.Session() as session: session.login(args.qai_server, args.qai_user, args.qai_password) dump['regions'] = session.get_json("/lab_miseq_regions?mode=dump", retries=0) dump['projects'] = session.get_json( "/lab_miseq_projects?mode=dump&pipeline=" + args.pipeline_version, retries=0) empty_projects = [] for name, project in dump['projects'].items(): project['regions'].sort(key=itemgetter('coordinate_region')) for region in project['regions']: used_regions.add(region['coordinate_region']) used_regions.update(region['seed_region_names']) if not project['regions']: empty_projects.append(name) for name in empty_projects: del dump['projects'][name] errors = dump['projects'].get('errors') if errors: raise RuntimeError('\n'.join(errors)) check_key_positions(dump['projects'], sys.stdout) dump['regions'] = { key: value for key, value in dump['regions'].items() if key in used_regions } dump_scoring = deepcopy(dump) for project in dump['projects'].values(): for region in project['regions']: del region['key_positions'] del region['min_coverage1'] del region['min_coverage2'] del region['min_coverage3'] dump_json(dump, "../projects.json") for project in dump_scoring['projects'].values(): for region in project['regions']: name = region['coordinate_region'] seq = ''.join(dump_scoring['regions'][name]['reference']) region['coordinate_region_length'] = len(seq) del dump_scoring['regions'] dump_json(dump_scoring, "../project_scoring.json") print("Done.")
def upload_loop(qai_server, qai_user, qai_password, pipeline_version, upload_queue): # noinspection PyBroadException try: with qai_helper.Session() as session: # Try logging in to QAI, just so we learn about problems at launch. session.login(qai_server, qai_user, qai_password) except Exception: logger.error('Unable to log in to QAI.', exc_info=True) while True: item = upload_queue.get() if item is None: break process_folder(item, qai_server, qai_user, qai_password, pipeline_version)
def main(): dump = {} with qai_helper.Session() as session: session.login(settings.qai_project_path, settings.qai_project_user, settings.qai_project_password) dump['regions'] = session.get_json("/lab_miseq_regions?mode=dump", retries=0) dump['projects'] = session.get_json( "/lab_miseq_projects?mode=dump&pipeline=" + settings.pipeline_version, retries=0) for project in dump['projects'].itervalues(): project['regions'].sort() errors = dump['projects'].get('errors') if errors: raise StandardError('\n'.join(errors)) check_key_positions(dump['projects'], sys.stdout) dump_scoring = deepcopy(dump) for project in dump['projects'].itervalues(): for region in project['regions']: del region['key_positions'] del region['min_coverage1'] del region['min_coverage2'] del region['min_coverage3'] dump_json(dump, "../projects.json") for project in dump_scoring['projects'].itervalues(): for region in project['regions']: name = region['coordinate_region'] seq = ''.join(dump_scoring['regions'][name]['reference']) region['coordinate_region_length'] = len(seq) del dump_scoring['regions'] dump_json(dump_scoring, "../project_scoring.json") print "Done."
def main(): project_config = ProjectConfig.loadDefault() with open('../project_scoring.json', 'rU') as scoring_file: scoring_config = json.load(scoring_file) with qai_helper.Session() as session: session.login(settings.qai_path, settings.qai_user, settings.qai_password) pipelines = session.get_json("/lab_miseq_pipelines?version=" + settings.pipeline_version, retries=0) if pipelines: raise RuntimeError('Pipeline {} already exists.'.format( settings.pipeline_version)) seed_groups = session.get_json("/lab_miseq_seed_groups") seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups)) old_regions = session.get_json("/lab_miseq_regions", retries=0) regions = dict(((region['name'], region) for region in old_regions)) for region_name, region_data in project_config.config[ 'regions'].iteritems(): region = regions.get(region_name) if region is None: seed_group_name = region_data['seed_group'] seed_group_id = seed_group_ids.get(seed_group_name) if seed_group_id is None and seed_group_name: seed_group = session.post_json("/lab_miseq_seed_groups", {'name': seed_group_name}) seed_group_id = seed_group['id'] seed_group_ids[seed_group_name] = seed_group_id region = session.post_json( "/lab_miseq_regions", { 'name': region_name, 'is_nucleotide': region_data['is_nucleotide'], 'reference': ''.join(region_data['reference']), 'seed_group_id': seed_group_id }) regions[region_name] = region pipeline = session.post_json("/lab_miseq_pipelines", {'version': settings.pipeline_version}) pipeline_id = pipeline['id'] old_projects = session.get_json("/lab_miseq_projects", retries=0) projects = dict( ((project['name'], project) for project in old_projects)) for project_name, project_data in project_config.config[ 'projects'].iteritems(): project = projects.get(project_name) if project is None: project = session.post_json( "/lab_miseq_projects", { 'name': project_name, 'max_variants': project_data['max_variants'] }) project_version = session.post_json("/lab_miseq_project_versions", { 'pipeline_id': pipeline_id, 'project_id': project['id'] }) for i, region_data in enumerate(project_data['regions']): scoring_data = scoring_config['projects'][project_name][ 'regions'][i] coordinate_region = regions[region_data['coordinate_region']] seed_region = regions[region_data['seed_region_names'][0]] seed_group_id = seed_region['seed_group_id'] project_region = session.post_json( "/lab_miseq_project_regions", { 'project_version_id': project_version['id'], 'coordinate_region_id': coordinate_region['id'], 'min_coverage1': scoring_data['min_coverage1'], 'min_coverage2': scoring_data['min_coverage2'], 'min_coverage3': scoring_data['min_coverage3'], 'seed_group_id': seed_group_id }) for key_position in scoring_data['key_positions']: session.post_json( "/lab_miseq_key_positions", { 'project_region_id': project_region['id'], 'start_pos': key_position['start_pos'], 'end_pos': key_position['end_pos'] }) print "Done."
def main(): args = parse_args() project_config = ProjectConfig.loadDefault() scoring_path = Path(__file__).parent.parent / 'project_scoring.json' with scoring_path.open() as scoring_file: scoring_config = json.load(scoring_file) with qai_helper.Session() as session: session.login(args.qai_server, args.qai_user, args.qai_password) pipelines = session.get_json("/lab_miseq_pipelines?version=" + args.pipeline_version, retries=0) if pipelines: raise RuntimeError('Pipeline {} already exists.'.format( args.pipeline_version)) seed_groups = session.get_json("/lab_miseq_seed_groups") # noinspection PyTypeChecker seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups)) old_regions = session.get_json("/lab_miseq_regions", retries=0) regions = dict(((region['name'], region) for region in old_regions)) for region_name, region_data in project_config.config['regions'].items( ): ref_seq = ''.join(region_data['reference']) region = regions.get(region_name) if region is None: seed_group_name = region_data['seed_group'] seed_group_id = seed_group_ids.get(seed_group_name) if seed_group_id is None and seed_group_name: seed_group = session.post_json("/lab_miseq_seed_groups", {'name': seed_group_name}) seed_group_id = seed_group['id'] seed_group_ids[seed_group_name] = seed_group_id region = session.post_json( "/lab_miseq_regions", { 'name': region_name, 'is_nucleotide': region_data['is_nucleotide'], 'reference': ref_seq, 'seed_group_id': seed_group_id }) regions[region_name] = region elif region['reference'] != ref_seq: print("Reference doesn't match:", region_name) if args.update_sequences: region['reference'] = ref_seq session.post_json(f"/lab_miseq_regions/{region['id']}", region) pipeline = session.post_json("/lab_miseq_pipelines", {'version': args.pipeline_version}) pipeline_id = pipeline['id'] old_projects = session.get_json("/lab_miseq_projects", retries=0) projects = dict( ((project['name'], project) for project in old_projects)) for project_name, project_data in project_config.config[ 'projects'].items(): project = projects.get(project_name) if project is None: project = session.post_json( "/lab_miseq_projects", { 'name': project_name, 'max_variants': project_data['max_variants'] }) project_version = session.post_json("/lab_miseq_project_versions", { 'pipeline_id': pipeline_id, 'project_id': project['id'] }) for i, region_data in enumerate(project_data['regions']): scoring_data = scoring_config['projects'][project_name][ 'regions'][i] coordinate_region = regions[region_data['coordinate_region']] seed_region = regions[region_data['seed_region_names'][0]] seed_group_id = seed_region['seed_group_id'] project_region = session.post_json( "/lab_miseq_project_regions", { 'project_version_id': project_version['id'], 'coordinate_region_id': coordinate_region['id'], 'min_coverage1': scoring_data['min_coverage1'], 'min_coverage2': scoring_data['min_coverage2'], 'min_coverage3': scoring_data['min_coverage3'], 'seed_group_id': seed_group_id }) for key_position in scoring_data['key_positions']: session.post_json( "/lab_miseq_key_positions", { 'project_region_id': project_region['id'], 'start_pos': key_position['start_pos'], 'end_pos': key_position['end_pos'] }) print("Done.")
def main(): filename = 'HIV1_COM_2015_genome_DNA.csv' args = parse_args() if not os.path.exists(filename): form = { 'ORGANISM': 'HIV', 'ALIGN_TYPE': 'COM', 'SUBORGANISM': 'HIV1', 'PRE_USER': '******', 'REGION': 'GENOME', 'START': '', 'END': '', 'GENO_SUB': 'All', 'BASETYPE': 'DNA', 'YEAR': '2015', 'FORMAT': 'csv', 'submit': 'Get Alignment' } response = requests.post( "https://www.hiv.lanl.gov/cgi-bin/NEWALIGN/align.cgi", data=form) response.raise_for_status() # print(response.text) match = re.search(r'<pre>(.*)</pre>', response.text, re.DOTALL) with open(filename, 'w') as f: f.write(match.group(1)) with qai_helper.Session() as session: session.login(args.qai_server, args.qai_user, args.qai_password) seed_groups = session.get_json("/lab_miseq_seed_groups") seed_group_name = 'HIV1-seed' for seed_group in seed_groups: if seed_group['name'] == seed_group_name: break else: raise RuntimeError( 'Seed group {} not found.'.format(seed_group_name)) old_regions = session.get_json("/lab_miseq_regions", retries=0) hiv_seeds = { region['name']: region for region in old_regions if region['seed_group_id'] == seed_group['id'] } del old_regions clean_count = 0 dirty_count = 0 recombinant_names = [] with open(filename, 'rU') as f: reader = csv.reader(f) for description, seed_seq in reader: seed_seq = seed_seq.replace('-', '') name_fields = description.split('.') subtype, country = name_fields[:2] accession = name_fields[-1] if subtype[0].isdigit(): recombinant_names.append(description) continue seed_name = '-'.join( ('HIV1', subtype, country, accession, 'seed')) groups = re.findall(r'([^ACGT]+)', seed_seq) if groups: dirty_count += 1 print('Unexpected bases found in {}: {}'.format( seed_name, ', '.join(groups))) else: clean_count += 1 old_region = hiv_seeds.pop(seed_name, None) if old_region: old_seq = ''.join(old_region['reference']) if old_seq != seed_seq: print('expected: ' + seed_seq) print('found: ' + old_seq) raise RuntimeError( 'Seed sequence {} does not match.'.format( seed_name)) elif len(seed_name) > 30: print('Name too long: {!r}.'.format(seed_name)) else: session.post_json( "/lab_miseq_regions", { 'name': seed_name, 'description': description, 'is_nucleotide': True, 'reference': seed_seq, 'seed_group_id': seed_group['id'] }) if recombinant_names: print('Skipped recombinants: ' + ', '.join(sorted(recombinant_names))) if hiv_seeds: seed_names = sorted(hiv_seeds.keys()) should_delete = True print('Left over seeds:') if not should_delete: print(', '.join(seed_names)) else: for seed_name in seed_names: print(seed_name) seed_id = hiv_seeds[seed_name]['id'] session.delete('{}/lab_miseq_regions/{}'.format( args.qai_server, seed_id)) print('Done with {} clean and {} dirty.'.format( clean_count, dirty_count))