def __init__(self, file=None, rules_yaml=None, genotype=None, references=None): """ Load ASI rules from a file or file object. """ if references is None: projects = ProjectConfig.loadDefault() references = projects.getAllReferences() with WILD_TYPES_PATH.open() as wild_types_file: wild_types = safe_load(wild_types_file) references.update(wild_types) self.stds = { name if name != 'INT' else 'IN': ref for name, ref in references.items()} # Algorithm info self.alg_version = '' self.alg_name = '' # definitions self.gene_def = {} # {code: [drug_class_code]} self.level_def = {} # {'1': 'Susceptible'} self.drug_class = defaultdict(list) # {code: [drug_code]} self.global_range = [] # [ ['-INF', '9', '1'] , ...] #first two are the range, the third one is the res level self.comment_def = {} # {code: comment_text} self.drugs = {} # {code: (name, [condition, [(action_type, action_value)]])} self.mutation_comments = [] # maybe skip for now? We don't really use this atm. if file is not None: self.load_xml(file) elif rules_yaml is not None: self.load_yaml(rules_yaml, genotype)
def extract_v3loop_ref(): ref_filename = os.path.join(os.path.dirname(__file__), 'v3loop_ref.txt') try: with open(ref_filename) as f: v3loop_ref = f.read() except FileNotFoundError: project_config = ProjectConfig.loadDefault() hiv_seed = project_config.getReference(G2P_SEED_NAME) coordinate_ref = project_config.getReference(COORDINATE_REF_NAME) v3loop_ref = extract_target(hiv_seed, coordinate_ref) with open(ref_filename, 'w') as f: f.write(v3loop_ref) return v3loop_ref
def load_references(): projects = ProjectConfig.loadDefault() references = {} # {(genotype, region): Reference} for ref_name, sequence in projects.getAllReferences().items(): match = re.match(r'HCV(.*?)-.*-([^-]+)$', ref_name) if match: genotype = match.group(1) region = match.group(2) if region in HCV_REGIONS: reference = Reference(ref_name, sequence) references[(genotype, region)] = reference if genotype == '6': references[('6E', region)] = reference return references
def main(): fastq_files = [ FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100), FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)), (CodonMutation(159, 'GTC'), )), FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100), FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)), (CodonMutation(159, 'GTC'), )), FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100), FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)), (CodonMutation(316, 'AGC'), )), FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100), FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)), (CodonMutation(316, 'AGC'), )) ] projects = ProjectConfig.loadDefault() for fastq_file in fastq_files: with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: ref_name, ref_start, ref_end = find_coord_pos( projects, section.coord_name, section.start_pos, section.end_pos) ref_nuc_seq = projects.getReference(ref_name) ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end]) for mutation in fastq_file.mutations: if section.start_pos <= mutation.pos <= section.end_pos: section_pos = (mutation.pos - section.start_pos) * 3 ref_nuc_section[section_pos:section_pos + 3] = list( mutation.codon) ref_nuc_section = ''.join(ref_nuc_section) if fastq_file.is_reversed: ref_nuc_section = reverse_and_complement(ref_nuc_section) phred_scores = 'A' * (ref_end - ref_start) file_num = '2' if fastq_file.is_reversed else '1' for cluster in range(section.count): f.write( '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n' .format(fastq_file.extract_num, cluster + next_cluster, file_num)) f.write(ref_nuc_section + '\n') f.write('+\n') f.write(phred_scores + '\n') next_cluster += section.count
def read_contigs(contigs_csv, excluded_seeds=None): gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 contig_groups = defaultdict( list) # {group_ref_name: [seq, index, index...]} conseqs = {} projects = ProjectConfig.loadDefault() with contigs_csv: contigs_reader = DictReader(contigs_csv) for i, row in reversed(list(enumerate(contigs_reader, 1))): contig_seq = row['contig'] match_fraction = float(row['match']) is_match = 0.25 <= match_fraction is_reversed = match_fraction < 0 if not (ARE_CONTIGS_MERGED and is_match): contig_name = get_contig_name(i, row['ref'], is_match, is_reversed, excluded_seeds) conseqs[contig_name] = contig_seq continue group_ref_name = row['group_ref'] contig_group = contig_groups[group_ref_name] if not contig_group: contig_group.append(projects.getReference(group_ref_name)) contig_group.append(str(i)) group_seq = contig_group[0] agroup, acontig, score = align_it(group_seq, contig_seq, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) match = re.match('-*([^-](.*[^-])?)', acontig) start = match.start(1) end = match.end(1) merged_seq = agroup[:start] + contig_seq + agroup[end:] left_trim = len(agroup) - len(agroup.lstrip('-')) right_trim = len(agroup) - len(agroup.rstrip('-')) contig_group[0] = merged_seq[left_trim:-right_trim or None] is_match = True is_reversed = False for group_ref_name, contig_group in contig_groups.items(): (group_seq, *contig_nums) = contig_group prefix = '_'.join(reversed(contig_nums)) contig_name = get_contig_name(prefix, group_ref_name, is_match, is_reversed, excluded_seeds) conseqs[contig_name] = group_seq return conseqs
def fastq_g2p(pssm, fastq1, fastq2, g2p_csv, g2p_summary_csv=None, unmapped1=None, unmapped2=None, aligned_csv=None, min_count=1, min_valid=1, min_valid_percent=0.0, merged_contigs_csv=None): g2p_filename = getattr(g2p_csv, 'name', None) if g2p_filename is None: count_prefix = None else: working_path = os.path.dirname(g2p_csv.name) count_prefix = os.path.join(working_path, 'read_counts') project_config = ProjectConfig.loadDefault() hiv_seed = project_config.getReference(G2P_SEED_NAME) coordinate_ref = project_config.getReference(COORDINATE_REF_NAME) v3loop_ref = extract_target(hiv_seed, coordinate_ref) reader = FastqReader(fastq1, fastq2) merged_reads = merge_reads(reader) consensus_builder = ConsensusBuilder() counted_reads = consensus_builder.build(merged_reads) trimmed_reads = trim_reads(counted_reads, v3loop_ref) mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2) read_counts = count_reads(mapped_reads, count_prefix) if aligned_csv is not None: read_counts = write_aligned_reads(read_counts, aligned_csv, hiv_seed, v3loop_ref) write_rows(pssm, read_counts, g2p_csv, g2p_summary_csv, min_count, min_valid=min_valid, min_valid_percent=min_valid_percent) if merged_contigs_csv is not None: contig_writer = DictWriter(merged_contigs_csv, ['contig']) contig_writer.writeheader() for consensus in consensus_builder.get_consensus_by_lengths(): unambiguous_consensus = consensus.replace('N', '').replace('-', '') if unambiguous_consensus: contig_writer.writerow(dict(contig=consensus))
def main(): project_config = ProjectConfig.loadDefault() error_count = 0 unchecked_ref_names = set(project_config.getAllReferences().keys()) error_count += check_hcv_seeds(project_config, unchecked_ref_names) error_count += check_hcv_coordinates(project_config, unchecked_ref_names) error_count += check_hiv_seeds(project_config, unchecked_ref_names) error_count += check_hiv_coordinates(project_config, unchecked_ref_names) error_count += check_hiv_wild_types(project_config) error_count += check_hla_seeds(project_config, unchecked_ref_names) error_count += check_hla_coordinates(project_config, unchecked_ref_names) if not unchecked_ref_names: print('No unchecked refs.') else: print(fill_report(f'Unchecked refs: ' f'{", ".join(sorted(unchecked_ref_names))}')) error_count += len(unchecked_ref_names) print(f'Total errors: {error_count}.')
def test_duplicated_sars_base_amino(sequence_report): """ Special case for duplicated base in SARS orf1ab. Expect amino sequence AQSFLNRVCG. """ # refname,qcut,rank,count,offset,seq aligned_reads = prepare_reads("""\ SARS-CoV-2-seed,15,0,9,0,GCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACAC """) # Repeat is here: ^ # A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,...,coverage expected_text = """\ SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,1,4396,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,4,4397,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,7,4398,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,10,4399,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,13,4400,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,16,4401,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,18,4402,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,4403,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,4404,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,4405,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9""" sequence_report.projects = ProjectConfig.loadDefault() orf1ab_size = len( sequence_report.projects.getReference('SARS-CoV-2-ORF1ab')) nsp12_size = len(sequence_report.projects.getReference('SARS-CoV-2-nsp12')) report_file = StringIO() sequence_report.write_amino_header(report_file) sequence_report.read(aligned_reads) sequence_report.write_amino_counts() report = report_file.getvalue() report_lines = report.splitlines() expected_size = orf1ab_size + nsp12_size + 1 if len(report_lines) != expected_size: assert (len(report_lines), report) == (expected_size, '') key_lines = report_lines[4396:4406] key_report = '\n'.join(key_lines) assert key_report == expected_text
def test_duplicated_sars_base_nuc(sequence_report): """ Make sure duplicated base in SARS isn't duplicated in nuc.csv. """ # refname,qcut,rank,count,offset,seq aligned_reads = prepare_reads("""\ SARS-CoV-2-seed,15,0,9,10,ACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCG """) # A,C,G,T,N,...,coverage expected_section = """\ SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,13198,0,0,0,9,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,22,13199,0,0,0,9,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,23,13200,9,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,13201,9,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,25,13202,9,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,26,13203,0,9,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,13204,0,0,9,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,28,13205,0,0,9,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,29,13206,0,0,9,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,30,13207,0,0,0,9,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,31,13208,0,0,0,9,0,0,0,0,0,9""" sequence_report.projects = ProjectConfig.loadDefault() orf1ab_size = len( sequence_report.projects.getReference('SARS-CoV-2-ORF1ab')) nsp12_size = len(sequence_report.projects.getReference('SARS-CoV-2-nsp12')) report_file = StringIO() sequence_report.write_nuc_header(report_file) sequence_report.read(aligned_reads) sequence_report.write_nuc_counts() report = report_file.getvalue() report_lines = report.splitlines() header_size = 1 skipped_rows = 2 expected_size = (orf1ab_size + nsp12_size) * 3 + header_size - skipped_rows if len(report_lines) != expected_size: assert (len(report_lines), report) == (expected_size, '') key_lines = report_lines[13198:13209] key_report = '\n'.join(key_lines) assert key_report == expected_section
def main(): # find_best_match_for_pssm() sequences = fetch_alignment_sequences( 2004, 'CON', # Consensus/Ancestral 'ENV') consensus = sequences['CON_OF_CONS'].replace('-', '').upper() project_config = ProjectConfig.loadDefault() ref_names = set(project_config.getAllReferences().keys()) new_sequences = fetch_alignment_sequences('2015', 'COM') consensus_accession = 'Consensus' assert consensus_accession not in new_sequences, sorted( new_sequences.keys()) new_sequences[consensus_accession] = consensus for line in compare_config('HIV', project_config, new_sequences, ref_names): print(line, end='') print('Unchecked refs: ' + ', '.join(sorted(ref_names)))
def __init__(self, file=None, rules_yaml=None, genotype=None, references=None, backup_genotype=None): """ Load ASI rules from a file or file object. """ if references is None: projects = ProjectConfig.loadDefault() references = projects.getAllReferences() with WILD_TYPES_PATH.open() as wild_types_file: wild_types = safe_load(wild_types_file) references.update(wild_types) self.stds = { name if name != 'INT' else 'IN': ref for name, ref in references.items() } # Algorithm info self.alg_version = '' self.alg_name = '' # definitions self.gene_def = {} # {code: [drug_class_code]} self.level_def = {} # {'1': 'Susceptible'} self.drug_class = defaultdict(list) # {code: [drug_code]} self.global_range = [ ] # [ ['-INF', '9', '1'] , ...] #first two are the range, the third one is the res level self.comment_def = {} # {code: comment_text} self.drugs = { } # {code: (name, [condition, [(action_type, action_value)]])} self.mutation_comments = [ ] # maybe skip for now? We don't really use this atm. if file is not None: self.load_xml(file) elif rules_yaml is not None: self.load_yaml(rules_yaml, genotype, backup_genotype)
def write_nuc_mutations(nuc_csv: typing.TextIO, nuc_mutations_csv: typing.TextIO): nuc_rows = DictReader(nuc_csv) mutations_writer = DictWriter(nuc_mutations_csv, ['seed', 'region', 'wt', 'refseq_nuc_pos', 'var', 'prevalence'], lineterminator=os.linesep) mutations_writer.writeheader() for seed, seed_rows in groupby(nuc_rows, itemgetter('seed')): if seed != 'SARS-CoV-2-seed': continue landmark_reader = LandmarkReader.load() projects = ProjectConfig.loadDefault() for region_name, region_rows in groupby(seed_rows, itemgetter('region')): region = landmark_reader.get_gene(seed, region_name) seed_seq = projects.getReference(seed) ref_seq = seed_seq[region['start']-1:region['end']] for row in region_rows: nuc_pos = int(row['refseq.nuc.pos']) wild_type = ref_seq[nuc_pos-1] coverage = int(row['coverage']) if coverage == 0: continue for nuc in 'ACGT': if nuc == wild_type: continue nuc_count = int(row[nuc]) prevalence = nuc_count / coverage if prevalence >= 0.05: mutations_writer.writerow(dict(seed=seed, region=region_name, wt=wild_type, refseq_nuc_pos=nuc_pos, var=nuc, prevalence=prevalence))
def fastq_g2p(pssm, fastq1, fastq2, g2p_csv, g2p_summary_csv=None, unmapped1=None, unmapped2=None, aligned_csv=None, min_count=1, min_valid=1, min_valid_percent=0.0): g2p_filename = getattr(g2p_csv, 'name', None) if g2p_filename is None: count_prefix = None else: working_path = os.path.dirname(g2p_csv.name) count_prefix = os.path.join(working_path, 'read_counts') project_config = ProjectConfig.loadDefault() hiv_seed = project_config.getReference(G2P_SEED_NAME) coordinate_ref = project_config.getReference(COORDINATE_REF_NAME) v3loop_ref = extract_target(hiv_seed, coordinate_ref) reader = FastqReader(fastq1, fastq2) merged_reads = merge_reads(reader) trimmed_reads = trim_reads(merged_reads, v3loop_ref) mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2) read_counts = count_reads(mapped_reads, count_prefix) if aligned_csv is not None: read_counts = write_aligned_reads(read_counts, aligned_csv, hiv_seed, v3loop_ref) write_rows(pssm, read_counts, g2p_csv, g2p_summary_csv, min_count, min_valid=min_valid, min_valid_percent=min_valid_percent)
def main(): project_config = ProjectConfig.loadDefault() error_count = 0 unchecked_ref_names = set(project_config.getAllReferences().keys()) error_count += check_hcv_seeds(project_config, unchecked_ref_names) error_count += check_hcv_coordinates(project_config, unchecked_ref_names) error_count += check_hiv_seeds(project_config, unchecked_ref_names) error_count += check_hiv_coordinates(project_config, unchecked_ref_names) error_count += check_hiv_wild_types(project_config) error_count += check_hla_seeds(project_config, unchecked_ref_names) error_count += check_hla_coordinates(project_config, unchecked_ref_names) error_count += check_sars_seeds(project_config, unchecked_ref_names) error_count += check_sars_coordinates(project_config, unchecked_ref_names) if not unchecked_ref_names: print('No unchecked refs.') else: print( fill_report(f'Unchecked refs: ' f'{", ".join(sorted(unchecked_ref_names))}')) error_count += len(unchecked_ref_names) print(f'Total errors: {error_count}.')
def load_hcv(seqs): hcv_definitions = DictReader( StringIO("""\ protocol,name,direction,length,h77_pos,sequence HCV WG,oligo dA20,R,20,9418-9437,AAAAAAAAAAAAAAAAAAAA ,Pr3,R,30,8616-8645,GGCGGAATTCCTGGTCATAGCCTCCGTGAA ,1abGENF1bp,F,28,266-293,GGGTCGCGAAAGGCCTTGTGGTACTGCC ,TIM-Pr3,R,30,8616-8645,CAGGAAACAGCTATGACGGCGGAATTCCTGGTCATAGCCTCCGTGAA ,1abGENF2,F,30,286-315,GTACTGCCTGATAGGGTGCTTGCGAGTGCC ,Pr6,R,30,8611-8640,AATTCCTGGTCATAGCCTCCGTGAAGACTC HCV miDi,Pr1,F,31,8245-8275,TGGGGTTCGCGTATGATACCCGCTGCTTTGA ,Pr2,F,31,8245-8275,TGGGGTTTTCTTACGACACCAGGTGCTTTGA ,oligo dA20-TIM,R,20,9418-9437,CAGGAAACAGCTATGACAAAAAAAAAAAAAAAAAAAA ,Pr4,F,29,8253-8281,CCGTATGATACCCGCTGCTTTGACTCAAC ,Pr5,F,29,8253-8281,TCCTACGACACCAGGTGCTTTGATTCAAC ,TIM,R,,1-0,CAGGAAACAGCTATGAC """)) projects = ProjectConfig.loadDefault() h77 = projects.getReference('HCV-1a') is_comparing = True differ = Differ() for row in hcv_definitions: name = 'HCV ' + row['name'] start, end = (int(pos) for pos in row['h77_pos'].split('-')) primer = SeqRecord(Seq(row['sequence']), name, description='') complement = primer.reverse_complement(id=primer.id, description='') direction = row['direction'] if direction == 'F': seqs['left'].append(primer) else: seqs['right'].append(primer) primer, complement = complement, primer h77_section = Seq(h77[start - 1:end]) if is_comparing and primer.seq != h77_section: print(name, 'does not match.') diffs = differ.compare([str(primer.seq) + '\n'], [str(h77_section) + '\n']) print(*diffs, sep='')
def main(): args = parse_args() project_config = ProjectConfig.loadDefault() scoring_path = Path(__file__).parent.parent / 'project_scoring.json' with scoring_path.open() as scoring_file: scoring_config = json.load(scoring_file) with qai_helper.Session() as session: session.login(args.qai_server, args.qai_user, args.qai_password) pipelines = session.get_json("/lab_miseq_pipelines?version=" + args.pipeline_version, retries=0) if pipelines: raise RuntimeError('Pipeline {} already exists.'.format( args.pipeline_version)) seed_groups = session.get_json("/lab_miseq_seed_groups") # noinspection PyTypeChecker seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups)) old_regions = session.get_json("/lab_miseq_regions", retries=0) regions = dict(((region['name'], region) for region in old_regions)) for region_name, region_data in project_config.config['regions'].items( ): ref_seq = ''.join(region_data['reference']) region = regions.get(region_name) if region is None: seed_group_name = region_data['seed_group'] seed_group_id = seed_group_ids.get(seed_group_name) if seed_group_id is None and seed_group_name: seed_group = session.post_json("/lab_miseq_seed_groups", {'name': seed_group_name}) seed_group_id = seed_group['id'] seed_group_ids[seed_group_name] = seed_group_id region = session.post_json( "/lab_miseq_regions", { 'name': region_name, 'is_nucleotide': region_data['is_nucleotide'], 'reference': ref_seq, 'seed_group_id': seed_group_id }) regions[region_name] = region elif region['reference'] != ref_seq: print("Reference doesn't match:", region_name) if args.update_sequences: region['reference'] = ref_seq session.post_json(f"/lab_miseq_regions/{region['id']}", region) pipeline = session.post_json("/lab_miseq_pipelines", {'version': args.pipeline_version}) pipeline_id = pipeline['id'] old_projects = session.get_json("/lab_miseq_projects", retries=0) projects = dict( ((project['name'], project) for project in old_projects)) for project_name, project_data in project_config.config[ 'projects'].items(): project = projects.get(project_name) if project is None: project = session.post_json( "/lab_miseq_projects", { 'name': project_name, 'max_variants': project_data['max_variants'] }) project_version = session.post_json("/lab_miseq_project_versions", { 'pipeline_id': pipeline_id, 'project_id': project['id'] }) for i, region_data in enumerate(project_data['regions']): scoring_data = scoring_config['projects'][project_name][ 'regions'][i] coordinate_region = regions[region_data['coordinate_region']] seed_region = regions[region_data['seed_region_names'][0]] seed_group_id = seed_region['seed_group_id'] project_region = session.post_json( "/lab_miseq_project_regions", { 'project_version_id': project_version['id'], 'coordinate_region_id': coordinate_region['id'], 'min_coverage1': scoring_data['min_coverage1'], 'min_coverage2': scoring_data['min_coverage2'], 'min_coverage3': scoring_data['min_coverage3'], 'seed_group_id': seed_group_id }) for key_position in scoring_data['key_positions']: session.post_json( "/lab_miseq_key_positions", { 'project_region_id': project_region['id'], 'start_pos': key_position['start_pos'], 'end_pos': key_position['end_pos'] }) print("Done.")
def build_conseqs(conseqs_file, run, sample_sheet, ok_sample_regions): """ Parses a Pipeline-produced conseq file and builds JSON objects to send to QAI. @param conseqs_file: An open file that contains the consensus sequences from the counts2csf step for all samples in the run. @param run: a hash with the attributes of the run record, including a sequencing summary of all the samples and their target projects @param sample_sheet: The data parsed from the sample sheet. @param ok_sample_regions: A set of (sample_name, region, qcut) tuples that were given a good score by the pipeline. @return an array of JSON hashes, one for each conseq. """ result = [] ss = sample_sheet sequencings = run['sequencing_summary'] conseqs_csv = csv.DictReader(conseqs_file) # ss["Data"] is keyed by (what should be) the FASTQ # filename, which looks like # # [sample name with ; and _ replaced by -]_S[sample number]. # # Meanwhile, entries in conseqs_file have a "sample" field holding # just the sample name (also with ; and _ replaced). We make a # lookup table to get the FASTQ filename just from the first part. # This will make subsequent steps easier (avoids having to do a # search through a list/dict of dicts). # FASTQ_lookup = {} # filename_re = re.compile("(.+)_S.+") # for fastq_filename in ss["Data"]: # sample_name = filename_re.match(fastq_filename).group(1) # FASTQ_lookup[sample_name] = fastq_filename projects = ProjectConfig.loadDefault() target_regions = set() # set([(project_name, tags)]) for entry in sequencings: seeds = projects.getProjectSeeds(entry['target_project']) for seed in seeds: target_regions.add((entry['tag'], seed)) for row in conseqs_csv: # Each row of this file looks like: # sample,region,q-cutoff,s-number,consensus-percent-cutoff,sequence # We want to take the "sample" entry and get the corresponding # original Sample_Name from the sample sheet. In version 2, this # looks like [sample name]~[project name]#[...] # In version 1, this looked like [sample name]~[project name]#[...] # but both ; and _ got garbled by the MiSeq instrument itself. # Thus we have to work around it. fastq_filename = row["sample"] sample_info = ss["Data"][fastq_filename] orig_sample_name = sample_info["orig_sample_name"] sample_tags = sample_info["tags"] # FIXME if row["sequence"] is blank we replace it with a dash. # Need Conan to make that row blank-able. curr_seq = row["sequence"] if len(row["sequence"]) > 0 else "-" sample_region = (fastq_filename, row["region"], row["q-cutoff"]) ok_region = sample_region in ok_sample_regions is_target_region = (sample_tags, row["region"]) in target_regions ok_for_release = ok_region and is_target_region result.append({"samplename": orig_sample_name, # July 9, 2014: we can't do this properly right now # without a lookup table that is yet to be fully # defined. "testcode": None, "conseq_cutoff": row["consensus-percent-cutoff"], "region": row["region"], "qcutoff": float(row["q-cutoff"]), "snum": fastq_filename.split('_')[-1], "seq": curr_seq, "ok_for_release": ok_for_release}) return result
def aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, callback=None, coverage_summary_csv=None, clipping_csv=None, conseq_ins_csv=None, g2p_aligned_csv=None, remap_conseq_csv=None): """ Analyze aligned reads for nucleotide and amino acid frequencies. Generate consensus sequences. @param aligned_csv: Open file handle containing aligned reads (from sam2aln) @param nuc_csv: Open file handle to write nucleotide frequencies. @param amino_csv: Open file handle to write amino acid frequencies. @param coord_ins_csv: Open file handle to write insertions relative to coordinate reference. @param conseq_csv: Open file handle to write consensus sequences. @param failed_align_csv: Open file handle to write sample consensus sequences that failed to align to the coordinate reference. @param callback: a function to report progress with three optional parameters - callback(message, progress, max_progress) @param coverage_summary_csv: Open file handle to write coverage depth. @param clipping_csv: Open file handle containing soft clipping counts @param conseq_ins_csv: Open file handle containing insertions relative to consensus sequence @param g2p_aligned_csv: Open file handle containing aligned reads (from fastq_g2p) @param remap_conseq_csv: Open file handle containing consensus sequences from the remap step. """ # load project information projects = ProjectConfig.loadDefault() # initialize reporter classes insert_writer = InsertionWriter(coord_ins_csv) report = SequenceReport(insert_writer, projects, CONSEQ_MIXTURE_CUTOFFS) report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE report.write_amino_header(amino_csv) report.write_consensus_header(conseq_csv) report.write_failure_header(failed_align_csv) report.write_nuc_header(nuc_csv) if coverage_summary_csv is None: coverage_summary = coverage_writer = None else: coverage_writer = csv.DictWriter( coverage_summary_csv, ['avg_coverage', 'coverage_region', 'region_width'], lineterminator=os.linesep) coverage_writer.writeheader() coverage_summary = {} if callback: aligned_filename = getattr(aligned_csv, 'name', None) if aligned_filename: file_size = os.stat(aligned_filename).st_size report.enable_callback(callback, file_size) if clipping_csv is not None: report.read_clipping(clipping_csv) if conseq_ins_csv is not None: report.read_insertions(conseq_ins_csv) if remap_conseq_csv is not None: report.read_remap_conseqs(remap_conseq_csv) report.process_reads(g2p_aligned_csv, aligned_csv, coverage_summary) if coverage_summary_csv is not None: if coverage_summary: coverage_writer.writerow(coverage_summary)
def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): """ Use Blastn to search for the genotype of a set of reference sequences. :param str fasta: file path of the FASTA file containing the query sequences :param str db: file path of the database to search for matches :param blast_csv: open file to write the blast matches to, or None :param dict group_refs: {contig_ref: group_ref} or None. The dictionary will get filled in with the mapping from each contig's reference name to the best matched reference for the whole seed group. :return: {query_name: (ref_name, matched_fraction)} where query_name is a sequence header from the query sequences FASTA file, ref_name is the name of the best match from the database, and matched_fraction is the fraction of the query that aligned against the reference (matches and mismatches). """ contig_nums = {} # {contig_name: contig_num} with open(fasta) as f: for line in f: if line.startswith('>'): contig_name = line[1:-1] contig_nums[contig_name] = len(contig_nums) + 1 blast_columns = [ 'qaccver', 'saccver', 'pident', 'score', 'qcovhsp', 'qstart', 'qend', 'sstart', 'send' ] cline = NcbiblastnCommandline(query=fasta, db=db, outfmt=f'"10 {" ".join(blast_columns)}"', evalue=0.0001, gapopen=5, gapextend=2, penalty=-3, reward=1, max_target_seqs=5000) stdout, _ = cline() samples = {} # {query_name: (subject_name, matched_fraction)} matches = sorted(DictReader(StringIO(stdout), blast_columns), key=lambda row: (row['qaccver'], float(row['score']))) if not blast_csv: blast_writer = None else: blast_writer = DictWriter(blast_csv, [ 'contig_num', 'ref_name', 'score', 'match', 'pident', 'start', 'end', 'ref_start', 'ref_end' ], lineterminator=os.linesep) blast_writer.writeheader() contig_top_matches = { match['qaccver']: match['saccver'] for match in matches } top_refs = set(contig_top_matches.values()) projects = ProjectConfig.loadDefault() match_scores = Counter() for contig_name, contig_matches in groupby(matches, itemgetter('qaccver')): contig_top_ref = contig_top_matches[contig_name] contig_seed_group = projects.getSeedGroup(contig_top_ref) for match in contig_matches: ref_name = match['saccver'] if ref_name not in top_refs: continue match_seed_group = projects.getSeedGroup(ref_name) if match_seed_group == contig_seed_group: match_scores[ref_name] += float(match['score']) if group_refs is not None: group_top_refs = { projects.getSeedGroup(ref_name): ref_name for ref_name, count in reversed(match_scores.most_common()) } for ref_name in contig_top_matches.values(): group_refs[ref_name] = group_top_refs[projects.getSeedGroup( ref_name)] for match in matches: matched_fraction = float(match['qcovhsp']) / 100 if int(match['send']) < int(match['sstart']): matched_fraction *= -1 pident = round(float(match['pident'])) contig_name = match['qaccver'] samples[contig_name] = (match['saccver'], matched_fraction) if blast_writer: blast_writer.writerow( dict(contig_num=contig_nums[contig_name], ref_name=match['saccver'], score=match['score'], match=matched_fraction, pident=pident, start=match['qstart'], end=match['qend'], ref_start=match['sstart'], ref_end=match['send'])) return samples
def build_conseqs(conseqs_file, run, sample_sheet, ok_sample_regions): """ Parses a Pipeline-produced conseq file and builds JSON objects to send to QAI. @param conseqs_file: An open file that contains the consensus sequences from the counts2csf step for all samples in the run. @param run: a hash with the attributes of the run record, including a sequencing summary of all the samples and their target projects @param sample_sheet: The data parsed from the sample sheet. @param ok_sample_regions: A set of (sample_name, region, qcut) tuples that were given a good score by the pipeline. @return an array of JSON hashes, one for each conseq. """ result = [] ss = sample_sheet sequencings = run['sequencing_summary'] conseqs_csv = csv.DictReader(conseqs_file) # ss["Data"] is keyed by (what should be) the FASTQ # filename, which looks like # # [sample name with ; and _ replaced by -]_S[sample number]. # # Meanwhile, entries in conseqs_file have a "sample" field holding # just the sample name (also with ; and _ replaced). We make a # lookup table to get the FASTQ filename just from the first part. # This will make subsequent steps easier (avoids having to do a # search through a list/dict of dicts). # FASTQ_lookup = {} # filename_re = re.compile("(.+)_S.+") # for fastq_filename in ss["Data"]: # sample_name = filename_re.match(fastq_filename).group(1) # FASTQ_lookup[sample_name] = fastq_filename projects = ProjectConfig.loadDefault() target_regions = set() # set([(tags, seed_name)]) for entry in sequencings: try: seeds = projects.getProjectSeeds(entry['target_project']) except KeyError: logger.warning('Failed to load project seeds.', exc_info=True) seeds = set() for seed in seeds: target_regions.add((entry['tag'], seed)) for row in conseqs_csv: # Each row of this file looks like: # sample,region,q-cutoff,s-number,consensus-percent-cutoff,sequence # We want to take the "sample" entry and get the corresponding # original Sample_Name from the sample sheet. In version 2, this # looks like [sample name]~[project name]#[...] # In version 1, this looked like [sample name]~[project name]#[...] # but both ; and _ got garbled by the MiSeq instrument itself. # Thus we have to work around it. fastq_filename = row["sample"] sample_info = ss["Data"][fastq_filename] orig_sample_name = sample_info["orig_sample_name"] sample_tags = sample_info["tags"] # FIXME if row["sequence"] is blank we replace it with a dash. # Need Conan to make that row blank-able. curr_seq = row["sequence"] if len(row["sequence"]) > 0 else "-" sample_region = (fastq_filename, row["region"], row["q-cutoff"]) ok_region = sample_region in ok_sample_regions is_target_region = (sample_tags, row["region"]) in target_regions ok_for_release = ok_region and is_target_region result.append({ "samplename": orig_sample_name, # July 9, 2014: we can't do this properly right now # without a lookup table that is yet to be fully # defined. "testcode": None, "conseq_cutoff": row["consensus-percent-cutoff"], "region": row["region"], "qcutoff": float(row["q-cutoff"]), "snum": fastq_filename.split('_')[-1], "seq": curr_seq, "ok_for_release": ok_for_release }) return result
def main(): project_config = ProjectConfig.loadDefault() with open('../project_scoring.json', 'rU') as scoring_file: scoring_config = json.load(scoring_file) with qai_helper.Session() as session: session.login(settings.qai_path, settings.qai_user, settings.qai_password) pipelines = session.get_json("/lab_miseq_pipelines?version=" + settings.pipeline_version, retries=0) if pipelines: raise RuntimeError('Pipeline {} already exists.'.format( settings.pipeline_version)) seed_groups = session.get_json("/lab_miseq_seed_groups") seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups)) old_regions = session.get_json("/lab_miseq_regions", retries=0) regions = dict(((region['name'], region) for region in old_regions)) for region_name, region_data in project_config.config[ 'regions'].iteritems(): region = regions.get(region_name) if region is None: seed_group_name = region_data['seed_group'] seed_group_id = seed_group_ids.get(seed_group_name) if seed_group_id is None and seed_group_name: seed_group = session.post_json("/lab_miseq_seed_groups", {'name': seed_group_name}) seed_group_id = seed_group['id'] seed_group_ids[seed_group_name] = seed_group_id region = session.post_json( "/lab_miseq_regions", { 'name': region_name, 'is_nucleotide': region_data['is_nucleotide'], 'reference': ''.join(region_data['reference']), 'seed_group_id': seed_group_id }) regions[region_name] = region pipeline = session.post_json("/lab_miseq_pipelines", {'version': settings.pipeline_version}) pipeline_id = pipeline['id'] old_projects = session.get_json("/lab_miseq_projects", retries=0) projects = dict( ((project['name'], project) for project in old_projects)) for project_name, project_data in project_config.config[ 'projects'].iteritems(): project = projects.get(project_name) if project is None: project = session.post_json( "/lab_miseq_projects", { 'name': project_name, 'max_variants': project_data['max_variants'] }) project_version = session.post_json("/lab_miseq_project_versions", { 'pipeline_id': pipeline_id, 'project_id': project['id'] }) for i, region_data in enumerate(project_data['regions']): scoring_data = scoring_config['projects'][project_name][ 'regions'][i] coordinate_region = regions[region_data['coordinate_region']] seed_region = regions[region_data['seed_region_names'][0]] seed_group_id = seed_region['seed_group_id'] project_region = session.post_json( "/lab_miseq_project_regions", { 'project_version_id': project_version['id'], 'coordinate_region_id': coordinate_region['id'], 'min_coverage1': scoring_data['min_coverage1'], 'min_coverage2': scoring_data['min_coverage2'], 'min_coverage3': scoring_data['min_coverage3'], 'seed_group_id': seed_group_id }) for key_position in scoring_data['key_positions']: session.post_json( "/lab_miseq_key_positions", { 'project_region_id': project_region['id'], 'start_pos': key_position['start_pos'], 'end_pos': key_position['end_pos'] }) print "Done."
def find_probes(contigs_csv, probes_csv): reader = DictReader(contigs_csv) columns = ['sample', 'contig'] for target_name in TARGET_SEQUENCES: for column_type in [ 'in_contig_start', 'in_contig_size', 'in_hxb2_start', 'in_hxb2_size', 'merged_hxb2_start', 'merged_hxb2_size', 'dist', 'end_dist', 'score', 'is_reversed', 'seq' ]: columns.append(target_name + '_' + column_type) writer = DictWriter(probes_csv, columns) writer.writeheader() projects = ProjectConfig.loadDefault() hxb2 = projects.getReference('HIV1-B-FR-K03455-seed') gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 for sample_name, sample_rows in groupby(reader, itemgetter('sample')): contig_num = 0 for row in sample_rows: seed_name = row.get('genotype') or row.get('ref') or row['region'] conseq_cutoff = row.get('consensus-percent-cutoff') if conseq_cutoff and conseq_cutoff != 'MAX': continue contig_num += 1 contig_name = f'{contig_num}-{seed_name}' contig_seq: str = row.get('contig') or row['sequence'] aligned_hxb2, aligned_contig_to_hxb2, _ = align_it( hxb2, contig_seq, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) new_row = dict(sample=sample_name, contig=contig_name) for target_name, target_seq in TARGET_SEQUENCES.items(): finder = ProbeFinder(contig_seq, target_seq) size = len(finder.contig_match) start_pos = finder.start + 1 end_pos = finder.start + size hxb2_pos = contig_pos = 0 merged_hxb2_start = merged_hxb2_size = None for hxb2_nuc, contig_nuc in zip(aligned_hxb2, aligned_contig_to_hxb2): if hxb2_nuc != '-': hxb2_pos += 1 if contig_nuc != '-': contig_pos += 1 if contig_pos == start_pos: merged_hxb2_start = hxb2_pos if contig_pos == end_pos: merged_hxb2_size = hxb2_pos - merged_hxb2_start + 1 break aligned_ref, aligned_match, _ = align_it( hxb2, finder.contig_match, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) lstripped_match = aligned_match.lstrip('-') in_hxb2_start = len(aligned_match) - len(lstripped_match) tail_len = len(lstripped_match) - len( lstripped_match.rstrip('-')) ref_match = aligned_ref[in_hxb2_start:-tail_len or None] in_hxb2_size = len(ref_match.replace('-', '')) prefix = target_name + '_' new_row[prefix + 'in_contig_start'] = start_pos new_row[prefix + 'in_contig_size'] = size new_row[prefix + 'in_hxb2_start'] = in_hxb2_start new_row[prefix + 'in_hxb2_size'] = in_hxb2_size new_row[prefix + 'merged_hxb2_start'] = merged_hxb2_start new_row[prefix + 'merged_hxb2_size'] = merged_hxb2_size new_row[prefix + 'dist'] = finder.dist new_row[prefix + 'end_dist'] = finder.end_dist new_row[prefix + 'score'] = finder.score new_row[prefix + 'is_reversed'] = ('Y' if finder.is_reversed else 'N') new_row[prefix + 'seq'] = finder.contig_match writer.writerow(new_row)
from micall.utils.alignment_wrapper import align_nucs try: # noinspection PyPackageRequirements from mappy import Aligner except ImportError: Aligner = None from micall.utils.fetch_sequences import fetch_by_accession import sys from micall.core.project_config import ProjectConfig REFERENCE = ProjectConfig.loadDefault() REFERENCE = REFERENCE.getReference('SARS-CoV-2-seed') def load_coverage(csv): result = {} with open(csv) as csvfile: reader = DictReader(csvfile) for row in reader: result[int(row['query_nuc_pos'])] = int(row['coverage']) return result BATCH = 'batch_01' ROOT = ( Path('/wow') / BATCH
def build_coverage_figure(genome_coverage_csv, blast_csv=None): min_position, max_position = 1, 500 coordinate_depths = Counter() contig_depths = Counter() contig_groups = defaultdict(set) # {coordinates_name: {contig_name}} reader = DictReader(genome_coverage_csv) for row in reader: query_nuc_pos = int(row['query_nuc_pos']) if row['refseq_nuc_pos']: refseq_nuc_pos = int(row['refseq_nuc_pos']) else: refseq_nuc_pos = min_position min_position = min(min_position, refseq_nuc_pos, query_nuc_pos) max_position = max(max_position, refseq_nuc_pos, query_nuc_pos) coordinates_name = row['coordinates'] contig_name = row['contig'] if row['coverage'] != '': row_coverage = int(row['coverage']) - int(row['dels']) coordinate_depths[coordinates_name] = max( coordinate_depths[coordinates_name], row_coverage) contig_depths[contig_name] = max(contig_depths[contig_name], row_coverage) contig_groups[coordinates_name].add(contig_name) if '' in coordinate_depths: # Force partial contigs to come last. coordinate_depths[''] = -1 position_offset = -min_position + 1 max_position += position_offset blast_rows = [] if blast_csv is not None: for blast_row in DictReader(blast_csv): for field_name in ('start', 'end', 'ref_start', 'ref_end'): # noinspection PyTypeChecker blast_row[field_name] = int(blast_row[field_name]) blast_rows.append(blast_row) blast_rows.sort(key=itemgetter('start', 'ref_start')) landmarks_path = (Path(__file__).parent.parent / "data" / "landmark_references.yaml") landmark_groups = yaml.safe_load(landmarks_path.read_text()) projects = ProjectConfig.loadDefault() f = Figure() for _, coordinates_name in sorted( (-depth, name) for name, depth in coordinate_depths.items()): for reference_set in landmark_groups: if coordinates_name != reference_set['coordinates']: continue prev_landmark = None for i, landmark in enumerate( sorted(reference_set['landmarks'], key=itemgetter('start'))): landmark.setdefault('frame', 0) if prev_landmark and 'end' not in prev_landmark: prev_landmark['end'] = landmark['start'] - 1 prev_landmark = landmark for frame, frame_landmarks in groupby(reference_set['landmarks'], itemgetter('frame')): subtracks = [] for landmark in frame_landmarks: landmark_colour = landmark.get('colour') if landmark_colour is None: continue subtracks.append( Track(landmark['start'] + position_offset, landmark['end'] + position_offset, label=landmark['name'], color=landmark_colour)) max_position = max(max_position, landmark['end'] + position_offset) f.add(Multitrack(subtracks)) break else: add_partial_banner(f, position_offset, max_position) contig_names = contig_groups[coordinates_name] sorted_contig_names = sort_contig_names(contig_names, contig_depths) ref_arrows = [] for contig_name in sorted_contig_names: if contig_name.startswith('contig-'): # No arrows on original contig tracks. continue contig_matcher = ContigMatcher(contig_name) ref_positions = None arrow_count = 0 for blast_row in blast_rows: if not contig_matcher.is_match(blast_row): continue if (ref_positions is None and coordinates_name != '' and blast_row['ref_name'] != coordinates_name): ref_positions = map_references(blast_row['ref_name'], coordinates_name, projects) arrow_count += 1 ref_start = int(blast_row['ref_start']) ref_end = int(blast_row['ref_end']) if ref_positions is None: coordinate_start = ref_start coordinate_end = ref_end else: coordinate_start = ref_positions[ref_start] coordinate_end = ref_positions[ref_end] ref_arrows.append( Arrow(coordinate_start + position_offset, coordinate_end + position_offset, elevation=1, label=f'{contig_matcher.num}.{arrow_count}')) if ref_arrows: f.add(ArrowGroup(ref_arrows)) for contig_name in sorted_contig_names: genome_coverage_csv.seek(0) reader = DictReader(genome_coverage_csv) build_contig(reader, f, contig_name, max_position, position_offset, blast_rows) if not f.elements: f.add(Track(1, max_position, label='No contigs found.', color='none')) return f
def main(): fastq_files = [FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100), FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)), (CodonMutation(159, 'GTC'),)), FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100), FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)), (CodonMutation(159, 'GTC'),)), FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100), FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)), (CodonMutation(316, 'AGC'),)), FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100), FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)), (CodonMutation(316, 'AGC'),)), FastqFile('2140A-HIV_S17_L001_R1_001.fastq', '2140', False, (FastqSection('PR', 1, 80, 100),), (CodonMutation(24, 'ATA'),)), FastqFile('2140A-HIV_S17_L001_R2_001.fastq', '2140', True, (FastqSection('PR', 20, 99, 100),), (CodonMutation(24, 'ATA'),))] projects = ProjectConfig.loadDefault() for fastq_file in fastq_files: with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: ref_name, ref_start, ref_end = find_coord_pos(projects, section.coord_name, section.start_pos, section.end_pos) ref_nuc_seq = projects.getReference(ref_name) ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end]) for mutation in fastq_file.mutations: if section.start_pos <= mutation.pos <= section.end_pos: section_pos = (mutation.pos - section.start_pos) * 3 ref_nuc_section[section_pos:section_pos+3] = list(mutation.codon) ref_nuc_section = ''.join(ref_nuc_section) if fastq_file.is_reversed: ref_nuc_section = reverse_and_complement(ref_nuc_section) phred_scores = 'A' * (ref_end-ref_start) file_num = '2' if fastq_file.is_reversed else '1' for cluster in range(section.count): f.write('@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n'.format( fastq_file.extract_num, cluster + next_cluster, file_num)) f.write(ref_nuc_section+'\n') f.write('+\n') f.write(phred_scores+'\n') next_cluster += section.count
def aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, callback=None, coverage_summary_csv=None, clipping_csv=None, conseq_ins_csv=None, g2p_aligned_csv=None, remap_conseq_csv=None, conseq_region_csv=None): """ Analyze aligned reads for nucleotide and amino acid frequencies. Generate consensus sequences. @param aligned_csv: Open file handle containing aligned reads (from sam2aln) @param nuc_csv: Open file handle to write nucleotide frequencies. @param amino_csv: Open file handle to write amino acid frequencies. @param coord_ins_csv: Open file handle to write insertions relative to coordinate reference. @param conseq_csv: Open file handle to write consensus sequences. @param failed_align_csv: Open file handle to write sample consensus sequences that failed to align to the coordinate reference. @param callback: a function to report progress with three optional parameters - callback(message, progress, max_progress) @param coverage_summary_csv: Open file handle to write coverage depth. @param clipping_csv: Open file handle containing soft clipping counts @param conseq_ins_csv: Open file handle containing insertions relative to consensus sequence @param g2p_aligned_csv: Open file handle containing aligned reads (from fastq_g2p) @param remap_conseq_csv: Open file handle containing consensus sequences from the remap step. @param conseq_region_csv: Open file handle to write consensus sequences split into regions. """ # load project information projects = ProjectConfig.loadDefault() # initialize reporter classes with InsertionWriter(coord_ins_csv) as insert_writer: report = SequenceReport(insert_writer, projects, CONSEQ_MIXTURE_CUTOFFS) report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE report.write_amino_header(amino_csv) report.write_consensus_header(conseq_csv) report.write_consensus_regions_header(conseq_region_csv) report.write_failure_header(failed_align_csv) report.write_nuc_header(nuc_csv) if coverage_summary_csv is None: coverage_summary = coverage_writer = None else: coverage_writer = csv.DictWriter(coverage_summary_csv, ['avg_coverage', 'coverage_region', 'region_width'], lineterminator=os.linesep) coverage_writer.writeheader() coverage_summary = {} if callback: aligned_filename = getattr(aligned_csv, 'name', None) if aligned_filename: file_size = os.stat(aligned_filename).st_size report.enable_callback(callback, file_size) if clipping_csv is not None: report.read_clipping(clipping_csv) if conseq_ins_csv is not None: report.read_insertions(conseq_ins_csv) if remap_conseq_csv is not None: report.read_remap_conseqs(remap_conseq_csv) report.process_reads(g2p_aligned_csv, aligned_csv, coverage_summary) if coverage_summary_csv is not None: if coverage_summary: coverage_writer.writerow(coverage_summary)
def main(): args = parse_args() projects = ProjectConfig.loadDefault() for sample_name in args.sample: process_file(sample_name, projects, args) print('Done.')
def main(): args = parse_args() project_config = ProjectConfig.loadDefault() with open('../project_scoring.json', 'rU') as scoring_file: scoring_config = json.load(scoring_file) with qai_helper.Session() as session: session.login(args.qai_server, args.qai_user, args.qai_password) pipelines = session.get_json( "/lab_miseq_pipelines?version=" + args.pipeline_version, retries=0) if pipelines: raise RuntimeError('Pipeline {} already exists.'.format( args.pipeline_version)) seed_groups = session.get_json("/lab_miseq_seed_groups") seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups)) old_regions = session.get_json("/lab_miseq_regions", retries=0) regions = dict(((region['name'], region) for region in old_regions)) for region_name, region_data in project_config.config['regions'].items(): region = regions.get(region_name) if region is None: seed_group_name = region_data['seed_group'] seed_group_id = seed_group_ids.get(seed_group_name) if seed_group_id is None and seed_group_name: seed_group = session.post_json("/lab_miseq_seed_groups", {'name': seed_group_name}) seed_group_id = seed_group['id'] seed_group_ids[seed_group_name] = seed_group_id region = session.post_json( "/lab_miseq_regions", {'name': region_name, 'is_nucleotide': region_data['is_nucleotide'], 'reference': ''.join(region_data['reference']), 'seed_group_id': seed_group_id}) regions[region_name] = region pipeline = session.post_json("/lab_miseq_pipelines", {'version': args.pipeline_version}) pipeline_id = pipeline['id'] old_projects = session.get_json("/lab_miseq_projects", retries=0) projects = dict(((project['name'], project) for project in old_projects)) for project_name, project_data in project_config.config['projects'].items(): project = projects.get(project_name) if project is None: project = session.post_json( "/lab_miseq_projects", {'name': project_name, 'max_variants': project_data['max_variants']}) project_version = session.post_json("/lab_miseq_project_versions", {'pipeline_id': pipeline_id, 'project_id': project['id']}) for i, region_data in enumerate(project_data['regions']): scoring_data = scoring_config['projects'][project_name]['regions'][i] coordinate_region = regions[region_data['coordinate_region']] seed_region = regions[region_data['seed_region_names'][0]] seed_group_id = seed_region['seed_group_id'] project_region = session.post_json( "/lab_miseq_project_regions", {'project_version_id': project_version['id'], 'coordinate_region_id': coordinate_region['id'], 'min_coverage1': scoring_data['min_coverage1'], 'min_coverage2': scoring_data['min_coverage2'], 'min_coverage3': scoring_data['min_coverage3'], 'seed_group_id': seed_group_id}) for key_position in scoring_data['key_positions']: session.post_json("/lab_miseq_key_positions", {'project_region_id': project_region['id'], 'start_pos': key_position['start_pos'], 'end_pos': key_position['end_pos']}) print("Done.")
def main(): projects = ProjectConfig.loadDefault() sections_2100hcv_1, sections_2100hcv_2 = make_random_sections( 'HCV1A-H77-NS5a', 1, 300, projects, 400) sections_2100v3_1, sections_2100v3_2 = ([ FastqSection('HIV1-B-FR-K03455-seed', 7056, 7312, 50), FastqSection('HIV1-B-FR-K03455-seed', 7062, 7312, 50) ], [ FastqSection('HIV1-B-FR-K03455-seed', 7123, 7373, 50), FastqSection('HIV1-B-FR-K03455-seed', 7123, 7376, 50) ]) sections_2100hiv_1, sections_2100hiv_2 = make_random_sections( 'RT', 1, 300, projects, 400) sections_2160_1, sections_2160_2 = make_random_sections( 'HCV2-JFH-1-NS5b', 1, 230, projects, mutations=(CodonMutation(159, 'GTC'), )) sections_2160midi_1, sections_2160midi_2 = make_random_sections( 'HCV2-JFH-1-NS5b', 231, 561, projects, mutations=(CodonMutation(316, 'AGC'), )) sections_2170_1a_1, sections_2170_1a_2 = make_random_sections( 'HCV-1a', 6258, 9375) sections_2170_2_1, sections_2170_2_2 = make_random_sections( 'HCV-2a', 6269, 9440) sections_2180_1, sections_2180_2 = make_random_sections( 'HIV1-B-FR-K03455-seed', 6225, 7757) hxb2_ref = projects.getReference('HIV1-B-FR-K03455-seed') projects.config['regions']['HXB2-with-deletion'] = dict( reference=hxb2_ref[617:928] + hxb2_ref[9358:9652], is_nucleotide=True, seed_group=None) sections_2210_1, sections_2210_2 = make_random_sections( 'HXB2-with-deletion', projects=projects) fastq_files = [ FastqFile('2010A-V3LOOP_S3_L001_R1_001.fastq', '2010', False, (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10), FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))), FastqFile('2010A-V3LOOP_S3_L001_R2_001.fastq', '2010', True, (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10), FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))), FastqFile('2020A-GP41_S4_L001_R1_001.fastq', '2020', False, (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10, (CodonMutation(6981, 'GGGATA'), )), )), FastqFile('2020A-GP41_S4_L001_R2_001.fastq', '2020', True, (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10, (CodonMutation(6981, 'GGGATA'), )), )), FastqFile('2040A-HLA-B_S6_L001_R1_001.fastq', '2040', False, (FastqSection('HLA-B-seed', 201, 315, 80), FastqSection('HLA-B-seed', 201, 315, 20, (CodonMutation(207, 'TCT'), )))), FastqFile('2040A-HLA-B_S6_L001_R2_001.fastq', '2040', True, (FastqSection('HLA-B-seed', 201, 315, 80), FastqSection('HLA-B-seed', 201, 315, 20, (CodonMutation(207, 'TCT'), )))), FastqFile( '2070A-PR_S9_L001_R1_001.fastq', '2070', False, (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )), FastqSection('PR', 40, 80, 3, (CodonMutation(45, ''), CodonMutation(64, ''))))), FastqFile( '2070A-PR_S9_L001_R2_001.fastq', '2070', True, (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )), FastqSection('PR', 40, 80, 3, (CodonMutation(45, ''), CodonMutation(64, ''))))), FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R1_001.fastq', '2100', False, sections_2100hcv_1 + sections_2100v3_1 + sections_2100hiv_1), FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R2_001.fastq', '2100', True, sections_2100hcv_2 + sections_2100v3_2 + sections_2100hiv_2), FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 1, 66, 100), FastqSection('HCV2-JFH-1-NS5b', 115, 181, 100, (CodonMutation(159, 'GTC'), )))), FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 51, 114, 100), FastqSection('HCV2-JFH-1-NS5b', 165, 230, 100))), FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 231, 315, 100), FastqSection('HCV2-JFH-1-NS5b', 398, 485, 100))), FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 305, 397, 100, (CodonMutation(316, 'AGC'), )), FastqSection('HCV2-JFH-1-NS5b', 470, 561, 100))), FastqFile('2140A-HIV_S17_L001_R1_001.fastq', '2140', False, (FastqSection('PR', 1, 80, 100, (CodonMutation(24, 'ATA'), )), )), FastqFile('2140A-HIV_S17_L001_R2_001.fastq', '2140', True, (FastqSection('PR', 20, 99, 100, (CodonMutation(24, 'ATA'), )), )), # Simplify with one_contig. FastqFile('2160A-HCV_S19_L001_R1_001.fastq', '2160', False, sections_2160_1), FastqFile('2160A-HCV_S19_L001_R2_001.fastq', '2160', True, sections_2160_2), # Simplify with one_contig. FastqFile('2160AMIDI-MidHCV_S20_L001_R1_001.fastq', '2160', False, sections_2160midi_1), FastqFile('2160AMIDI-MidHCV_S20_L001_R2_001.fastq', '2160', True, sections_2160midi_2), # Simplify with two_long_contigs. FastqFile('2170A-HCV_S21_L001_R1_001.fastq', '2170', False, sections_2170_1a_1 + sections_2170_2_1), FastqFile('2170A-HCV_S21_L001_R2_001.fastq', '2170', True, sections_2170_1a_2 + sections_2170_2_2), FastqFile('2180A-HIV_S22_L001_R1_001.fastq', '2180', False, sections_2180_1), FastqFile('2180A-HIV_S22_L001_R2_001.fastq', '2180', True, sections_2180_2), FastqFile('2190A-SARSCOV2_S23_L001_R1_001.fastq', '2190', False, (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50, (CodonMutation(4400, 'TCA'), )), FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50, (CodonMutation(4400, 'TCA'), )))), FastqFile('2190A-SARSCOV2_S23_L001_R2_001.fastq', '2190', True, (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50, (CodonMutation(4400, 'TCA'), )), FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50, (CodonMutation(4400, 'TCA'), )))), FastqFile('2200A-SARSCOV2_S24_L001_R1_001.fastq', '2200', False, (FastqSection('SARS-CoV-2-nsp1', 20, 66, 100), )), FastqFile('2200A-SARSCOV2_S24_L001_R2_001.fastq', '2200', True, (FastqSection('SARS-CoV-2-nsp1', 56, 102, 100), )), FastqFile('2210A-NFLHIVDNA_S25_L001_R1_001.fastq', '2210', False, sections_2210_1), FastqFile('2210A-NFLHIVDNA_S25_L001_R2_001.fastq', '2210', True, sections_2210_2) ] for fastq_file in fastq_files: with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: ref_name, ref_start, ref_end = find_coord_pos( projects, section.coord_name, section.start_pos, section.end_pos) ref_nuc_seq = projects.getReference(ref_name) ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end]) is_nucleotide = ((ref_start, ref_end) == (section.start_pos, section.end_pos)) for mutation in section.mutations: if section.start_pos <= mutation.pos <= section.end_pos: section_pos = mutation.pos - section.start_pos if not is_nucleotide: section_pos *= 3 ref_nuc_section[section_pos:section_pos + 3] = list( mutation.codon) ref_nuc_section = ''.join(ref_nuc_section) if fastq_file.is_reversed: ref_nuc_section = reverse_and_complement(ref_nuc_section) phred_scores = 'A' * len(ref_nuc_section) file_num = '2' if fastq_file.is_reversed else '1' # noinspection PyTypeChecker for cluster in range(section.count): f.write( '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n' .format(fastq_file.extract_num, cluster + next_cluster, file_num)) f.write(ref_nuc_section + '\n') f.write('+\n') f.write(phred_scores + '\n') next_cluster += section.count
def load_projects() -> ProjectConfig: return ProjectConfig.loadDefault()