def export_naiveseqs_stat(gene, sequences): result = [] for sequence in sequences: result.append({ 'Accession': sequence['Accession'], 'PMID': sequence['PubMedID'], 'Gene': gene, 'Subtype': sequence['Subtype'], 'NumAAChanges': len(sequence['Mutations']), 'NumInsertions': sequence['NumInsertions'], 'NumDeletions': sequence['NumDeletions'], 'NumStopCodons': sequence['NumStopCodons'], 'NumApobecs': sequence['NumApobecs'], 'NumUnusuals': sequence['NumUnusuals'], 'NumFrameShifts': sequence['NumFrameShifts'], }) csv_writer( os.path.join(ROOT, 'data', 'naiveStudies', '{}StatBySeq.csv'.format(gene)), result, [ 'Accession', 'PMID', 'Gene', 'Subtype', 'NumAAChanges', 'NumInsertions', 'NumDeletions', 'NumStopCodons', 'NumApobecs', 'NumUnusuals', 'NumFrameShifts' ])
def main(): with Pool(max(1, cpu_count() - 2)) as pool: for gene in ('gag', 'gp41'): result = calc_distances(gene, pool) csv_writer(os.path.join( ROOT, 'local', 'naiveStudies', '{}NaiveDistance.csv'.format(gene) ), result, ['Sequence1', 'Sequence2', 'Distance'])
def export_unusuals(gene, sequences): result = [] for seq in sequences: result.append({ 'Accession': seq['Accession'], 'NumUnusuals': seq['NumUnusuals'] }) csv_writer( os.path.join(ROOT, 'internalFiles', 'naiveStudies', '{}Unusuals.csv'.format(gene)), result, ['Accession', 'NumUnusuals'])
def export_aa_prevalence(gene, ptseqs): major_subtypes = [None] + get_most_common_subtypes(gene) header = ['Gene', 'Subtype', 'Pos', 'AA', 'Pcnt', 'Count', 'PosTotal'] all_prevalence = [] for subtype in major_subtypes: prevs = aggregate_aa_prevalence(gene, ptseqs, subtype).values() all_prevalence.append(prevs) csv_writer( os.path.join(ROOT, 'data', 'naiveStudies', '{}AAPrevalence.csv'.format(gene)), chain(*all_prevalence), header)
def export_papers_table(filename, rows): results = [] for row in rows: numpt = row['NumLANLIsolatesQCPassed'] if not numpt: continue results.append({ 'PubMedID': row['PubMedID'], 'PubYear': row['PubYear'], 'NumPatients': numpt, 'RxStatus': row['RxStatus'], 'Title': row['Title'], 'Subtypes': row['Subtypes'], 'Authors': row['Authors'], }) csv_writer(filename, results, CLEAN_TABLE_HEADERS)
def export_adindex(gene, sequences): apobecs = {} for apobec in possible_apobecs_reader(gene): pos = int(apobec['Position']) apobecs.setdefault(pos, set()).add(apobec['AAChange'].split('=>', 1)[1]) conserveds = len(apobecs) result = [] for sequence in sequences: num_apobecs = sequence['NumApobecs'] index = num_apobecs / conserveds result.append({ 'Accession': sequence['Accession'], 'NumAPOBECs': num_apobecs, 'NumConservedAPOBECSites': conserveds, 'ADIndex': index # APOBEC-mediated defectives index }) csv_writer( os.path.join(ROOT, 'internalFiles', 'naiveStudies', 'apobec', '{}NaiveADIndex.csv'.format(gene)), result, ['Accession', 'NumAPOBECs', 'NumConservedAPOBECSites', 'ADIndex'])
def main(): def cc_keyfunc(c): return int(c['PID']), c['Rx'], c['Pos'] for gene in ('gag', 'gp41'): csv_writer( os.path.join(ROOT, 'internalFiles', 'aaChangesByPosWPrev', '{}.csv'.format(gene)), chain( aggregate_aa_changes_by_pos(gene, 'PIs', 'PIs'), aggregate_aa_changes_by_pos(gene, 'NNRTIs', 'NNRTIs'), ), ['Group', 'Pos', 'PreAA', 'PostAA', 'NumPts', 'PrePrev', 'PostPrev', 'Fold', 'LogFold']) csv_writer( os.path.join(ROOT, 'internalFiles', 'codonChangesByPt', '{}.csv'.format(gene)), sorted( codon_changes_per_person(gene, ('PIs', 'NNRTIs')), key=cc_keyfunc), ['PID', 'Rx', 'Pos', 'Type', 'Codons', 'NumNAChanges', 'AAs'])
def find_possible_apobecs(gene, ptseqs): filename = os.path.join(ROOT, 'data', 'naiveStudies', 'apobec', '{}PossibleApobecs.csv'.format(gene)) apobecs = Counter() profile = aggregate_aa_prevalence(gene, ptseqs) for seq in ptseqs: naseq = seq['AlignedNASequence'] # search for all positions has GG=>AG or GG=>AA change # deletion gaps should be also considered matches = re.finditer('A-*(?=[AG])', naseq) muts = {m['Position']: m for m in seq['Mutations']} start_codon_apobec_changed = False if 1 in muts: first = muts[1] start_codon_apobec_changed = (first['ReferenceText'] == 'M' and 'I' in first['AminoAcidText']) if not start_codon_apobec_changed and not seq['NumStopCodons']: # no M=>I and no W=>* changes continue if seq['NumStopCodons']: for mut in seq['Mutations']: if '*' in mut['AminoAcidText']: cons = mut['ReferenceText'] if cons != 'W': continue aa_pos = mut['Position'] cons_prev = profile[(aa_pos, cons)]['Pcnt'] if cons_prev < 97.5: # skip non-conserved position continue apobecs[(aa_pos, 'W=>*')] += 1 for match in matches: start, end = match.span(0) # na2 = naseq[end] aa_pos = start // 3 + 1 na_offset = start % 3 if aa_pos not in muts: continue mut = muts[aa_pos] cons = mut['ReferenceText'] if mut['IsPartial'] or mut['IsInsertion'] or mut['IsDeletion']: continue if '*' in mut['AminoAcidText']: continue cons_prev = profile[(aa_pos, cons)]['Pcnt'] if cons_prev < 97.5: # skip non-conserved position continue codon = mut['CodonText'] for source in get_codons(cons): # find G=>A hypermutation if source[na_offset] != 'G': continue target = source[:na_offset] + 'A' + source[na_offset + 1:] if compare_codon(target, codon): target_aa = translate_codon(target) if target_aa == cons: # do not add things like "E=>E" break apobecs[(aa_pos, '{}=>{}'.format(cons, target_aa))] += 1 break possible_apobecs = [] for (pos, mut), count in apobecs.most_common(): possible_apobecs.append({ 'Position': pos, 'AAChange': mut, 'Consensus %': profile[pos, mut.split('=>', 1)[0]]['Pcnt'].quantize(PREC1), '# with Stop': count }) for seq in ptseqs: muts = {m['Position']: m for m in seq['Mutations']} for apobec in possible_apobecs: pos = apobec['Position'] aa = apobec['AAChange'].split('=>', 1)[1] if pos in muts and aa in muts[pos]['AminoAcidText']: apobec['# Sequence'] = apobec.get('# Sequence', 0) + 1 for apobec in possible_apobecs: apobec['% with Stop'] = Decimal(apobec['# with Stop'] * 100 / apobec['# Sequence']).quantize(PREC0) possible_apobecs = sorted(possible_apobecs, key=lambda a: (a['Position'], a['AAChange'])) possible_apobecs = [a for a in possible_apobecs if a['% with Stop'] > 50] csv_writer( filename, possible_apobecs, ['Position', 'AAChange', 'Consensus %', '% with Stop', '# Sequence'], writer_options={'extrasaction': 'ignore'})
def create_review_table(gene, ptseqs): fact_table = get_fact_table(gene) grouped = groupby(ptseqs, lambda s: s['_PubID']) results = {} for pubid, group_seqs in grouped: group_seqs = list(group_seqs) subtypes = sorted({s['Subtype'] for s in group_seqs}) seq = group_seqs[0] fact = fact_table.get(pubid, {}) if fact.get('PubIDCorrection'): pubid = fact['PubIDCorrection'] if pubid not in results: results[pubid] = { 'PubID': pubid, 'PubMedID': fact.get('PMID') or seq['PubMedID'], 'PubYear': fact.get('PubYr') or seq['PubYear'], 'NumPts': fact.get('NumPts'), 'NumIsolates': fact.get('NumIsolates'), 'NumLANLIsolates': len(group_seqs), 'NumLANLIsolatesQCPassed': len([s for s in group_seqs if s['Included']]), 'Title': seq['Title'], 'Authors': seq['Authors'], 'Subtypes': '; '.join(subtypes), 'RxStatus': fact.get('RxStatus'), 'Notes': fact.get('Notes'), } else: result = results[pubid] origsubtypes = result['Subtypes'].split('; ') subtypes = sorted(set(origsubtypes + subtypes)) result['Subtypes'] = '; '.join(subtypes) if not result.get('PubYear'): result['PubYear'] = fact.get('PubYr') or seq['PubYear'] if not result.get('PubMedID'): result['PubMedID'] = fact.get('PMID') or seq['PubMedID'] num_pts = int(result['NumPts'] or 0) num_pts += int(fact.get('NumPts') or 0) if num_pts: result['NumPts'] = str(num_pts) num_isos = int(result['NumIsolates'] or 0) num_isos += int(fact.get('NumIsolates') or 0) if num_isos: result['NumIsolates'] = str(num_isos) result['NumLANLIsolates'] += len(group_seqs) result['NumLANLIsolatesQCPassed'] += len( [s for s in group_seqs if s['Included']]) if not result.get('RxStatus'): result['RxStatus'] = fact.get('RxStatus') results = sorted(results.values(), key=lambda r: (-r['NumLANLIsolates'], r['PubID'])) csv_writer( os.path.join(ROOT, 'internalFiles', 'papersReview', '{}ReviewTable.csv'.format(gene)), results, REVIEW_TABLE_HEADERS) export_excel_table( os.path.join(ROOT, 'internalFiles', 'papersReview', '{}ReviewTable.xlsx'.format(gene)), results) export_papers_table( os.path.join(ROOT, 'data', 'naiveStudies', '{}Studies.csv'.format(gene)), results)
def export_naive_sequences(gene, ptseqs): filename = os.path.join(ROOT, 'internalFiles', 'naiveStudies', '{}.csv'.format(gene.lower())) aligned_fasta = os.path.join(ROOT, 'data', 'naiveStudies', '{}NaiveAligned.fas'.format(gene.lower())) unaligned_fasta = os.path.join(ROOT, 'data', 'naiveStudies', '{}NaiveOriginal.fas'.format(gene.lower())) indels_csv = os.path.join(ROOT, 'data', 'naiveStudies', '{}NaiveIndels.csv'.format(gene.lower())) genesize = int(CONSENSUS[gene]['Size']) siteheaders = ['P{}'.format(i) for i in range(1, genesize + 1)] rows = [] indels = [] ptseqs = sorted(ptseqs, key=lambda s: s['Accession']) for seq in ptseqs: firstaa = seq['FirstAA'] lastaa = seq['LastAA'] muts = {m['Position']: m for m in seq['Mutations']} row = { 'PMID': seq['PubMedID'], 'Accession': seq['Accession'], 'RxStatus': 'Naive', 'lanlSubtype': seq['Subtype'], 'NumAAChanges': len(muts), 'NumInsertions': seq['NumInsertions'], 'NumDeletions': seq['NumDeletions'], 'NumStopCodons': seq['NumStopCodons'], 'NumApobecs': seq['NumApobecs'], 'NumUnusuals': seq['NumUnusuals'], 'NumFrameShifts': seq['NumFrameShifts'] } for pos in range(1, genesize + 1): pname = 'P{}'.format(pos) if pos < firstaa or pos > lastaa: row[pname] = '.' elif pos not in muts: row[pname] = '-' else: mut = muts[pos] if mut['IsInsertion']: row[pname] = 'i' elif mut['IsDeletion']: row[pname] = 'd' elif mut['IsPartial']: row[pname] = 'X' else: aas = mut['AminoAcidText'] if len(aas) > 4: aas = 'X' row[pname] = aas rows.append(row) for mut in seq['Mutations']: if mut['IsPartial'] or not (mut['IsInsertion'] or mut['IsDeletion']): continue isins = mut['IsInsertion'] indels.append({ 'Accession': seq['Accession'], 'Gene': gene, 'Position': mut['Position'], 'IndelType': 'ins' if isins else 'del', 'Codon': mut['CodonText'] if isins else '', 'InsertedCodons': mut['InsertedCodonsText'] if isins else '' }) csv_writer(filename, rows, [ 'PMID', 'Accession', 'RxStatus', 'lanlSubtype', 'NumAAChanges', 'NumInsertions', 'NumDeletions', 'NumStopCodons', 'NumApobecs', 'NumUnusuals', 'NumFrameShifts' ] + siteheaders) csv_writer(indels_csv, indels, [ 'Accession', 'Gene', 'Position', 'IndelType', 'Codon', 'InsertedCodons' ]) data_writer( aligned_fasta, '\n'.join('>{Accession}|{Subtype}\n{AlignedNASequence}'.format(**s) for s in ptseqs)) data_writer( unaligned_fasta, '\n'.join('>{Accession}|{Subtype}\n{NASequence}'.format(**s) for s in ptseqs)) print('- {} naive {} sequences were exported'.format(len(ptseqs), gene))