for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename sample = SamplePat(sample) pname = sample.patient ref = sample.get_reference("genomewide", "gb") # Collect the insertions (where possible) ics = {} for fragment in ["F" + str(i) for i in xrange(1, 7)]: try: ic = sample.get_insertions(fragment, merge_read_types=False) except IOError: continue start = find_annotation(ref, fragment).location.nofuzzy_start ics[(fragment, start)] = ic if not len(ics): if VERBOSE >= 1: print "No data found: skipping" continue # Merge insertions ic = merge_insertions(ics, VERBOSE=VERBOSE) if save_to_file: fn_out = sample.get_insertions_filename("genomewide") save_insertions(fn_out, ic) if VERBOSE >= 1: print "Genomewide insertions saved to:", fn_out
parser.add_argument('--reference', default='HXB2', help='Reference to use for alignment') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-4]') parser.add_argument('--subtypes', nargs='+', default=['B'], help='Subtypes to keep') args = parser.parse_args() regions = args.regions refname = args.reference VERBOSE = args.verbose subtypes = args.subtypes from hivwholeseq.reference import load_custom_reference from hivwholeseq.utils.sequence import find_annotation ref = load_custom_reference('HXB2', 'gb') for region in regions: regm = np.array(find_annotation(ref, region).extract(ref), 'S1') for subtype in subtypes: fn = get_subtype_reference_alignment_filename(region, subtype=subtype, refname=refname, VERBOSE=VERBOSE) alim = np.array(AlignIO.read(fn, 'fasta'), 'S1') weird = ((alim != regm).mean(axis=1) > 0.2) print region, subtype, weird.sum()
help='Reference to use for alignment') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-4]') parser.add_argument('--subtypes', nargs='+', default=['B'], help='Subtypes to keep') args = parser.parse_args() regions = args.regions refname = args.reference VERBOSE = args.verbose subtypes = args.subtypes from hivwholeseq.reference import load_custom_reference from hivwholeseq.utils.sequence import find_annotation ref = load_custom_reference('HXB2', 'gb') for region in regions: regm = np.array(find_annotation(ref, region).extract(ref), 'S1') for subtype in subtypes: fn = get_subtype_reference_alignment_filename(region, subtype=subtype, refname=refname, VERBOSE=VERBOSE) alim = np.array(AlignIO.read(fn, 'fasta'), 'S1') weird = ((alim != regm).mean(axis=1) > 0.2) print region, subtype, weird.sum()
def correlate_epitope_substitution(ds, dctl): '''Correlate presence of a substitution with epitope''' from hivwholeseq.data.primers import primers_coordinates_HXB2_outer start_F1 = primers_coordinates_HXB2_outer['F1'][0][1] end_F6 = primers_coordinates_HXB2_outer['F6'][1][0] ds = ds.copy() dg = [] for pcode, datum in dctl.groupby('pcode'): a = np.arange(start_F1, end_F6) b = np.zeros(len(a), bool) for _, epi in datum.iterrows(): b[(a >= epi['start_HXB2']) & (a < epi['end_HXB2'])] = True c = np.zeros(len(a), bool) datum = ds.loc[ds['pcode'] == pcode] # Keep only nonsyn substitutions datum = datum.loc[datum['syn'] == False] c[datum['pos_ref'] - a[0]] = True dat = { 'pos': a, 'epitope': b, 'substitution': c, } dat = pd.DataFrame(dat) dat['pcode'] = pcode dg.append(dat) dg = pd.concat(dg) # Exclude env because it has antibody-related substitutions from hivwholeseq.reference import load_custom_reference from hivwholeseq.utils.sequence import find_annotation ref = load_custom_reference('HXB2', 'gb') start_env = find_annotation(ref, 'gp41').location.nofuzzy_start end_env = find_annotation(ref, 'gp41').location.nofuzzy_end - 450 dg = dg.loc[(dg['pos'] < start_env) | (dg['pos'] >= end_env)] M = dg.groupby(['epitope', 'substitution']).size().unstack() Ma = np.array(M) xp = 1.0 * Ma[1, 0] / Ma[0, 0] * Ma[0, 1] xs = Ma[1, 1] - xp print M from scipy.stats import fisher_exact print 'Fisher\'s exact enrichment:', fisher_exact(Ma)[0] print 'Fisher\'s exact P value:', fisher_exact(Ma)[1] print 'expected:', xp print 'excess:', xs, 'per patient:', xs / 9.0 pos_epi = dg.loc[dg['epitope'] == True]['pos'].unique() dg2 = dg.loc[dg['pos'].isin(pos_epi)].copy() M2 = dg2.groupby(['epitope', 'substitution']).size().unstack() M2a = np.array(M2) xp = 1.0 * M2a[1, 0] / M2a[0, 0] * M2a[0, 1] xs = M2a[1, 1] - xp print M2 print '\nFisher\'s exact enrichment:', fisher_exact(M2a)[0] print 'Fisher\'s exact P value:', fisher_exact(M2a)[1] print 'expected:', xp print 'excess:', xs, 'per patient:', xs / 9.0 return { 'dg': dg, 'dg2': dg2, }
for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename sample = SamplePat(sample) pname = sample.patient ref = sample.get_reference('genomewide', 'gb') # Collect the insertions (where possible) ics = {} for fragment in ['F' + str(i) for i in xrange(1, 7)]: try: ic = sample.get_insertions(fragment, merge_read_types=False) except IOError: continue start = find_annotation(ref, fragment).location.nofuzzy_start ics[(fragment, start)] = ic if not len(ics): if VERBOSE >= 1: print 'No data found: skipping' continue # Merge insertions ic = merge_insertions(ics, VERBOSE=VERBOSE) if save_to_file: fn_out = sample.get_insertions_filename('genomewide') save_insertions(fn_out, ic) if VERBOSE >= 1: print 'Genomewide insertions saved to:', fn_out