def loadK31(reg, filepath, fromHIV=False): ''' Loading data for 31 additional patients Input arguments: reg: name of genetic region (gag or pol) filepath: path to directory where the frequency data are to be stored/downloaded fromHIV: download raw data and store them, if True; use stored data, if False ''' data = {} if fromHIV: sys.path.append("/scicore/home/neher/neher/HIV/hivwholeseq") from hivwholeseq.patients.patients import load_patients, Patient pats = load_patients(csv=True) fmt = "%d/%m/%Y" fhandle = open(filepath + 'K31_info_{}.txt'.format(reg), 'w') for pcode, pat in pats.iterrows(): try: EDI = datetime.strptime(pat["infect date best"], fmt) P = Patient(pat) aft = P.get_allele_frequency_trajectories(reg, cov_min=500)[0] for si, (scode, sample) in enumerate(P.samples.iterrows()): try: date = datetime.strptime(sample["date"], fmt) af = aft[si] TI = date.toordinal() - EDI.toordinal() fhandle.write('{}\t{}\t{}\n'.format(pcode, scode, TI)) np.save( filepath + '{}_{}_{}_data.npy'.format(pcode, scode, reg), af.data) np.save( filepath + '{}_{}_{}_mask.npy'.format(pcode, scode, reg), af.mask) data['{}_{}'.format(pcode, scode)] = (date.toordinal() - EDI.toordinal(), af) print(pcode, scode, "WORKED!!!") except: print(scode, "didn't work") except: print("skipping patient ", pcode) fhandle.close() else: with open(filepath + 'K31_info_{}.txt'.format(reg), 'r') as fhandle: for line in fhandle: words = line.split() pat_name = '_'.join(words[:2]) af_data = np.load(filepath + '{}_{}_data.npy'.format(pat_name, reg)) af_mask = np.load(filepath + '{}_{}_mask.npy'.format(pat_name, reg)) af = np.ma.masked_array(af_data, mask=af_mask) data[pat_name] = (int(words[2]), af) return data
patients = load_patients() if pnames is not None: patients = patients.loc[pnames] data = [] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for ifr, region in enumerate(regions): if VERBOSE >= 1: print pname, region aft, ind = patient.get_allele_frequency_trajectories(region, cov_min=10) times = patient.times[ind] dg = get_divergence(aft) ds = get_diversity(aft) data.append({'pname': pname, 'region': region, 'dg': dg, 'ds': ds, 't': times}) if plot: fig, ax = plt.subplots(1, 1) ax.set_xlabel('Time from transmission [days]') ax.set_ylabel('Divergence [solid]\nDiversity [dashed]') #ax.set_yscale('log') for i, d in enumerate(data): pname = d['pname']
fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments dgs = {} dss = {} for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for ifr, fragment in enumerate(fragments): if VERBOSE >= 1: print pname, fragment aft, ind = patient.get_allele_frequency_trajectories(fragment, cov_min=100) # NOTE: Ns should be excluded from diversity and divergence aft = aft[:, :5, :] if use_sliding: (x, dg, ds) = get_divergence_diversity_sliding(aft, block_length, VERBOSE=VERBOSE) else: (x, dg, ds) = get_divergence_diversity_blocks(aft, block_length, VERBOSE=VERBOSE) # FIXME: avoid this var to get different conv and aft indices times = patient.times[ind] dgs[(pname, fragment)] = (patient.times[ind], dg) dss[(pname, fragment)] = (patient.times[ind], ds)
for pname, patient in patients.iterrows(): patient = Patient(patient) for fragment in fragments: if VERBOSE >= 1: print patient.name, fragment mapco = patient.get_map_coordinates_reference(fragment, refname=refname) if VERBOSE >= 2: print 'Get initial allele frequencies' af0 = patient.get_initial_allele_frequencies(fragment, cov_min=depth_min) if VERBOSE >= 2: print 'Get allele frequencies' aft, ind = patient.get_allele_frequency_trajectories(fragment, depth_min=depth_min) if VERBOSE >= 2: print 'Filter out masked positions' ind_nonmasked = -aft.mask.any(axis=0).any(axis=0) if VERBOSE >= 2: print 'Remove first time sample' aft_der = aft[int(0 in ind):].copy() if VERBOSE >= 2: print 'Filter out ancestral alleles' for i, ai in enumerate(af0.argmax(axis=0)): aft_der[:, ai, i] = 0 # take out everything at high frequency in first sample to # improve polarization
patient = Patient(patient) patient.discard_nonsequenced_samples() t_bds = [] t_loss = [] t_fixs = [] n_staypolys = [] for fragment in fragments: if VERBOSE >= 1: print fragment # Collect allele counts from patient samples, and return only positive hits # sns contains sample names and PCR types (aft, ind) = patient.get_allele_frequency_trajectories( fragment, cov_min=cov_min, depth_min=depth_min, VERBOSE=VERBOSE) times = patient.times[ind] ntemplates = patient.n_templates[ind] n_staypoly = 0 t_bd = [] t_fix = [] t_los = [] for pos in xrange(aft.shape[2]): for ia, a in enumerate(alpha): aft_pos = aft[:, ia, pos] # Keep only polymorphic ipos0 = (aft_pos > af0[0]) & (aft_pos < af0[1])
args = parser.parse_args() pnames = args.patients roi = args.roi VERBOSE = args.verbose use_plot = args.plot use_interactive = args.interactive patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) (fragment, start, end) = patient.get_fragmented_roi(roi, VERBOSE=VERBOSE) aft, ind = patient.get_allele_frequency_trajectories(fragment) aft = aft[:, :, start:end] # TODO: also calculate the logos ## Get only some time points # i = np.arange(len(ind))[::len(ind) // 2] # aft = aft[i] # ind = ind[i] times = patient.times[ind] if use_plot: fig, axs = plt.subplots(aft.shape[0], 1, figsize=(14, 3 * aft.shape[0])) for i, (ax, af) in enumerate(izip(axs, aft)):
print pname patient = Patient(patient) patient.discard_nonsequenced_samples() t_bds = [] t_loss = [] t_fixs = [] n_staypolys = [] for fragment in fragments: if VERBOSE >= 1: print fragment # Collect allele counts from patient samples, and return only positive hits # sns contains sample names and PCR types (aft, ind) = patient.get_allele_frequency_trajectories(fragment, cov_min=cov_min, depth_min=depth_min, VERBOSE=VERBOSE) times = patient.times[ind] ntemplates = patient.n_templates[ind] n_staypoly = 0 t_bd = [] t_fix = [] t_los = [] for pos in xrange(aft.shape[2]): for ia, a in enumerate(alpha): aft_pos = aft[:, ia, pos] # Keep only polymorphic ipos0 = (aft_pos > af0[0]) & (aft_pos < af0[1]) if not ipos0.any():
pnames = args.patients roi = args.roi VERBOSE = args.verbose use_plot = args.plot use_interactive = args.interactive patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) (fragment, start, end) = patient.get_fragmented_roi(roi, VERBOSE=VERBOSE) aft, ind = patient.get_allele_frequency_trajectories(fragment) aft = aft[:, :, start:end] # TODO: also calculate the logos ## Get only some time points #i = np.arange(len(ind))[::len(ind) // 2] #aft = aft[i] #ind = ind[i] times = patient.times[ind] if use_plot: fig, axs = plt.subplots(aft.shape[0], 1, figsize=(14, 3 * aft.shape[0]))
fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments dgs = {} dss = {} for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for ifr, fragment in enumerate(fragments): if VERBOSE >= 1: print pname, fragment aft, ind = patient.get_allele_frequency_trajectories(fragment, cov_min=100) # NOTE: Ns should be excluded from diversity and divergence aft = aft[:, :5, :] if use_sliding: (x, dg, ds) = get_divergence_diversity_sliding(aft, block_length, VERBOSE=VERBOSE) else: (x, dg, ds) = get_divergence_diversity_blocks(aft, block_length, VERBOSE=VERBOSE) # FIXME: avoid this var to get different conv and aft indices times = patient.times[ind]