def get_template_numbers(patients, VERBOSE=0): '''Collect template numbers from all patient samples''' data = [] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE: print pname, patient.code samples = patient.samples n_approx = samples['templates approx'] dils = [get_dilution(x) for x in samples['dilutions']] n_dils = [2 * estimate_ntemplates_Poisson(x) for x in dils] # Attach sample date info age = np.array((datetime.datetime.now() - samples.date)) / 86400e9 data.append({ 'n_approx': n_approx, 'n_dil': n_dils, 'age': age, 'pname': patient.code }) return data
def loadK31(reg, filepath, fromHIV=False): ''' Loading data for 31 additional patients Input arguments: reg: name of genetic region (gag or pol) filepath: path to directory where the frequency data are to be stored/downloaded fromHIV: download raw data and store them, if True; use stored data, if False ''' data = {} if fromHIV: sys.path.append("/scicore/home/neher/neher/HIV/hivwholeseq") from hivwholeseq.patients.patients import load_patients, Patient pats = load_patients(csv=True) fmt = "%d/%m/%Y" fhandle = open(filepath + 'K31_info_{}.txt'.format(reg), 'w') for pcode, pat in pats.iterrows(): try: EDI = datetime.strptime(pat["infect date best"], fmt) P = Patient(pat) aft = P.get_allele_frequency_trajectories(reg, cov_min=500)[0] for si, (scode, sample) in enumerate(P.samples.iterrows()): try: date = datetime.strptime(sample["date"], fmt) af = aft[si] TI = date.toordinal() - EDI.toordinal() fhandle.write('{}\t{}\t{}\n'.format(pcode, scode, TI)) np.save( filepath + '{}_{}_{}_data.npy'.format(pcode, scode, reg), af.data) np.save( filepath + '{}_{}_{}_mask.npy'.format(pcode, scode, reg), af.mask) data['{}_{}'.format(pcode, scode)] = (date.toordinal() - EDI.toordinal(), af) print(pcode, scode, "WORKED!!!") except: print(scode, "didn't work") except: print("skipping patient ", pcode) fhandle.close() else: with open(filepath + 'K31_info_{}.txt'.format(reg), 'r') as fhandle: for line in fhandle: words = line.split() pat_name = '_'.join(words[:2]) af_data = np.load(filepath + '{}_{}_data.npy'.format(pat_name, reg)) af_mask = np.load(filepath + '{}_{}_mask.npy'.format(pat_name, reg)) af = np.ma.masked_array(af_data, mask=af_mask) data[pat_name] = (int(words[2]), af) return data
args = parser.parse_args() pnames = args.patients regions = args.regions VERBOSE = args.verbose plot = args.plot patients = load_patients() if pnames is not None: patients = patients.loc[pnames] pnames = patients.index.tolist() data = [] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for ifr, region in enumerate(regions): if VERBOSE >= 1: print pname, region try: dg, ind = patient.get_divergence(region, cov_min=10) except ValueError: continue times = patient.times[ind] data.append({'pname': pname, 'region': region, 'dg': dg, 't': times}) if VERBOSE >= 1: