Exemplo n.º 1
0
def loadK31(reg, filepath, fromHIV=False):
    '''
    Loading data for 31 additional patients
    
    Input arguments:
    reg: name of genetic region (gag or pol)
    filepath: path to directory where the frequency data are to be stored/downloaded
    fromHIV: download raw data and store them, if True; use stored data, if False 
    '''
    data = {}
    if fromHIV:
        sys.path.append("/scicore/home/neher/neher/HIV/hivwholeseq")
        from hivwholeseq.patients.patients import load_patients, Patient
        pats = load_patients(csv=True)
        fmt = "%d/%m/%Y"
        fhandle = open(filepath + 'K31_info_{}.txt'.format(reg), 'w')
        for pcode, pat in pats.iterrows():
            try:
                EDI = datetime.strptime(pat["infect date best"], fmt)
                P = Patient(pat)
                aft = P.get_allele_frequency_trajectories(reg, cov_min=500)[0]
                for si, (scode, sample) in enumerate(P.samples.iterrows()):
                    try:
                        date = datetime.strptime(sample["date"], fmt)
                        af = aft[si]
                        TI = date.toordinal() - EDI.toordinal()
                        fhandle.write('{}\t{}\t{}\n'.format(pcode, scode, TI))
                        np.save(
                            filepath +
                            '{}_{}_{}_data.npy'.format(pcode, scode, reg),
                            af.data)
                        np.save(
                            filepath +
                            '{}_{}_{}_mask.npy'.format(pcode, scode, reg),
                            af.mask)
                        data['{}_{}'.format(pcode,
                                            scode)] = (date.toordinal() -
                                                       EDI.toordinal(), af)
                        print(pcode, scode, "WORKED!!!")
                    except:
                        print(scode, "didn't work")

            except:
                print("skipping patient ", pcode)
        fhandle.close()
    else:
        with open(filepath + 'K31_info_{}.txt'.format(reg), 'r') as fhandle:
            for line in fhandle:
                words = line.split()
                pat_name = '_'.join(words[:2])
                af_data = np.load(filepath +
                                  '{}_{}_data.npy'.format(pat_name, reg))
                af_mask = np.load(filepath +
                                  '{}_{}_mask.npy'.format(pat_name, reg))
                af = np.ma.masked_array(af_data, mask=af_mask)
                data[pat_name] = (int(words[2]), af)
    return data
    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    data = []

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for ifr, region in enumerate(regions):
            if VERBOSE >= 1:
                print pname, region

            aft, ind = patient.get_allele_frequency_trajectories(region,
                                                                 cov_min=10)
            times = patient.times[ind]

            dg = get_divergence(aft)
            ds = get_diversity(aft)

            data.append({'pname': pname, 'region': region, 'dg': dg, 'ds': ds, 't': times})

    if plot:
        fig, ax = plt.subplots(1, 1)
        ax.set_xlabel('Time from transmission [days]')
        ax.set_ylabel('Divergence [solid]\nDiversity [dashed]')
        #ax.set_yscale('log')

        for i, d in enumerate(data):
            pname = d['pname']
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    dgs = {}
    dss = {}

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for ifr, fragment in enumerate(fragments):
            if VERBOSE >= 1:
                print pname, fragment

            aft, ind = patient.get_allele_frequency_trajectories(fragment,
                                                                 cov_min=100)

            # NOTE: Ns should be excluded from diversity and divergence
            aft = aft[:, :5, :]

            if use_sliding:
                (x, dg, ds) = get_divergence_diversity_sliding(aft, block_length,
                                                               VERBOSE=VERBOSE)
            else:
                (x, dg, ds) = get_divergence_diversity_blocks(aft, block_length,
                                                              VERBOSE=VERBOSE)

            # FIXME: avoid this var to get different conv and aft indices
            times = patient.times[ind]
            dgs[(pname, fragment)] = (patient.times[ind], dg)
            dss[(pname, fragment)] = (patient.times[ind], ds)
Exemplo n.º 4
0
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        for fragment in fragments:
            if VERBOSE >= 1:
                print patient.name, fragment

            mapco = patient.get_map_coordinates_reference(fragment, refname=refname)

            if VERBOSE >= 2:
                print 'Get initial allele frequencies'
            af0 = patient.get_initial_allele_frequencies(fragment, cov_min=depth_min)

            if VERBOSE >= 2:
                print 'Get allele frequencies'
            aft, ind = patient.get_allele_frequency_trajectories(fragment,
                                                                 depth_min=depth_min)

            if VERBOSE >= 2:
                print 'Filter out masked positions'
            ind_nonmasked = -aft.mask.any(axis=0).any(axis=0)

            if VERBOSE >= 2:
                print 'Remove first time sample'
            aft_der = aft[int(0 in ind):].copy()

            if VERBOSE >= 2:
                print 'Filter out ancestral alleles'
            for i, ai in enumerate(af0.argmax(axis=0)):
                aft_der[:, ai, i] = 0
                # take out everything at high frequency in first sample to
                # improve polarization
Exemplo n.º 5
0
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        t_bds = []
        t_loss = []
        t_fixs = []
        n_staypolys = []
        for fragment in fragments:
            if VERBOSE >= 1:
                print fragment

            # Collect allele counts from patient samples, and return only positive hits
            # sns contains sample names and PCR types
            (aft, ind) = patient.get_allele_frequency_trajectories(
                fragment,
                cov_min=cov_min,
                depth_min=depth_min,
                VERBOSE=VERBOSE)
            times = patient.times[ind]
            ntemplates = patient.n_templates[ind]

            n_staypoly = 0
            t_bd = []
            t_fix = []
            t_los = []
            for pos in xrange(aft.shape[2]):
                for ia, a in enumerate(alpha):
                    aft_pos = aft[:, ia, pos]

                    # Keep only polymorphic
                    ipos0 = (aft_pos > af0[0]) & (aft_pos < af0[1])
Exemplo n.º 6
0
    args = parser.parse_args()
    pnames = args.patients
    roi = args.roi
    VERBOSE = args.verbose
    use_plot = args.plot
    use_interactive = args.interactive

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        (fragment, start, end) = patient.get_fragmented_roi(roi, VERBOSE=VERBOSE)
        aft, ind = patient.get_allele_frequency_trajectories(fragment)
        aft = aft[:, :, start:end]

        # TODO: also calculate the logos

        ## Get only some time points
        # i = np.arange(len(ind))[::len(ind) // 2]
        # aft = aft[i]
        # ind = ind[i]

        times = patient.times[ind]

        if use_plot:
            fig, axs = plt.subplots(aft.shape[0], 1, figsize=(14, 3 * aft.shape[0]))

            for i, (ax, af) in enumerate(izip(axs, aft)):
        print pname
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        t_bds = []
        t_loss = []
        t_fixs = []
        n_staypolys = []
        for fragment in fragments:
            if VERBOSE >= 1:
                print fragment

            # Collect allele counts from patient samples, and return only positive hits
            # sns contains sample names and PCR types
            (aft, ind) = patient.get_allele_frequency_trajectories(fragment,
                                                               cov_min=cov_min,
                                                               depth_min=depth_min,
                                                               VERBOSE=VERBOSE)
            times = patient.times[ind]
            ntemplates = patient.n_templates[ind]

            n_staypoly = 0
            t_bd = []
            t_fix = []
            t_los = []
            for pos in xrange(aft.shape[2]):
                for ia, a in enumerate(alpha):
                    aft_pos = aft[:, ia, pos]
                    
                    # Keep only polymorphic
                    ipos0 = (aft_pos > af0[0]) & (aft_pos < af0[1])
                    if not ipos0.any():
Exemplo n.º 8
0
    pnames = args.patients
    roi = args.roi
    VERBOSE = args.verbose
    use_plot = args.plot
    use_interactive = args.interactive

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        (fragment, start, end) = patient.get_fragmented_roi(roi,
                                                            VERBOSE=VERBOSE)
        aft, ind = patient.get_allele_frequency_trajectories(fragment)
        aft = aft[:, :, start:end]

        # TODO: also calculate the logos

        ## Get only some time points
        #i = np.arange(len(ind))[::len(ind) // 2]
        #aft = aft[i]
        #ind = ind[i]

        times = patient.times[ind]

        if use_plot:
            fig, axs = plt.subplots(aft.shape[0],
                                    1,
                                    figsize=(14, 3 * aft.shape[0]))
Exemplo n.º 9
0
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    dgs = {}
    dss = {}

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for ifr, fragment in enumerate(fragments):
            if VERBOSE >= 1:
                print pname, fragment

            aft, ind = patient.get_allele_frequency_trajectories(fragment,
                                                                 cov_min=100)

            # NOTE: Ns should be excluded from diversity and divergence
            aft = aft[:, :5, :]

            if use_sliding:
                (x, dg, ds) = get_divergence_diversity_sliding(aft,
                                                               block_length,
                                                               VERBOSE=VERBOSE)
            else:
                (x, dg, ds) = get_divergence_diversity_blocks(aft,
                                                              block_length,
                                                              VERBOSE=VERBOSE)

            # FIXME: avoid this var to get different conv and aft indices
            times = patient.times[ind]