for samplename, sample in samples.iterrows():
        if VERBOSE >= 1:
            print samplename

        sample = SamplePat(sample)
        pname = sample.patient
        ref = sample.get_reference("genomewide", "gb")

        # Collect the insertions (where possible)
        ics = {}
        for fragment in ["F" + str(i) for i in xrange(1, 7)]:
            try:
                ic = sample.get_insertions(fragment, merge_read_types=False)
            except IOError:
                continue
            start = find_annotation(ref, fragment).location.nofuzzy_start
            ics[(fragment, start)] = ic

        if not len(ics):
            if VERBOSE >= 1:
                print "No data found: skipping"
            continue

        # Merge insertions
        ic = merge_insertions(ics, VERBOSE=VERBOSE)
        if save_to_file:
            fn_out = sample.get_insertions_filename("genomewide")
            save_insertions(fn_out, ic)
            if VERBOSE >= 1:
                print "Genomewide insertions saved to:", fn_out
Пример #2
0
    parser.add_argument('--reference', default='HXB2',
                        help='Reference to use for alignment')
    parser.add_argument('--verbose', type=int, default=0,
                        help='Verbosity level [0-4]')
    parser.add_argument('--subtypes', nargs='+', default=['B'],
                        help='Subtypes to keep')

    args = parser.parse_args()
    regions = args.regions
    refname = args.reference
    VERBOSE = args.verbose
    subtypes = args.subtypes


    from hivwholeseq.reference import load_custom_reference
    from hivwholeseq.utils.sequence import find_annotation
    ref = load_custom_reference('HXB2', 'gb')

    for region in regions:
        regm = np.array(find_annotation(ref, region).extract(ref), 'S1')
        for subtype in subtypes:
            fn = get_subtype_reference_alignment_filename(region,
                                                          subtype=subtype,
                                                          refname=refname,
                                                          VERBOSE=VERBOSE)
            alim = np.array(AlignIO.read(fn, 'fasta'), 'S1')
            weird = ((alim != regm).mean(axis=1) > 0.2)
            print region, subtype, weird.sum()
                

Пример #3
0
                        help='Reference to use for alignment')
    parser.add_argument('--verbose',
                        type=int,
                        default=0,
                        help='Verbosity level [0-4]')
    parser.add_argument('--subtypes',
                        nargs='+',
                        default=['B'],
                        help='Subtypes to keep')

    args = parser.parse_args()
    regions = args.regions
    refname = args.reference
    VERBOSE = args.verbose
    subtypes = args.subtypes

    from hivwholeseq.reference import load_custom_reference
    from hivwholeseq.utils.sequence import find_annotation
    ref = load_custom_reference('HXB2', 'gb')

    for region in regions:
        regm = np.array(find_annotation(ref, region).extract(ref), 'S1')
        for subtype in subtypes:
            fn = get_subtype_reference_alignment_filename(region,
                                                          subtype=subtype,
                                                          refname=refname,
                                                          VERBOSE=VERBOSE)
            alim = np.array(AlignIO.read(fn, 'fasta'), 'S1')
            weird = ((alim != regm).mean(axis=1) > 0.2)
            print region, subtype, weird.sum()
Пример #4
0
def correlate_epitope_substitution(ds, dctl):
    '''Correlate presence of a substitution with epitope'''
    from hivwholeseq.data.primers import primers_coordinates_HXB2_outer
    start_F1 = primers_coordinates_HXB2_outer['F1'][0][1]
    end_F6 = primers_coordinates_HXB2_outer['F6'][1][0]

    ds = ds.copy()

    dg = []
    for pcode, datum in dctl.groupby('pcode'):
        a = np.arange(start_F1, end_F6)
        b = np.zeros(len(a), bool)
        for _, epi in datum.iterrows():
            b[(a >= epi['start_HXB2']) & (a < epi['end_HXB2'])] = True
        c = np.zeros(len(a), bool)
        datum = ds.loc[ds['pcode'] == pcode]
        # Keep only nonsyn substitutions
        datum = datum.loc[datum['syn'] == False]
        c[datum['pos_ref'] - a[0]] = True
        dat = {
            'pos': a,
            'epitope': b,
            'substitution': c,
        }
        dat = pd.DataFrame(dat)
        dat['pcode'] = pcode
        dg.append(dat)
    dg = pd.concat(dg)

    # Exclude env because it has antibody-related substitutions
    from hivwholeseq.reference import load_custom_reference
    from hivwholeseq.utils.sequence import find_annotation
    ref = load_custom_reference('HXB2', 'gb')
    start_env = find_annotation(ref, 'gp41').location.nofuzzy_start
    end_env = find_annotation(ref, 'gp41').location.nofuzzy_end - 450
    dg = dg.loc[(dg['pos'] < start_env) | (dg['pos'] >= end_env)]

    M = dg.groupby(['epitope', 'substitution']).size().unstack()
    Ma = np.array(M)
    xp = 1.0 * Ma[1, 0] / Ma[0, 0] * Ma[0, 1]
    xs = Ma[1, 1] - xp
    print M
    from scipy.stats import fisher_exact
    print 'Fisher\'s exact enrichment:', fisher_exact(Ma)[0]
    print 'Fisher\'s exact P value:', fisher_exact(Ma)[1]
    print 'expected:', xp
    print 'excess:', xs, 'per patient:', xs / 9.0

    pos_epi = dg.loc[dg['epitope'] == True]['pos'].unique()
    dg2 = dg.loc[dg['pos'].isin(pos_epi)].copy()
    M2 = dg2.groupby(['epitope', 'substitution']).size().unstack()
    M2a = np.array(M2)
    xp = 1.0 * M2a[1, 0] / M2a[0, 0] * M2a[0, 1]
    xs = M2a[1, 1] - xp
    print M2
    print '\nFisher\'s exact enrichment:', fisher_exact(M2a)[0]
    print 'Fisher\'s exact P value:', fisher_exact(M2a)[1]
    print 'expected:', xp
    print 'excess:', xs, 'per patient:', xs / 9.0
    return {
        'dg': dg,
        'dg2': dg2,
    }
    for samplename, sample in samples.iterrows():
        if VERBOSE >= 1:
            print samplename

        sample = SamplePat(sample)
        pname = sample.patient
        ref = sample.get_reference('genomewide', 'gb')

        # Collect the insertions (where possible)
        ics = {}
        for fragment in ['F' + str(i) for i in xrange(1, 7)]:
            try:
                ic = sample.get_insertions(fragment, merge_read_types=False)
            except IOError:
                continue
            start = find_annotation(ref, fragment).location.nofuzzy_start
            ics[(fragment, start)] = ic

        if not len(ics):
            if VERBOSE >= 1:
                print 'No data found: skipping'
            continue

        # Merge insertions
        ic = merge_insertions(ics, VERBOSE=VERBOSE)
        if save_to_file:
            fn_out = sample.get_insertions_filename('genomewide')
            save_insertions(fn_out, ic)
            if VERBOSE >= 1:
                print 'Genomewide insertions saved to:', fn_out