示例#1
0
 def get_initial_allele_counts(self, fragment):
     '''Get allele counts from the initial time point'''
     import os
     from hivwholeseq.patients.samples import SamplePat
     for i in xrange(len(self.samples)):
         sample = SamplePat(self.samples.iloc[i])
         if os.path.isfile(sample.get_allele_counts_filename(fragment)):
             return sample.get_allele_counts(fragment)
示例#2
0
 def get_initial_allele_counts(self, fragment):
     '''Get allele counts from the initial time point'''
     import os
     from hivwholeseq.patients.samples import SamplePat
     for i in xrange(len(self.samples)):
         sample = SamplePat(self.samples.iloc[i])
         if os.path.isfile(sample.get_allele_counts_filename(fragment)):
             return sample.get_allele_counts(fragment)
示例#3
0
                print protein, samplename

            sample = SamplePat(sample)

            # NOTE: How do we find what fragment covers the protein? Well, a
            # protein can happily cross fragments. Since each
            # codon is independent, we should iterate over codons. We do not
            # do that for efficiency reasons. Instead, we identify all potential
            # fragments and split the protein into full codon chunks covered by
            # a single fragment.
            fragment_rois = sample.get_fragments_covered(
                protein, include_coordinates=True)

            refseq = sample.get_reference(protein)
            fn_out = sample.get_allele_counts_filename(protein,
                                                       PCR=PCR,
                                                       qual_min=qual_min,
                                                       type='aa')

            from hivwholeseq.utils.sequence import alphaa
            count = np.zeros((len(alphaa), len(refseq) // 3), int)
            for frroi in fragment_rois:
                fragment = frroi['name']
                start_fr, end_fr = frroi['fragment']
                start, end = frroi['roi']

                # Check that we align with codons
                rf = start % 3
                if rf:
                    start_fr += 3 - rf
                    start += 3 - rf
                rf = end % 3
        sample = SamplePat(sample)
        pname = sample.patient
        conss_genomewide = SeqIO.read(
            get_initial_reference_filename(pname, 'genomewide'), 'fasta')

        # Collect the allele counts (where possible)
        acs = []
        for fragment in ['F' + str(i) for i in xrange(1, 7)]:
            try:
                ref = ''.join(
                    SeqIO.read(get_initial_reference_filename(pname, fragment),
                               'fasta'))
                ac = sample.get_allele_counts(fragment, merge_read_types=False)
                acs.append((fragment, ref, ac))
            except IOError:
                continue

        if not len(acs):
            if VERBOSE >= 1:
                print 'No data found: skipping'
            continue

        # Merge allele counts
        ac = merge_allele_counts(conss_genomewide, acs, VERBOSE=VERBOSE)
        if save_to_file:
            fn_out = sample.get_allele_counts_filename('genomewide')
            np.save(fn_out, ac)
            if VERBOSE >= 1:
                print 'Genomewide allele counts saved to:', fn_out
    for fragment in fragments:
        counts = []
        for samplename, sample in samples.iterrows():
            if submit:
                fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min)
                continue

            if VERBOSE >= 1:
                print fragment, samplename

            sample = SamplePat(sample)
            pname = sample.patient
            refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')

            fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR)
            if not os.path.isfile(fn):
                warn('No BAM file found', NoDataWarning)
                continue

            count, _ = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE)
            counts.append(count)

            if save_to_file:
                fn_out = sample.get_allele_counts_filename(fragment, PCR=PCR,
                                                           qual_min=qual_min)
                count.dump(fn_out)

                if VERBOSE >= 2:
                    print 'Allele counts saved:', samplename, fragment
                fork_self(samplename,
                          fragment,
                          VERBOSE=VERBOSE,
                          qual_min=qual_min)
                continue

            if VERBOSE >= 1:
                print fragment, samplename

            sample = SamplePat(sample)
            pname = sample.patient
            refseq = SeqIO.read(
                get_initial_reference_filename(pname, fragment), 'fasta')

            fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR)
            if not os.path.isfile(fn):
                warn('No BAM file found', NoDataWarning)
                continue

            count, _ = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE)
            counts.append(count)

            if save_to_file:
                fn_out = sample.get_allele_counts_filename(fragment,
                                                           PCR=PCR,
                                                           qual_min=qual_min)
                count.dump(fn_out)

                if VERBOSE >= 2:
                    print 'Allele counts saved:', samplename, fragment
                print protein, samplename

            sample = SamplePat(sample)

            # NOTE: How do we find what fragment covers the protein? Well, a
            # protein can happily cross fragments. Since each
            # codon is independent, we should iterate over codons. We do not
            # do that for efficiency reasons. Instead, we identify all potential
            # fragments and split the protein into full codon chunks covered by
            # a single fragment.
            fragment_rois = sample.get_fragments_covered(protein,
                                                         include_coordinates=True)
            
            refseq = sample.get_reference(protein)
            fn_out = sample.get_allele_counts_filename(protein, PCR=PCR,
                                                       qual_min=qual_min,
                                                       type='aa')

            from hivwholeseq.utils.sequence import alphaa
            count = np.zeros((len(alphaa), len(refseq) // 3), int)
            for frroi in fragment_rois:
                fragment = frroi['name']
                start_fr, end_fr = frroi['fragment']
                start, end = frroi['roi']

                # Check that we align with codons
                rf = start % 3
                if rf:
                    start_fr += 3 - rf
                    start += 3 - rf
                rf = end % 3
    for samplename, sample in samples.iterrows():
        if VERBOSE >= 1:
            print samplename

        sample = SamplePat(sample)
        pname = sample.patient
        conss_genomewide = SeqIO.read(get_initial_reference_filename(pname, 'genomewide'), 'fasta')

        # Collect the allele counts (where possible)
        acs = []
        for fragment in ['F'+str(i) for i in xrange(1, 7)]:
            try:
                ref = ''.join(SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta'))
                ac = sample.get_allele_counts(fragment, merge_read_types=False)
                acs.append((fragment, ref, ac))
            except IOError:
                continue

        if not len(acs):
            if VERBOSE >= 1:
                print 'No data found: skipping'
            continue

        # Merge allele counts
        ac = merge_allele_counts(conss_genomewide, acs, VERBOSE=VERBOSE)
        if save_to_file:
            fn_out = sample.get_allele_counts_filename('genomewide')
            np.save(fn_out, ac)
            if VERBOSE >= 1:
                print 'Genomewide allele counts saved to:', fn_out