def get_initial_allele_counts(self, fragment): '''Get allele counts from the initial time point''' import os from hivwholeseq.patients.samples import SamplePat for i in xrange(len(self.samples)): sample = SamplePat(self.samples.iloc[i]) if os.path.isfile(sample.get_allele_counts_filename(fragment)): return sample.get_allele_counts(fragment)
if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments for fragment in fragments: counts = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print fragment, samplename sample = SamplePat(sample) pname = sample.patient refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR) if not os.path.isfile(fn): warn('No BAM file found', NoDataWarning) continue count, _ = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE) counts.append(count) if save_to_file: fn_out = sample.get_allele_counts_filename(fragment, PCR=PCR, qual_min=qual_min) count.dump(fn_out)
VERBOSE = args.verbose qual_min = args.qualmin use_plot = args.plot samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() for region in regions: for samplename, sample in samples.iterrows(): sample = SamplePat(sample) if VERBOSE >= 1: print region, samplename count = sample.get_allele_counts(region, qual_min=qual_min) if use_plot: x = np.tile(np.arange(count.shape[1]), (count.shape[0], 1)) color = np.tile(np.arange(count.shape[0]), (count.shape[1], 1)).T fig, ax = plt.subplots(figsize=(12, 6)) ax.scatter(x, count + 0.1, lw=2, c=color) ax.set_xlabel('Position [bp]') ax.set_ylabel('Coverage')
for fragment in fragments: for samplename, sample in samples.iterrows(): fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min, PCR=PCR, maxreads=maxreads, use_tests=use_tests) sys.exit() counts_all = [] for fragment in fragments: counts = [] for samplename, sample in samples.iterrows(): sample = SamplePat(sample) pname = sample.patient if VERBOSE >= 2: print pname, fragment, samplename refseq = SeqIO.read( get_initial_reference_filename(pname, fragment), 'fasta') fn_out = sample.get_allele_cocounts_filename(fragment, PCR=PCR, qual_min=qual_min, compressed=True) fn = sample.get_mapped_filtered_filename( fragment, PCR=PCR, decontaminated=True) #FIXME if save_to_file:
VERBOSE = args.verbose qual_min = args.qualmin use_plot = args.plot samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() for protein in proteins: for samplename, sample in samples.iterrows(): sample = SamplePat(sample) if VERBOSE >= 1: print protein, samplename count = sample.get_allele_counts_aa(protein, qual_min=qual_min) if use_plot: x = np.tile(np.arange(count.shape[1]), (count.shape[0], 1)) color = np.tile(np.arange(count.shape[0]), (count.shape[1], 1)).T fig, ax = plt.subplots(figsize=(12, 6)) ax.scatter(x, count + 0.1, lw=2, c=color) ax.set_xlabel('Position [aa]') ax.set_ylabel('Coverage')
print 'fragments', fragments for fragment in fragments: inses = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print fragment, samplename sample = SamplePat(sample) pname = sample.patient refseq = SeqIO.read( get_initial_reference_filename(pname, fragment), 'fasta') fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR) if not os.path.isfile(fn): warn('No BAM file found', NoDataWarning) continue _, inse = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE) inses.append(inse) if save_to_file: fn_out = sample.get_insertions_filename(fragment, PCR=PCR,
def itersamples(self): '''Generator for samples in this patient, each with extended attributes''' from hivwholeseq.patients.samples import SamplePat for samplename, sample in self.samples.iterrows(): yield SamplePat(sample)
args = parser.parse_args() pnames = args.patients samplenames = args.samples VERBOSE = args.verbose use_save = args.save fragments = ['F' + str(i + 1) for i in xrange(6)] samples = load_samples_sequenced() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] for samplename, sample in samples.iterrows(): sample = SamplePat(sample) if VERBOSE >= 1: print samplename dist_hists = [] samples_seq = sample.get_sequenced_samples() samples_seq = samples_seq.loc[samples_seq.PCR == 1] for samplename_seq, sample_seq in samples_seq.iterrows(): sample_seq = SampleSeq(sample_seq) data_folder = sample_seq.seqrun_folder adaID = sample_seq.adapter for fragment in fragments: try: dist_hist = get_distance_histogram(data_folder, adaID,
VERBOSE = args.verbose use_save = args.save use_plot = args.plot samples = load_samples_sequenced() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() data = defaultdict(dict) for samplename, sample in samples.iterrows(): sample = SamplePat(sample) if VERBOSE >= 1: print samplename for (fr1, fr2) in izip(fragments[:-1], fragments[1:]): try: ac1 = sample.get_allele_counts(fr1) ac2 = sample.get_allele_counts(fr2) except IOError: continue if VERBOSE >= 2: print fr1, fr2 # Filter positions by coverage covmin = 100
if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments if submit: for fragment in fragments: for samplename, sample in samples.iterrows(): fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min, PCR=PCR) sys.exit() for samplename, sample in samples.iterrows(): sample = SamplePat(sample) pname = sample.patient for fragment in fragments: if VERBOSE >= 1: print pname, samplename, fragment fn = sample.get_allele_cocounts_filename(fragment, PCR=PCR, qual_min=qual_min, compressed=False) fn_out = sample.get_allele_cocounts_filename(fragment, PCR=PCR, qual_min=qual_min, compressed=True)
fragments = args.fragments submit = args.submit VERBOSE = args.verbose n_pairs = args.maxreads summary = args.summary PCR = args.PCR # Collect all sequenced samples from patients samples_pat = lssp() if pnames is not None: samples_seq = [] for pname in pnames: patient = load_patient(pname) patient.discard_nonsequenced_samples() for samplename_pat, sample_pat in patient.samples.iterrows(): sample_pat = SamplePat(sample_pat) samples_seq.append(sample_pat.samples_seq) samples_seq = pd.concat(samples_seq) elif samplenames is not None: samples_seq = lss() ind = samples_pat.index.isin(samplenames) samplenames_pat = samples_pat.index[ind] samples_seq = samples_seq.loc[samples_seq['patient sample'].isin(samplenames_pat)] else: samples_seq = lss() samples_seq = samples_seq.loc[samples_seq['patient sample'].isin(samples_pat.index)] if PCR != 'all':
if VERBOSE >= 3: print 'fragments', fragments if submit: for fragment in fragments: for samplename, sample in samples.iterrows(): fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min, PCR=PCR, maxreads=maxreads, use_tests=use_tests) sys.exit() counts_all = [] for fragment in fragments: counts = [] for samplename, sample in samples.iterrows(): sample = SamplePat(sample) pname = sample.patient if VERBOSE >= 2: print pname, fragment, samplename refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') fn_out = sample.get_allele_cocounts_filename(fragment, PCR=PCR, qual_min=qual_min, compressed=True) fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR, decontaminated=True) #FIXME if save_to_file: cocount = gac(fn, len(refseq), maxreads=maxreads,
print 'fragments', fragments for fragment in fragments: counts = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print fragment, samplename sample = SamplePat(sample) pname = sample.patient refseq = SeqIO.read( get_initial_reference_filename(pname, fragment), 'fasta') fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR) if not os.path.isfile(fn): warn('No BAM file found', NoDataWarning) continue count, _ = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE) counts.append(count) if save_to_file: fn_out = sample.get_allele_counts_filename(fragment, PCR=PCR,
if VERBOSE >= 2: print 'samples', samples.index.tolist() counts_all = [] for protein in proteins: counts = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, protein, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print protein, samplename sample = SamplePat(sample) # NOTE: How do we find what fragment covers the protein? Well, a # protein can happily cross fragments. Since each # codon is independent, we should iterate over codons. We do not # do that for efficiency reasons. Instead, we identify all potential # fragments and split the protein into full codon chunks covered by # a single fragment. fragment_rois = sample.get_fragments_covered(protein, include_coordinates=True) refseq = sample.get_reference(protein) fn_out = sample.get_allele_counts_filename(protein, PCR=PCR, qual_min=qual_min, type='aa')
VERBOSE = args.verbose qual_min = args.qualmin use_plot = args.plot samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() for protein in proteins: for samplename, sample in samples.iterrows(): sample = SamplePat(sample) if VERBOSE >= 1: print protein, samplename count = sample.get_allele_counts_aa(protein, qual_min=qual_min) if use_plot: x = np.tile(np.arange(count.shape[1]), (count.shape[0], 1)) color = np.tile(np.arange(count.shape[0]), (count.shape[1], 1)).T fig, ax = plt.subplots(figsize=(12, 6)) ax.scatter(x, count + 0.1, lw=2, c=color) ax.set_xlabel('Position [aa]')
VERBOSE = args.verbose qual_min = args.qualmin use_plot = args.plot samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() for region in regions: for samplename, sample in samples.iterrows(): sample = SamplePat(sample) if VERBOSE >= 1: print region, samplename count = sample.get_allele_counts(region, qual_min=qual_min) if use_plot: x = np.tile(np.arange(count.shape[1]), (count.shape[0], 1)) color = np.tile(np.arange(count.shape[0]), (count.shape[1], 1)).T fig, ax = plt.subplots(figsize=(12, 6)) ax.scatter(x, count + 0.1, lw=2, c=color) ax.set_xlabel('Position [bp]')
PCR = args.PCR samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print "samples", samples.index.tolist() for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename sample = SamplePat(sample) pname = sample.patient ref = sample.get_reference("genomewide", "gb") # Collect the insertions (where possible) ics = {} for fragment in ["F" + str(i) for i in xrange(1, 7)]: try: ic = sample.get_insertions(fragment, merge_read_types=False) except IOError: continue start = find_annotation(ref, fragment).location.nofuzzy_start ics[(fragment, start)] = ic if not len(ics): if VERBOSE >= 1:
PCR = args.PCR samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename sample = SamplePat(sample) pname = sample.patient conss_genomewide = SeqIO.read(get_initial_reference_filename(pname, 'genomewide'), 'fasta') # Collect the allele counts (where possible) acs = [] for fragment in ['F'+str(i) for i in xrange(1, 7)]: try: ref = ''.join(SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')) ac = sample.get_allele_counts(fragment, merge_read_types=False) acs.append((fragment, ref, ac)) except IOError: continue if not len(acs): if VERBOSE >= 1:
PCR = args.PCR samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename sample = SamplePat(sample) pname = sample.patient conss_genomewide = SeqIO.read( get_initial_reference_filename(pname, 'genomewide'), 'fasta') # Collect the allele counts (where possible) acs = [] for fragment in ['F' + str(i) for i in xrange(1, 7)]: try: ref = ''.join( SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')) ac = sample.get_allele_counts(fragment, merge_read_types=False) acs.append((fragment, ref, ac)) except IOError: continue
PCR = args.PCR samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename sample = SamplePat(sample) pname = sample.patient ref = sample.get_reference('genomewide', 'gb') # Collect the insertions (where possible) ics = {} for fragment in ['F' + str(i) for i in xrange(1, 7)]: try: ic = sample.get_insertions(fragment, merge_read_types=False) except IOError: continue start = find_annotation(ref, fragment).location.nofuzzy_start ics[(fragment, start)] = ic if not len(ics): if VERBOSE >= 1:
counts_all = [] for protein in proteins: counts = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, protein, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print protein, samplename sample = SamplePat(sample) # NOTE: How do we find what fragment covers the protein? Well, a # protein can happily cross fragments. Since each # codon is independent, we should iterate over codons. We do not # do that for efficiency reasons. Instead, we identify all potential # fragments and split the protein into full codon chunks covered by # a single fragment. fragment_rois = sample.get_fragments_covered( protein, include_coordinates=True) refseq = sample.get_reference(protein) fn_out = sample.get_allele_counts_filename(protein, PCR=PCR, qual_min=qual_min, type='aa')
print 'Alignments' copy_folder(patient, pat_fn, 'alignments') print 'Trees' copy_folder(patient, pat_fn, 'trees') print 'Haplotypes' copy_folder(patient, pat_fn, 'haplotypes') print 'Samples' for samplename, sample in patient.samples.iterrows(): print samplename sample = SamplePat(sample) print 'Make folder' sm_fn = pat_fn+samplename+os.sep if not strip_PCR1: sm_fn += 'PCR1'+os.sep mkdirs(sm_fn) print 'Consensus' copy_glob(sample, sm_fn, 'consensus') print 'Allele counts' copy_glob(sample, sm_fn, 'allele_counts')
def initial_sample(self): '''The initial sample used as a mapping reference''' from .samples import SamplePat return SamplePat(self.samples.iloc[0])
n_pairs = args.maxreads skip_hash = args.skiphash summary = args.summary only_chunks = args.chunks filtered = args.filtered use_contaminated = args.include_contaminated # Collect all sequenced samples from patients samples_pat = lssp() if pnames is not None: samples_seq = [] for pname in pnames: patient = load_patient(pname) patient.discard_nonsequenced_samples() for samplename_pat, sample_pat in patient.samples.iterrows(): sample_pat = SamplePat(sample_pat) samples_seq.append(sample_pat.samples_seq) samples_seq = pd.concat(samples_seq) else: samples_seq = lss() ind = samples_pat.index.isin(samplenames) if ind.sum(): samplenames_pat = samples_pat.index[ind] samples_seq = samples_seq.loc[samples_seq['patient sample'].isin(samplenames_pat)] else: samples_seq = samples_seq.loc[samples_seq.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples_seq.index.tolist()
fragments = args.fragments submit = args.submit VERBOSE = args.verbose n_pairs = args.maxreads summary = args.summary PCR = args.PCR # Collect all sequenced samples from patients samples_pat = lssp() if pnames is not None: samples_seq = [] for pname in pnames: patient = load_patient(pname) patient.discard_nonsequenced_samples() for samplename_pat, sample_pat in patient.samples.iterrows(): sample_pat = SamplePat(sample_pat) samples_seq.append(sample_pat.samples_seq) samples_seq = pd.concat(samples_seq) elif samplenames is not None: samples_seq = lss() ind = samples_pat.index.isin(samplenames) samplenames_pat = samples_pat.index[ind] samples_seq = samples_seq.loc[samples_seq['patient sample'].isin( samplenames_pat)] else: samples_seq = lss() samples_seq = samples_seq.loc[samples_seq['patient sample'].isin( samples_pat.index)]
args = parser.parse_args() pnames = args.patients samplenames = args.samples VERBOSE = args.verbose use_save = args.save fragments = ['F'+str(i+1) for i in xrange(6)] samples = load_samples_sequenced() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] for samplename, sample in samples.iterrows(): sample = SamplePat(sample) if VERBOSE >= 1: print samplename dist_hists = [] samples_seq = sample.get_sequenced_samples() samples_seq = samples_seq.loc[samples_seq.PCR == 1] for samplename_seq, sample_seq in samples_seq.iterrows(): sample_seq = SampleSeq(sample_seq) data_folder = sample_seq.seqrun_folder adaID = sample_seq.adapter for fragment in fragments: try: dist_hist = get_distance_histogram(data_folder, adaID, fragment, VERBOSE=VERBOSE)
if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments for fragment in fragments: inses = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print fragment, samplename sample = SamplePat(sample) pname = sample.patient refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR) if not os.path.isfile(fn): warn('No BAM file found', NoDataWarning) continue _, inse = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE) inses.append(inse) if save_to_file: fn_out = sample.get_insertions_filename(fragment, PCR=PCR, qual_min=qual_min) save_insertions(fn_out, inse)
VERBOSE = args.verbose repn = args.repnumber samplename = args.sample patient = load_patient(pname) patient.discard_nonsequenced_samples() mkdirs(get_initial_reference_foldername(pname)) if not fragments: fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments if samplename is None: sample = SamplePat(patient.samples.iloc[samplen]) else: sample = load_sample_sequenced(samplename) for fragment in fragments: sample_seq = SampleSeq(sample.samples_seq.iloc[repn]) seq_run = sample_seq['seq run'] adaID = sample_seq['adapter'] dataset = sample_seq.sequencing_run data_folder = dataset.folder if VERBOSE: print 'Initial sample:', sample_seq.name, sample_seq['seq run'], print sample_seq.adapter