def align_consensi_dataset(dataset, adaIDs, fragments, VERBOSE=0): '''Align consensi from different samples in a dataset''' data_folder = dataset['folder'] # Collect consensi if VERBOSE >= 1: print 'Collecting consensi...', consensi = defaultdict(dict) for adaID in adaIDs: samplename = dataset['samples'][dataset['adapters'].index(adaID)] fragments_sample = samples[samplename]['fragments'] for frag in fragments_sample: frag_gen = frag[:2] if frag_gen not in fragments: continue con_fn = get_consensus_filename(data_folder, adaID, frag_gen) if os.path.isfile(con_fn): con = SeqIO.read(con_fn, 'fasta') consensi[frag_gen][adaID] = con if 'genomewide' in fragments: frag_gens = [frag[:2] for frag in fragments_sample] con_gw_fn = get_merged_consensus_filename(data_folder, adaID, frag_gens) if os.path.isfile(con_gw_fn): con = SeqIO.read(con_gw_fn, 'fasta') consensi['genomewide'][adaID] = con if VERBOSE >= 1: print 'done.' print 'Aligning...', # Align alis = {} for (frag, con_dict) in consensi.iteritems(): if VERBOSE >= 2: print frag, ali_frag = align_muscle(*(con_dict.values())) alis[frag] = ali_frag if VERBOSE >= 1: print 'done.' return alis
samples = dataset.samples if adaIDs is not None: samples = samples.loc[samples.adapter.isin(adaIDs)] if VERBOSE >= 3: print 'adaIDs', samples.adapter for samplename, sample in samples.iterrows(): sample = SampleSeq(sample) adaID = sample.adapter if VERBOSE >= 1: print adaID, samplename fragments = [fr[:2] for fr in sample.regions_complete] if (len(fragments) != 6) and (VERBOSE >= 1): print 'WARNING: only '+str(len(fragments))+' regions found!' # Write one or more merged consensi consensus = merge_consensi(data_folder, adaID, fragments, VERBOSE=VERBOSE) for (frags, cons) in consensus: output_filename = get_merged_consensus_filename(data_folder, adaID, frags) SeqIO.write(cons, output_filename, 'fasta') # Write allele frequencies if do_nus: nu = merge_allele_frequencies(data_folder, adaID, fragments, VERBOSE=VERBOSE) for (frags, nuf) in nu: output_filename = get_merged_allele_frequencies_filename(data_folder, adaID, frags) nuf.dump(output_filename)