예제 #1
0
def align_consensi_dataset(dataset, adaIDs, fragments, VERBOSE=0):
    '''Align consensi from different samples in a dataset'''

    data_folder = dataset['folder']

    # Collect consensi
    if VERBOSE >= 1:
        print 'Collecting consensi...',
    consensi = defaultdict(dict)
    for adaID in adaIDs:
        samplename = dataset['samples'][dataset['adapters'].index(adaID)]
        fragments_sample = samples[samplename]['fragments']
        for frag in fragments_sample:
            frag_gen = frag[:2]
            if frag_gen not in fragments:
                continue
            con_fn = get_consensus_filename(data_folder, adaID, frag_gen)
            if os.path.isfile(con_fn):
                con = SeqIO.read(con_fn, 'fasta')
                consensi[frag_gen][adaID] = con

        if 'genomewide' in fragments:
            frag_gens = [frag[:2] for frag in fragments_sample]
            con_gw_fn = get_merged_consensus_filename(data_folder, adaID,
                                                      frag_gens)
            if os.path.isfile(con_gw_fn):
                con = SeqIO.read(con_gw_fn, 'fasta')
                consensi['genomewide'][adaID] = con

    if VERBOSE >= 1:
        print 'done.'
        print 'Aligning...',

    # Align
    alis = {}
    for (frag, con_dict) in consensi.iteritems():
        if VERBOSE >= 2:
            print frag,
        ali_frag = align_muscle(*(con_dict.values()))
        alis[frag] = ali_frag

    if VERBOSE >= 1:
        print 'done.'

    return alis
def align_consensi_dataset(dataset, adaIDs, fragments, VERBOSE=0):
    '''Align consensi from different samples in a dataset'''

    data_folder = dataset['folder']

    # Collect consensi
    if VERBOSE >= 1:
        print 'Collecting consensi...',
    consensi = defaultdict(dict)
    for adaID in adaIDs:
        samplename = dataset['samples'][dataset['adapters'].index(adaID)]
        fragments_sample = samples[samplename]['fragments']
        for frag in fragments_sample:
            frag_gen = frag[:2]
            if frag_gen not in fragments:
                continue
            con_fn = get_consensus_filename(data_folder, adaID, frag_gen)
            if os.path.isfile(con_fn):
                con = SeqIO.read(con_fn, 'fasta')
                consensi[frag_gen][adaID] = con

        if 'genomewide' in fragments:
            frag_gens = [frag[:2] for frag in fragments_sample]
            con_gw_fn = get_merged_consensus_filename(data_folder, adaID, frag_gens)
            if os.path.isfile(con_gw_fn):
                con = SeqIO.read(con_gw_fn, 'fasta')
                consensi['genomewide'][adaID] = con
    
    if VERBOSE >= 1:
        print 'done.'
        print 'Aligning...',

    # Align
    alis = {}
    for (frag, con_dict) in consensi.iteritems():
        if VERBOSE >= 2:
            print frag,
        ali_frag = align_muscle(*(con_dict.values()))
        alis[frag] = ali_frag

    if VERBOSE >= 1:
        print 'done.'

    return alis
    samples = dataset.samples
    if adaIDs is not None:
        samples = samples.loc[samples.adapter.isin(adaIDs)]
    if VERBOSE >= 3:
        print 'adaIDs', samples.adapter

    for samplename, sample in samples.iterrows():
        sample = SampleSeq(sample)
        adaID = sample.adapter

        if VERBOSE >= 1:
            print adaID, samplename

        fragments = [fr[:2] for fr in sample.regions_complete]

        if (len(fragments) != 6) and (VERBOSE >= 1):
            print 'WARNING: only '+str(len(fragments))+' regions found!'

        # Write one or more merged consensi
        consensus = merge_consensi(data_folder, adaID, fragments, VERBOSE=VERBOSE)
        for (frags, cons) in consensus:
            output_filename = get_merged_consensus_filename(data_folder, adaID, frags)
            SeqIO.write(cons, output_filename, 'fasta')

        # Write allele frequencies
        if do_nus:
            nu = merge_allele_frequencies(data_folder, adaID, fragments, VERBOSE=VERBOSE)
            for (frags, nuf) in nu:
                output_filename = get_merged_allele_frequencies_filename(data_folder, adaID, frags)
                nuf.dump(output_filename)