示例#1
0
def trim(OGid):
    # 0 Load MSA
    try:
        msa1 = read_fasta(f'../align_fastas1/out/{OGid}.mfa')
    except FileNotFoundError:
        msa1 = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa')

    # 1 Calculate shared variables
    gaps_array = np.full((len(msa1), len(msa1[0][1])), False)
    for i, (_, seq) in enumerate(msa1):
        for j, sym in enumerate(seq):
            if sym == '-':
                gaps_array[i, j] = True
    scores = gaps_array.sum(axis=0)
    msa1 = skbio.TabularMSA([skbio.Protein(seq, metadata={'description': header}) for header, seq in msa1])

    # 2 Get trims (segments and columns)
    syms_list1 = trim_conserved(msa1, scores, gaps_array,
                                tp['con_frac'], tp['con_window'], tp['con_minlen'], tp['con_rate'], tp['con_minsig'])
    syms_list2, trims = trim_insertions(msa1, scores, gaps_array,
                                        tp['gap_num'], tp['gap_rate'], tp['gap_minsig'],
                                        tp['nongap_frac'], tp['nongap_minlen'],
                                        tp['gp_sigma'], tp['gd_window'], tp['indel1_rate'], tp['indel2_rate'],
                                        tp['weights'], tp['threshold'],
                                        matrix)

    # 3 Combine trims (segments and columns) to yield final alignment
    msa2 = []
    for seq, syms1, syms2 in zip(msa1, syms_list1, syms_list2):
        syms = ['-' if sym1 != sym2 else sym1 for sym1, sym2 in zip(syms1, syms2)]  # Will only differ if one is converted to gap
        msa2.append((seq.metadata['description'], syms))

    # 4 Restore gap only columns
    gaps_array = np.full((len(msa2), len(msa2[0][1])), False)
    for i, (_, seq) in enumerate(msa2):
        for j, sym in enumerate(seq):
            if sym == '-':
                gaps_array[i, j] = True
    scores = gaps_array.sum(axis=0)

    rf = ['x' for _ in range(len(msa2[0][1]))]  # Metadata for marking consensus columns in profile HMM
    for region, in ndimage.find_objects(ndimage.label(scores == len(msa2))[0]):
        rf[region] = (region.stop - region.start) * ['.']
        for i in range(len(msa2)):
            syms = msa2[i][1]
            syms[region] = list(str(msa1[i, region]))

    # 5 Write to file
    msa2 = skbio.TabularMSA([skbio.Protein(''.join(syms), metadata={'description': header}) for header, syms in msa2],
                            positional_metadata={'RF': rf})
    msa2.write(f'out/{OGid}.sto', 'stockholm')
def hmm_align(OGid):
    sqidnum, gnidnum = OGid2meta[OGid]
    if sqidnum == gnidnum:
        path = f'../make_fastas1/out/{OGid}.tfa'
    else:
        path = f'../make_fastas2-2/out/{OGid}.tfa'
    run(f'../../../bin/hmmbuild --hand --eset {1.5*gnidnum} --wnone out/{OGid}.hmm ../realign_trim/out/{OGid}.sto > out/{OGid}.txt', shell=True, check=True)
    run(f'../../../bin/hmmalign --outformat afa out/{OGid}.hmm {path} > out/{OGid}_temp.mfa', shell=True, check=True)

    # Remove excess gaps
    msa = read_fasta(f'out/{OGid}_temp.mfa')
    slices, idx = [], None
    for j in range(len(msa[0][1])):
        for i in range(len(msa)):
            sym = msa[i][1][j]
            if sym not in ['-', '.']:
                if idx is None:  # Store position only if new slice is not started
                    idx = j
                break
        else:
            if idx is not None:
                slices.append(slice(idx, j))
                idx = None
    if idx is not None:  # Add final slice to end
        slices.append(slice(idx, len(msa[0][1])))

    # Write to file and remove temp alignment
    with open(f'out/{OGid}.mfa', 'w') as file:
        for header, seq1 in msa:
            seq2 = ''.join([seq1[s] for s in slices])
            seqstring = '\n'.join([seq2[i:i+80] for i in range(0, len(seq2), 80)]) + '\n'
            file.write(header + '\n' + seqstring)
    os.remove(f'out/{OGid}_temp.mfa')
示例#3
0
def decode(OGid, params):
    # Load msa and trim terminal insertions
    msa = read_fasta(f'../../ortho_MSA/realign_hmmer2/out/{OGid}.mfa')

    idx = 0
    for j in range(len(msa[0][1])):
        for i in range(len(msa)):
            sym = msa[i][1][j]
            if sym == '.' or sym.islower():
                break
        else:
            idx = j
            break  # if no break exit
    msa = [(header, seq[idx:]) for header, seq in msa]

    idx = len(msa[0][1])
    for j in range(len(msa[0][1]), 0, -1):
        for i in range(len(msa)):
            sym = msa[i][1][j - 1]
            if sym == '.' or sym.islower():
                break
        else:
            idx = j
            break  # if no break exit
    msa = [(header, seq[:idx]) for header, seq in msa]

    # Create emission sequence
    col0 = []
    emits = []
    for j in range(len(msa[0][1])):
        col = [1 if msa[i][1][j] in ['-', '.'] else 0 for i in range(len(msa))]
        emit0 = all([c0 == c for c0, c in zip(col0, col)])
        emit1 = sum(col)
        emits.append((emit0, emit1))
        col0 = col

    # Instantiate model
    e_dists_rv = {
        state: bernoulli_betabinom_frozen(p,
                                          len(msa) - 1, a, b)
        for state, (p, a, b) in params['e_dists'].items()
    }
    model = hmm.HMM(params['t_dists'], e_dists_rv, params['start_dist'])

    # Decode states and write
    fbs = model.forward_backward(emits)
    with open(f'out/{OGid}.tsv', 'w') as file:
        file.write('\t'.join(states) + '\n')
        for fb in zip(*[fbs[state] for state in states]):
            file.write('\t'.join([str(v) for v in fb]) + '\n')
示例#4
0
def decode(OGid, params):
    # Load msa
    msa = read_fasta(f'../insertion_trim/out/{OGid}.mfa')

    # Create Bernoulli sequence
    ps = []
    for j in range(len(msa[0][1])):
        col = [1 if msa[i][1][j] in ['-', '.'] else 0 for i in range(len(msa))]
        p = sum(col) / len(col)
        ps.append(p)

    # Instantiate model
    e_dists_rv = {
        '0': msaBernoulli(ps),
        '1': msaBernoulli([params['e_param'] for _ in range(len(ps))])
    }
    model = hmm.HMM(params['t_dists'], e_dists_rv, params['start_dist'])

    # Decode states
    records = []
    for header, seq in msa:
        # Create emission sequence
        emits = []
        for j, sym in enumerate(seq):
            if sym in ['-', '.']:
                emits.append((j, 1))
            else:
                emits.append((j, 0))

        ppid = re.search(ppid_regex, header).group(1)
        fbs = model.forward_backward(emits)
        records.append((ppid, fbs))

    # Write decoded states
    with open(f'out/{OGid}.tsv', 'w') as file:
        file.write('\t'.join(['ppid'] + states) + '\n')
        for ppid, fbs in records:
            for fb in zip(*[fbs[state] for state in states]):
                file.write(ppid + '\t' + '\t'.join([str(v)
                                                    for v in fb]) + '\n')
示例#5
0
def run_aucpred(OGid):
    msa = read_fasta(f'../insertion_trim/out/{OGid}.mfa')
    prefix = f'out/raw/{OGid}/'

    if not os.path.exists(prefix):
        os.mkdir(prefix)

    for header, seq in msa:
        ppid = re.search(r'ppid=([A-Za-z0-9_]+)', header).group(1)
        seq = seq.translate({ord('-'): None, ord('.'): None})
        if len(
                seq
        ) < 10000:  # AUCpreD uses PSIPRED which has a length limit of 10000
            with open(f'{prefix}{ppid}.fasta', 'w') as file:
                seqstring = '\n'.join(
                    [seq[i:i + 80] for i in range(0, len(seq), 80)]) + '\n'
                file.write(header + '\n' + seqstring)
            subprocess.run(
                f'../../../bin/Predict_Property/AUCpreD.sh -i {prefix}{ppid}.fasta -o {prefix}',
                check=True,
                shell=True)
            os.remove(f'{prefix}{ppid}.fasta')
示例#6
0
        spid, _, source, _, cds_path = line.split()
        genomes.append((spid, source, cds_path))

# Load translation table
ttable = {}
with open('ttable.txt') as file:
    lines = [line.rstrip().split(' = ')[1] for line in file]
    for i in range(len(lines[0])):
        aa = lines[0][i]
        codon = ''.join([lines[j][i] for j in range(2, 5)])
        ttable[codon] = aa

# Load CDSs
ppid2cds = {}
for spid, source, cds_path in genomes:
    fasta = read_fasta(cds_path)
    for header, seq in fasta:
        ppid = re.search(ppid_regex[header], line).group(1)
        ppid2cds[ppid] = seq

if not os.path.exists('out/'):
    os.mkdir('out/')

sys.stdout = open('out/out.txt', 'w')  # Redirect stdout to file
for file_id in filter(lambda x: x.endswith('.mfa'), os.listdir('../align_fastas/out/')):
    # Translate and write CDS
    nt_aligns = []
    for header, aa_align in read_fasta('../align_fastas/out/' + file_id):
        ppid = re.search(r'ppid=([NXYPFBp0-9_.]+)\|', header)[1]
        aa_seq = aa_align.replace('-', '')
        nt_seq = ppid2cds[ppid]
示例#7
0
genes = pd.read_table('genes.tsv')

# Load tree
tree = skbio.read('../../ortho_tree/ctree_WAG/out/100red_ni.txt', 'newick', skbio.TreeNode)
tip_order = {tip.name: i for i, tip in enumerate(tree.tips())}

# Draw alignments
if not os.path.exists('out/'):
    os.mkdir('out/')

df = OGs[['gnid', 'OGid']].drop_duplicates().merge(OG_meta, on='OGid', how='right').merge(genes, on='gnid', how='right')
df.to_csv('out/OGs.tsv', sep='\t', index=False)

for row in df.dropna().itertuples():
    if row.sqidnum == row.gnidnum:
        msa = read_fasta(f'../align_fastas1/out/{row.OGid}.mfa')
    else:
        msa = read_fasta(f'../align_fastas2-2/out/{row.OGid}.mfa')
    msa = [(re.search(r'spid=([a-z]+)', header).group(1), seq) for header, seq in msa]

    msa = [seq for _, seq in sorted(msa, key=lambda x: tip_order[x[0]])]  # Re-order sequences and extract seq only
    im = draw_msa(msa)
    plt.imsave(f'out/{row.OGid}.png', im)

"""
DEPENDENCIES
../../ortho_cluster3/clique4+_pcommunity/clique4+_pcommunity2.py
    ../../ortho_cluster3/clique4+_pcommunity/out/pgraph2/4clique/pclusters.txt
../../ortho_search/seq_meta/seq_meta.py
    ../../ortho_search/seq_meta/out/seq_meta.tsv
../../ortho_tree/ctree_WAG/ctree_WAG.py
示例#8
0
tree = skbio.read('../../ortho_tree/ctree_WAG/out/100red_ni.txt', 'newick', skbio.TreeNode)
tip_order = {tip.name: i for i, tip in enumerate(tree.tips())}
spids = {tip.name for tip in tree.tips() if tip.name != 'sleb'}

OG_filter = pd.read_table('../OG_filter/out/OG_filter.tsv')
df = pd.read_table('../gap_contrasts/out/total_sums.tsv').merge(OG_filter[['OGid', 'sqidnum']], on='OGid', how='left')  # total_sums.tsv has gnidnum already
df['norm1'] = df['total'] / df['gnidnum']
df['norm2'] = df['total'] / (df['gnidnum'] * df['len2'])

for label in ['norm1', 'norm2']:
    if not os.path.exists(f'out/{label}/'):
        os.makedirs(f'out/{label}/')

    head = df.sort_values(by=label, ascending=False).head(150)
    for i, row in enumerate(head.itertuples()):
        msa = read_fasta(f'../realign_hmmer2/out/{row.OGid}.mfa')
        msa = [(re.search(r'spid=([a-z]+)', header).group(1), seq) for header, seq in msa]

        msa = [seq.upper() for _, seq in sorted(msa, key=lambda x: tip_order[x[0]])]  # Re-order sequences and extract seq only
        im = draw_msa(msa)
        plt.imsave(f'out/{label}/{i}_{row.OGid}.png', im)

"""
DEPENDENCIES
../../ortho_tree/ctree_WAG/ctree_WAG.py
    ../../ortho_tree/ctree_WAG/out/100red_ni.txt
../gap_contrasts/gap_contrasts_calc.py
    ../gap_contrasts/out/total_sums.tsv
../OG_filter/OG_filter.py
    ../OG_filter/out/OG_filter.tsv
../realign_hmmer2/realign_hmmer2.py
示例#9
0
seed(930715)  # Set seed to make results consistent

# Extract column pools
colpools = [('100red', lambda col: is_redundant(col, 1), []),
            ('100red_ni',
             lambda col: is_redundant(col, 1) and not is_invariant(col), []),
            ('50red', lambda col: is_redundant(col, 0.5), []),
            ('50red_ni',
             lambda col: is_redundant(col, 0.5) and not is_invariant(col), []),
            ('0red', lambda col: is_redundant(col, 0), []),
            ('0red_ni',
             lambda col: is_redundant(col, 0) and not is_invariant(col), [])]
for file_id in filter(
        lambda x: x.endswith('.mfa'), os.listdir('../align_fastas/out/')
):  # Because inputs are not sorted, results are not guaranteed to be consistent
    msa = read_fasta(f'../align_fastas/out/{file_id}')
    for i in range(len(msa[0][1])):
        col = [Column(header[-4:], seq[i]) for header, seq in msa]
        for _, condition, colpool in colpools:
            if condition(col):
                colpool.append(col)

# Make meta alignments
for label, _, colpool in colpools:
    if not os.path.exists(f'out/{label}/'):
        os.makedirs(f'out/{label}/')

    print(f'{label}:', len(colpool))
    for samplenum in range(100):
        sample = [colpool[randrange(len(colpool))] for _ in range(10000)]
        seqs = {}
示例#10
0
    file.readline()
    for line in file:
        fields = line.rstrip('\n').split('\t')
        OGid, start, stop, state = fields
        if OGid in OGid2labels:
            OGid2labels[OGid][state].append((int(start), int(stop)))
        else:
            labels = {'0': [], '1A': [], '1B': [], '2': [], '3': []}
            labels[state].append((int(start), int(stop)))
            OGid2labels[OGid] = labels

if not os.path.exists('out/'):
    os.mkdir('out/')

for OGid, labels in OGid2labels.items():
    msa = trim_terminals(read_fasta(f'../../ortho_MSA/realign_hmmer1/out/{OGid}.mfa'))

    if labels['0'] and labels['0'][0][0] == 0:
        offset = labels['0'][0][1]
    else:
        offset = 0

    lines = {}
    for state in ['1A', '1B', '2', '3']:
        line = np.zeros(len(msa[0][1]))
        for start, stop in labels[state]:
            line[slice(start-offset, stop-offset)] = 1
        lines[state] = line

    plot_msa_lines([seq[1].upper() for seq in msa], [lines['1A'], lines['2'], lines['3'], lines['1B']], figsize=(15, 6))
    plt.savefig(f'out/{OGid}.png', bbox_inches='tight')
示例#11
0
        if state != '0':  # Skip terminal insertions as actual state
            states.add(state)
        try:
            OGid2regions[OGid].append((int(start), int(stop), state))
        except KeyError:
            OGid2regions[OGid] = [(int(start), int(stop), state)]

# Initialize counts with pseudocounts
t_counts = {state: {s: 1 for s in states} for state in states}
e_counts = {state: {} for state in states}
start_count = {state: 1 for state in states}

# Get observed counts
for OGid, regions in OGid2regions.items():
    # Load MSA and trim terminal insertions
    msa = read_fasta(f'../../ortho_MSA/realign_hmmer1/out/{OGid}.mfa')
    if regions[-1][2] == '0':
        start, _, _ = regions[-1]
        regions = regions[:-1]
        trim = []
        for header, seq in msa:
            trim.append((header, seq[:start]))
        msa = trim
    if regions[0][2] == '0':
        _, stop, _ = regions[0]
        trim = []
        for header, seq in msa:
            trim.append((header, seq[stop:]))
        msa = trim

        offset = regions[0][1]
示例#12
0
regions = []
with open('../aucpred_filter/out/regions_30.tsv') as file:
    file.readline()  # Skip header
    for line in file:
        OGid, start, stop, disorder, ppids = line.split()
        regions.append((OGid, int(start), int(stop), set(ppids.split(','))))

# Calculate contrasts
if not os.path.exists('out/'):
    os.mkdir('out/')

totals = []
rows = []
for OGid, start, stop, ppids in regions:
    msa = {}
    for header, seq in read_fasta(f'../insertion_trim/out/{OGid}.mfa'):
        ppid = re.search(ppid_regex, header).group(1)
        spid = re.search(spid_regex, header).group(1)
        if ppid in ppids:
            msa[spid] = seq[start:stop]

    tree = tree_template.deepcopy().shear(msa.keys())
    for tip in tree.tips():
        gap_vector = np.asarray([1 if sym == '-' else 0 for sym in msa[tip.name]])
        tip.value = gap_vector
    tree.length = 0  # Set root length to 0 for convenience

    contrasts, _, _ = get_contrasts(tree)
    gap_matrix = np.asarray([[0 if sym == '-' else 1 for sym in seq] for seq in msa.values()])
    len1 = len(msa['dmel'])  # Total length of alignment
    len2 = (gap_matrix / len(msa)).sum()  # Adjusted length of alignment
示例#13
0
    'X': 0,
    '-': 0
}
a = 1E-3  # Coefficient of outlier curve
spid_regex = r'spid=([a-z]+)'
tree = skbio.read('../../ortho_tree/ctree_WAG/out/100red_ni.txt', 'newick',
                  skbio.TreeNode)
tip_order = {tip.name: i for i, tip in enumerate(tree.tips())}

records = []
for OGid in [
        path.split('.mfa')[0] for path in os.listdir('../realign_hmmer1/out/')
        if path.endswith('.mfa')
]:
    msa = [(re.search(spid_regex, header).group(1), seq.upper())
           for header, seq in read_fasta(f'../realign_hmmer1/out/{OGid}.mfa')]

    idx = 0
    for j in range(len(msa[0][1])):
        for i in range(len(msa)):
            sym = msa[i][1][j]
            if sym == '.' or sym.islower():
                break
        else:
            idx = j
            break  # if no break exit
    msa = [(header, seq[idx:]) for header, seq in msa]

    idx = len(msa[0][1])
    for j in range(len(msa[0][1]), 0, -1):
        for i in range(len(msa)):
示例#14
0
    with open('segments.tsv') as file:
        file.readline()  # Skip header
        for line in file:
            OGid, ppid, start, stop, state = line.split()
            state_set.add(state)
            try:
                OGid2regions[(OGid, ppid)].append(
                    (int(start), int(stop), state))
            except KeyError:
                OGid2regions[(OGid, ppid)] = [(int(start), int(stop), state)]

    # Convert MSAs to records containing state-emissions sequences and other data
    records = []
    for (OGid, ppid), regions in OGid2regions.items():
        # Load MSA and extract seq
        msa = read_fasta(f'../insertion_trim/out/{OGid}.mfa')
        seq = [
            seq for header, seq in msa
            if re.search(ppid_regex, header).group(1) == ppid
        ][0]

        # Create Bernoulli sequence
        p_seq = []
        for j in range(len(msa[0][1])):
            col = [
                1 if msa[i][1][j] in ['-', '.'] else 0 for i in range(len(msa))
            ]
            p = sum(col) / len(col)
            p_seq.append(p)

        # Create emission sequence
示例#15
0
            if 'charset' in line:
                groupdict = re.search(r'charset (?P<name>[a-zA-Z0-9]+) = (?P<regions>[0-9 -]+);', line)
                regions = []
                for region in groupdict['regions'].split():
                    start, stop = region.split('-')
                    regions.append((int(start)-1, int(stop)))
                transform, start0 = {}, 0
                for start, stop in regions:
                    transform[(start, stop)] = (start0, stop - start + start0)
                    start0 += stop - start
                partition = partitions[partition_id]
                partition.update({'regions': regions, 'transform': transform})
                partition_id += 1

    # Calculate likelihoods
    msa = read_fasta(f'../asr_aa/out/{OGid}.mfa')
    for partition in partitions.values():
        # Unpack partition parameters and partition MSA
        matrix, freqs = models[partition['model']]
        pinv, alpha, num_categories = partition['pinv'], partition['alpha'], partition['num_categories']
        partition_msa = []
        for header, seq in msa:
            partition_seq = ''.join([seq[start:stop] for start, stop in partition['regions']])
            partition_msa.append((header, partition_seq))

        # Convert to vectors at tips of tree
        tips = {tip.name: tip for tip in tree.tips()}
        for header, seq in partition_msa:
            tip = tips[header[1:5]]
            conditional = np.zeros((len(syms), len(seq)))
            for j, sym in enumerate(seq):
示例#16
0
with open('../../brownian2/aucpred_regions/out/regions.tsv') as file:
    file.readline()  # Skip header
    for line in file:
        OGid, start, stop, disorder = line.split()
        try:
            OGid2regions[OGid].append(
                (int(start), int(stop), True if disorder == 'True' else False))
        except KeyError:
            OGid2regions[OGid] = [(int(start), int(stop),
                                   True if disorder == 'True' else False)]

if not os.path.exists('out/'):
    os.mkdir('out/')

for OGid in OGids:
    msa = read_fasta(f'../../brownian2/insertion_trim/out/{OGid}.mfa')
    msa = [(re.search(ppid_regex,
                      header).group(1), re.search(spid_regex,
                                                  header).group(1), seq)
           for header, seq in msa]

    # Check regions and merge if necessary
    regions = OGid2regions[OGid]
    disorder_length = sum(
        [stop - start for start, stop, disorder in regions if disorder])
    order_length = sum(
        [stop - start for start, stop, disorder in regions if not disorder])
    if disorder_length >= 30 and order_length >= 30:
        disorder_regions = [
            f'{start+1}-{stop}' for start, stop, disorder in regions
            if disorder
示例#17
0
tip_order = {tip.name: i for i, tip in enumerate(tree.tips())}
spids = {tip.name for tip in tree.tips() if tip.name != 'sleb'}

OG_filter = pd.read_table('../../ortho_MSA/OG_filter/out/OG_filter.tsv')
df = pd.read_table('../../ortho_MSA/gap_contrasts/out/total_sums.tsv').merge(OG_filter[['OGid', 'sqidnum']], on='OGid', how='left')  # total_sums.tsv has gnidnum already
df['norm1'] = df['total'] / df['gnidnum']
df['norm2'] = df['total'] / (df['gnidnum'] * df['len2'])

for label in ['norm1', 'norm2']:
    if not os.path.exists(f'out/{label}/'):
        os.makedirs(f'out/{label}/')

    head = df.sort_values(by=label, ascending=False).head(150)
    for i, row in enumerate(head.itertuples()):
        # Load msa and trim terminal insertions
        msa = [(re.search(r'spid=([a-z]+)', header).group(1), seq) for header, seq in read_fasta(f'../../ortho_MSA/realign_hmmer2/out/{row.OGid}.mfa')]
        msa = trim_terminals(msa)

        # Load decoded states
        posterior = []
        with open(f'../insertion_trim/out/{row.OGid}.tsv') as file:
            header = file.readline().rstrip('\n').split('\t')
            for line in file:
                fields = {key: float(value) for key, value in zip(header, line.rstrip('\n').split('\t'))}
                posterior.append(fields['2'] + fields['3'])
        posterior = np.array(posterior)
        gradient = np.gradient(posterior)

        # Make trim plot
        slices = get_slices(msa, posterior, gradient)
        trims = np.zeros(len(posterior))
示例#18
0
# Parse genomes
genomes = []
with open('../config/genomes.tsv') as file:
    file.readline()  # Skip header
    for line in file:
        spid, _, source, prot_path, tcds_path = line.split()
        genomes.append((spid, source, prot_path, tcds_path))

# Extract and count polypeptide IDs
counts = {}  # Counts for each PPID to find duplicates
ppid2meta = {}  # PPID to gene and species
gnid2seqs = {}  # GNID to PPIDs with unique sequences
for spid, source, prot_path, tcds_path in genomes:
    # Find parent genes in tcds headers
    tcds_fasta = read_fasta(tcds_path)
    for header, _ in tcds_fasta:
        gn_match = re.search(gnid_regex[source], header)
        pp_match = re.search(ppid_regex[source], header)
        try:
            # First group is entire line, second is first match
            gnid = gn_match.group(1)
            ppid = pp_match.group(1)
            ppid2meta[ppid] = (gnid, spid)
        except AttributeError:
            print(header)

    # Find representative sequences in prot files
    prot_fasta = read_fasta(prot_path)
    for header, seq in prot_fasta:
        ppid = re.search(ppid_regex[source], header).group(1)
示例#19
0
            return False
    return True


Column = namedtuple('Column', ['spid', 'sym'])
seed(930715)  # Set seed to make results consistent

# Extract column pools
colpools = [('100red', lambda col: is_redundant(col, 1), []),
            ('100red_ni', lambda col: is_redundant(col, 1) and not is_invariant(col), []),
            ('50red', lambda col: is_redundant(col, 0.5), []),
            ('50red_ni', lambda col: is_redundant(col, 0.5) and not is_invariant(col), []),
            ('0red', lambda col: is_redundant(col, 0), []),
            ('0red_ni', lambda col: is_redundant(col, 0) and not is_invariant(col), [])]
for file_id in filter(lambda x: x.endswith('.mfa'), os.listdir('../align_aa2nt/out/')):  # Because inputs are not sorted, results are not guaranteed to be consistent
    msa = read_fasta(f'../align_aa2nt/out/{file_id}')
    for i in range(len(msa[0][1])):
        col = [Column(header[-4:], seq[i]) for header, seq in msa]
        for _, condition, colpool in colpools:
            if condition(col):
                colpool.append(col)

# Make meta alignments
for label, _, colpool in colpools:
    if not os.path.exists(f'out/{label}/'):
        os.makedirs(f'out/{label}/')

    print(f'{label}:', len(colpool))
    for samplenum in range(100):
        sample = [colpool[randrange(len(colpool))] for _ in range(10000)]
        seqs = {}
示例#20
0
    for line in file:
        ppid, gnid, _, sqid = line.split()
        ppid2meta[ppid] = (gnid, sqid)

# Parse genomes
genomes = {}
with open('../config/genomes.tsv') as file:
    file.readline()  # Skip header
    for line in file:
        spid, _, source, prot_path = line.split()
        genomes[spid] = (source, prot_path)

# Parse polypeptides
rows = []
for spid, (source, prot_path) in genomes.items():
    fasta = read_fasta(prot_path)
    for header, seq in fasta:
        ppid = re.search(ppid_regex[source], header).group(1)
        gnid, sqid = ppid2meta[ppid]
        rows.append({
            'ppid': ppid,
            'gnid': gnid,
            'spid': spid,
            'sqid': sqid,
            'seqlen': len(seq),
            'Xnum': seq.upper().count('X'),
            'Xmax': get_Xmax(seq)
        })

# Make plots output directory
if not os.path.exists('out/'):
示例#21
0
        line = file.readline()
        while not line.startswith('Model of rate heterogeneity:'):
            line = file.readline()
        num_categories = int(line.rstrip().split(' Gamma with ')[1][0])
        alpha = float(file.readline().rstrip().split(': ')[1])
    igfs = []  # Incomplete gamma function evaluations
    for i in range(num_categories + 1):
        x = gamma.ppf(i / num_categories, a=alpha, scale=1 / alpha)
        igfs.append(gammainc(alpha + 1, alpha * x))
    rates = []  # Normalized rates
    for i in range(num_categories):
        rate = num_categories * (igfs[i + 1] - igfs[i])
        rates.append((rate, 1 / num_categories))

    # Load sequence and convert to vectors at tips of tree
    mca = read_fasta(f'../asr_indel/out/{OGid}.mfa')
    tips = {tip.name: tip for tip in tree.tips()}
    for header, seq in mca:
        tip = tips[header[1:5]]
        conditional = np.zeros((2, len(seq)))
        for j, sym in enumerate(seq):
            conditional[int(sym), j] = 1
        tip.conditional = conditional

    # Get likelihoods for rate categories
    likelihoods = []
    for rate, prior in rates:
        s, conditional = get_conditional(tree, rate * matrix)
        l = np.expand_dims(freqs, -1) * conditional
        likelihoods.append(np.exp(s) * l * prior)