예제 #1
0
def hmmer_to_markers(input, temp_dir):
    global refpkg

    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + '_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])

    for gene in refpkg["genes"]:
        # Now run HMMER search
        hmmer_output = temp_dir + '/' + gene + ".out"
        hmmer_search(frag_file, refpkg[gene]["hmm"], hmmer_output)
        results = read_hmmsearch_results(hmmer_output)

        # Now select best direction for each frag
        for name, value in results.items():
            bitscore = value[1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name, val in frag_scores.items():
        if (val[1] not in genes):
            genes[val[1]] = {}
        if (val[2] == 'forward'):
            genes[val[1]][name] = fragments[name]
        else:
            genes[val[1]][name] = reverse_sequence(fragments[name])

    genes.pop("NA", None)

    for gene, seq in genes.items():
        gene_file = temp_dir + '/' + gene + ".frags.fas.fixed"
        _write_fasta(seq, gene_file)

    binned_fragments = {}
    for gene, seq in genes.items():
        binned_fragments[gene] = {}
        binned_fragments[gene]["file"] = temp_dir + '/' + gene \
            + ".frags.fas.fixed"
        binned_fragments[gene]["nfrags"] = len(seq.keys())

    return binned_fragments
예제 #2
0
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name+'_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir+"/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])
    gene_set = marker_genes
    align_name = 'sate'
    if (options().genes == 'cogs'):
        gene_set = cog_genes
        align_name = 'pasta'
    for gene in gene_set:
        # Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(
                options().__getattribute__('reference').path,
                'refpkg/%s.refpkg/%.profile' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for name, value in results.items():
            bitscore = value[1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name, val in frag_scores.items():
        if (val[1] not in genes):
            genes[val[1]] = {}
        if (val[2] == 'forward'):
            genes[val[1]][name] = fragments[name]
        else:
            genes[val[1]][name] = reverse_sequence(fragments[name])
    genes.pop("NA", None)
    for gene, seq in genes.items():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(seq, gene_file + ".fixed")
    return genes
예제 #3
0
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + '_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    #Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])
    gene_set = marker_genes
    align_name = 'sate'
    if (options().genes == 'cogs'):
        gene_set = cog_genes
        align_name = 'pasta'
    for gene in gene_set:
        #Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(options().__getattribute__('reference').path,
                         'refpkg/%s.refpkg/%.profile' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        #Now select best direction for each frag
        for name in results.keys():
            bitscore = results[name][1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    #Now bin the fragments
    genes = dict([])
    for name in frag_scores.keys():
        if (frag_scores[name][1] not in genes):
            genes[frag_scores[name][1]] = {}
        if (frag_scores[name][2] == 'forward'):
            genes[frag_scores[name][1]][name] = fragments[name]
        else:
            genes[frag_scores[name][1]][name] = reverse_sequence(
                fragments[name])
    genes.pop("NA", None)
    for gene in genes.keys():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(genes[gene], gene_file + ".fixed")
    return genes
예제 #4
0
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + "_rev", reverse_sequence(seq)) for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, "NA", "NA"]) for name in fragments.keys()])
    gene_set = marker_genes
    align_name = "sate"
    if options().genes == "cogs":
        gene_set = cog_genes
        align_name = "pasta"
    for gene in gene_set:
        # Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(
                options().__getattribute__("reference").path, "refpkg/%s.refpkg/%.profile" % (gene, align_name)
            ),
            temp_dir + "/%s.out" % gene,
        )
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for name in results.keys():
            bitscore = results[name][1]
            direction = "forward"
            true_name = name
            if name.find("_rev") != -1:
                true_name = true_name.replace("_rev", "")
                direction = "reverse"
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name in frag_scores.keys():
        if frag_scores[name][1] not in genes:
            genes[frag_scores[name][1]] = {}
        if frag_scores[name][2] == "forward":
            genes[frag_scores[name][1]][name] = fragments[name]
        else:
            genes[frag_scores[name][1]][name] = reverse_sequence(fragments[name])
    genes.pop("NA", None)
    for gene in genes.keys():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(genes[gene], gene_file + ".fixed")
    return genes
예제 #5
0
def main():
  args = parse_args()
  sequences = MutableAlignment()
  assert os.path.isfile(args.input) and os.access(args.input, os.R_OK), "Input file %s does not exist\n" % args.input
  sequences.read_file_object(args.input)
  frag = MutableAlignment()
  full = MutableAlignment()
  
  for (key,seq) in sequences.items():
    if (len(seq) <= args.threshold):
      frag[key]=seq
    else:
      full[key]=seq
  frag.write_to_path("%s.frag.fas" % args.output)
  full.write_to_path("%s.full.fas" % args.output)
예제 #6
0
def main():
    args = parse_args()
    sequences = MutableAlignment()
    assert os.path.isfile(args.input) and os.access(
        args.input, os.R_OK), "Input file %s does not exist\n" % args.input
    sequences.read_file_object(args.input)
    frag = MutableAlignment()
    full = MutableAlignment()

    for (key, seq) in sequences.items():
        if (len(seq) <= args.threshold):
            frag[key] = seq
        else:
            full[key] = seq
    frag.write_to_path("%s.frag.fas" % args.output)
    full.write_to_path("%s.full.fas" % args.output)
예제 #7
0
    def testExtendedAlignment(self):
        print "======= starting testExtendedAlignment ========="

        subset = [
            "SFIF", "SFII", "SCFC", "SGHD", "SDCC", "SBGE", "SFBB", "SDI",
            "SCGB", "SJGF", "SGBI", "SCJA", "SGAD", "SHEB", "SFHB", "SDJI",
            "SHED", "SJJJ", "SBBE", "SCCH", "SDJB", "SDAC", "SHEH", "SFDC",
            "SFEI", "SHHB", "SC", "SIAB", "SDDI", "SBCB", "SJB", "SEBD",
            "SFGD", "SHA", "SIDA", "SGHI", "SGIB", "SBFJ", "SFIE", "SCJF",
            "SJHJ", "SJBG", "SEJI", "SFFF", "SJ", "SIII", "SJHH", "SEIH",
            "SBDC", "SHDJ", "SJDD", "SGDB", "SIHA", "SIBB", "SECC", "SCAD",
            "SGBB", "SGIF", "SJHC", "SFCD", "SEAA", "SEFF", "SDFG", "SDJE",
            "SCFG", "SFH", "SCJ", "SDDD", "SEGD", "SCIH", "SDAG", "SCJE",
            "SFAJ", "SIDJ", "SE", "SHBC", "SJFF", "SCHD", "SBHA", "SEDF",
            "SFAF", "SEDD", "SDHD", "SGJD", "SIBH", "SGDF", "SIFA", "SJGA",
            "SIJB", "SFI", "SGA", "SBFC", "SBJA", "SFFC", "SFDH", "SFEE",
            "SBDF", "SGBJ", "SDHE", "SJIB", "SHHI", "SIDE", "SJII"
        ]

        alg = MutableAlignment()
        alg.read_filepath("data/simulated/test.fasta")
        alg.delete_all_gap()
        tlen = alg.get_length()

        frg = MutableAlignment()
        frg.read_filepath("data/simulated/test.fas")
        #print frg.get_num_taxa()

        pp = SeppProblem(alg.keys())
        pp.fragments = frg
        pp.subalignment = alg

        cp1 = SeppProblem(subset, pp)
        cp2 = SeppProblem(list(set(alg.keys()) - set(subset)), pp)
        cp1.fragments = ReadonlySubalignment(
            [k for k in frg.keys() if int(k[-1]) >= 9], frg)
        cp2.fragments = ReadonlySubalignment(
            [k for k in frg.keys() if int(k[-1]) <= 1], frg)

        cp1labels = cp1.write_subalignment_without_allgap_columns(
            "data/tmp/cp1.fasta")
        cp2labels = cp2.write_subalignment_without_allgap_columns(
            "data/tmp/cp2.fasta")
        tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta")
        assert all(
            [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())])
        tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta")
        assert all(
            [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())])

        cp1.fragments.write_to_path("data/tmp/cp1.frags.fas")
        cp2.fragments.write_to_path("data/tmp/cp2.frags.fas")
        '''We have done the hmmalign before. don't worry about that right now'''

        ext1 = ExtendedAlignment(cp1.fragments)
        ext1.build_extended_alignment("data/tmp/cp1.fasta",
                                      "data/tmp/cp1.extended.sto")
        ext1.relabel_original_columns(cp1labels)
        ext2 = ExtendedAlignment(cp2.fragments)
        ext2.build_extended_alignment("data/tmp/cp2.fasta",
                                      "data/tmp/cp2.extended.sto")
        ext2.relabel_original_columns(cp2labels)

        extmerger = ExtendedAlignment([])
        extmerger.merge_in(ext1)
        mixed = extmerger.merge_in(ext2)

        extmerger.write_to_path("data/tmp/extended.merged.fasta")

        assert extmerger.is_aligned(), "Merged alignment is not aligned"
        in1 = len([x for x in ext1._col_labels if x < 0])
        in2 = len([x for x in ext2._col_labels if x < 0])
        print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" % (
            extmerger.get_length(), in1, in2, tlen)
        assert (in1 + in2 + tlen - mixed) == extmerger.get_length(
        ), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" % (
            extmerger.get_length(), in1, in2, tlen, mixed)
        assert (in1 + in2 - mixed) == len(
            list(extmerger.iter_insertion_columns())
        ), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" % (
            len(list(extmerger.iter_insertion_columns())), in1, in1, mixed)

        tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment()
        tmp.delete_all_gap()
        assert tmp.is_aligned(), "merged alignment should be aligned!"
        assert tmp.get_length() == tlen, "merged alignment has wrong length"
        assert all([alg[k] == s for (k, s) in tmp.items()
                    ]), "merged alignment should match original alignment"

        print "======= finished testExtendedAlignment ========="
예제 #8
0
    def testExtendedAlignment(self):
        print "======= starting testExtendedAlignment ========="

        subset = ["SFIF","SFII","SCFC","SGHD","SDCC","SBGE","SFBB","SDI","SCGB","SJGF","SGBI","SCJA","SGAD","SHEB","SFHB","SDJI","SHED","SJJJ","SBBE","SCCH","SDJB","SDAC","SHEH","SFDC","SFEI","SHHB","SC","SIAB","SDDI","SBCB","SJB","SEBD","SFGD","SHA","SIDA","SGHI","SGIB","SBFJ","SFIE","SCJF","SJHJ","SJBG","SEJI","SFFF","SJ","SIII","SJHH","SEIH","SBDC","SHDJ","SJDD","SGDB","SIHA","SIBB","SECC","SCAD","SGBB","SGIF","SJHC","SFCD","SEAA","SEFF","SDFG","SDJE","SCFG","SFH","SCJ","SDDD","SEGD","SCIH","SDAG","SCJE","SFAJ","SIDJ","SE","SHBC","SJFF","SCHD","SBHA","SEDF","SFAF","SEDD","SDHD","SGJD","SIBH","SGDF","SIFA","SJGA","SIJB","SFI","SGA","SBFC","SBJA","SFFC","SFDH","SFEE","SBDF","SGBJ","SDHE","SJIB","SHHI","SIDE","SJII"]
         
        alg = MutableAlignment()
        alg.read_filepath("data/simulated/test.fasta")
        alg.delete_all_gap()
        tlen = alg.get_length()                    
        
        frg = MutableAlignment()
        frg.read_filepath("data/simulated/test.fas")
        #print frg.get_num_taxa()
        
        pp = SeppProblem(alg.keys())
        pp.fragments = frg
        pp.subalignment = alg
        
        cp1 = SeppProblem(subset, pp)
        cp2 = SeppProblem(list(set(alg.keys()) -set(subset)), pp)
        cp1.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) >= 9], frg)
        cp2.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) <= 1], frg)
        
        cp1labels = cp1.write_subalignment_without_allgap_columns("data/tmp/cp1.fasta")
        cp2labels = cp2.write_subalignment_without_allgap_columns("data/tmp/cp2.fasta")
        tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta")
        assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())])        
        tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta")
        assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())])
        
        cp1.fragments.write_to_path("data/tmp/cp1.frags.fas")
        cp2.fragments.write_to_path("data/tmp/cp2.frags.fas")
        
        '''We have done the hmmalign before. don't worry about that right now'''
        
        ext1 = ExtendedAlignment(cp1.fragments)
        ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto")
        ext1.relabel_original_columns(cp1labels)
        ext2 = ExtendedAlignment(cp2.fragments)
        ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto")
        ext2.relabel_original_columns(cp2labels)
        
        extmerger = ExtendedAlignment([])
        extmerger.merge_in(ext1)
        mixed = extmerger.merge_in(ext2)
                        
        extmerger.write_to_path("data/tmp/extended.merged.fasta")        

        assert extmerger.is_aligned(), "Merged alignment is not aligned"
        in1 = len([x for x in ext1._col_labels if x<0])
        in2 = len([x for x in ext2._col_labels if x<0])
        print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" %(extmerger.get_length(),in1 , in2 , tlen)
        assert ( in1 + in2 + tlen - mixed) == extmerger.get_length(), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d"  %(extmerger.get_length(),in1, in2 , tlen, mixed)
        assert ( in1 + in2 - mixed) == len(list(extmerger.iter_insertion_columns())), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d"  %(len(list(extmerger.iter_insertion_columns())),in1 , in1, mixed)
         
        
        tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment()
        tmp.delete_all_gap()
        assert tmp.is_aligned(), "merged alignment should be aligned!"
        assert tmp.get_length() == tlen, "merged alignment has wrong length"
        assert all([alg[k] == s for (k,s) in tmp.items()]), "merged alignment should match original alignment"

        
        print "======= finished testExtendedAlignment ========="
예제 #9
0
        base_alignment.get_sequence_names())
    print("Orig alignment sequences: ", original_backbone.get_sequence_names())
    print("Base alignment sequences: ", base_alignment.get_sequence_names())

    print("num base alignment seqs: ",
          len(base_alignment.get_sequence_names()))

    frags = MutableAlignment()
    sequence_names = []
    for file in sequence_files:
        seq = MutableAlignment()
        done = seq.read_filepath(file)
        print('query sequence names:', seq.get_sequence_names())
        print("query sequence length:", seq.get_length())
        done = sequence_names.extend(seq.get_sequence_names())
        for name, seq in seq.items():
            frags[name] = seq.upper()

    problem = SeppProblem(sequence_names)
    problem.set_subalignment(subbackbone)

    # constructs sub-base-alignment from the full alignment, delete all gaps
    mut_subalg = problem.subalignment.get_mutable_alignment()
    remaining_cols = mut_subalg.delete_all_gap()
    problem.annotations["ref.alignment.columns"] = remaining_cols
    print("num remaining_cols: ", len(remaining_cols))
    problem.fragments = frags

    ap_alg = problem.read_extendend_alignment_and_relabel_columns(
        base_alignment_file, aligned_files)
    extendedAlignment.merge_in(ap_alg, convert_to_string=True)
예제 #10
0
for dir in dirs:
    print("Working on %s\n" % dir)
    aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir)
    sequence_files = glob.glob('%sFC_*/hmmalign.frag.*' % dir)
    base_alignment_file = glob.glob('%s/*.fasta' % dir)
    base_alignment = MutableAlignment()
    done = base_alignment.read_filepath(base_alignment_file[0])
    subbackbone = original_backbone.get_soft_sub_alignment(
        base_alignment.get_sequence_names())
    frags = MutableAlignment()
    sequence_names = []
    for file in sequence_files:
        seq = MutableAlignment()
        done = seq.read_filepath(file)
        done = sequence_names.extend(seq.get_sequence_names())
        for name, seq in seq.items():
            frags[name] = seq.upper()
    problem = SeppProblem(sequence_names)
    problem.set_subalignment(subbackbone)

    mut_subalg = problem.subalignment.get_mutable_alignment()
    remaining_cols = mut_subalg.delete_all_gap()
    problem.annotations["ref.alignment.columns"] = remaining_cols
    problem.fragments = frags
    ap_alg = problem.read_extendend_alignment_and_relabel_columns(
        base_alignment_file, aligned_files)
    extendedAlignment.merge_in(ap_alg, convert_to_string=False)

extendedAlignment.write_to_path("/projects/sate8/namphuon/ultra_large/1000000/"
                                "upp_100_10_new/upp.unmasked.fasta")
extendedAlignment.remove_insertion_columns()