def test_fasta__from_file__compressed_bz2(): expected = [ FASTA("This_is_BZ_FASTA!", None, "CGTNA"), FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN") ] results = list(FASTA.from_file("tests/data/fasta_file.fasta.bz2")) assert_equal(results, expected)
def test_sequentual_phy__different_length_names_2(): msa = MSA([ FASTA("Burchelli_4", None, "ACGTTGATAACCAGG"), FASTA("Donkey", None, "TGCAGAGTACGACGT") ]) expected = \ """2 15 Burchelli_4 ACGTTGATAA CCAGG Donkey TGCAGAGTAC GACGT""" print interleaved_phy(msa), expected assert_equal(interleaved_phy(msa), expected)
def test_fasta__from_lines__multiple_records(): lines = [ ">first\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n", ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n", ">Third\n", "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n" ] expected = [ FASTA("first", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA"), FASTA("Second", "XT:1:0", "GAGAGCTCAGCTAAC"), FASTA("Third", None, "CGCTGACCAAAAACGGACAGGGCATTCGGC") ] assert_list_equal(FASTA.from_lines(lines), expected)
def test_sequentual_phy__different_length_names_1(): msa = MSA([ FASTA("A_short_name", None, "ACGTTGATAACCAGG"), FASTA("Another_really_long_sequence_name_that_is_too_long", None, "TGCAGAGTACGACGT") ]) expected = \ """2 15 A_short_name ACGTTGATAA CCAGG Another_really_long_sequence_n TGCAGAGTAC GACGT""" print interleaved_phy(msa), expected assert_equal(interleaved_phy(msa), expected)
def filter_singletons(self, to_filter, filter_using): included, excluded, to_filter \ = self._group(filter_using, to_filter) sequence = list(to_filter.sequence) sequences = [record.sequence.upper() for record in included] for (index, nts) in enumerate(zip(*sequences)): current_nt = sequence[index].upper() if current_nt in "N-": continue allowed_nts = set() for allowed_nt in nts: if allowed_nt not in "N-": allowed_nts.update(NT_CODES[allowed_nt]) filtered_nts = frozenset(NT_CODES[current_nt]) & allowed_nts if not filtered_nts: filtered_nts = "N" genotype = encode_genotype(filtered_nts) if genotype != current_nt: sequence[index] = genotype.lower() new_record = FASTA(to_filter.name, to_filter.meta, "".join(sequence)) return MSA([new_record] + included + excluded)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--genotype", help="Tabix indexed pileup file.", required=True) parser.add_argument("--intervals", help="BED file.", required=True) parser.add_argument("--padding", type=int, default=10, help="Number of bases to expand intervals, when " "filtering based on adjacent indels [%default]") parser.add_argument("--min-distance-to-indels", type=int, default=5, help="Variants closer than this distance from indels " "are filtered [%default].") args = parser.parse_args(argv) genotype = pysam.Tabixfile(args.genotype) with open(args.intervals) as bed_file: intervals = text.parse_lines_by_contig(bed_file, pysam.asBed()) for (_, beds) in sorted(intervals.items()): for (name, sequence) in build_genes(args, genotype, beds): FASTA(name, None, sequence).write(sys.stdout) return 0
def from_lines(cls, lines): """Parses a MSA from a file/list of lines, and returns a dictionary of names to sequences. If read_meta is True, meta information included after the first space in header of each sequence: >NAME META-INFORMATION SEQUENCE As suggested above, sequences are expected to be in FASTA format.""" return MSA(FASTA.from_lines(lines))
def test_fasta__from_lines__multiple_records(): lines = [">first\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n", ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n", ">Third\n", "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n"] expected = [FASTA("first", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA"), FASTA("Second", "XT:1:0", "GAGAGCTCAGCTAAC"), FASTA("Third", None, "CGCTGACCAAAAACGGACAGGGCATTCGGC")] assert_list_equal(FASTA.from_lines(lines), expected)
def __init__(self, options, filename): genome = list(FASTA.from_file(filename)) assert len(genome) == 1, len(genome) self._genome = genome[0].sequence.upper() self._sequence = None self._positions = None self._annotations = None self._mutate(options)
def _setup(self, _config, temp): self._update_ctl_file(source = self._control_file, destination = os.path.join(temp, "template.ctl")) os.symlink(os.path.abspath(self._trees_file), os.path.join(temp, "template.trees")) with open(os.path.join(temp, "template.seqs"), "w") as handle: for record in FASTA.from_file(self._sequence_file): if record.name not in self._exclude_groups: name = record.name sequence = record.sequence.upper() handle.write("%s\n" % (FASTA(name, None, sequence),))
def _run(self, _config, temp): fasta_files = [] for (name, filename) in sorted(self._infiles.iteritems()): fasta_files.append((name, pysam.Fastafile(filename))) for sequence_name in sorted(self._sequences): filename = os.path.join(temp, sequence_name + ".fasta") with open(filename, "w") as out_handle: for (sample, fasta_file) in fasta_files: sequence = fasta_file.fetch(sequence_name) fasta = FASTA(sample, sequence_name, sequence) out_handle.write(str(fasta))
def reduce(self): columns = [] uncalled = frozenset("Nn-") for column in izip(*(record.sequence for record in self)): if (frozenset(column) - uncalled): columns.append(column) if not columns: return None records = [] for (record, sequence) in izip(self, izip(*columns)): records.append(FASTA(record.name, record.meta, "".join(sequence))) return MSA(records)
def join(cls, *msas): """Merge multiple MSAs into a single MSA, by concatenating sequences in the order of the passed MSAs. Sequences are joined by name, and all MSAs must therefore contain the same set of sequence names. Meta information is not preserved.""" cls.validate(*msas) merged = defaultdict(list) for msa in msas: for record in msa: merged[record.name].append(record.sequence) sequences = [] for (name, sequence) in merged.iteritems(): sequences.append(FASTA(name, None, "".join(sequence))) return MSA(sequences)
def split(self, split_by = "123"): """Splits a MSA and returns a dictionary of keys to MSAs, using the keys in the 'split_by' parameter at the top level. See also pypeline.common.sequences.split.""" self.validate(self) if not split_by: raise TypeError("No partitions to split by specified") results = dict((key, set()) for key in split_by) for record in self: for (key, partition) in split(record.sequence, split_by).iteritems(): results[key].add(FASTA(record.name, None, partition)) for (key, value) in results.items(): results[key] = MSA(value) return results
def _run(self, _config, temp): def _by_name(bed): return bed.name fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._bedfile) as bedfile: bedrecords = text.parse_lines_by_contig(bedfile, BEDRecord) for (contig, beds) in sorted(bedrecords.iteritems()): beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start)) for (gene, gene_beds) in itertools.groupby(beds, _by_name): gene_beds = tuple(gene_beds) sequence = self._collect_sequence(fastafile, gene_beds) seqs[(contig, gene)] = sequence temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): FASTA(gene, None, sequence).write(out_file) fileutils.move_file(temp_file, self._outfile)
def test_fasta__from_file__compressed_bz2(): expected = [FASTA("This_is_BZ_FASTA!", None, "CGTNA"), FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN")] results = list(FASTA.from_file("tests/data/fasta_file.fasta.bz2")) assert_equal(results, expected)
def test_fasta__unimplemented_comparison(): assert_is(NotImplemented, FASTA("A", None, "C").__eq__(10)) assert_is(NotImplemented, FASTA("A", None, "C").__lt__(10)) assert_is(NotImplemented, FASTA("A", None, "C").__le__(10)) assert_is(NotImplemented, FASTA("A", None, "C").__ge__(10)) assert_is(NotImplemented, FASTA("A", None, "C").__gt__(10))
def test_fasta__from_lines__empty_record_last(): lines = [">fasta1\n", "ACGT\n", ">fasta2\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_name__alone(): lines = [">\n", "ACGT\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_name__with_others(): lines = [">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"] list(FASTA.from_lines(lines))
def test_fasta__repr__partial_line_test(): expected = ">foobar\n%s\n" % (_SEQ_FRAG, ) result = repr(FASTA("foobar", None, _SEQ_FRAG)) assert_equal(result, expected)
def test_fasta__sorting_less_equal(): assert not FASTA("A", "B", "C") < FASTA("A", "B", "C") assert_less(FASTA("A", "B", "C"), FASTA("B", "B", "C")) assert_less(FASTA("A", "B", "C"), FASTA("A", "C", "C")) assert_less(FASTA("A", "B", "C"), FASTA("A", "B", "D")) assert_less_equal(FASTA("A", "B", "C"), FASTA("A", "B", "C")) assert_less_equal(FASTA("A", "B", "C"), FASTA("B", "B", "C")) assert_less_equal(FASTA("A", "B", "C"), FASTA("A", "C", "C")) assert_less_equal(FASTA("A", "B", "C"), FASTA("A", "B", "D"))
def test_fasta__sorting_greater_equal(): assert not FASTA("A", "B", "C") > FASTA("A", "B", "C") assert_greater(FASTA("B", "B", "C"), FASTA("A", "B", "C")) assert_greater(FASTA("A", "C", "C"), FASTA("A", "B", "C")) assert_greater(FASTA("A", "B", "D"), FASTA("A", "B", "C")) assert_greater_equal(FASTA("A", "B", "C"), FASTA("A", "B", "C")) assert_greater_equal(FASTA("B", "B", "C"), FASTA("A", "B", "C")) assert_greater_equal(FASTA("A", "C", "C"), FASTA("A", "B", "C")) assert_greater_equal(FASTA("A", "B", "D"), FASTA("A", "B", "C"))
def test_fasta__from_lines_single_record(): lines = [">single\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n"] expected = [FASTA("single", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA")] assert_list_equal(FASTA.from_lines(lines), expected)
def test_fasta__from_lines__no_records(): assert_list_equal(FASTA.from_lines([]), [])
def test_fasta__repr__multiple_lines(): expected = ">foobar\n%s\n%s\n" \ % (_SEQ_FRAG * 10, _SEQ_FRAG * 5) result = repr(FASTA("foobar", None, _SEQ_FRAG * 15)) assert_equal(result, expected)
def test_fasta__repr__complete_line_test(): expected = ">barfoo\n%s\n" % (_SEQ_FRAG * 10, ) result = repr(FASTA("barfoo", None, _SEQ_FRAG * 10)) assert_equal(result, expected)
def test_fasta__inequality(): assert_not_equal(FASTA("A", "B", "C"), FASTA("A", "B", "D")) assert_not_equal(FASTA("A", "B", "C"), FASTA("A", None, "C")) assert_not_equal(FASTA("A", "B", "C"), FASTA("D", "B", "C"))
def test_fasta__from_lines__empty_record_name_only__first(): list(FASTA.from_lines([">fasta1\n", ">fasta2\n", "AGTC\n"]))
def _simple_fasta_record(): return FASTA("Dummy", "Meta-inf", "ACGT")
def test_fasta__from_lines__empty_record__middle(): lines = [">fasta0\n", "ACGT\n", ">fasta1\n", ">fasta2\n", "AGTC\n"] list(FASTA.from_lines(lines))
def test_fasta__equality(): assert_equal(FASTA("A", "B", "C"), FASTA("A", "B", "C"))
from nose.tools import assert_equal from pypeline.common.formats.phylip import \ sequential_phy, \ interleaved_phy from pypeline.common.formats.msa import \ MSA from pypeline.common.formats.fasta import \ FASTA _MSA_SHORT_SEQUENCES = \ MSA([FASTA("seq1", None, "ACGTTGATAACCAGG"), FASTA("seq2", None, "TGCAGAGTACGACGT")]) _MSA_MEDIUM_SEQUENCES = \ MSA([FASTA("seq1", None, "ACGTTGATAACCAGGAGGGATTCGCGATTGGTGGTAACGTAGCC"), FASTA("seq2", None, "TGCAGAGTACGACGTCTCCTAGATCCTGGACAATTTAAACCGAA")]) _MSA_LONG_SEQUENCES = \ MSA([FASTA("seq1", None, "CGGATCTGCTCCTCCACTGGCCACGTTTACTGTCCCCCAACCGTT" \ "CGTCCCGACCTAGTTATACTTCTTAGCAAGGTGTAAAACCAGAGATTGAGGTTATAACG" \ "TTCCTAATCAGTTATTAAATTACCGCGCCCCGACAG"), FASTA("seq2", None, "AGTTGAAGAGGCGGAACGTTTGTAAACCGCGCTAACGTAGTTCTA" \ "CAACCAGCCACCCGGTTCGAAGGAACAACTGGTCGCCATAATTAGGCGAAACGATAGTG" \ "CACTAAGGTCAGGTGCGCCCCTGTAAATAATTAGAT")]) _MSA_MEDIUM_NAMES = \ MSA([FASTA("A_really_long_sequence", None, "ACGTTGATAACCAGG"), FASTA("Another_real_long_one!", None, "TGCAGAGTACGACGT")])
def test_fasta__from_lines__missing_name__alone(): lines = ["ACGT\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_record_name_only__nothing_else(): list(FASTA.from_lines([">fasta1\n"]))
def test_fasta__hash(): assert_equal(hash(FASTA("A", "B", "C")), hash(FASTA("A", "B", "C"))) assert_not_equal(hash(FASTA("A", "B", "C")), hash(FASTA("B", "B", "C"))) assert_not_equal(hash(FASTA("A", "B", "C")), hash(FASTA("A", "C", "C"))) assert_not_equal(hash(FASTA("A", "B", "C")), hash(FASTA("A", "B", "D")))