Пример #1
0
def filterMINCED(loaded_gff, local_location):
    colheaders = [
        'seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
        'attributes'
    ]
    repeat_df = pd.DataFrame(columns=colheaders)
    #genome_name = local_location.split('.')[0]
    for name, group in loaded_gff.groupby(by='CRISPR locus number'):
        results_list = list()
        lengths_list = list()
        unique_reps = list()
        uniq_rep_nums = list()
        repeat_num = 0
        number_reps = len(group)
        if name == '':
            pass
        else:
            for index, row in group.iterrows():
                contig_ID = row['seqid']
                source = row['source']
                type_col = row['type']
                seq_start = row['start']
                seq_end = row['end']
                strand = row['strand']
                phase = row['phase']
                GeneOfInterest = row["DR number"]
                if 'CRISPR' in type_col:
                    initial_start = row['start']
                    final_end = row['end']
                    #numDRs = row['score']
                else:
                    repeat_num += 1
                    LocationString = local_location + '.fna'
                    for record in SeqIO.parse(open(LocationString), 'fasta'):
                        if contig_ID in record.name:
                            record.seq = record.seq[int(seq_start):int(seq_end
                                                                       )]
                            record.id = GeneOfInterest
                            record.name = GeneOfInterest
                            results_list.append(record)
                            lengths_list.append(len(record.seq))
                            if record.seq not in unique_reps:
                                unique_reps.append(record.seq)
                                uniq_rep_nums.append(repeat_num)
                            repeat_Seq = record.seq
            if len(lengths_list) > 0:
                avgrep = np.mean(uniq_rep_nums)
                dist1st = avgrep - 1
                distlast = number_reps - avgrep
                if distlast > dist1st:
                    strand = '-'
                else:
                    strand = '+'
                #print name, avgrep, number_reps, dist1st, distlast, strand
                if max(lengths_list) == min(lengths_list):
                    alignment = MultipleSeqAlignment(results_list)
                    summary_align = AlignInfo.SummaryInfo(alignment)
                    consensus = summary_align.gap_consensus()
                    if strand == '-':
                        consensus = consensus.reverse_complement()
                    summary_line = pd.DataFrame(
                        columns=colheaders,
                        data=[[
                            os.path.basename(local_location), source, type_col,
                            initial_start, final_end, number_reps, strand,
                            name.split('=')[-1],
                            str(consensus)
                        ]])
                    #return 'equal lengths'
                    if 'ID' not in name:
                        repeat_df = repeat_df.append(summary_line)
                    else:
                        pass
                else:
                    consensus = repeat_Seq
                    if strand == '-':
                        consensus = consensus.reverse_complement()
                    summary_line = pd.DataFrame(
                        columns=colheaders,
                        data=[[
                            os.path.basename(local_location), source, type_col,
                            initial_start, final_end, number_reps, strand,
                            name.split('=')[-1],
                            str(consensus)
                        ]])
                    if 'ID' not in name:
                        repeat_df = repeat_df.append(summary_line)
                    else:
                        pass
    return repeat_df
Пример #2
0
    args = parser.parse_args()

    genes = args.genes if type(args.genes)==list else [args.genes]
    translations = args.translations if type(args.translations)==list else [args.translations]

    T = Phylo.read(args.tree, 'newick')
    leafs = {n.name for n in T.get_terminals()}

    node_data = {}
    for gene, translation in zip(genes, translations):
        seqs = []
        for s in SeqIO.parse(translation, 'fasta'):
            if s.id in leafs:
                seqs.append(s)


        tt = TreeAnc(tree=T, aln=MultipleSeqAlignment(seqs), alphabet='aa')

        tt.infer_ancestral_sequences(reconstruct_tip_states=True)

        with open(translation.replace('.fasta', '_withInternalNodes.fasta'), 'w') as fh:
            for n in tt.tree.find_clades():
                if n.name not in node_data:
                    node_data[n.name] = {"aa_muts":{}}
                node_data[n.name]["aa_muts"][gene] = [f"{a}{p+1}{d}" for a,p,d in n.mutations]
                fh.write(f">{n.name}\n{tt.sequence(n, as_string=True, reconstructed=True)}\n")


    with open(args.output, 'w') as fh:
        json.dump({"nodes":node_data}, fh)
Пример #3
0
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

    Arguments:
     - sequences - A list (or iterator) of SeqRecord objects, or (if using
       Biopython 1.54 or later) a single SeqRecord.
     - handle    - File handle object to write to, or filename as string
       (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    Note if providing a file handle, your code should close the handle
    after calling this function (to ensure the data gets flushed to disk).

    Returns the number of records written (as an integer).
    """
    from Bio import AlignIO

    # Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(handle, SeqRecord):
        raise TypeError("Check arguments, handle should NOT be a SeqRecord")
    if isinstance(handle, list):
        # e.g. list of SeqRecord objects
        raise TypeError("Check arguments, handle should NOT be a list")

    if isinstance(sequences, SeqRecord):
        # This raised an exception in older versions of Biopython
        sequences = [sequences]

    if format in _BinaryFormats:
        mode = "wb"
    else:
        mode = "w"

    with as_handle(handle, mode) as fp:
        # Map the file format to a writer function/class
        if format in _FormatToString:
            format_function = _FormatToString[format]
            count = 0
            for record in sequences:
                fp.write(format_function(record))
                count += 1
        elif format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(sequences)
        elif format in AlignIO._FormatToWriter:
            # Try and turn all the records into a single alignment,
            # and write that using Bio.AlignIO
            alignment = MultipleSeqAlignment(sequences)
            alignment_count = AlignIO.write([alignment], fp, format)
            if alignment_count != 1:
                raise RuntimeError(
                    "Internal error - the underlying writer "
                    "should have returned 1, not %r" % alignment_count
                )
            count = len(alignment)
            del alignment_count, alignment
        elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format
            )
        else:
            raise ValueError("Unknown format '%s'" % format)

        if not isinstance(count, int):
            raise RuntimeError(
                "Internal error - the underlying %s writer "
                "should have returned the record count, not %r" % (format, count)
            )

    return count
Пример #4
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration

        # Whitelisted headers we know about
        known_headers = [
            "CLUSTAL",
            "PROBCONS",
            "MUSCLE",
            "MSAPROBS",
            "Kalign",
            "Biopython",
        ]
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known CLUSTAL header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        # find the clustal version in the header line
        version = None
        for word in line.split():
            if word[0] == "(" and word[-1] == ")":
                word = word[1:-1]
            if word[0] in "0123456789":
                version = word
                break

        # There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        # If the alignment contains entries with the same sequence
        # identifier (not a good idea - but seems possible), then this
        # dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        consensus = ""
        seq_cols = None  # Used to extract the consensus

        # Use the first block to get the sequence identifiers
        while True:
            if line[0] != " " and line.strip() != "":
                # Sequences identifier...
                fields = line.rstrip().split()

                # We expect there to be two fields, there can be an optional
                # "sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError(f"Could not parse line:\n{line}")

                ids.append(fields[0])
                seqs.append(fields[1])

                # Record the sequence position to get the consensus
                if seq_cols is None:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end
                assert fields[1] == line[seq_cols]

                if len(fields) == 3:
                    # This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            f"Could not parse line, bad sequence number:\n{line}"
                        ) from None
                    if len(fields[1].replace("-", "")) != letters:
                        raise ValueError(
                            f"Could not parse line, invalid sequence number:\n{line}"
                        )
            elif line[0] == " ":
                # Sequence consensus line...
                assert len(ids) == len(seqs)
                assert len(ids) > 0
                assert seq_cols is not None
                consensus = line[seq_cols]
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                # Check for blank line (or end of file)
                line = handle.readline()
                assert line.strip() == ""
                break
            else:
                # No consensus
                break
            line = handle.readline()
            if not line:
                break  # end of file

        assert line.strip() == ""
        assert seq_cols is not None

        # Confirm all same length
        for s in seqs:
            assert len(s) == len(seqs[0])
        if consensus:
            assert len(consensus) == len(seqs[0])

        # Loop over any remaining blocks...
        done = False
        while not done:
            # There should be a blank line between each block.
            # Also want to ignore any consensus line from the
            # previous block.
            while (not line) or line.strip() == "":
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if line.split(None, 1)[0] in known_headers:
                # Found concatenated alignment.
                self._header = line
                break

            for i in range(len(ids)):
                if line[0] == " ":
                    raise ValueError(f"Unexpected line:\n{line!r}")
                fields = line.rstrip().split()

                # We expect there to be two fields, there can be an optional
                # "sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError(f"Could not parse line:\n{line!r}")

                if fields[0] != ids[i]:
                    raise ValueError(
                        "Identifiers out of order? Got '%s' but expected '%s'"
                        % (fields[0], ids[i]))

                if fields[1] != line[seq_cols]:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    if start != seq_cols.start:
                        raise ValueError("Old location %s -> %i:XX" %
                                         (seq_cols, start))
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end

                # Append the sequence
                seqs[i] += fields[1]
                assert len(seqs[i]) == len(seqs[0])

                if len(fields) == 3:
                    # This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            f"Could not parse line, bad sequence number:\n{line}"
                        ) from None
                    if len(seqs[i].replace("-", "")) != letters:
                        raise ValueError(
                            f"Could not parse line, invalid sequence number:\n{line}"
                        )

                # Read in the next line
                line = handle.readline()
            # There should now be a consensus line
            if consensus:
                assert line[0] == " "
                assert seq_cols is not None
                consensus += line[seq_cols]
                assert len(consensus) == len(seqs[0])
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                # Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            raise StopIteration

        if (self.records_per_alignment is not None
                and self.records_per_alignment != len(ids)):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(ids), self.records_per_alignment))

        records = (SeqRecord(Seq(s), id=i, description=i)
                   for (i, s) in zip(ids, seqs))
        alignment = MultipleSeqAlignment(records)
        # TODO - Handle alignment annotation better, for now
        # mimic the old parser in Bio.Clustalw
        if version:
            alignment._version = version
        if consensus:
            alignment_length = len(seqs[0])
            if len(consensus) != alignment_length:
                raise ValueError(
                    "Alignment length is %i, consensus length is %i, '%s'" %
                    (alignment_length, len(consensus), consensus))
            alignment.column_annotations["clustal_consensus"] = consensus
            # For backward compatibility prior to .column_annotations:
            alignment._star_info = consensus
        return alignment
Пример #5
0
    logger.info('Working on MSA file: ' + file)
    MSA_file = files[file]
    logger.info('Number of sequences in the MSA: ' + str(len(MSA_file)))
    assigned = []
    unassigned = []
    logger.info("Assigning traits to sequences for MSA")
    for x in MSA_file:
        if spec(x.id) in species_temp.keys():
            assigned.append(
                SeqRecord(Seq(str(x.seq)),
                          x.id + '|' + str(species_temp[spec(x.id)]), '', ''))
        else:
            unassigned.append(SeqRecord(Seq(str(x.seq)), x.id, '', ''))
    logger.info('Number of sequences with assigned traits: ' +
                str(len(assigned)))
    MSA_file = MultipleSeqAlignment(unassigned)
    AlignIO.write(MSA_file, file.split('.')[0] + "_unassigned.fa", "fasta")

    MSA_file = MultipleSeqAlignment(assigned)
    AlignIO.write(MSA_file, file.split('.')[0] + "_assigned.fa", "fasta")

    def temp(txt):
        a = txt.split('|')
        b = a[-1]
        return float(b)

    #retain only those sequences within the desired OGT range
    logger.info("Retaining only those sequences with trait in desired range")
    in_range = []
    for x in MSA_file:
        for ranges in OGT_range:
Пример #6
0
    def setUp(self):
        # Test set 1
        seq1 = SeqRecord(
            Seq(
                "TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG",
            ),
            id="pro1",
        )
        seq2 = SeqRecord(
            Seq(
                "TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG",
            ),
            id="pro2",
        )
        pro1 = SeqRecord(Seq("SGTARTKLLLLLAALCAAGGALE"), id="pro1")
        pro2 = SeqRecord(Seq("SGTSRTKRLLLLAALGAAGGALE"), id="pro2")
        aln1 = MultipleSeqAlignment([pro1, pro2])
        self.aln1 = aln1
        self.seqlist1 = [seq1, seq2]
        # Test set 2
        #                      M  K  K  H  E L(F)L  C  Q  G  T  S  N  K  L  T  Q(L)L  G  T  F  E  D  H  F  L  S  L  Q  R  M  F  N  N  C  E  V  V
        seq3 = SeqRecord(Seq(
            "ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC"
        ),
                         id="pro1")
        # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC'), id='pro2')
        seq4 = SeqRecord(Seq(
            "ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC"
        ),
                         id="pro2")
        # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC  TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC'), id='pro3')
        seq5 = SeqRecord(Seq(
            "ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC"
        ),
                         id="pro3")
        pro3 = SeqRecord(Seq(
            "MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL"
        ),
                         id="pro1")
        pro4 = SeqRecord(Seq(
            "MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL"
        ),
                         id="pro2")
        pro5 = SeqRecord(Seq(
            "MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL"
        ),
                         id="pro3")
        aln2 = MultipleSeqAlignment([pro3, pro4, pro5])
        self.aln2 = aln2
        self.seqlist2 = [seq3, seq4, seq5]

        # Test set 3
        # use Yeast mitochondrial codon table
        seq6 = SeqRecord(Seq(
            "ATGGCAAGGGACCACCCAGTTGGGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACCTTTCTTTTCTCAAGACCATCCAG"
        ),
                         id="pro6")
        seq7 = SeqRecord(Seq(
            "ATGGCAAGGCACCATCCAGTTGAGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACGTGTCTCTGCTCAAGACCATCCAG"
        ),
                         id="pro7")
        seq8 = SeqRecord(Seq(
            "ATGGCAGGGGACCACCCAGTTGGGCACTGATATGATCGTGTGTATCTGCAGAGTAGTAACCACTCTTTTCTCATGACCATCCAG"
        ),
                         id="pro8")
        pro6 = SeqRecord(Seq("MARDHPVGHWYDRVYLQSSNTSFTKTIQ"), id="pro6")
        pro7 = SeqRecord(Seq("MARHHPVEHWYDRVYLQSSNVSTTKTIQ"), id="pro7")
        pro8 = SeqRecord(Seq("MAGDHPVGHWYDRVYTQSSNHSFTMTIQ"), id="pro8")
        aln3 = MultipleSeqAlignment([pro6, pro7, pro8])
        self.aln3 = aln3
        self.seqlist3 = [seq6, seq7, seq8]
        self.codontable3 = CodonTable.unambiguous_dna_by_id[3]
Пример #7
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers") from None

        assert self._is_header(line)

        if (self.records_per_alignment is not None
                and self.records_per_alignment != number_of_seqs):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            while len(s) < length_of_seqs:
                # The sequence may be split into multiple lines
                line = handle.readline().strip()
                if not line:
                    break
                if line == "":
                    continue
                s = "".join([s, line.strip().replace(" ", "")])
                if len(s) > length_of_seqs:
                    raise ValueError("Found a record of length %i, "
                                     "should be %i" % (len(s), length_of_seqs))
            if "." in s:
                raise ValueError(_NO_DOTS)
            seqs.append(s)
        while True:
            # Find other alignments in the file
            line = handle.readline()
            if not line:
                break
            if self._is_header(line):
                self._header = line
                break

        records = (SeqRecord(Seq(s), id=i, name=i, description=i)
                   for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records)
Пример #8
0
        items = line.strip('\n').split()
        chrom = items[0]
        pos = items[1]
        REF = items[3]
        ALTs = items[4].split(',')
        alleles = tuple([REF] + ALTs)
        GTs = items[9:]

        for x, sample in enumerate(samples):
            # print(i)
            seq_dict[sample + '_1'][i] = alleles[int(GTs[x].split('|')[0])]
            seq_dict[sample + '_2'][i] = alleles[int(GTs[x].split('|')[1])]

        positions[i] = pos

        i += 1
        # print(i)

alignment = MultipleSeqAlignment([
    SeqRecord(Seq(''.join(y), generic_dna), id=x, description='')
    for x, y in seq_dict.items()
])
AlignIO.write(alignment, args.output_prefix + '.fa', "fasta")

if args.output_positions:
    with open(args.output_prefix + '.pos', 'wt') as f_out:
        f_out.write('\n'.join(positions) + '\n')
    f_out.close()

#
Пример #9
0
def run(args):
    metadata, columns = read_metadata(args.metadata)
    dates = get_numerical_dates(metadata, fmt='%Y-%m-%d')
    stiffness = args.stiffness
    inertia = args.inertia

    if args.method == "kde":
        # Load weights if they have been provided.
        if args.weights:
            with open(args.weights, "r") as fh:
                weights = json.load(fh)

            weights_attribute = args.weights_attribute
        else:
            weights = None
            weights_attribute = None

    if args.tree:
        tree = Phylo.read(args.tree, 'newick')
        tps = []
        for tip in tree.get_terminals():
            tip.attr = {"num_date": np.mean(dates[tip.name])}
            tps.append(tip.attr["num_date"])

            # Annotate tips with metadata to enable filtering and weighting of
            # frequencies by metadata attributes.
            for key, value in metadata[tip.name].items():
                tip.attr[key] = value

        if args.method == "diffusion":
            # estimate tree frequencies
            pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date)
            frequency_dict = {"pivots":format_frequencies(pivots)}
            frequency_dict["counts"] = {}

            for region in args.regions:
                # Omit strains sampled prior to the first pivot from frequency calculations.
                # (these tend to be reference strains included for phylogenetic context)
                if region=='global':
                    node_filter_func = lambda node: node.attr["num_date"] >= pivots[0]
                else:
                    node_filter_func = lambda node: (node.attr["region"] == region
                                                    and node.attr["num_date"] >= pivots[0])

                tree_freqs = tree_frequencies(tree, pivots, method='SLSQP',
                                              node_filter = node_filter_func,
                                              ws = max(2, tree.count_terminals()//10),
                                              stiffness = stiffness, inertia=inertia,
                                              min_clades=args.minimal_clade_size_to_estimate)

                tree_freqs.estimate_clade_frequencies()

                frequency_dict["counts"][region] = [int(x) for x in tree_freqs.counts]
                if args.output_format == "nextflu":
                    # Export frequencies in nextflu-format by region and clade id.
                    for clade_id, clade_frequencies in tree_freqs.frequencies.items():
                        frequency_dict["%s_clade:%d" % (region, clade_id)] = format_frequencies(clade_frequencies)

                else:
                    # Export frequencies in auspice-format by strain name.
                    for node in tree.find_clades(order='postorder'):
                        if node.is_terminal():
                            node.tipcount=1
                        else:
                            node.tipcount = np.sum([c.tipcount for c in node])

                        if (node.is_terminal() or args.include_internal_nodes) and node.tipcount>args.minimal_clade_size:
                            if node.name not in frequency_dict:
                                frequency_dict[node.name] = {}
                            frequency_dict[node.name][region] = format_frequencies(tree_freqs.frequencies[node.clade])

        elif args.method == "kde":
            if args.output_format == "nextflu":
                print("ERROR: nextflu format is not supported for KDE frequencies", file=sys.stderr)
                return 1

            # Estimate frequencies.
            kde_frequencies = TreeKdeFrequencies(
                sigma_narrow=args.narrow_bandwidth,
                sigma_wide=args.wide_bandwidth,
                proportion_wide=args.proportion_wide,
                pivot_frequency=args.pivot_interval,
                start_date=args.min_date,
                end_date=args.max_date,
                weights=weights,
                weights_attribute=weights_attribute,
                include_internal_nodes=args.include_internal_nodes,
                censored=args.censored
            )
            frequencies = kde_frequencies.estimate(tree)

            # Export frequencies in auspice-format by strain name.
            frequency_dict = {"pivots": list(kde_frequencies.pivots)}
            for node_name in frequencies:
                frequency_dict[node_name] = {
                    "frequencies": format_frequencies(frequencies[node_name])
                }

        write_json(frequency_dict, args.output)
        print("tree frequencies written to", args.output, file=sys.stdout)
    elif args.alignments:
        frequencies = None
        for gene, fname in zip(args.gene_names, args.alignments):
            if not os.path.isfile(fname):
                print("ERROR: alignment file not found", file=sys.stderr)
                return 1

            aln = MultipleSeqAlignment([seq for seq in AlignIO.read(fname, 'fasta')
                                        if not seq.name.startswith('NODE_')])
            tps = np.array([np.mean(dates[seq.name]) for seq in aln])

            if frequencies is None:
                pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date)
                frequencies = {"pivots":format_frequencies(pivots)}

            if args.method == "kde":
                kde_frequencies = AlignmentKdeFrequencies(
                    sigma_narrow=args.narrow_bandwidth,
                    sigma_wide=args.wide_bandwidth,
                    proportion_wide=args.proportion_wide,
                    pivot_frequency=args.pivot_interval,
                    start_date=args.min_date,
                    end_date=args.max_date,
                    weights=weights,
                    weights_attribute=weights_attribute,
                    include_internal_nodes=args.include_internal_nodes,
                    censored=args.censored
                )
                kde_frequencies.estimate(
                    aln,
                    tps
                )

                for mutation, mutation_frequencies in kde_frequencies.frequencies.items():
                    position, state = mutation.split(":")
                    frequencies["%s:%s%s" % (gene, position, state)] = format_frequencies(mutation_frequencies)
            else:
                freqs = alignment_frequencies(aln, tps, pivots, stiffness=stiffness, inertia=inertia, method='SLSQP', dtps=2.0)
                freqs.mutation_frequencies(min_freq = args.minimal_frequency, ignore_char=args.ignore_char)
                frequencies.update({"%s:%d%s" % (gene, pos+1, state): format_frequencies(mutation_frequencies)
                                    for (pos, state), mutation_frequencies in freqs.frequencies.items()})
                frequencies["%s:counts" % gene] = [int(observations_per_pivot)
                                                   for observations_per_pivot in freqs.counts]

        write_json(frequencies, args.output)
        print("mutation frequencies written to", args.output, file=sys.stdout)
Пример #10
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration

        # Whitelisted headers we know about.
        known_headers = [
            "!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp"
        ]
        # Examples in "Molecular Biology Software Training Manual GCG version 10"
        # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001
        # would often start as follows:
        #
        # !!AA_MUTIPLE_ALIGNMENT 1.0
        # PileUp of: @/usr/users2/culhane/...
        #
        # etc with other seemingly free format text before getting to the
        # MSF/Type/Check line and the following Name: lines block and // line.
        #
        # MUSCLE just has a line "PileUp", while other sources just use the line
        # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT"
        # (nucleotide).
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known GCG MSF header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        while line and " MSF: " not in line:
            line = handle.readline()

        if not line:
            raise ValueError(
                "Reached end of file without MSF/Type/Check header line")

        # Quoting from "Molecular Biology Software Training Manual GCG version 10"
        # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001.
        # Page 31:
        #
        # "Header information is before a .. (double dot) in a GCG format file.
        #  The file will also have a checksum specific for that file."
        #
        # This was followed by a single non-aligned sequence, but this convention
        # appears to also be used in the GCG MSF files. Quoting other examples in
        # this reference, page 31:
        #
        # localpileup_17.msf  MSF: 195  Type: P  January 6, 2000 15:41  Check: 4365 ..
        #
        # Except from page 148:
        #
        # localpileup_106.msf  MSF: 457  Type: P  November 28, 2000 16:09  Check: 2396 ..
        #
        # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum:
        #
        #   MSF: 689  Type: N  Check: 0000  ..
        #
        # By observation, the MSF value is the column count, type is N (nucleotide)
        # or P (protein / amino acid).
        #
        # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown,
        #
        # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf
        # !!NA_MULTIPLE_ALIGNMENT 1.0
        #
        #   stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 ..
        #
        #   Name: G26680     Len: 633  Check: 4334 Weight: 1.00
        #   Name: G26685     Len: 633  Check: 3818 Weight: 1.00
        #   Name: G29385     Len: 633  Check:  391 Weight: 1.00
        #
        # //
        #
        parts = line.strip("\n").split()
        offset = parts.index("MSF:")
        if (parts[offset + 2] != "Type:"
                or parts[-3] not in ("Check:", "CompCheck:")
                or parts[-1] != ".."):
            raise ValueError(
                "GCG MSF header line should be "
                "'<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..', "
                " not: %r" % line)
        try:
            aln_length = int(parts[offset + 1])
        except ValueError:
            aln_length = -1
        if aln_length < 0:
            raise ValueError(
                "GCG MSF header line should have MDF: <int> for column count, not %r"
                % parts[offset + 1])
        seq_type = parts[offset + 3]
        if seq_type not in ["P", "N"]:
            raise ValueError(
                "GCG MSF header line should have 'Type: P' (protein) "
                "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type)

        # There should be a blank line after that header line, then the Name: lines
        #
        # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here,
        #
        # PileUp
        #
        #
        #
        #    MSF:  628  Type: P    Check:   147   ..
        #
        #  Name: AK1H_ECOLI/1-378 oo  Len:  628  Check:  3643  Weight:  1.000
        #  Name: AKH_HAEIN/1-382 oo  Len:  628  Check:  6504  Weight:  1.000
        #
        # //
        ids = []
        lengths = []
        checks = []
        weights = []
        line = handle.readline()
        while line and line.strip() != "//":
            line = handle.readline()
            if line.strip().startswith("Name: "):
                if " Len: " in line and " Check: " in line and " Weight: " in line:
                    rest = line[line.index("Name: ") + 6:].strip()
                    name, rest = rest.split(" Len: ")
                    length, rest = rest.split(" Check: ")
                    check, weight = rest.split(" Weight: ")
                    name = name.strip()
                    if name.endswith(" oo"):
                        # T-COFFEE oddity, ignore this
                        name = name[:-3]
                    if name in ids:
                        raise ValueError("Duplicated ID of %r" % name)
                    if " " in name:
                        raise NotImplementedError("Space in ID %r" % name)
                    ids.append(name)
                    # Expect aln_length <= int(length.strip()), see below
                    lengths.append(int(length.strip()))
                    checks.append(int(check.strip()))
                    weights.append(float(weight.strip()))
                else:
                    raise ValueError("Malformed GCG MSF name line: %r" % line)
        if not line:
            raise ValueError(
                "End of file while looking for end of header // line.")

        if aln_length != max(lengths):
            # In broken examples from IMGTHLA was possible to continue
            # https://github.com/ANHIG/IMGTHLA/issues/201
            max_length = max(lengths)
            max_count = sum(1 for _ in lengths if _ == max_length)
            raise ValueError(
                "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s"
                % (aln_length, max_count, len(ids), max_length))

        line = handle.readline()
        if not line:
            raise ValueError("End of file after // line, expected sequences.")
        if line.strip():
            raise ValueError(
                "After // line, expected blank line before sequences.")

        # Now load the sequences
        seqs = [[] for _ in ids]  # list of empty lists
        completed_length = 0
        while completed_length < aln_length:
            # Note might have a coordinate header line (seems to be optional)
            for idx, name in enumerate(ids):
                line = handle.readline()
                if idx == 0 and not line.strip():
                    # T-COFFEE uses two blank lines between blocks, rather than one
                    while line and not line.strip():
                        line = handle.readline()
                if not line:
                    raise ValueError(
                        "End of file where expecting sequence data.")
                # print("Looking for seq for %s in line: %r" % (name, line))
                words = line.strip().split()
                # Should we use column numbers, rather than assuming no spaces in names?
                if idx == 0 and words and words[0] != name:
                    # print("Actually have a coord line")
                    # Hopefully this is a coordinate header before the first seq
                    try:
                        i = int(words[0])
                    except ValueError:
                        i = -1
                    if i != completed_length + 1:
                        raise ValueError(
                            "Expected GCG MSF coordinate line starting %i, got: %r"
                            % (completed_length + 1, line))
                    if len(words) > 1:
                        # Final block usually not full 50 chars, so expect start only.
                        if len(words) != 2:
                            i = -1
                        else:
                            try:
                                i = int(words[1])
                            except ValueError:
                                i = -1
                        if i != (completed_length + 50 if completed_length +
                                 50 < aln_length else aln_length):
                            raise ValueError(
                                "Expected GCG MSF coordinate line %i to %i, got: %r"
                                % (
                                    completed_length + 1,
                                    completed_length + 50 if completed_length +
                                    50 < aln_length else aln_length,
                                    line,
                                ))
                    line = handle.readline()
                    words = line.strip().split()
                    # print("Still looking for seq for %s in line: %r" % (name, line))
                # Dealt with any coordinate header line, should now be sequence
                if not words:
                    # Should be sequence here, but perhaps its a short one?
                    if (lengths[idx] < aln_length
                            and len("".join(seqs[idx])) == lengths[idx]):
                        # Is this actually allowed in the format? Personally I would
                        # expect a line with name and a block of trailing ~ here.
                        pass
                    else:
                        raise ValueError("Expected sequence for %s, got: %r" %
                                         (name, line))
                elif words[0] == name:
                    assert len(words) > 1, line
                    # print(i, name, repr(words))
                    seqs[idx].extend(words[1:])
                else:
                    raise ValueError("Expected sequence for %r, got: %r" %
                                     (name, line))
            # TODO - check the sequence lengths thus far are consistent
            # with blocks of 50?
            completed_length += 50
            line = handle.readline()
            if line.strip():
                raise ValueError("Expected blank line, got: %r" % line)

        # Skip over any whitespace at the end...
        while True:
            line = handle.readline()
            if not line:
                # End of file, no more alignments
                break
            elif not line.strip():
                # Blank line, ignore
                pass
            elif line.strip().split()[0] in known_headers:
                # Looks like the start of another alignment:
                self._header = line
                break
            else:
                raise ValueError(
                    "Unexpected line after GCG MSF alignment: %r" % line)

        # Combine list of strings into single string, remap gaps
        seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs]

        # Apply any trailing padding for short sequences
        padded = False
        for idx, (length, s) in enumerate(zip(lengths, seqs)):
            if len(s) < aln_length and len(s) == length:
                padded = True
                seqs[idx] = s + "-" * (aln_length - len(s))
        if padded:
            import warnings
            from Bio import BiopythonParserWarning

            warnings.warn(
                "One of more alignment sequences were truncated and have been gap padded",
                BiopythonParserWarning,
            )

        records = (SeqRecord(
            Seq(s),
            id=i,
            name=i,
            description=i,
            annotations={"weight": w},
        ) for (i, s, w) in zip(ids, seqs, weights))

        # This will check alignment lengths are self-consistent:
        align = MultipleSeqAlignment(records)
        # Check matches the header:
        if align.get_alignment_length() != aln_length:
            raise ValueError(
                "GCG MSF headers said alignment length %i, but have %i" %
                (aln_length, align.get_alignment_length()))
        return align
Пример #11
0
    a.close()

    command = '/usr/texbin/pdflatex --file-line-error --synctex=1 -output-directory=%s --save-size=10000  %s/align.tex > /dev/null' % (
        TEMP_DIR, TEMP_DIR)

    print('Launcning command:')
    print(command)
    os.system(command)
    os.system('mv ' + TEMP_DIR + '/align.pdf %s.pdf' % title.replace(' ', '_'))

    #prof=cons_prof(alignment)
    #pylab.plot(prof)


if __name__ == '__main__':
    human_h2a_z_core = Seq(
        'SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLI-KATIAGGGVIPHIHKSLIG'
    )
    xenopus_h2a_core = Seq(
        'TRSSRAGLQFPVGRVHRLLRKGNYAE-RVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLP'
    )
    # human_h2a_z_core=Seq('SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLIKATIAGGGVIPHIHKSLIG')

    get_pdf(
        'H2A',
        MultipleSeqAlignment([
            SeqRecord(xenopus_h2a_core, id='H2A', name='H2A'),
            SeqRecord(human_h2a_z_core, id='H2A.Z', name='H2A.Z')
        ]), 'H2AvsH2A.Z', [0, 5, 1], True, True)
    # get_pdf('H2A',MultipleSeqAlignment([SeqRecord(human_h2a_z_core,id='H2A',name='H2A'),SeqRecord(human_h2a_z_core,id='1H2A.Z',name='H2A.Z')]),'H2AvsH2A.Z',[0,5,1])
Пример #12
0
    def get_spliced(self, starts, ends, strand=1):
        """Return a multiple alignment of the exact sequence range provided.

        Accepts two lists of start and end positions on target_seqname, representing
        exons to be spliced in silico.  Returns a *MultipleSeqAlignment* of the
        desired sequences spliced together.

        *starts* should be a list of 0-based start coordinates of segments in the reference.
        *ends* should be the list of the corresponding segment ends
        (in the half-open UCSC convention:
        http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).

        To ask for the alignment portion corresponding to the first 100
        nucleotides of the reference sequence, you would use
        ``search([0], [100])``
        """
        # validate strand
        if strand not in (1, -1):
            raise ValueError("Strand must be 1 or -1, got %s" % str(strand))

        # pull all alignments that span the desired intervals
        fetched = [multiseq for multiseq in self.search(starts, ends)]

        # keep track of the expected letter count
        # (sum of lengths of [start, end) segments,
        # where [start, end) half-open)
        expected_letters = sum(
            [end - start for start, end in zip(starts, ends)])

        # if there's no alignment, return filler for the assembly of the length given
        if len(fetched) == 0:
            return MultipleSeqAlignment([
                SeqRecord(Seq("N" * expected_letters), id=self._target_seqname)
            ])

        # find the union of all IDs in these alignments
        all_seqnames = set(
            [sequence.id for multiseq in fetched for sequence in multiseq])

        # split every record by base position
        # key: sequence name
        # value: dictionary
        #        key: position in the reference sequence
        #        value: letter(s) (including letters
        #               aligned to the "-" preceding the letter
        #               at the position in the reference, if any)
        split_by_position = dict([(seq_name, {}) for seq_name in all_seqnames])

        # keep track of what the total number of (unspliced) letters should be
        total_rec_length = 0

        # track first strand encountered on the target seqname
        ref_first_strand = None

        for multiseq in fetched:
            # find the target_seqname in this MultipleSeqAlignment and use it to
            # set the parameters for the rest of this iteration
            for seqrec in multiseq:
                if seqrec.id == self._target_seqname:
                    try:
                        if ref_first_strand is None:
                            ref_first_strand = seqrec.annotations["strand"]

                            if ref_first_strand not in (1, -1):
                                raise ValueError("Strand must be 1 or -1")
                        elif ref_first_strand != seqrec.annotations["strand"]:
                            raise ValueError(
                                "Encountered strand='%s' on target seqname, "
                                "expected '%s'" %
                                (seqrec.annotations["strand"],
                                 ref_first_strand))
                    except KeyError:
                        raise ValueError(
                            "No strand information for target seqname (%s)" %
                            self._target_seqname)
                    # length including gaps (i.e. alignment length)
                    rec_length = len(seqrec)
                    rec_start = seqrec.annotations["start"]
                    rec_end = seqrec.annotations["start"] + seqrec.annotations[
                        "size"]

                    total_rec_length += rec_end - rec_start

                    # blank out these positions for every seqname
                    for seqrec in multiseq:
                        for pos in range(rec_start, rec_end):
                            split_by_position[seqrec.id][pos] = ""

                    break
            else:
                raise ValueError("Did not find %s in alignment bundle" %
                                 (self._target_seqname, ))

            # the true, chromosome/contig/etc position in the target seqname
            real_pos = rec_start

            # loop over the alignment to fill split_by_position
            for gapped_pos in range(0, rec_length):
                for seqrec in multiseq:
                    # keep track of this position's value for the target seqname
                    if seqrec.id == self._target_seqname:
                        track_val = seqrec.seq[gapped_pos]

                    # Here, a real_pos that corresponds to just after a series of "-"
                    # in the reference will "accumulate" the letters found in other sequences
                    # in front of the "-"s
                    split_by_position[
                        seqrec.id][real_pos] += seqrec.seq[gapped_pos]

                # increment the real_pos counter only when non-gaps are found in
                # the target_seqname, and we haven't reached the end of the record
                if track_val != "-" and real_pos < rec_end - 1:
                    real_pos += 1

        # make sure the number of bp entries equals the sum of the record lengths
        if len(split_by_position[self._target_seqname]) != total_rec_length:
            raise ValueError(
                "Target seqname (%s) has %s records, expected %s" %
                (self._target_seqname,
                 len(split_by_position[self._target_seqname]),
                 total_rec_length))

        # translates a position in the target_seqname sequence to its gapped length
        realpos_to_len = dict([
            (x, len(y))
            for x, y in split_by_position[self._target_seqname].items()
            if len(y) > 1
        ])

        # splice together the exons
        subseq = {}

        for seqid in all_seqnames:
            seq_split = split_by_position[seqid]
            seq_splice = []

            filler_char = "N" if seqid == self._target_seqname else "-"

            # iterate from start to end, taking bases from split_by_position when
            # they exist, using N or - for gaps when there is no alignment.
            append = seq_splice.append

            for exonstart, exonend in zip(starts, ends):
                for real_pos in range(exonstart, exonend):
                    # if this seqname has this position, add it
                    if real_pos in seq_split:
                        append(seq_split[real_pos])
                    # if not, but it's in the target_seqname, add length-matched filler
                    elif real_pos in realpos_to_len:
                        append(filler_char * realpos_to_len[real_pos])
                    # it's not in either, so add a single filler character
                    else:
                        append(filler_char)

            subseq[seqid] = "".join(seq_splice)

        # make sure we're returning the right number of letters
        if len(subseq[self._target_seqname].replace("-",
                                                    "")) != expected_letters:
            raise ValueError(
                "Returning %s letters for target seqname (%s), expected %s" %
                (len(subseq[self._target_seqname].replace(
                    "-", "")), self._target_seqname, expected_letters))

        # check to make sure all sequences are the same length as the target seqname
        ref_subseq_len = len(subseq[self._target_seqname])

        for seqid, seq in subseq.items():
            if len(seq) != ref_subseq_len:
                raise ValueError("Returning length %s for %s, expected %s" %
                                 (len(seq), seqid, ref_subseq_len))

        # finally, build a MultipleSeqAlignment object for our final sequences
        result_multiseq = []

        for seqid, seq in subseq.items():
            seq = Seq(seq)

            seq = seq if strand == ref_first_strand else seq.reverse_complement(
            )

            result_multiseq.append(
                SeqRecord(seq, id=seqid, name=seqid, description=""))

        return MultipleSeqAlignment(result_multiseq)
Пример #13
0
def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet):
    """Iterate over a MAF file handle as MultipleSeqAlignment objects.

    Iterates over lines in a MAF file-like object (handle), yielding
    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to
    species names.
    """
    in_a_bundle = False

    annotations = []
    records = []

    while True:
        # allows parsing of the last bundle without duplicating code
        try:
            line = next(handle)
        except StopIteration:
            line = ""

        if in_a_bundle:
            if line.startswith("s"):
                # add a SeqRecord to the bundle
                line_split = line.strip().split()

                if len(line_split) != 7:
                    raise ValueError(
                        "Error parsing alignment - 's' line must have 7 fields"
                    )

                # convert MAF-style +/- strand to biopython-type 1/-1
                if line_split[4] == "+":
                    strand = 1
                elif line_split[4] == "-":
                    strand = -1
                else:
                    # TODO: issue warning, set to 0?
                    strand = 1

                # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
                anno = {
                    "start": int(line_split[2]),
                    "size": int(line_split[3]),
                    "strand": strand,
                    "srcSize": int(line_split[5])
                }

                sequence = line_split[6]

                # interpret a dot/period to mean the same as the first sequence
                if "." in sequence:
                    if not records:
                        raise ValueError(
                            "Found dot/period in first sequence of alignment")

                    ref = str(records[0].seq)
                    new = []

                    for (letter, ref_letter) in zip(sequence, ref):
                        new.append(ref_letter if letter == "." else letter)

                    sequence = "".join(new)

                records.append(
                    SeqRecord(Seq(sequence, alphabet),
                              id=line_split[1],
                              name=line_split[1],
                              description="",
                              annotations=anno))
            elif line.startswith("i"):
                # TODO: information about what is in the aligned species DNA before
                # and after the immediately preceding "s" line
                pass
            elif line.startswith("e"):
                # TODO: information about the size of the gap between the alignments
                # that span the current block
                pass
            elif line.startswith("q"):
                # TODO: quality of each aligned base for the species.
                # Need to find documentation on this, looks like ASCII 0-9 or gap?
                # Can then store in each SeqRecord's .letter_annotations dictionary,
                # perhaps as the raw string or turned into integers / None for gap?
                pass
            elif line.startswith("#"):
                # ignore comments
                # (not sure whether comments
                # are in the maf specification, though)
                pass
            elif not line.strip():
                # end a bundle of records
                if seq_count is not None:
                    assert len(records) == seq_count

                alignment = MultipleSeqAlignment(records, alphabet)
                # TODO - Introduce an annotated alignment class?
                # See also Bio/AlignIO/FastaIO.py for same requirement.
                # For now, store the annotation a new private property:
                alignment._annotations = annotations

                yield alignment

                in_a_bundle = False

                annotations = []
                records = []
            else:
                raise ValueError(
                    "Error parsing alignment - unexpected line:\n%s" %
                    (line, ))
        elif line.startswith("a"):
            # start a bundle of records
            in_a_bundle = True
            annot_strings = line.strip().split()[1:]
            if len(annot_strings) != line.count("="):
                raise ValueError(
                    "Error parsing alignment - invalid key in 'a' line")
            annotations = dict(
                [a_string.split("=") for a_string in annot_strings])
        elif line.startswith("#"):
            # ignore comments
            pass
        elif not line:
            break
Пример #14
0
def createAlignment(sequences, alphabet):
    """Create an Alignment object from a list of sequences"""
    return MultipleSeqAlignment(
        (SeqRecord(Seq(s, alphabet), id="sequence%i" % (i + 1))
         for (i, s) in enumerate(sequences)), alphabet)
Пример #15
0
def mview_linkage(seq_file, plot_title, mafft_exe, mview_exe):

    # define file name
    seq_path, seq_basename, seq_ext = sep_path_basename_ext(seq_file)
    msa_file = '%s/%s.aln' % (seq_path, seq_basename)
    msa_file_mview = '%s/%s_MView.aln' % (seq_path, seq_basename)
    msa_file_mviewd_html_tmp = '%s/%s_MView_tmp.html' % (seq_path,
                                                         seq_basename)
    msa_file_mviewd_html = '%s/%s_MView.html' % (seq_path, seq_basename)

    gap_char = ' '
    break_line_char = '='

    # align sequences
    mafft_cmd = '%s --quiet --retree 1 %s > %s' % (mafft_exe, seq_file,
                                                   msa_file)
    os.system(mafft_cmd)

    mapped_reads_dict = {}
    ref_id = ''
    ref_seq = ''
    current_line = 0
    for each_seq in AlignIO.read(msa_file, "fasta"):
        seq_id = each_seq.id
        seq_seq = str(each_seq.seq).upper()
        if current_line == 0:
            ref_id = seq_id
            ref_seq = seq_seq
        else:
            seq_id_base = '.'.join(seq_id.split('.')[:-1])
            seq_id_strand = seq_id.split('.')[-1]
            if seq_id_base not in mapped_reads_dict:
                if seq_id_strand == '1':
                    mapped_reads_dict[seq_id_base] = [seq_seq, '']
                if seq_id_strand == '2':
                    mapped_reads_dict[seq_id_base] = ['', seq_seq]
            else:
                if seq_id_strand == '1':
                    mapped_reads_dict[seq_id_base][0] = seq_seq
                if seq_id_strand == '2':
                    mapped_reads_dict[seq_id_base][1] = seq_seq

        current_line += 1

    # create an empty list to hold all sequences in a msa
    seq_record_list = []

    # add ref_seq to seq_record_list
    align_record_mview = MultipleSeqAlignment([])
    ref_seq_split_by_n = ref_seq.split('N')
    ref_seq_split_by_n_updated = []
    for segment in ref_seq_split_by_n:
        if ('-' in segment) and (segment == '-' * len(segment)):
            segment = 'N' * len(segment)
        ref_seq_split_by_n_updated.append(segment)
    ref_seq_updated = 'N'.join(ref_seq_split_by_n_updated)
    seq_record_list.append(
        SeqRecord(Seq(ref_seq_updated), id=ref_id, description='Reference'))

    # add break line
    seq_record_list.append(
        SeqRecord(Seq(break_line_char * len(ref_seq_updated)),
                  id='#',
                  description=''))

    # add paired reads to seq_record_list
    singleton_dict = {}
    overlapping_reads_dict = {}
    for each_read_base in mapped_reads_dict:

        r1_seq = mapped_reads_dict[each_read_base][0]
        r2_seq = mapped_reads_dict[each_read_base][1]

        if (r1_seq == '') or (r2_seq == ''):
            if not ((r1_seq == '') and (r2_seq == '')):
                singleton_dict[each_read_base] = mapped_reads_dict[
                    each_read_base]
        else:
            overlapping_bps = 0
            for bp1, bp2 in zip(r1_seq, r2_seq):
                if (bp1 != '-') and (bp2 != '-'):
                    overlapping_bps += 1

            if overlapping_bps > 0:
                overlapping_reads_dict[each_read_base] = mapped_reads_dict[
                    each_read_base]
            else:
                merge_r1_r2 = ''
                for bp1, bp2 in zip(r1_seq, r2_seq):
                    if (bp1 == '-') and (bp2 == '-'):
                        merge_r1_r2 += '-'
                    if (bp1 != '-') and (bp2 == '-'):
                        merge_r1_r2 += bp1
                    if (bp1 == '-') and (bp2 != '-'):
                        merge_r1_r2 += bp2
                seq_record_list.append(
                    SeqRecord(Seq(merge_r1_r2),
                              id=each_read_base,
                              description='In_Pair'))

    # add break line
    seq_record_list.append(
        SeqRecord(Seq(break_line_char * len(ref_seq_updated)),
                  id='#',
                  description=''))

    # add overlapped reads to seq_record_list
    for each_overlapping_reads in overlapping_reads_dict:
        r1_id = '%s.1' % each_overlapping_reads
        r2_id = '%s.2' % each_overlapping_reads
        r1_seq = overlapping_reads_dict[each_overlapping_reads][0]
        r2_seq = overlapping_reads_dict[each_overlapping_reads][1]
        seq_record_list.append(
            SeqRecord(Seq(r1_seq), id=r1_id, description='Overlapped'))
        seq_record_list.append(
            SeqRecord(Seq(r2_seq), id=r2_id, description='Overlapped'))

    # add break line
    seq_record_list.append(
        SeqRecord(Seq(break_line_char * len(ref_seq_updated)),
                  id='#',
                  description=''))

    # add singleton to seq_record_list
    for each_singleton in singleton_dict:
        r1_id = '%s.1' % each_singleton
        r2_id = '%s.2' % each_singleton
        r1_seq = singleton_dict[each_singleton][0]
        r2_seq = singleton_dict[each_singleton][1]
        if (r1_seq != '') and (r2_seq == ''):
            seq_record_list.append(
                SeqRecord(Seq(r1_seq), id=r1_id, description='Unpaired'))
        if (r1_seq == '') and (r2_seq != ''):
            seq_record_list.append(
                SeqRecord(Seq(r2_seq), id=r2_id, description='Unpaired'))

    # add break line
    seq_record_list.append(
        SeqRecord(Seq(break_line_char * len(ref_seq_updated)),
                  id='#',
                  description=''))

    # get updated msa record
    align_record_mview = MultipleSeqAlignment(seq_record_list)

    # write out updated msa
    msa_file_mview_handle = open(msa_file_mview, 'w')
    AlignIO.write(align_record_mview, msa_file_mview_handle, 'fasta')
    msa_file_mview_handle.close()

    # visualize update msa
    # coloring: any,identity,mismatch,consensus,group
    mview_parameter_str = '-in fasta -moltype dna -colormap CLUSTAL_NUC -coloring any -css on -html head -ruler off -label0 -label4 -label5 -gap "%s"' % gap_char
    mview_cmd = '%s %s -title %s %s > %s' % (mview_exe, mview_parameter_str,
                                             plot_title, msa_file_mview,
                                             msa_file_mviewd_html_tmp)
    os.system(mview_cmd)

    msa_file_mviewd_html_handle = open(msa_file_mviewd_html, 'w')
    for each_line in open(msa_file_mviewd_html_tmp):
        if not (('Reference sequence' in each_line) or
                ('Colored by' in each_line)):
            msa_file_mviewd_html_handle.write(each_line)

    msa_file_mviewd_html_handle.close()
Пример #16
0
    parser.add_argument("-o",
                        dest='output',
                        type=argparse.FileType('w+'),
                        default=sys.stdout)
    args = parser.parse_args()

    in_sequences = {
        x.id: x
        for x in AlignIO.read(args.input_real, phylo.FASTA)
    }
    sim_in_sequences = {
        x.id: x
        for x in AlignIO.read(args.input_sim, phylo.FASTA)
    }
    real_tree = phylo.build_phylogenetic_tree(
        MultipleSeqAlignment(in_sequences.values()))

    N = len(in_sequences)

    if N < args.min_tree_size:
        sys.stderr.write("Tree size is too small!\n")
        sys.exit(-1)

    try:
        real_tree.root_at_midpoint()
    except:
        sys.stderr.write("Error processing tree!\n")
        sys.exit(-1)

    p_functions = [
        phylo.get_sackin_index, phylo.get_colless_index, phylo.count_cherries,
Пример #17
0
def FastaM10Iterator(handle, alphabet=single_letter_alphabet):
    """Alignment iterator for the FASTA tool's pairwise alignment output.

    This is for reading the pairwise alignments output by Bill Pearson's
    FASTA program when called with the -m 10 command line option for machine
    readable output.  For more details about the FASTA tools, see the website
    http://fasta.bioch.virginia.edu/ and the paper:

         W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448

    This class is intended to be used via the Bio.AlignIO.parse() function
    by specifying the format as "fasta-m10" as shown in the following code:

        from Bio import AlignIO
        handle = ...
        for a in AlignIO.parse(handle, "fasta-m10"):
            assert len(a) == 2, "Should be pairwise!"
            print "Alignment length %i" % a.get_alignment_length()
            for record in a:
                print record.seq, record.name, record.id

    Note that this is not a full blown parser for all the information
    in the FASTA output - for example, most of the header and all of the
    footer is ignored.  Also, the alignments are not batched according to
    the input queries.

    Also note that there can be up to about 30 letters of flanking region
    included in the raw FASTA output as contextual information.  This is NOT
    part of the alignment itself, and is not included in the resulting
    MultipleSeqAlignment objects returned.
    """
    if alphabet is None:
        alphabet = single_letter_alphabet

    state_PREAMBLE = -1
    state_NONE = 0
    state_QUERY_HEADER = 1
    state_ALIGN_HEADER = 2
    state_ALIGN_QUERY = 3
    state_ALIGN_MATCH = 4
    state_ALIGN_CONS = 5

    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" \
                             % (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  #Just for printing len(q) in debug below
        m = "?"  #Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordindates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError, err:
            print "Darn... amino acids vs nucleotide coordinates?"
            print tool
            print query_seq
            print query_tags
            print q, len(q)
            print match_seq
            print match_tags
            print m, len(m)
            print handle.name
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.iteritems():
            alignment._annotations[key] = value
        for key, value in align_tags.iteritems():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #18
0
def pairwise_sequence_alignment(a_seq, b_seq, engine, a_seq_id=None, b_seq_id=None,
                                gapopen=10, gapextend=0.5,
                                outfile=None, outdir=None, force_rerun=False):
    """Run a global pairwise sequence alignment between two sequence strings.

    Args:
        a_seq (str, Seq, SeqRecord, SeqProp): Reference sequence
        b_seq (str, Seq, SeqRecord, SeqProp): Sequence to be aligned to reference
        engine (str): `biopython` or `needle` - which pairwise alignment program to use
        a_seq_id (str): Reference sequence ID. If not set, is "a_seq"
        b_seq_id (str): Sequence to be aligned ID. If not set, is "b_seq"
        gapopen (int): Only for `needle` - Gap open penalty is the score taken away when a gap is created
        gapextend (float): Only for `needle` - Gap extension penalty is added to the standard gap penalty for each 
            base or residue in the gap
        outfile (str): Only for `needle` - name of output file. If not set, is {id_a}_{id_b}_align.txt
        outdir (str): Only for `needle` - Path to output directory. Default is the current directory.
        force_rerun (bool): Only for `needle` - Default False, set to True if you want to rerun the alignment 
            if outfile exists.

    Returns:
        MultipleSeqAlignment: Biopython object to represent an alignment

    """
    engine = engine.lower()

    if engine not in ['biopython', 'needle']:
        raise ValueError('{}: invalid engine'.format(engine))

    if not a_seq_id:
        a_seq_id = 'a_seq'
    if not b_seq_id:
        b_seq_id = 'b_seq'

    a_seq = ssbio.protein.sequence.utils.cast_to_str(a_seq)
    b_seq = ssbio.protein.sequence.utils.cast_to_str(b_seq)

    if engine == 'biopython':
        # TODO: allow different matrices? needle uses blosum62 by default, how to change that?
        # TODO: how to define gap open/extend when using matrix in biopython global alignment?
        log.warning('Gap penalties not implemented in Biopython yet')

        blosum62 = matlist.blosum62
        alignments = pairwise2.align.globaldx(a_seq, b_seq, blosum62)  # TODO: add gap penalties
        best_alignment = alignments[0]

        a = ssbio.protein.sequence.utils.cast_to_seq_record(best_alignment[0], id=a_seq_id)
        b = ssbio.protein.sequence.utils.cast_to_seq_record(best_alignment[1], id=b_seq_id)
        alignment = MultipleSeqAlignment([a, b], annotations={"score": best_alignment[2],
                                                              "start": best_alignment[3],
                                                              "end"  : best_alignment[4]})
        alignment.annotations['percent_identity'] = get_percent_identity(best_alignment[0], best_alignment[1]) * 100

        return alignment

    if engine == 'needle':
        alignment_file = run_needle_alignment(seq_a=a_seq, seq_b=b_seq, gapopen=gapopen, gapextend=gapextend,
                                              outdir=outdir, outfile=outfile, force_rerun=force_rerun)
        log.debug('Needle alignment at {}'.format(alignment_file))

        if not op.exists(alignment_file):
            raise ValueError('{}: needle alignment file does not exist'.format(alignment_file))

        # Use AlignIO to parse the needle alignment, alignments[0] is the first alignment (the only one in pairwise)
        alignments = list(AlignIO.parse(alignment_file, "emboss"))
        alignment = alignments[0]

        # Rename the sequence IDs
        alignment[0].id = a_seq_id
        alignment[1].id = b_seq_id

        # Add needle statistics as annotations in the alignment object
        stats = needle_statistics(alignment_file)
        alignment_ids = list(stats.keys())
        if len(alignment_ids) > 1:
            raise ValueError('Needle alignment file contains more than one pairwise alignment')
        needle_id = alignment_ids[0]
        alignment.annotations['percent_identity'] = stats[needle_id]['percent_identity']
        alignment.annotations['percent_similarity'] = stats[needle_id]['percent_similarity']
        alignment.annotations['percent_gaps'] = stats[needle_id]['percent_gaps']
        alignment.annotations['score'] = stats[needle_id]['score']

        return alignment
Пример #19
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers") from None

        assert self._is_header(line)

        if (self.records_per_alignment is not None
                and self.records_per_alignment != number_of_seqs):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            if "." in s:
                raise ValueError(_NO_DOTS)
            seqs.append([s])

        # Look for further blocks
        line = ""
        while True:
            # Skip any blank lines between blocks...
            while "" == line.strip():
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if self._is_header(line):
                # Looks like the start of a concatenated alignment
                self._header = line
                break

            # print("New block...")
            for i in range(number_of_seqs):
                s = line.strip().replace(" ", "")
                if "." in s:
                    raise ValueError(_NO_DOTS)
                seqs[i].append(s)
                line = handle.readline()
                if (not line) and i + 1 < number_of_seqs:
                    raise ValueError("End of file mid-block")
            if not line:
                break  # end of file

        records = (SeqRecord(Seq("".join(s)), id=i, name=i, description=i)
                   for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records)
Пример #20
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            # Empty file - just give up.
            raise StopIteration
        if line.strip() != "# STOCKHOLM 1.0":
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = OrderedDict(
        )  # Really only need an OrderedSet, but python lacks this
        gs = {}
        gr = {}
        gf = {}
        gc = {}
        passed_end_alignment = False
        while True:
            line = handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == "# STOCKHOLM 1.0":
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError(
                        "Could not split line into identifier and sequence:\n"
                        + line)
                seq_id, seq = parts
                if seq_id not in ids:
                    ids[seq_id] = True
                seqs.setdefault(seq_id, "")
                seqs[seq_id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == "#=GC ":
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    feature, text = line[5:].strip().split(None, 2)
                    if feature not in gc:
                        gc[feature] = ""
                    gc[feature] += text.strip()  # append to any previous entry
                    # Might be interleaved blocks, so can't check length yet
                elif line[:5] == "#=GS ":
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    seq_id, feature, text = line[5:].strip().split(None, 2)
                    # if seq_id not in ids:
                    #    ids.append(seq_id)
                    if seq_id not in gs:
                        gs[seq_id] = {}
                    if feature not in gs[seq_id]:
                        gs[seq_id][feature] = [text]
                    else:
                        gs[seq_id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    seq_id, feature, text = line[5:].strip().split(None, 2)
                    # if seq_id not in ids:
                    #    ids.append(seq_id)
                    if seq_id not in gr:
                        gr[seq_id] = {}
                    if feature not in gr[seq_id]:
                        gr[seq_id][feature] = ""
                    gr[seq_id][feature] += text.strip(
                    )  # append to any previous entry
                    # Might be interleaved blocks, so can't check length yet
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids.keys()
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if (self.records_per_alignment is not None
                    and self.records_per_alignment != len(ids)):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" %
                    (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for seq_id in ids:
                seq = seqs[seq_id]
                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )
                name, start, end = self._identifier_split(seq_id)
                record = SeqRecord(
                    Seq(seq, self.alphabet),
                    id=seq_id,
                    name=name,
                    description=seq_id,
                    annotations={"accession": name},
                )
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(seq_id, record)
                records.append(record)
            for k, v in gc.items():
                if len(v) != alignment_length:
                    raise ValueError("%s length %i, expected %i" %
                                     (k, len(v), alignment_length))
            alignment = MultipleSeqAlignment(records, self.alphabet)

            for k, v in sorted(gc.items()):
                if k in self.pfam_gc_mapping:
                    alignment.column_annotations[self.pfam_gc_mapping[k]] = v
                elif k.endswith("_cons") and k[:-5] in self.pfam_gr_mapping:
                    alignment.column_annotations[self.pfam_gr_mapping[
                        k[:-5]]] = v
                else:
                    # Ignore it?
                    alignment.column_annotations["GC:" + k] = v

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Пример #21
0
def concatenate(alignments):
    """
    Concatenates a list of multiple sequence alignment objects.

    The alignments are concatenated based on their label, i.e. the
    sequences from the different alignments which have the same id/labels
    will become a single sequence. The order is preserved.

    If any sequences are missing in one or several alignments, these parts
    are padded with unknown data (:py:class:`Bio.Seq.UnknownSeq`).

    :param alignments: the list of alignments objects, i.e. list(:py:class:`Bio.Align.MultipleSeqAlignment`)
    :returns: a single :py:class:`Bio.Align.MultipleSeqAlignment`

    Example::

        >>> sequences = {'aln1': {'seq1': 'acgtca',
        ...                       'seq2': 'acgtt-',
        ...                       'seq3': 'ac-ta-'},
        ...              'aln2': {'seq2': 'ttg-cta',
        ...                       'seq3': 'tcgacta',
        ...                       'seq4': 'ttgacta'}}
        >>> alignments = [MultipleSeqAlignment([SeqRecord(Seq(sequence,
        ...                    alphabet=IUPAC.extended_dna), id=key)
        ...      for (key, sequence) in sequences[aln].items()])
        ...               for aln in ('aln1', 'aln2')]
        >>> con_alignment = concatenate(alignments)
        >>> con_alignment.sort()
        >>> print(con_alignment)
        ExtendedIUPACDNA() alignment with 4 rows and 13 columns
        acgtcaNNNNNNN seq1
        acgtt-ttg-cta seq2
        ac-ta-tcgacta seq3
        NNNNNNttgacta seq4

    :note:

       Limitations: any annotations in the sub-alignments are lost in
       the concatenated alignment.

    """

    # First check to see whether we're inputting filenames of alignments or the Biopython alignments
    # Assume that it's a biopython alignment if it's not a filename
    tmp_aligns = []
    for filename in alignments:
        if identify_input(filename).name == 'FILENAME':
            tmp_aligns.append(AlignIO.read(filename, "fasta"))
        else:
            tmp_aligns.append(filename)

    # Copy back to alignments
    alignments = tmp_aligns

    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    tmp = defaultdict(list)

    # Assume all alignments have same alphabet
    alphabet = alignments[0]._alphabet

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the tmp dict
        for label in missing:
            new_seq = UnknownSeq(length, alphabet=alphabet)
            tmp[label].append(str(new_seq))

        # else stuff the string representation into the tmp dict
        for rec in aln:
            tmp[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    return MultipleSeqAlignment(
        SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k)
        for (k, v) in tmp.items())
Пример #22
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle
        line = handle.readline()

        if not line:
            raise StopIteration

        # Strip out header comments
        while line and line.strip().startswith('#'):
            line = handle.readline()

        seqs = {}
        seq_regions = {}
        passed_end_alignment = False

        latest_id = None
        while True:
            if not line:
                break  # end of file
            line = line.strip()

            if line.startswith('='):
                # There may be more data, but we've reached the end of this
                # alignment
                break
            elif line.startswith('>'):
                m = XMFA_HEADER_REGEX_BIOPYTHON.match(line)
                if not m:
                    m = XMFA_HEADER_REGEX.match(line)
                    if not m:
                        raise ValueError("Malformed header line: %s", line)

                parsed_id = m.group('id')
                parsed_data = {}
                for key in ('start', 'end', 'id', 'strand', 'name',
                            'realname'):
                    try:
                        value = m.group(key)
                        if key == 'start':
                            value = int(value)
                            # Convert to zero based counting
                            if value > 0:
                                value -= 1

                        if key == 'end':
                            value = int(value)
                        parsed_data[key] = value
                    except IndexError:
                        # This will occur if we're asking for a group that
                        # doesn't exist. It's fine.
                        pass
                seq_regions[parsed_id] = parsed_data

                if parsed_id not in self._ids:
                    self._ids.append(parsed_id)

                seqs.setdefault(parsed_id, '')
                latest_id = parsed_id
            else:
                assert not passed_end_alignment
                if latest_id is None:
                    raise ValueError("Saw sequence before definition line")
                seqs[latest_id] += line
            line = handle.readline()

        assert len(seqs) <= len(self._ids)

        self.ids = self._ids
        self.sequences = seqs

        if self._ids and seqs:
            alignment_length = max(map(len, list(seqs.values())))
            records = []
            for id in self._ids:
                if id not in seqs or len(seqs[id]) == 0 \
                        or len(seqs[id]) == 0:
                    seq = '-' * alignment_length
                else:
                    seq = seqs[id]

                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )

                # Sometimes we don't see a particular sequence in the
                # alignment, so we skip that record since it isn't present in
                # that LCB/alignment
                if id not in seq_regions:
                    continue

                if (seq_regions[id]['start'] != 0
                        or seq_regions[id]['end'] != 0):
                    suffix = '/{start}-{end}'.format(**seq_regions[id])
                    if 'realname' in seq_regions[id]:
                        corrected_id = seq_regions[id]['realname']
                    else:
                        corrected_id = seq_regions[id]['name']
                    if corrected_id.count(suffix) == 0:
                        corrected_id += suffix
                else:
                    if 'realname' in seq_regions[id]:
                        corrected_id = seq_regions[id]['realname']
                    else:
                        corrected_id = seq_regions[id]['name']

                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=corrected_id,
                                   name=id)

                record.annotations["start"] = seq_regions[id]['start']
                record.annotations["end"] = seq_regions[id]['end']
                record.annotations[
                    "strand"] = 1 if seq_regions[id]['strand'] == '+' else -1

                records.append(record)
            return MultipleSeqAlignment(records, self.alphabet)
        else:
            raise StopIteration
Пример #23
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect")
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                # Quick hack until I can work out how -, * and / characters
                # and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError as err:
            print("Darn... amino acids vs nucleotide coordinates?")
            print(tool)
            print(query_seq)
            print(query_tags)
            print("%s %i" % (q, len(q)))
            print(match_seq)
            print(match_tags)
            print("%s %i" % (m, len(m)))
            print(handle.name)
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        # TODO - Introduce an annotated alignment class?
        # See also Bio/AlignIO/MafIO.py for same requirement.
        # For now, store the annotation a new private property:
        alignment._annotations = {}

        # Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        # Query
        # =====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        # TODO - What if a specific alphabet has been requested?
        # TODO - Use an IUPAC alphabet?
        # TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        # Match
        # =====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        # This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #24
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration

        while line.rstrip() != "#=======================================":
            line = handle.readline()
            if not line:
                raise StopIteration

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        header_dict = {}

        while line[0] == "#":
            # Read in the rest of this alignment header,
            # try and discover the number of records expected
            # and their length
            parts = line[1:].split(":", 1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":", 1)
                    assert i + 1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            # Parse the rest of the header
            if key == "identity":
                header_dict["identity"] = int(parts[1].strip().split("/")[0])
            if key == "similarity":
                header_dict["similarity"] = int(parts[1].strip().split("/")[0])
            if key == "gaps":
                header_dict["gaps"] = int(parts[1].strip().split("/")[0])
            if key == "score":
                header_dict["score"] = float(parts[1].strip())

            # And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise ValueError("Number of sequences missing!")
        if length_of_seqs is None:
            raise ValueError("Length of sequences missing!")

        if (self.records_per_alignment is not None
                and self.records_per_alignment != number_of_seqs):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        seq_starts = []
        index = 0

        # Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    # identifier, seq start position, seq, seq end position
                    # (an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end
                    if start >= end:
                        # Special case, either a single letter is present,
                        # or no letters at all.
                        if seq.replace("-", "") == "":
                            start = int(start)
                            end = int(end)
                        else:
                            start = int(start) - 1
                            end = int(end)
                    else:
                        assert seq.replace("-", "") != "", repr(line)
                        start = int(start) - 1  # python counting
                        end = int(end)

                    if index < 0 or index >= number_of_seqs:
                        raise ValueError("Expected index %i in range [0,%i)" %
                                         (index, number_of_seqs))
                    # The identifier is truncated...
                    assert id == ids[index] or id == ids[index][:len(id)]

                    if len(seq_starts) == index:
                        # Record the start
                        seq_starts.append(start)

                    # Check the start...
                    if start >= end:
                        assert seq.replace("-", "") == "", line
                    elif start - seq_starts[index] != len(seqs[index].replace(
                            "-", "")):
                        raise ValueError(
                            "Found %i chars so far for sequence %i (%s, %r), line says start %i:\n%s"
                            % (
                                len(seqs[index].replace("-", "")),
                                index,
                                id,
                                seqs[index],
                                start,
                                line,
                            ))
                    seqs[index] += seq

                    # Check the end ...
                    if end != seq_starts[index] + len(seqs[index].replace(
                            "-", "")):
                        raise ValueError(
                            "Found %i chars so far for sequence %i (%s, %r, start=%i), file says end %i:\n%s"
                            % (
                                len(seqs[index].replace("-", "")),
                                index,
                                id,
                                seqs[index],
                                seq_starts[index],
                                end,
                                line,
                            ))

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    # just a start value, this is just alignment annotation (?)
                    # print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                # Just a spacer?
                pass
            else:
                raise ValueError("Unrecognised EMBOSS pairwise line: %r\n" %
                                 line)

            line = handle.readline()
            if (line.rstrip() == "#---------------------------------------"
                    or line.rstrip()
                    == "#======================================="):
                # End of alignment
                self._header = line
                break

        assert index == 0

        if (self.records_per_alignment is not None
                and self.records_per_alignment != len(ids)):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(ids), self.records_per_alignment))

        records = []
        for id, seq in zip(ids, seqs):
            if len(seq) != length_of_seqs:
                # EMBOSS 2.9.0 is known to use spaces instead of minus signs
                # for leading gaps, and thus fails to parse.  This old version
                # is still used as of Dec 2008 behind the EBI SOAP webservice:
                # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
                raise ValueError("Error parsing alignment - sequences of "
                                 "different length? You could be using an "
                                 "old version of EMBOSS.")
            records.append(
                SeqRecord(Seq(seq, self.alphabet), id=id, description=id))
        return MultipleSeqAlignment(records,
                                    self.alphabet,
                                    annotations=header_dict)
def filter_vcf(args):
    fd = args.input
    of = args.output
    ot = args.outtype
    vcf_reader = (vcf.Reader(fsock=fd) if fd == sys.stdin else vcf.Reader(
        filename=fd))
    if ot == "vcf":
        vcf_writer = (vcf.Writer(of, vcf_reader) if of == sys.stdout else
                      vcf.Writer(open(of, 'w'), vcf_reader))
    else:
        newalignment = None
        samplenames = None

    for record in vcf_reader:
        gts = list()
        for sample in record.samples:
            sepchar = (
                '/' if not sample.phased else '|'
            )  #replacing this is not strictly necessary, but makes sense to do
            bases = sample.gt_bases
            if bases != None:
                gts.append(set(bases.replace(
                    sepchar,
                    '')))  #sort to ensure A/T is not different from T/A
            else:
                gts.append(None)
                break
        if gts[-1] is None: continue
        c = gts[0].intersection(
            gts[1:]
        )  #c is the common allele. this will be empty if there were 2 mutations (ie A/A -> T/T) so remove those samples
        if len(c) == 0:
            continue
        commonalleles = [c] * len(gts)
        variablealleles = [
            ((gt - c) if (gt - c) else gt) for gt in gts
        ]  #if sample is homozygous for the common allele, its gt-c set will be empty, so its second allele is the same as its first

        if args.variable:
            commonalleles = [set()] * len(gts)
            if len(variablealleles[0].union(variablealleles[1:])) == 1:
                variablealleles = [set()] * len(gts)
            if commonalleles == variablealleles == [set()] * len(gts):
                continue

        combined = list()
        if skip:
            combined = baselist
        else:
            #convert to list of chars first
            commonalleles = [list(c)[0] if c else '' for c in commonalleles]
            variablealleles = [
                list(v)[0] if v else '' for v in variablealleles
            ]
            combined = [c + v for c, v in zip(commonalleles, variablealleles)]

        if commonalleles == variablealleles == [set()] * len(gts):
            continue

        if ot == "vcf":
            vcf_writer.write_record(record)
        else:
            if not newalignment:
                newalignment = [''] * len(record.samples)
                samplenames = [x.sample for x in record.samples]
            newalignment = [
                newalignment[j] + combined[j] for j in range(0, len(combined))
            ]

    if ot == "vcf":
        vcf_writer.close()
    else:
        newseqobjs = [
            SeqRecord(Seq(newalignment[l], IUPAC.unambiguous_dna),
                      id=samplenames[l],
                      description='') for l in range(0, len(newalignment))
        ]
        newalnobj = MultipleSeqAlignment(newseqobjs)
        newalnobj = remove_duplicate_seqs(newalnobj)
        AlignIO.write(newalnobj, of, ot)
Пример #26
0
def project_CM(alnfile, contmat):
    """
    project contact maps onto alignment
    """

    from Bio.Align import MultipleSeqAlignment

    alignment = AlignIO.read(open(alnfile), "clustal")

    # filter dssp entries from alignment file
    alignment_nr = MultipleSeqAlignment([])
    for ix, record in enumerate(alignment):
        if "dssp" not in record.id and "space" not in record.id:
            alignment_nr.append(record)
    alignment = alignment_nr

    nrow = len(alignment)
    ncol = alignment.get_alignment_length()

    mapping = {}
    list_id=[]

    CM = np.zeros(( ncol, ncol, nrow ))


    for ix, record in enumerate(alignment):
        seq_aln = np.array(record.seq)
        seq_ref = "".join(list(seq_aln[seq_aln!='-']))
        ident = record.id.split('_')[0]
        list_id.append(ident)
        current_substr = structures[structures['ORF'] == ident]['substrate'].item()

        current_mapping = {}
        # dict of aln position for seq position
        pos_aln = 0
        pos_ref = 1                                 # this is a FIX for python indexing vs. PDB
        while (pos_aln < ncol):
            if seq_aln[pos_aln] == '-':
                pos_aln += 1
            else:
                current_mapping[pos_ref] = pos_aln
                pos_aln += 1
                pos_ref += 1

        mapping[ident] = current_mapping

        current_cm = CONTMAT[ident]
        for n1, n2 in current_cm.edges():
            aln1 = current_mapping[n1]
            aln2 = current_mapping[n2]

            CM[aln1, aln2, ix] = CM[aln2, aln1, ix] = 1

    
    cons_cm = np.sum(CM, 2)
    np.savetxt("../data/processed/consensus_cm.txt", cons_cm, fmt='%i' )

    cm_df = pd.DataFrame(columns=['i', 'j', 'n'])
    counter = 0
    for i in range(ncol):
        for j in range(ncol):
            if cons_cm[i,j] > 0:
                cm_df.loc[counter] = [i, j, cons_cm[i,j]]
                counter += 1

    cm_df.to_csv("../data/processed/consensus_cm_df.txt", sep='\t', header=True, index=False)


    return CM, list_id
Пример #27
0
from Bio.Alphabet import generic_protein
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
seq1 = 'MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW'
seq2 = 'MH--IFIYQIGYALKSGYIQSIRSPEY-NW'
seq_rec_1 = SeqRecord(Seq(seq1, generic_protein), id='asp')
seq_rec_2 = SeqRecord(Seq(seq2, generic_protein), id='unk')
align = MultipleSeqAlignment([seq_rec_1, seq_rec_2])
print(align)
Пример #28
0
    def next(self):
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()

        if not line:
            raise StopIteration
        line = line.strip()
        parts = filter(None, line.split())
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        #Expects STRICT truncation/padding to 10 characters
        #Does not require any white space between name and seq.
        for i in range(0, number_of_seqs):
            line = handle.readline().rstrip()
            ids.append(line[:10].strip())  #first ten characters
            seqs.append([line[10:].strip().replace(" ", "")])

        #Look for further blocks
        line = ""
        while True:
            #Skip any blank lines between blocks...
            while "" == line.strip():
                line = handle.readline()
                if not line: break  #end of file
            if not line: break  #end of file

            if self._is_header(line):
                #Looks like the start of a concatenated alignment
                self._header = line
                break

            #print "New block..."
            for i in range(0, number_of_seqs):
                seqs[i].append(line.strip().replace(" ", ""))
                line = handle.readline()
                if (not line) and i + 1 < number_of_seqs:
                    raise ValueError("End of file mid-block")
            if not line: break  #end of file

        records = (SeqRecord(Seq("".join(s), self.alphabet), \
                             id=i, name=i, description=i) \
                   for (i,s) in zip(ids, seqs))
        return MultipleSeqAlignment(records, self.alphabet)
Пример #29
0
            try:
                print(next(SeqIO.parse(h, t_format, given_alpha)))
                h.close()
                assert False, "Forcing wrong alphabet, %s, should fail (%s)" \
                    % (repr(given_alpha), t_filename)
            except ValueError:
                # Good - should fail
                pass
            h.close()
        del good, bad, given_alpha, base_alpha

        if t_alignment:
            print("Testing reading %s format file %s as an alignment" %
                  (t_format, t_filename))

            alignment = MultipleSeqAlignment(
                SeqIO.parse(handle=t_filename, format=t_format))
            assert len(alignment) == t_count

            alignment_len = alignment.get_alignment_length()

            # Check the record order agrees, and double check the
            # sequence lengths all agree too.
            for i in range(t_count):
                assert compare_record(records[i], alignment[i])
                assert len(records[i].seq) == alignment_len

            print(alignment_summary(alignment))

    # Some alignment file formats have magic characters which mean
    # use the letter in this position in the first sequence.
    # They should all have been converted by the parser, but if
Пример #30
0
def aln_maker(vcf,
              reference,
              contig,
              start=None,
              stop=None,
              vcf_modifier=None,
              add_ref=False,
              check=True,
              alphabet=None,
              gatkwc2ref=True,
              wosamples=[],
              indels=False):

    if isinstance(vcf, str):
        vcf = VI(vcf, vcf_modifier)

    if vcf_modifier:
        vcf.modifier = vcf_modifier

    with open(reference) as f:
        fdata = SeqIO.parse(f, 'fasta')
        fdata = {res.id: res.seq for res in fdata}

    samples = vcf.samples
    if add_ref and "Reference" in samples:
        raise AlnException(
            "Cannot add reference : A sample has already this name")
    if add_ref: samples.append("Reference")

    if contig not in fdata:
        raise AlnException("Contig not found in the reference sequence : %s" %
                           (contig))

    # sequence
    start = start or 0
    sequence = str(fdata[contig][start:stop])
    data = {
        sample: [MutableSeq(sequence) for i in range(ploidy)]
        for sample, ploidy in vcf.ploidies.items()
    }
    if add_ref: data["Reference"] = [sequence]

    if DEBUG: print(sequence)

    if indels: _infer_indels(data, vcf, sequence, contig, start, stop, check)
    else:
        _infer_snps(data, vcf, sequence, contig, start, stop, gatkwc2ref,
                    check)

    alphabet = alphabet or unambiguous_dna
    data = {
        "%s_%i" % (sample, idx): Seq(str(sequence), alphabet)
        for sample, sequences in data.items()
        for idx, sequence in enumerate(sequences, start=1)
        if sample not in wosamples
    }

    stop = stop or len(sequence)
    desc = "%s - %i - %i" % (contig, start, stop)

    return MultipleSeqAlignment(
        SeqRecord(data[sample], id=sample, name=sample, description=desc)
        for sample in sorted(data))