Пример #1
0
    parser.add_argument("--keep-temporary-files",
                        action='store_true',
                        help="don't clean up")
    parser.add_argument("--chunk-size",
                        default=10,
                        type=int,
                        help="don't clean up")
    parser.add_argument("--nthreads",
                        default=1,
                        type=int,
                        help="Number of threads to use in alignment")
    args = parser.parse_args()

    #refname = f"config/reference.gb"
    refname = args.gbk
    features = load_features(refname)
    seqs = SeqIO.parse(args.sequences, 'fasta')
    ref = SeqIO.read(refname, 'genbank')
    #clade_designations = read_in_clade_definitions(f"config/clades.tsv")
    clade_designations = read_in_clade_definitions(args.clade)

    log_fname = "clade_assignment.log"
    in_fname = "clade_assignment_tmp.fasta"
    out_fname = "clade_assignment_tmp_alignment.fasta"

    output = open(args.output, 'w')
    print('name\tclade\tparent clades', file=output)

    # break the sequences into chunks, align each to the reference, and assign clades one-by-one
    done = False
    while not done:
Пример #2
0
    parser = argparse.ArgumentParser(
        description="Add translations",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument('--tree', type=str, required=True, help="input tree")
    parser.add_argument('--reference', type=str, required=True, help="reference genbank sequence")
    parser.add_argument('--translations', type=str,  nargs='+', required=True, help="amino acid alignment")
    parser.add_argument('--genes', type=str, nargs='+', required=True, help="amino acid alignment")
    parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
    args = parser.parse_args()

    genes = args.genes if type(args.genes)==list else [args.genes]
    translations = args.translations if type(args.translations)==list else [args.translations]
    ref = SeqIO.read(args.reference, format='genbank')
    features = load_features(args.reference)

    if not set(features.keys())==set(args.genes):
        print("WARNING: supplied genes don't match the annotation")
        print("the following features are in the annotation by not supplied as genes:", set(features.keys()).difference(args.genes))
        print("the following features are in the supplied as genes but not the annotation:", set(args.genes).difference(features.keys()))

    T = Phylo.read(args.tree, 'newick')
    leafs = {n.name for n in T.get_terminals()}

    node_data = {}
    root_sequence_translations = {}
    for gene, translation in zip(genes, translations):
        seqs = []
        for s in SeqIO.parse(translation, 'fasta'):
            if s.id in leafs:
Пример #3
0
    sequences = []
    for seq in SeqIO.parse(args.sequences, 'fasta'):
        if seq.name in metadata:
            if metadata[seq.name]["num_date"]>=time_interval[0] and \
               metadata[seq.name]["num_date"]<time_interval[1] and \
               metadata[seq.name]["region"]==region:
                sequences.append(seq)

    tmp_str = "".join(sample('ABCDEFGHILKLMOPQRSTUVWXYZ', 20))
    if not os.path.isdir('tmp'):
        os.mkdir('tmp')

    print("selected %d for region %s and date interval %f-%f" %
          (len(sequences), region, time_interval[0], time_interval[1]))
    features_to_translate = load_features(args.reference_sequence, args.genes)
    tmp_file = "tmp/sequence_file_%s_%s.fasta" % (region, tmp_str)
    tmp_file_out = "tmp/sequence_file_%s_%s_aln.fasta" % (region, tmp_str)
    SeqIO.write(sequences, tmp_file, 'fasta')
    fail = align.run(
        pseudo_args(sequences=tmp_file,
                    reference_sequence=args.reference_sequence,
                    output=tmp_file_out,
                    reference_name=None,
                    remove_reference=True,
                    method='mafft',
                    nthreads=1,
                    fill_gaps=False))
    if fail:
        sys.exit(fail)
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Translate nucleotide sequences to amino acid sequences for the requested genes",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument('--sequences', required=True, help='FASTA file of nucleotide sequences to translate')
    parser.add_argument('--reference-sequence', required=True, help='GenBank or GFF file containing the annotation')
    parser.add_argument('--genes', nargs='+', help="genes to translate (list or file containing list)")
    parser.add_argument('--output', nargs='+', help="FASTA files of amino acid sequences per gene")

    args = parser.parse_args()

    # Load features for requested genes.
    features = load_features(args.reference_sequence, args.genes)

    # Load sequences indexed by sequence id.
    sequences = {
        sequence.id: str(sequence.seq)
        for sequence in Bio.SeqIO.parse(args.sequences, "fasta")
        if "N" not in str(sequence.seq)
    }
    #if sorted(set(list(str(sequence.seq)))) == ["A", "C", "G", "T"]

    # Translate requested genes.
    translations = {}
    invalid_samples = set()
    for feature_name, output_file in zip(args.genes, args.output):
        translations[feature_name] = translate_feature(sequences, features[feature_name])
        records = [