def internal_homology_blast(record: secmet.Record) -> Dict[int, List[List[str]]]: """ Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs store groups of homologs - including singles - in a dictionary as a list of lists accordingly Arguments: record: the Record to generate groups from Returns: a dictionary mapping cluster_number to a list containing distinct groups represented by lists of query ids """ with TemporaryDirectory(change=True): logging.info("Finding internal homologs in each gene cluster...") internalhomologygroups = {} for cluster in record.get_clusters(): cluster_number = cluster.get_cluster_number() iquerycluster_names, iqueryclusterseqs = create_blast_inputs(cluster) query_filename = "internal_input.fasta" fasta.write_fasta(iquerycluster_names, iqueryclusterseqs, query_filename) blastoutput = run_internal_blastsearch(query_filename) queries, _ = blastparse(blastoutput, record, min_seq_coverage=25, min_perc_identity=30) groups = find_internal_orthologous_groups(queries, iquerycluster_names) internalhomologygroups[cluster_number] = groups return internalhomologygroups
def run_muscle_single(seq_name: str, seq: str, comparison_file: str) -> Dict[str, str]: """ Runs muscle over a single sequence against a comparison file in profile mode and returns a dictionary of the resulting alignments Arguments: seq_name: the name of the query seq: the sequence to align comparison_file: the path of the file containing comparison sequences Returns: a dictionary mapping sequence name (query or reference) to alignment """ with NamedTemporaryFile(mode="w+") as temp_in: with NamedTemporaryFile(mode="w+") as temp_out: write_fasta([seq_name], [seq], temp_in.name) # Run muscle and collect sequence positions from file result = execute([ get_config().executables.muscle, "-profile", "-quiet", "-in1", comparison_file, "-in2", temp_in.name, "-out", temp_out.name ]) if not result.successful(): raise RuntimeError( "muscle returned %d: %r while comparing query named %s" % (result.return_code, result.stderr.replace("\n", ""), seq_name)) fasta = read_fasta(temp_out.name) return fasta
def smcog_tree_analysis(cds: CDSFeature, input_number: int, smcog: str, output_dir: str) -> None: "run smCOG search on all gene cluster CDS features" gene_id = cds.get_name() seq = cds.translation # create input.fasta file with single query sequence to be used as input for MSA fasta.write_fasta([gene_id], [seq], "input" + str(input_number) + ".fasta") alignment_file = alignsmcogs(smcog, input_number) # Generate trimmed alignment trim_alignment(input_number, alignment_file) # Draw phylogenetic tree draw_tree(input_number, output_dir, gene_id)
def trim_alignment(input_number: int, alignment_file: str) -> None: """ remove all positions before the first and after the last position shared by at least a third of all sequences """ def find_first_aa_position(conservations: List[Dict[str, int]], sequence_count: int) -> int: """ Finds the first position of a shared amino acid """ for position, conservation in enumerate(conservations): aa = sorted(conservation.items(), key=lambda x: (x[1], x[0]), reverse=True) base, count = aa[0] # skip best hits that are gaps if base == "-": continue # check that the count is greater than required if count >= sequence_count / 3: return position return 0 # can't be earlier than the start contents = fasta.read_fasta(alignment_file) # check all sequences are the same length sequence_length = len(list(contents.values())[0]) for name, seq in contents.items(): assert sequence_length == len( seq), "%s has different sequence length" % name # stripping ( and ) because it breaks newick tree parsing # and keeping only the last two fields (id and description) names = [ "|".join(name.replace("(", "_").replace(")", "_").rsplit('|', 2)[-2:]) for name in list(contents) ] seqs = list(contents.values()) # store conservation of residues conservations = [defaultdict(lambda: 0) for i in range(sequence_length) ] # type: List[Dict[str, int]] for seq in seqs: for position, base in enumerate(seq): conservations[position][base] += 1 # Find first and last amino acids shared first_shared_amino = find_first_aa_position(conservations, len(seqs)) conservations.reverse() last_shared_amino = sequence_length - find_first_aa_position( conservations, len(seqs)) # Shorten sequences to detected conserved regions seqs = [seq[first_shared_amino:last_shared_amino] for seq in seqs] seed_fasta_name = "trimmed_alignment" + str(input_number) + ".fasta" fasta.write_fasta(names, seqs, seed_fasta_name)
def write_fastas_with_all_genes(regions: Iterable[secmet.Region], filename: str, partitions: int = 1) -> List[str]: """ Write fasta files containing all genes in all clusters in a blast friendly form. If partitioning the data into multiple files, the index of the partition will be included in the filename before the extension, e.g. input.fasta -> input0.fasta, input1.fasta, ... Arguments: regions: an iterable of clusters to find genes in filename: the filename to use for the file partitions: the number of files to create (approx. equally sized) Returns: a list containing filenames of the written files """ if not isinstance(partitions, int): raise TypeError("Partitions must be an int greater than 0") if partitions < 1: raise ValueError("Partitions must be greater than 0") all_names, all_seqs = [], [] for region in regions: names, seqs = create_blast_inputs(region) all_names.extend(names) all_seqs.extend(seqs) if not (all_names and all_seqs): raise ValueError("Diamond search space contains no sequences") if partitions == 1: fasta.write_fasta(all_names, all_seqs, filename) return [filename] chunk_filename = "%d".join(os.path.splitext(filename)) size = len(all_names) // partitions for i in range(partitions): chunk_names = all_names[i * size:(i + 1) * size] chunk_seqs = all_seqs[i * size:(i + 1) * size] if i == partitions - 1: chunk_names = all_names[i * size:] chunk_seqs = all_seqs[i * size:] fasta.write_fasta(chunk_names, chunk_seqs, chunk_filename % i) return [chunk_filename % i for i in range(partitions)]