def write_clusters(clusters, outfile): """ Writes information about clusters to file. Parameters ---------- clusters : dict Dictionary with the identifiers of sequences that are cluster representatives as keys and a list with tuples as values. Each tuple has the identifier of a sequence that was added to the cluster, the decimal proportion of shared distinct kmers/minimizers and the length of the clustered sequence. outfile : str Path to the file that will be created to save information about clusters. """ cluster_lines = [] for rep, seqids in clusters.items(): current_cluster = [] current_cluster.append('>{0}'.format(rep)) clustered = [', '.join(['{}'] * len(s)).format(*s) for s in seqids] current_cluster.extend(clustered) cluster_lines.append(current_cluster) # sort by number of lines to get clusters with more sequences first cluster_lines = im.sort_data(cluster_lines, sort_key=lambda x: len(x), reverse=True) cluster_lines = im.flatten_list(cluster_lines) cluster_text = im.join_list(cluster_lines, '\n') fo.write_to_file(cluster_text, outfile, 'w', '\n')
def determine_distinct(sequences_file, unique_fasta): """ Identifies duplicated sequences in a FASTA file. Returns a single sequence identifier per distinct sequence and saves distinct sequences to a FASTA file. Parameters ---------- sequences_file : str Path to a FASTA file. unique_fasta : str Path to a FASTA file that will be created to store distinct sequences. Returns ------- List with following elements: total : int Total number of times sequences were repeated. unique_seqids : list List with one sequence identifier per distinct sequence. The first identifier observed for a distinct sequence is the one stored in the list. """ total = 0 seqs_dict = {} out_limit = 10000 out_seqs = [] exausted = False seq_generator = SeqIO.parse(sequences_file, 'fasta') while exausted is False: record = next(seq_generator, None) if record is not None: # seq object has to be converted to string sequence = str(record.seq.upper()) seqid = record.id seq_hash = im.hash_sequence(sequence) # store only the hash for distinct sequences if seq_hash not in seqs_dict: seqs_dict[seq_hash] = seqid recout = fao.fasta_str_record(seqid, sequence) out_seqs.append(recout) elif seq_hash in seqs_dict: total += 1 else: exausted = True if len(out_seqs) == out_limit or exausted is True: if len(out_seqs) > 0: out_seqs = im.join_list(out_seqs, '\n') fo.write_to_file(out_seqs, unique_fasta, 'a', '\n') out_seqs = [] unique_seqids = list(seqs_dict.values()) return [total, unique_seqids]
def blast_inputs(clusters, output_directory, ids_dict): """ Creates files with the identifiers of the sequences in each cluster. Parameters ---------- clusters : dict Dictionary with the identifiers of cluster representatives as keys and a list with tuples as values (each tuple has the identifier of a sequence that is in the cluster, the decimal proportion of shared minimizers and the length of that sequence). output_directory : str Path to the directory where files with identifiers will be created. ids_dict : dict Dictionary that maps sequence identifiers to shorter and unique identifiers that will be saved in the files and used as sequence identifiers during BLAST to avoid errors related with sequence headers/identifiers that exceed length limit allowed by BLAST. Returns ------- ids_to_blast : list List with the identifiers of all clusters. """ rev_ids = {v: k for k, v in ids_dict.items()} ids_to_blast = [] for rep in clusters: cluster_file = os.path.join(output_directory, '{0}_ids.txt'.format(rev_ids[rep])) cluster_ids = [rev_ids[rep] ] + [rev_ids[seqid[0]] for seqid in clusters[rep]] cluster_lines = im.join_list(cluster_ids, '\n') fo.write_to_file(cluster_lines, cluster_file, 'w', '') ids_to_blast.append((rev_ids[rep], len(cluster_ids))) return ids_to_blast
def get_sequences_by_id(sequences, seqids, out_file, limit=5000): """ Retrieves sequences from an indexed FASTA file. Parameters ---------- sequences : dict or Bio.File._IndexedSeqFileDict Dictionary with seqids as keys and sequences as values or a Fasta file index created with BioPython. seqids : list List with the identifiers of the sequences that should be retrieved. out_file : str Path to the FASTA file to which selected sequences will be saved. limit : int Maximum number of sequences that will be kept in memory at a time (to avoid keeping huge datasets in memory). Returns ------- Creates a file with the sequences that have the identifiers in the input list. """ if type(sequences) == dict: seqs = [(seqid, sequences[seqid]) for seqid in seqids] else: seqs = [(seqid, str(sequences[seqid].seq)) for seqid in seqids] records = [] for seq in seqs: record = fasta_str_record(seq[0], seq[1]) records.append(record) if len(records) == limit or seq[0] == seqids[-1]: lines = im.join_list(records, '\n') fo.write_to_file(lines, out_file, 'a', '\n') records = []
def write_protein_table(output_file, genome_id, cds_info): """ Writes information about coding sequences in a genome to a file. Parameters ---------- output_file : str Path to the output file to which info will be saved. genome_id : str Identifier of the genome to add to first field of every new line. cds_info : list List with information about each coding sequence identified in the genome (contig identifier, CDS start position, CDS stop position, CDS identifier and CDS coding strand). """ table_lines = [[genome_id] + protein_info for protein_info in cds_info] table_lines = [im.join_list(line, '\t') for line in table_lines] table_text = im.join_list(table_lines, '\n') fo.write_to_file(table_text, output_file, 'a', '\n')
def translate_coding_sequences(seqids, sequences_file, translation_table, minimum_length, dna_file, protein_file): """ Translates CDSs into protein sequences. Parameters ---------- seqids : list List with the sequence identifiers of the sequences to be translated. sequences_file : str Path to the FASTA file that contains the DNA sequences. translation_table : int Translation table identifier. minimum_length : int The minimum sequence length value. dna_file : str Path to a file to save DNA sequences. protein_file : str Path to a file to save protein sequences. Returns ------- A list with following elements: invalid_alleles : list List with one sublist per invalid allele. Each sublist contains a sequence identifer and the exception message returned after attempting translation. total_seqs : int Total number of DNA sequences that were translated. """ # define limit of records to keep in memory dna_lines = [] total_seqs = 0 prot_lines = [] line_limit = 5000 invalid_alleles = [] cds_index = SeqIO.index(sequences_file, 'fasta') for i, seqid in enumerate(seqids): try: sequence = str(cds_index.get(seqid).seq) except Exception as e: print(e) translation = sm.translate_dna(sequence, translation_table, minimum_length) if isinstance(translation, list): dna_lines.append('>{0}'.format(seqid)) dna_lines.append(translation[0][1]) prot_lines.append('>{0}'.format(seqid)) prot_lines.append(str(translation[0][0])) total_seqs += 1 # if returned value is a string, translation failed and # string contains exceptions elif isinstance(translation, str): invalid_alleles.append([seqid, translation]) if len(dna_lines) // 2 == line_limit or i + 1 == len(seqids): dna_lines = im.join_list(dna_lines, '\n') fo.write_to_file(dna_lines, dna_file, 'a', '\n') dna_lines = [] prot_lines = im.join_list(prot_lines, '\n') fo.write_to_file(prot_lines, protein_file, 'a', '\n') prot_lines = [] return [invalid_alleles, total_seqs]
def adapt_loci(genes, schema_path, schema_short_path, bsr, min_len, table_id, size_threshold, blastp_path, makeblastdb_path): """ Adapts a set of genes/loci from an external schema so that that schema can be used with chewBBACA. Removes invalid alleles and selects representative alleles to include in the "short" directory. Parameters ---------- genes_list : list A list with the following elements: - List with paths to the files to be processed. - Path to the schema directory. - Path to the "short" directory. - BLAST Score Ratio value. - Minimum sequence length value. - Genetic code. - Sequence size variation threshold. Returns ------- invalid_alleles : list List with the identifiers of the alleles that were determined to be invalid. invalid_genes : list List with the identifiers of the genes that had no valid alleles. summary_stats : list of list List with one sublist per processed locus. Each sublist has four elements: - The identifier of the locus. - The number of alleles in the external file. - The number of alleles that were a valid CDS. - The number of representatives determined determined by the process. The function writes the schema files . """ # divide input list into variables summary_stats = [] invalid_genes = [] invalid_alleles = [] for gene in genes: representatives = [] final_representatives = [] # get gene basename and identifier gene_basename = os.path.basename(gene) gene_id = gene_basename.split('.f')[0] # create paths to gene files in new schema gene_file = fo.join_paths(schema_path, ['{0}{1}'.format(gene_id, '.fasta')]) gene_short_file = fo.join_paths(schema_short_path, ['{0}{1}'.format(gene_id, '_short.fasta')]) # create path to temp working directory for current gene gene_temp_dir = fo.join_paths(schema_path, ['{0}{1}'.format(gene_id, '_temp')]) # create temp directory for the current gene fo.create_directory(gene_temp_dir) # dictionaries mapping gene identifiers to DNA sequences # and Protein sequences gene_seqs, prot_seqs, gene_invalid, seqids_map, total_sequences = \ sm.get_seqs_dicts(gene, gene_id, table_id, min_len, size_threshold) invalid_alleles.extend(gene_invalid) # if locus has no valid CDS sequences, # continue to next locus if len(prot_seqs) == 0: shutil.rmtree(gene_temp_dir) invalid_genes.append(gene_id) summary_stats.append([gene_id, str(total_sequences), '0', '0']) continue if len(gene_seqs) > 1: # identify DNA sequences that code for same protein equal_prots = sm.determine_duplicated_seqs(prot_seqs) # get only one identifier per protein ids_to_blast = [protids[0] for protein, protids in equal_prots.items()] # get longest sequence as first representative longest = sm.determine_longest(ids_to_blast, prot_seqs) representatives.append(longest) final_representatives.append(longest) # create FASTA file with distinct protein sequences protein_file = fo.join_paths(gene_temp_dir, ['{0}_protein.fasta'.format(gene_id)]) protein_lines = fao.fasta_lines(ids_to_blast, prot_seqs) fo.write_list(protein_lines, protein_file) # create blastdb with all distinct proteins blastp_db = os.path.join(gene_temp_dir, gene_id) bw.make_blast_db(makeblastdb_path, protein_file, blastp_db, 'prot') # determine appropriate blastp task (proteins < 30aa need blastp-short) blastp_task = bw.determine_blast_task(equal_prots) # cycles to BLAST representatives against non-representatives until # all non-representatives have a representative while len(set(ids_to_blast) - set(representatives)) != 0: # create FASTA file with representative sequences rep_file = fo.join_paths(gene_temp_dir, ['{0}_rep_protein.fasta'.format(gene_id)]) rep_protein_lines = fao.fasta_lines(representatives, prot_seqs) fo.write_list(rep_protein_lines, rep_file) # create file with seqids to BLAST against ids_str = im.concatenate_list([str(i) for i in ids_to_blast], '\n') ids_file = fo.join_paths(gene_temp_dir, ['{0}_ids.txt'.format(gene_id)]) fo.write_to_file(ids_str, ids_file, 'w', '') # BLAST representatives against non-represented blast_output = fo.join_paths(gene_temp_dir, ['{0}_blast_out.tsv'.format(gene_id)]) # set max_target_seqs to huge number because BLAST only # returns 500 hits by default blast_stderr = bw.run_blast(blastp_path, blastp_db, rep_file, blast_output, 1, 1, ids_file, blastp_task, 100000, ignore=ct.IGNORE_RAISED) if len(blast_stderr) > 0: raise ValueError(blast_stderr) # import BLAST results blast_results = fo.read_tabular(blast_output) # get self-score for representatives rep_self_scores = {res[1]: res[2] for res in blast_results if res[0] == res[1]} # divide results into high, low and hot BSR values hitting_high, hitting_low, hotspots, high_reps, low_reps, hot_reps = \ bsr_categorizer(blast_results, representatives, rep_self_scores, bsr, bsr+0.1) excluded_reps = [] # remove high BSR hits that have representative hitting_high = set(hitting_high) ids_to_blast = [i for i in ids_to_blast if i not in hitting_high] # remove representatives that led to high BSR with subjects that were removed prunned_high_reps = {k: [r for r in v if r in ids_to_blast] for k, v in high_reps.items()} reps_to_remove = [k for k, v in prunned_high_reps.items() if len(v) == 0] excluded_reps.extend(reps_to_remove) # determine smallest set of representatives that allow to get all cycle candidates excluded = [] hotspot_reps = set(im.flatten_list(list(hot_reps.values()))) for rep, hits in hot_reps.items(): common = hotspot_reps.intersection(set(hits)) if len(common) > 0: hotspot_reps = hotspot_reps - common else: excluded.append(rep) excluded_reps.extend(excluded) # remove representatives that only led to low BSR excluded_reps.extend(low_reps) representatives = [rep for rep in representatives if rep not in excluded_reps] ids_to_blast = [i for i in ids_to_blast if i not in excluded_reps] # determine next representative from candidates rep_candidates = list(set(hotspots) - hitting_high) # sort to guarantee reproducible results with same datasets rep_candidates = sorted(rep_candidates, key=lambda x: int(x)) representatives, final_representatives = select_candidate(rep_candidates, prot_seqs, ids_to_blast, representatives, final_representatives) # remove files created for current gene iteration os.remove(rep_file) os.remove(blast_output) os.remove(ids_file) else: final_representatives = list(prot_seqs.keys()) # write schema file with all alleles gene_lines = fao.fasta_lines(list(gene_seqs.keys()), gene_seqs) fo.write_list(gene_lines, gene_file) # get total number of valid sequences valid_sequences = len(gene_lines) # write schema file with representatives final_representatives = [seqids_map[rep] for rep in final_representatives] gene_rep_lines = fao.fasta_lines(final_representatives, gene_seqs) fo.write_list(gene_rep_lines, gene_short_file) # get number of representatives representatives_number = len(gene_rep_lines) summary_stats.append([gene_id, str(total_sequences), str(valid_sequences), str(representatives_number)]) shutil.rmtree(gene_temp_dir) return [invalid_alleles, invalid_genes, summary_stats]