def extract_subnetworks( partition_file, network_file, output_dir, max_cores=DEFAULT_MAX_CORES, max_size_matrix=DEFAULT_MAX_SIZE_MATRIX, saturation_threshold=DEFAULT_SATURATION_THRESHOLD, ): """Extract bin subnetworks from the main network Identify bins, extract subnets, draws the adjacency matrices, saves it all in a specified output directory. Parameters ---------- partition_file : file, str or pathlib.Path The file containing, for each chunk, the communities it was assigned to at each iteration. network_file : file, str or pathlib.Path The file containing the network in sparse (edge list) format output_dir : str or pathlib.Path The output directory to write the subnetworks into. max_cores : int, optional The maximum number of bins to extract. Default is 100. max_size_matrix : int, optional When rendering contact maps for each bin, the maximum size for the matrix. Default is 2000. saturation_threshold : float, optional When rendering contact maps for each bin, the percentile value over which the color map should be saturated. Default is 80. """ logger.info("Loading partition...") data_chunks = np.loadtxt(partition_file, usecols=(1, ), dtype=np.int32) logger.info("Loading network...") network = np.loadtxt(network_file, dtype=np.int32) cores = data_chunks core_network = np.copy(network) core_network[:, 0] = cores[network[:, 0]] core_network[:, 1] = cores[network[:, 1]] n = np.amax(cores) + 1 def extract(network_to_keep, filename): subnetwork = np.copy(network[network_to_keep]) subnetwork[:, 0] -= 1 subnetwork[:, 1] -= 1 np.savetxt(filename, subnetwork, fmt="%i") return subnetwork def draw(subnetwork, filename): try: # Numpy array format row = subnetwork[:, 0] col = subnetwork[:, 1] data = subnetwork[:, 2] except TypeError: # Scipy sparse format row = subnetwork.row col = subnetwork.col data = subnetwork.data row_indices = stats.rankdata(np.concatenate((row, col)), method="dense") col_indices = stats.rankdata(np.concatenate((col, row)), method="dense") data = np.concatenate((data, data)) # print("Row length: {}, col length: {}, data length: {}" # "".format(len(row_indices), len(col_indices), len(data))) unique_row = np.unique(row) unique_col = np.unique(col) # print("Network shape: {},{}".format(len(unique_row), # len(unique_col))) size = len(np.unique(np.concatenate((unique_row, unique_col)))) + 1 # print("Size of matrix to draw: {}".format(size)) try: sparse_subnet = sparse.coo_matrix( (data, (row_indices, col_indices)), shape=(size, size)) binning_factor = (size // max_size_matrix) + 1 binned_subnet = hcs.bin_sparse(sparse_subnet, subsampling_factor=binning_factor) dense_subnet = binned_subnet.todense() diagonal = np.diag(np.diag(dense_subnet)) normed_subnet = hcs.normalize_dense(dense_subnet - diagonal) vmax = np.percentile(normed_subnet, saturation_threshold) spaceless_pdf_plot_maker(normed_subnet, filename, vmax=vmax) except MemoryError: logger.warning( "Warning, couldn't save matrix due to memory issues") def extract_and_draw(network_to_keep, filename_text, filename_image): subnetwork = extract(network_to_keep, filename=filename_text) draw(subnetwork, filename=filename_image) # Extract and draw subnetworks for chosen cores and draw 2D arrays global_network_indices_list = [] for i in range(1, n): if i > max_cores: break # print("Bin {}:".format(i)) network_to_keep_1 = core_network[:, 0] == i network_to_keep_2 = core_network[:, 1] == i network_to_keep = network_to_keep_1 * network_to_keep_2 nonzero_indices, = np.nonzero(network_to_keep) global_network_indices_list += nonzero_indices.tolist() subnetwork_file = os.path.join(output_dir, "subnetwork_core_{}.dat".format(i)) image_name = os.path.join(output_dir, "core_{}.eps".format(i)) extract_and_draw( network_to_keep=network_to_keep, filename_text=subnetwork_file, filename_image=image_name, )
def alignment_to_contacts( sam_merged, assembly, output_dir, output_file_network=DEFAULT_NETWORK_FILE_NAME, output_file_chunk_data=DEFAULT_CHUNK_DATA_FILE_NAME, parameters=DEFAULT_PARAMETERS, ): """Generates a network file (in edgelist form) from an alignment in sam or bam format. Contigs are virtually split into 'chunks' of nearly fixed size (by default between 500 and 1000 bp) to reduce size bias. The chunks are the network nodes and the edges are the contact counts. The network is in a strict barebone form so that it can be reused and imported quickly into other applications etc. Verbose information about every single node in the network is written on a 'chunk data' file, by default called 'idx_contig_hit_size_cov.txt' Parameters ---------- sam_merged : file, str or pathlib.Path The alignment file in SAM/BAM format to be processed. assembly : file, str or pathlib.Path The initial assembly acting as the alignment file's reference genome. output_dir : str or pathlib.Path The output directory to write the network and chunk data into. output_dir_file_network : str or pathlib.Path, optional The specific file name for the output network file. Default is network.txt output_file_chunk_data : str or pathlib.Path, optional The specific file name for the output chunk data file. Default is idx_contig_hit_size_cov.txt parameters : dict, optional A dictionary of parameters for converting the alignment file into a network. These are: -size_chunk_threshold: the size (in bp) under which chunks are discarded. Default is 500. -mapq_threshold: the mapping quality under which alignments are discarded. Default is 10. -chunk_size: the default chunk size (in bp) when applicable, save smaller contigs or tail-ends. Default is 1000. -read_size: the size of reads used for mapping. Default is 65. -self_contacts: whether to count alignments between a chunk and itself. Default is False. -normalized: whether to normalize contacts by their coverage. Default is False. Returns ------- chunk_complete_data : dict A dictionary where the keys are chunks in (contig, position) form and the values are their id, name, total contact count, size and coverage. all_contacts : dict A counter dictionary where the keys are chunk pairs and the values are their contact count. """ all_contacts = collections.Counter() all_chunks = collections.Counter() # Initialize parameters chunk_size = int(parameters["chunk_size"]) mapq_threshold = int(parameters["mapq_threshold"]) size_chunk_threshold = int(parameters["size_chunk_threshold"]) read_size = int(parameters["read_size"]) self_contacts = parameters["self_contacts"] normalized = parameters["normalized"] logger.info("Establishing chunk list...") chunk_complete_data = dict() # Get all information about all chunks from all contigs # (this gets updated at the end) global_id = 1 for record in SeqIO.parse(assembly, "fasta"): length = len(record.seq) n_chunks = length // chunk_size n_chunks += (length % chunk_size) >= size_chunk_threshold for i in range(n_chunks): if (i + 1) * chunk_size <= length: size = chunk_size else: size = length % chunk_size chunk_name = "{}_{}".format(record.id, i) chunk_complete_data[chunk_name] = { "id": global_id, "hit": 0, "size": size, "coverage": 0, } global_id += 1 logger.info("Opening alignment files...") current_read = None # Read the BAM file to detect contacts. with pysam.AlignmentFile(sam_merged, "rb") as alignment_merged_handle: names = alignment_merged_handle.references lengths = alignment_merged_handle.lengths names_and_lengths = { name: length for name, length in zip(names, lengths) } logger.info("Reading contacts...") # Since the BAM file is supposed to be sorted and interleaved, # pairs should be always grouped with one below the other (the exact # order doesn't matter since the network is symmetric, so we simply # treat the first one as 'forward' and the second one as 'reverse') # We keep iterating until two consecutive reads have the same name, # discarding ones that don't. while "Reading forward and reverse alignments alternatively": try: my_read = next(alignment_merged_handle) if current_read is None: # First read current_read = my_read continue elif current_read.query_name != my_read.query_name: # print("{}_{}".format(current_read, my_read)) current_read = my_read continue read_forward, read_reverse = current_read, my_read except StopIteration: break # Get a bunch of info about the alignments to pass the tests below read_name_forward = read_forward.query_name read_name_reverse = read_reverse.query_name flag_forward, flag_reverse = read_forward.flag, read_reverse.flag try: assert read_name_forward == read_name_reverse except AssertionError: logger.error( "Reads don't have the same name: " "%s and %s", read_name_forward, read_name_reverse, ) raise # To check if a flag contains 4 # (digit on the third position from the right in base 2), # 4 = unmapped in SAM spec def is_unmapped(flag): return np.base_repr(flag, padding=3)[-3] == "1" if is_unmapped(flag_forward) or is_unmapped(flag_reverse): # print("Detected unmapped read on one end, skipping") continue contig_name_forward = read_forward.reference_name contig_name_reverse = read_reverse.reference_name len_contig_for = names_and_lengths[contig_name_forward] len_contig_rev = names_and_lengths[contig_name_reverse] position_forward = read_forward.reference_start position_reverse = read_reverse.reference_start mapq_forward = read_forward.mapping_quality mapq_reverse = read_reverse.mapping_quality # Some more tests: checking for size, map quality, map status etc. mapq_test = min(mapq_forward, mapq_reverse) > mapq_threshold min_length = min(len_contig_for, len_contig_rev) length_test = min_length > size_chunk_threshold # Trickest test: # # # contig # pos1 pos2 # ^ ^ # |-------|-------|-------|-------|---| # <-------><------><------><------><--> <-> # chunk chunk tail size_chunk_threshold # # Test is passed if tail >= size_chunk_threshold (pos2) # or if the position is a non-tail chunk (pos1) if position_forward < chunk_size * (len_contig_for // chunk_size): current_chunk_forward_size = chunk_size else: current_chunk_forward_size = len_contig_for % chunk_size if position_reverse < chunk_size * (len_contig_rev // chunk_size): current_chunk_reverse_size = chunk_size else: current_chunk_reverse_size = len_contig_rev % chunk_size min_chunk_size = min(current_chunk_forward_size, current_chunk_reverse_size) chunk_test = min_chunk_size >= size_chunk_threshold if mapq_test and length_test and chunk_test: chunk_forward = position_forward // chunk_size chunk_reverse = position_reverse // chunk_size chunk_name_forward = "{}_{}".format(contig_name_forward, chunk_forward) chunk_name_reverse = "{}_{}".format(contig_name_reverse, chunk_reverse) if self_contacts or chunk_name_forward != chunk_name_reverse: contact = tuple( sorted((chunk_name_forward, chunk_name_reverse))) all_contacts[contact] += 1 chunk_key_forward = ( chunk_name_forward, current_chunk_forward_size, ) all_chunks[chunk_key_forward] += 1 chunk_key_reverse = ( chunk_name_reverse, current_chunk_reverse_size, ) all_chunks[chunk_key_reverse] += 1 logger.info("Writing chunk data...") # Now we can update the chunk dictionary # with the info we gathered from the BAM file output_chunk_data_path = os.path.join(output_dir, output_file_chunk_data) with open(output_chunk_data_path, "w") as chunk_data_file_handle: for name in sorted(chunk_complete_data.keys()): chunk_data = chunk_complete_data[name] size = chunk_data["size"] chunk = (name, chunk_data["size"]) hit = all_chunks[chunk] coverage = hit * read_size * 1.0 / size try: chunk_complete_data[name]["hit"] = hit chunk_complete_data[name]["coverage"] = coverage except KeyError: logger.error("A mismatch was detected between the reference " "genome and the genome used for the alignment " "file, some sequence names were not found") raise idx = chunk_complete_data[name]["id"] line = "{}\t{}\t{}\t{}\t{}\n".format(idx, name, hit, size, coverage) chunk_data_file_handle.write(line) # Lastly, generate the network proper logger.info("Writing network...") output_network_path = os.path.join(output_dir, output_file_network) with open(output_network_path, "w") as network_file_handle: for chunks in sorted(all_contacts.keys()): chunk_name1, chunk_name2 = chunks contact_count = all_contacts[chunks] if normalized: coverage1 = chunk_complete_data[chunk_name1]["coverage"] coverage2 = chunk_complete_data[chunk_name2]["coverage"] mean_coverage = np.sqrt(coverage1 * coverage2) effective_count = contact_count * 1.0 / mean_coverage else: effective_count = contact_count try: idx1 = chunk_complete_data[chunk_name1]["id"] idx2 = chunk_complete_data[chunk_name2]["id"] line = "{}\t{}\t{}\n".format(idx1, idx2, effective_count) network_file_handle.write(line) except KeyError as e: logger.warning("Mismatch detected: %s", e) return chunk_complete_data, all_contacts