def filter_pcr_dup(pairs_idx_file, filtered_file): """ Filter out PCR duplicates from a coordinate-sorted pairs file using overrrepresented exact coordinates. If multiple fragments have two reads with the exact same coordinates, only one of those fragments is kept. Parameters ---------- pairs_idx_file : str Path to an indexed pairs file containing the Hi-C reads. filtered_file : str Path to the output pairs file after removing duplicates. """ # Keep count of how many reads are filtered filter_count = 0 reads_count = 0 # Store header lines header = hio.get_pairs_header(pairs_idx_file) with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered: # Copy header lines to filtered file for head_line in header: filtered.write(head_line + "\n") next(pairs) # Use csv methods to easily access columns paircols = [ "readID", "chr1", "pos1", "chr2", "pos2", "strand1", "strand2", "frag1", "frag2", ] # Columns used for comparison of coordinates coord_cols = [col for col in paircols if col != "readID"] pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols) filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols) # Initialise a variable to store coordinates of reads in previous pair prev = {k: 0 for k in paircols} for pair in pair_reader: reads_count += 1 # If coordinates are the same as before, skip pair if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols): filter_count += 1 continue # Else write pair and store new coordinates as previous else: filt_writer.writerow(pair) prev = pair logger.info( "%d%% PCR duplicates have been filtered out (%d / %d pairs) " % (100 * round(filter_count / reads_count, 3), filter_count, reads_count) )
def get_distance_law( pairs_reads_file, fragments_file, centro_file=None, base=1.1, out_file=None, circular=False, rm_centro=0, ): """Compute distance law as a function of the genomic coordinate aka P(s). Bin length increases exponentially with distance. Works on pairs file format from 4D Nucleome Omics Data Standards Working Group. If the genome is composed of several chromosomes and you want to compute the arms separately, provide a file with the positions of centromers. Create a file with three coulumns separated by a tabulation. The first column contains the xs, the second the ps and the third the name of the arm or chromosome. The file is create in the directory given in outdir or in the current directory if no directory given. Parameters ---------- pairs_reads_file : string Path of a pairs file format from 4D Nucleome Omics Data Standards Working Group with the 8th and 9th coulumns are the ID of the fragments of the reads 1 and 2. fragments_file : path Path of a table containing in the first column the ID of the fragment, in the second the names of the chromosome in the third and fourth the start position and the end position of the fragment. The file have no header. (File like the 'fragments_list.txt' from hicstuff) centro_file : None or str None or path to a file with the genomic positions of the centromers sorted as the chromosomes separated by a space. The file have only one line. base : float Base use to construct the logspace of the bins - 1.1 by default. out_file : None or str Path of the output file. If no path given, the output is returned. circular : bool If True, calculate the distance as the chromosome is circular. Default value is False. Cannot be True if centro_file is not None rm_centro : int If a value is given, will remove the contacts close the centromeres. It will remove as many kb as the argument given. Default is None. Returns ------- xs : list of numpy.ndarray Basepair coordinates of log bins used to compute distance law. ps : list of numpy.ndarray Contacts value, in arbitrary units, at increasingly long genomic ranges given by xs. names : list of strings Names of chromosomes that are plotted """ # Sanity check : centro_fileition should be None if chromosomes are # circulars (no centromeres is circular chromosomes). if circular and centro_file != None: logger.error("Chromosomes cannot have a centromere and be circular") sys.exit(1) # Import third columns of fragments file fragments = pd.read_csv(fragments_file, sep="\t", header=0, usecols=[0, 1, 2, 3]) # Calculate the indice of the bins to separate into chromosomes/arms chr_segment_bins = get_chr_segment_bins_index(fragments, centro_file, rm_centro) # Calculate the length of each chromosoms/arms chr_segment_length = get_chr_segment_length(fragments, chr_segment_bins) xs = logbins_xs(fragments, chr_segment_length, base, circular) # Create the list of p(s) with one array for each chromosome/arm and each # array contain as many values as in the logbin ps = [None] * len(chr_segment_length) for i in range(len(xs)): ps[i] = [0] * len(xs[i]) # Read the pair reads file with open(pairs_reads_file, "r", newline="") as reads: # Remove the line of the header header_length = len(hio.get_pairs_header(pairs_reads_file)) for i in range(header_length): next(reads) # Reads all the others lines and put the values in a dictionnary with # the keys : 'readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', # 'strand2', 'frag1', 'frag2' reader = csv.DictReader( reads, fieldnames=[ "readID", "chr1", "pos1", "chr2", "pos2", "strand1", "strand2", "frag1", "frag2", ], delimiter="\t", ) for line in reader: # Iterate in each line of the file after the header get_pairs_distance(line, fragments, chr_segment_bins, chr_segment_length, xs, ps, circular) # Divide the number of contacts by the area of the logbin for i in range(len(xs)): n = chr_segment_length[i] for j in range(len(xs[i]) - 1): # Use the area of a trapezium to know the area of the logbin with n # the size of the matrix. ps[i][j] /= ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) * ( (1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j])) # print( # ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) # * ((1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j])) # ) # Case of the last logbin which is an isosceles rectangle triangle # print(ps[i][-5:-1], ((n - xs[i][-1]) ** 2) / 2) ps[i][-1] /= ((n - xs[i][-1])**2) / 2 names = get_names(fragments, chr_segment_bins) if out_file: export_distance_law(xs, ps, names, out_file) return xs, ps, names
def chrom_contact_to_bedgraph(pairs_file, bg_file, focus_chrom, binsize=5000, threads=1, buffer="2G"): """ Compute 1D coverage track of contacts with a given chromosome from a pairs file and writes it to a bed file. Parameters ---------- pairs_file : str Path to the input pairs file. It is tab separated with header lines starting by "#" and contains 7 columns: readID,chr1,pos1,chr2,pos2, strand1,strand2. Assumes the pairs is an upper triangle (chr1,pos1 is always smaller than chr2,pos2). bg_file : str Path to the output bedgraph file. It is tab separated and contains 4 columns: chrom,start,end,value. focus_chrom : str Name of the chromosome with which contacts should be computed. bin_size : int Size of bins in which to compute contacts, in basepairs. Examples -------- chrom_contact_to_bedgraph('libxyz.pairs', 'libxyz.bedgraph' 'HBV') """ # Sort pairs file by coordinate and check how many lines are in header row hio.sort_pairs( pairs_file, pairs_file + ".sorted", ["chr1", "pos1", "chr2", "pos2"], threads=threads, buffer=buffer, ) header = hio.get_pairs_header(pairs_file + ".sorted") # Create bedgraph bins for each chromosome bg_bins = OrderedDict() for line in header: if line.startswith("#chromsize"): chrom = line.split()[1:] # Initialize empty bins for each chromosome (at least 1) bg_bins[chrom[0]] = np.zeros(max(int(chrom[1]) // binsize, 1)) # bg_bins should look like that: {'chr1': [0, 0, 0, ...], 'chr2':...} # Where each array has as many 0's as there are bins in the chrom. with open(pairs_file + ".sorted", "r") as pairs: # Ignore header lines for _ in range(len(header)): next(pairs) # Define reader object for input pairs file. pairs_cols = [ "readID", "chr1", "pos1", "chr2", "pos2", "strand1", "strand2" ] pairs_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=pairs_cols) # Read and record pair contacts with focus chromosome for pair in pairs_reader: try: # If chromosome of interest is first mate, increment second mate # bin coverage by 1 if pair["chr1"] == focus_chrom: bg_bins[pair["chr2"]][int(pair["pos2"]) // binsize] += 1 # If it is second mate, increment first mate bin coverage elif pair["chr2"] == focus_chrom: bg_bins[pair["chr1"]][int(pair["pos1"]) // binsize] += 1 except IndexError: print(pair) # Write contact value as a 1D bedgraph file with open(bg_file, "w") as bg: for chrom in bg_bins: for bin_idx, bin_contacts in enumerate(bg_bins[chrom]): # Write single line as "chrom start end contacts" bg.write("{0}\t{1}\t{2}\t{3}\n".format(chrom, bin_idx * binsize, (bin_idx + 1) * binsize, bin_contacts))
def attribute_fragments(pairs_file, idx_pairs_file, restriction_table): """ Writes the indexed pairs file, which has two more columns than the input pairs file corresponding to the restriction fragment index of each read. Note that pairs files have 1bp point positions whereas restriction table has 0bp point poisitions. Parameters ---------- pairs_file: str Path the the input pairs file. Consists of 7 white-space separated columns: readID, chr1, pos1, chr2, pos2, strand1, strand2 idx_pairs_file: str Path to the output indexed pairs file. Consists of 9 white space separated columns: readID, chr1, pos1, chr2, pos2, strand1, strand2, frag1, frag2. frag1 and frag2 are 0-based restriction fragments based on whole genome. restriction_table: dict Dictionary with chromosome identifiers (str) as keys and list of positions (int) of restriction sites as values. """ # NOTE: Bottlenecks here are 1. binary search in find_frag and 2. writerow # 1. could be reduced by searching groups of N frags in parallel and 2. by # writing N frags simultaneously using a single call of writerows. # Parse and update header section pairs_header = hio.get_pairs_header(pairs_file) header_size = len(pairs_header) chrom_order = [] with open(idx_pairs_file, "w") as idx_pairs: for line in pairs_header: # Add new column names to header if line.startswith("#columns"): line = line.rstrip() + " frag1 frag2" if line.startswith("#chromsize"): chrom_order.append(line.split()[1]) idx_pairs.write(line + "\n") # Get number of fragments per chrom to allow genome-based indices shift_frags = {} prev_frags = 0 for rank, chrom in enumerate(chrom_order): if rank > 0: # Note the "-1" because there are nfrags + 1 sites in rest table prev_frags += len(restriction_table[chrom_order[rank - 1]]) - 1 # Idx of each chrom's frags will be shifted by n frags in previous chroms shift_frags[chrom] = prev_frags missing_contigs = set() # Attribute pairs to fragments and append them to output file (after header) with open(pairs_file, "r") as pairs, open(idx_pairs_file, "a") as idx_pairs: # Skip header lines for _ in range(header_size): next(pairs) # Define input and output fields pairs_cols = [ "readID", "chr1", "pos1", "chr2", "pos2", "strand1", "strand2", ] idx_cols = pairs_cols + ["frag1", "frag2"] # Use csv reader / writer to automatically parse columns into a dict pairs_reader = csv.DictReader(pairs, fieldnames=pairs_cols, delimiter="\t") pairs_writer = csv.DictWriter(idx_pairs, fieldnames=idx_cols, delimiter="\t") for pair in pairs_reader: # Get the 0-based indices of corresponding restriction fragments # Deducing 1 from pair position to get it into 0bp point pair["frag1"] = find_frag( int(pair["pos1"]) - 1, restriction_table[pair["chr1"]]) pair["frag2"] = find_frag( int(pair["pos2"]) - 1, restriction_table[pair["chr2"]]) # Shift fragment indices to make them genome-based instead of # chromosome-based try: pair["frag1"] += shift_frags[pair["chr1"]] except KeyError: missing_contigs.add(pair["chr1"]) try: pair["frag2"] += shift_frags[pair["chr2"]] except KeyError: missing_contigs.add(pair["chr2"]) # Write indexed pairs in the new file pairs_writer.writerow(pair) if missing_contigs: logger.warning( "Pairs on the following contigs were discarded as " "those contigs are not listed in the paris file header. " "This is normal if you filtered out small contigs: %s" % " ".join(list(missing_contigs)))
def pairs2matrix(pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None): """Generate the matrix by counting the number of occurences of each combination of restriction fragments in a pairs file. Parameters ---------- pairs_file : str Path to a Hi-C pairs file, with frag1 and frag2 columns added. mat_file : str Path where the matrix will be written. fragments_file : str Path to the fragments_list.txt file. Used to know total matrix size in case some observations are not observed at the end. mat_fmt : str The format to use when writing the matrix. Can be graal or bg2 format. threads : int Number of threads to use in parallel. tmp_dir : str Temporary directory for sorting files. If None given, will use the system default. """ # Number of fragments is N lines in frag list - 1 for the header n_frags = sum(1 for line in open(fragments_file, "r")) - 1 frags = pd.read_csv(fragments_file, delimiter="\t") def write_mat_entry(frag1, frag2, contacts): """Write a single sparse matrix entry in either graal or bg2 format""" if mat_fmt == "graal": mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n") elif mat_fmt == "bg2": frag1, frag2 = int(frag1), int(frag2) mat.write("\t".join( map( str, [ frags.chrom[frag1], frags.start_pos[frag1], frags.end_pos[frag1], frags.chrom[frag2], frags.start_pos[frag2], frags.end_pos[frag2], contacts, ], )) + "\n") pre_mat_file = mat_file + ".pre.pairs" hio.sort_pairs( pairs_file, pre_mat_file, keys=["frag1", "frag2"], threads=threads, tmp_dir=tmp_dir, ) header_size = len(hio.get_pairs_header(pre_mat_file)) with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat: # Skip header lines for _ in range(header_size): next(pairs) prev_pair = ["0", "0"] # Pairs identified by [frag1, frag2] n_occ = 0 # Number of occurences of each frag combination n_nonzero = 0 # Total number of nonzero matrix entries n_pairs = 0 # Total number of pairs entered in the matrix pairs_reader = csv.reader(pairs, delimiter="\t") # First line contains nrows, ncols and number of nonzero entries. # Number of nonzero entries is unknown for now if mat_fmt == "graal": mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n") for pair in pairs_reader: # Fragment ids are field 8 and 9 curr_pair = [pair[7], pair[8]] # Increment number of occurences for fragment pair if prev_pair == curr_pair: n_occ += 1 # Write previous pair and start a new one else: if n_occ > 0: write_mat_entry(prev_pair[0], prev_pair[1], n_occ) prev_pair = curr_pair n_pairs += n_occ n_occ = 1 n_nonzero += 1 # Write the last value write_mat_entry(curr_pair[0], curr_pair[1], n_occ) n_nonzero += 1 n_pairs += 1 # Edit header line to fill number of nonzero entries inplace in graal header if mat_fmt == "graal": with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat: header = mat.readline() header = header.replace("-", str(n_nonzero)) tmp_mat.write(header) st.copyfileobj(mat, tmp_mat) # Replace the matrix file with the one with corrected header os.rename(pre_mat_file, mat_file) else: os.remove(pre_mat_file) logger.info( "%d pairs used to build a contact map of %d bins with %d nonzero entries.", n_pairs, n_frags, n_nonzero, )