def dendropy_newick_from_dist_matrix(infile, outfile, method): logging.info("Calculating tree using dendropy") logging.info(f"Loading distance matrix file {infile}") with utils.open_file(infile) as f: # triphecta saves distance matrix in the standard "phylip" format. # First line the number of samples. There is no line of just sample # names. This means we need to skip the first line, and then tell # dendropy that the first line is not sample names. next(f) pdm = dendropy.PhylogeneticDistanceMatrix.from_csv( src=f, is_first_row_column_names=False, delimiter="\t") if method == "upgma": logging.info("Calculating upgma tree") tree = pdm.upgma_tree() elif method == "nj": logging.info("Calculating nj tree") tree = pdm.nj_tree() else: raise ValueError( f"Got method {method}, but must be upgma or nj. Cannot continue") logging.info(f"Writing tree to file {outfile}") with utils.open_file(outfile, "w") as f: print( tree.as_string("newick", suppress_rooting=True).replace("'", ""), end="", file=f, )
def load_distance_matrix_file(infile): sample_names = [] distances = {} with utils.open_file(infile) as f: for line_number, line in enumerate(f): if line_number == 0: try: number_of_samples = int(line.rstrip()) except: raise RuntimeError( f"Expected first line of distance matrix to contain a number only. Got this: {line}" ) sample_names = [] elif line_number == 1: sample_names.append(line.split()[0]) continue else: fields = line.rstrip().split("\t", maxsplit=line_number) sample_names.append(fields[0]) for i in range(1, line_number): distances[tuple(sorted([line_number - 1, i - 1]))] = float( fields[i] ) if len(sample_names) != number_of_samples: raise RuntimeError( f"Expected {number_of_samples} samples in distance matrix file, but got {len(sample_names)}" ) return sample_names, distances
def _write_triples_names_file(cls, triples, phenos, outfile): pheno_names = sorted(list(phenos.pheno_types.keys())) with utils.open_file(outfile, "w") as f: print( "triple_id", "case", "control1", "geno_dist1", "pheno_dist1", "control2", "geno_dist2", "pheno_dist2", *[f"case_{x}\tcontrol1_{x}\tcontrol2_{x}" for x in pheno_names], sep="\t", file=f, ) for i, triple in enumerate(triples): print( i + 1, triple.case, triple.control1.sample, triple.control1.geno_dist, triple.control1.pheno_dist, triple.control2.sample, triple.control2.geno_dist, triple.control2.pheno_dist, *[ f"{phenos[triple.case][x]}\t{phenos[triple.control1.sample][x]}\t{phenos[triple.control2.sample][x]}" for x in pheno_names ], sep="\t", file=f, )
def vcf_to_variant_positions_to_mask_from_bed_file(vcf_file, bed_file): mask = _bed_mask_file_to_dict(bed_file) current_mask_chrom = None current_mask_index = None vcf_records_to_mask = {} with utils.open_file(vcf_file) as f: for line in f: if line.startswith("#"): continue chrom, pos, _, ref, _ = line.split("\t", maxsplit=4) if chrom not in mask: continue if chrom != current_mask_chrom: current_mask_chrom = chrom current_mask_index = 0 vcf_start = int(pos) - 1 vcf_end = vcf_start + len(ref) - 1 while (current_mask_index < len(mask[current_mask_chrom]) and mask[current_mask_chrom][current_mask_index][1] < vcf_start): current_mask_index += 1 if (current_mask_index < len(mask[current_mask_chrom]) and mask[current_mask_chrom][current_mask_index][0] <= vcf_end): if chrom not in vcf_records_to_mask: vcf_records_to_mask[chrom] = set() vcf_records_to_mask[chrom].add(vcf_start) return vcf_records_to_mask
def test_open_file(): tmp_file = "tmp.open_file" subprocess.check_output(f"rm -f {tmp_file}", shell=True) with pytest.raises(OSError): with utils.open_file(tmp_file) as f: pass for tmp_file in "tmp.open_file", "tmp.open_file.gz": with utils.open_file(tmp_file, "w") as f: print("TEST", file=f) print("TEST2", file=f) assert os.path.exists(tmp_file) with utils.open_file(tmp_file) as f: lines = [x.rstrip() for x in f] assert lines == ["TEST", "TEST2"] os.unlink(tmp_file)
def _load_phenotypes_tsv_file(cls, infile): phenos = {} with utils.open_file(infile) as f: reader = csv.DictReader(f, delimiter="\t") if "sample" not in reader.fieldnames: raise RuntimeError( f"Must have a 'sample' column in phenotypes file. Not found in file {infile}" ) pheno_types = { x: set() for x in reader.fieldnames if x != "sample" } for row in reader: if row["sample"] in phenos: raise RuntimeError( f"Duplicate sample name '{row['sample']}' in phenotypes file {infile}" ) phenos[row["sample"]] = { x: Phenotypes.convert_one_variable_string(row[x]) for x in row if x != "sample" } for p in pheno_types: pheno_types[p].add(type(phenos[row["sample"]][p])) return phenos, pheno_types
def load_variant_count_list_from_tsv(infile): variants = [] with utils.open_file(infile) as f: reader = csv.DictReader(f, delimiter="\t") for d in reader: for key in d: d[key] = int(d[key]) variants.append(VariantCounts(**d)) return variants
def sample_names_tsv_from_vcf_file_of_filenames(infile, outfile, threads=1): """Input is a file of VCF file names, one name per line. Writes a TSV file with columns sample_name, vcf_file""" with utils.open_file(infile) as f: vcf_files = [x.rstrip() for x in f.readlines()] logging.debug( f"Getting sample names from {len(vcf_files)} VCF files using {threads} thread(s)" ) with multiprocessing.Pool(processes=threads) as p: sample_names = p.map(sample_name_from_vcf, vcf_files) assert len(vcf_files) == len(sample_names) logging.debug(f"Writing sample/vcf TSV file {outfile}") with utils.open_file(outfile, "w") as f: print("sample", "vcf_file", sep="\t", file=f) for sample, vcf_file in zip(sample_names, vcf_files): print(sample, vcf_file, sep="\t", file=f)
def _bed_mask_file_to_dict(bed_file): mask = {} with utils.open_file(bed_file) as f: for line in f: chrom, start, end = line.rstrip().split("\t") if chrom not in mask: mask[chrom] = [] mask[chrom].append((int(start), int(end) - 1)) for l in mask.values(): l.sort() return mask
def write_distance_matrix_file(sample_names, distance_matrix, outfile): with utils.open_file(outfile, "w") as f: print(len(sample_names), file=f) for i, sample in enumerate(sample_names): out = [] for j, sample2 in enumerate(sample_names): if i == j: out.append(0) else: out.append(distance_matrix[tuple(sorted([i, j]))]) print(sample, *out, sep="\t", file=f)
def sample_name_from_vcf(infile): """Gets sample name from VCF (in its #CHROM... line). Assumes the VCF file only conatins one sample""" logging.debug(f"Getting sample name from VCF file {infile}") with utils.open_file(infile) as f: for line in f: if line.startswith("#CHROM"): name = line.rstrip().split("\t")[-1] logging.debug( f"Found sample name '{name}' from VCF file {infile}") return name raise RuntimeError(f"#CHROM line not found in file {infile}")
def write_template_constraints_json(self, outfile): constraints = {} for pheno, pheno_type in self.pheno_types.items(): constraints[pheno] = {"must_be_same": True, "params": {}} if pheno_type == bool: constraints[pheno]["method"] = "equal" elif pheno_type == float: constraints[pheno]["method"] = "range" constraints[pheno]["params"] = {"low": 0, "high": 1} else: raise TypeError with utils.open_file(outfile, "w") as f: json.dump(constraints, f, sort_keys=True, indent=2)
def _load_sample_distances_file_of_filenames(infile): sample_names = [] distance_files = [] expect_cols = {"sample", "distance_file"} with utils.open_file(infile) as f: reader = csv.DictReader(f, delimiter="\t") if not expect_cols.issubset(set(reader.fieldnames)): raise RuntimeError( f"Error reading distances file of filenames {infile}. Expected column names: {','.join(expect_cols)}. Got column names: {','.join(reader.fieldnames)}" ) for row in reader: sample_names.append(row["sample"]) distance_files.append(row["distance_file"]) return sample_names, distance_files
def _load_one_sample_distances_file(filename): """Loads a distance file into memory. Returns a list of tuples, where each tuple is (sample_name, distance)""" expect_cols = {"sample", "distance"} distances = [] with utils.open_file(filename) as f: reader = csv.DictReader(f, delimiter="\t") if not expect_cols.issubset(set(reader.fieldnames)): raise RuntimeError( f"Error reading distances file {filename}. Expected column names: {','.join(expect_cols)}. Got column names: {','.join(reader.fieldnames)}" ) for row in reader: distances.append((row["sample"], float(row["distance"]))) return distances
def write_variants_of_interest_file(self, filename, vcf_records_to_mask=None): with utils.open_file(filename, "w") as f: print( "variant_id", "in_mask", "chrom", "pos", "ref", "alt", "case", "control1", "control2", sep="\t", file=f, ) for i, variant in enumerate(self.variants): if i in self.variant_indexes_of_interest: if (vcf_records_to_mask is not None and variant.CHROM in vcf_records_to_mask and variant.POS in vcf_records_to_mask[variant.CHROM]): in_mask = 1 else: in_mask = 0 print( i + 1, in_mask, variant.CHROM, variant.POS + 1, variant.REF, ",".join(variant.ALTS), StrainTriple.genotype_to_string( self.variant_calls["case"][i]), StrainTriple.genotype_to_string( self.variant_calls["control1"][i]), StrainTriple.genotype_to_string( self.variant_calls["control2"][i]), sep="\t", file=f, )
def _write_variants_summary_file(cls, triples, outfile, vcf_records_to_mask=None): with utils.open_file(outfile, "w") as f: print( "variant_id", "in_mask", "chrom", "pos", "ref", "alt", "freq", *[f"Triple.{i+1}" for i in range(len(triples))], sep="\t", file=f, ) for variant_index, variant in enumerate(triples[0].variants): if ( vcf_records_to_mask is not None and variant.CHROM in vcf_records_to_mask and variant.POS in vcf_records_to_mask[variant.CHROM] ): in_mask = 1 else: in_mask = 0 in_triples = [ (1 if variant_index in t.variant_indexes_of_interest else 0) for t in triples ] freq = round(sum(in_triples) / len(in_triples), 4) print( variant_index + 1, in_mask, variant.CHROM, variant.POS + 1, variant.REF, ",".join(variant.ALTS), freq, *in_triples, sep="\t", file=f, )
def load_variant_calls_from_vcf_file(infile, expected_variants=None): with utils.open_file(infile) as f: sample_name = None calls = [] checking_variants = True if expected_variants is None: expected_variants = [] checking_variants = False for line in f: if line.startswith("##CHROM"): sample_name = line.rstrip().split("\t")[-1] elif not line.startswith("#"): gt, variant = vcf_line_to_variant_and_gt(line) calls.append(gt) if checking_variants: if len(calls) - 1 >= len(expected_variants): raise RuntimeError( f"Too many variants in VCF file {infile}. Expected {len(expected_variants)} but got at least one more than that, so stopping" ) if expected_variants[len(calls) - 1] != variant: raise RuntimeError( f"Mismatch in variant calls. Expected to get {expected_variants[len(calls)]} but got {variant} in file {infile}. Cannot continue" ) else: expected_variants.append(variant) if sample_name is not None: raise RuntimeError( f"Did not find sample name in VCF file {infile}. Cannot continue" ) if len(expected_variants) != len(calls): raise RuntimeError( f"Expected {len(expected_variants)} calls in VCF file {infile} but got {len(calls)}" ) return calls, expected_variants
def save_variant_count_list_to_tsv(var_list, outfile): with utils.open_file(outfile, "w") as f: print(*VariantCounts._fields, sep="\t", file=f) for v in var_list: print(*v, sep="\t", file=f)
def load_vcf_file_for_distance_calc( infile, only_use_pass=True, numeric_filters=None, het_to_hom_key="COV", het_to_hom_min_pc_depth=90.0, mask=None, ): """Loads VCF file, returning a numpy array of genotypes, of type uint16. 0 means unknown genotype. >0 means the allele number (where 1=ref, 2=first alt, etc). Format of numeric_filters is {"key": (bool, N)}. eg "GT_CONF": (True, 10) would require a minimum GT_CONF of 10 to use the called genotype. Otherwise the genotype is zero""" if mask is None: mask = {} if numeric_filters is None: numeric_filters = {} data = [] var_counts = {"hom": 0, "het": 0, "null": 0, "het_to_hom": 0} with utils.open_file(infile) as f: for line in f: if line.startswith("#"): continue fields = line.rstrip().split("\t") if fields[0] in mask and int(fields[1]) - 1 in mask[fields[0]]: continue if only_use_pass and fields[6] != "PASS": var_counts["null"] += 1 data.append(0) continue try: info = dict(zip(fields[8].split(":"), fields[9].split(":"))) genos = set(info["GT"].split("/")) except: raise RuntimeError( f"Error parsing final two columns of VCF file {infile} at this line:\n{line}" ) fail_filter = False for key, filt in numeric_filters.items(): if key in info: val = float(info[key]) if (filt[0] and val < filt[1]) or (not filt[0] and val > filt[1]): fail_filter = True break if fail_filter or "." in genos: data.append(0) var_counts["null"] += 1 elif len(genos) > 1: hom_allele = _convert_het_to_hom(genos, info, het_to_hom_key, het_to_hom_min_pc_depth) if hom_allele is None: var_counts["het"] += 1 data.append(0) else: var_counts["het_to_hom"] += 1 data.append(hom_allele + 1) else: var_counts["hom"] += 1 data.append(int(genos.pop()) + 1) logging.debug(f"loaded {infile}") var_counts = variant_counts.VariantCounts(**var_counts) return np.array(data, dtype=np.uint16), var_counts