def open_variant_file(var_type, var_file, burden_file, burden_regions, uncompressed): """Open a variant file for use as an iterable Args: var_type (str) Type of variants file (kmers, vcf, Rtab) var_file (str) Location of file burden_file (str) File containing regions to group burden tests burden_regions (list) List of burden regions to be filled in-place uncompressed (bool) True if kmer file is not gzipped """ sample_order = [] if var_type == "kmers": if uncompressed: infile = open(var_file) else: infile = gzip.open(var_file, 'r') elif var_type == "vcf": infile = VariantFile(var_file) if burden_file: load_burden(burden_file, burden_regions) else: # Rtab files have a header, rather than sample names accessible by row infile = open(var_file) header = infile.readline().rstrip() sample_order = header.split()[1:] return infile, sample_order
def main(): options = get_options() # Create dummy pheno object from sample list sample_list = [] with open(options.samples, 'r') as sample_file: for sample in sample_file: sample_list.append(sample.rstrip()) p = pd.Series(np.zeros(len(sample_list)), index=sample_list) # Open variant file. Mostly copied from __main__ sample_order = [] all_strains = set(p.index) if options.kmers: var_type = "kmers" if options.uncompressed: infile = open(options.kmers) else: infile = gzip.open(options.kmers, 'r') elif options.vcf: var_type = "vcf" infile = VariantFile(options.vcf) else: # Rtab files have a header, rather than sample names accessible by row var_type = "Rtab" infile = open(options.pres) header = infile.readline().rstrip() sample_order = header.split()[1:] eof = 0 # no copy of first variant_mat made. Reserve memory G = np.empty((len(p), block_size)) sys.stderr.write("Reading in variants\n") v_iter = load_var_block(var_type, p, None, None, infile, all_strains, sample_order, options.min_af, options.max_af, options.max_missing, options.uncompressed, block_size) while not eof: variants, variant_mat, eof = next(v_iter) if G.shape[1] > block_size: G = np.concatenate(G, variant_mat) else: G = variant_mat sys.stderr.write("Calculating sample similarity\n") K = np.matmul(G, np.transpose(G)) K_out = pd.DataFrame(K, index=p.index, columns=p.index) K_out.to_csv(sys.stdout, sep='\t')