def process_source(genome_in, metadata=dict()): """ Take source, uncompress, sort, and convert to GFF as needed, yield GFF """ # Handle genome compression, get input, and make best guess of format type. source_input = autozip.file_open(genome_in, 'r') metadata['input_type'] = detect_format(source_input) # Reset input and convert to GFF if necessary. source_input.close() source_input = autozip.file_open(genome_in, 'r') if metadata['input_type'] == "GFF": gff_input = source_input elif metadata['input_type'] == "CGIVAR": gff_input = cgivar_to_gff.convert(source_input) elif metadata['input_type'] == "23ANDME": gff_input = gff_from_23andme.convert(source_input) else: print "ERROR: genome file format not recognized" # Grab header (don't sort) & genome build. Pipe the rest to UNIX sort. header_done = False header = [] sort_cmd = ['sort', '--buffer-size=20%', '--key=1,1', '--key=5n,5', '--key=4n,4'] sort_out = subprocess.Popen(sort_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=1) genome_build = DEFAULT_BUILD b36_list = ["hg18", "36", "b36", "build36", "NCBI36"] b37_list = ["hg19", "37", "b37", "build37", "GRCh37"] for line in gff_input: if not header_done: if re.match('#', line): header.append(line) if line.startswith("##genome-build"): gbdata = line.split() if len(gbdata) < 2: raise Exception("no genome build specified?") elif gbdata[1] in b36_list: genome_build = "b36" elif gbdata[1] in b37_list: genome_build = "b37" else: raise Exception("genome build uninterpretable") else: header_done = True else: sort_out.stdin.write(str(line.rstrip('\n')) + '\n') sort_out.stdin.close() # Yield the genome build, followed by the GFF data. yield genome_build for line in header: yield line.rstrip('\n') for line in sort_out.stdout: yield line.rstrip('\n')
def read_metadata(genome_id): """Open file containing metadata, return it""" metadata_path = '/home/trait/upload/' + genome_id + '-out/metadata.json' f_meta = autozip.file_open(metadata_path) metadata = json.loads(f_meta.next()) f_meta.close() return metadata
def read_metadata(genome_id): """Open file containing metadata, return it""" metadata_path = "/home/trait/upload/" + genome_id + "-out/metadata.json" f_meta = autozip.file_open(metadata_path) metadata = json.loads(f_meta.next()) f_meta.close() return metadata
def convert(cgi_input, options=None): """Generator that converts CGI var data to GFF-formated strings""" # Set up CGI input. Default is to assume a str generator. cgi_data = cgi_input if isinstance(cgi_input, str): cgi_data = autozip.file_open(cgi_input, 'r') build = DEFAULT_BUILD software_ver = DEFAULT_SOFTWARE_VER header_done = False saw_chromosome = False for line in cgi_data: # Handle the header, get the genome build if you can. if not header_done: if re.match("#", line): if re.match("#GENOME_REFERENCE.*NCBI build 37", line): build = "b37" elif re.match("#GENOME_REFERENCE.*NCBI build 36", line): build = "b36" if re.match("#SOFTWARE_VERSION\W+([0-9.]+)", line): matches = re.match("#SOFTWARE_VERSION\W+([0-9.]+)", line).groups() software_ver = matches[0] continue else: # Output GFF header once we're done reading CGI's. yield "##genome-build " + build header_done = True if re.search("^\W*$", line): continue # TODO: use table header instead of assuming which column to use if re.search("^>", line): continue # Handle data data = line.rstrip('\n').split("\t") if options and options.chromosome: if data[3] != options.chromosome: if saw_chromosome: # Assume all base calls for a single chromosome are in a contiguous block break continue saw_chromosome = True if data[2] == "all" or data[1] == "1": # The output from process_full_position is a str. out = process_full_position(data, software_ver) else: assert data[2] == "1" # The output from process_split_position is a str generator; # it may end up calling itself recursively. out = process_split_position(data, cgi_data, software_ver) if not out: continue if isinstance(out, str): yield out else: for line in out: yield line
def main(): """Main function.""" usage = 'usage: %prog [options] gff_child gff_parentA [gff_parentB]' parser = optparse.OptionParser(usage=usage) parser.add_option('-o', '--output', help="Specificies an option output " \ + "file name. Default is standard output.", dest='f_out', action='store') parser.add_option('-m', '--mend_errs', help="If set, report mendelian " \ + "inheritance errors as an attribute. Default is to ignore them.", dest='mend_errs', action='store_true', default=False) (opts, args) = parser.parse_args() if (len(args) < 2): parser.error("Need atleast 2 input file arguments.") child = args[0] parent_a = args[1] parent_b = None if (len(args) > 2): parent_b = args[2] trioizer = PhaseTrio(child, parent_a, parent_b, opts.mend_errs) if opts.f_out: out = autozip.file_open(opts.f_out, 'w') for line in trioizer.call_phase(): out.write('%s\n' % line) else: for line in trioizer.call_phase(): print '%s\n' % line
def match_getev_pph2(getev_file, pph2_file): # Use these files to create a link between uniprot ID and gene name. kgwname_file = '/home/trait/data/knownGene_hg18_sorted.txt' kgxref_file = '/home/trait/data/kgXref_hg18.txt.gz' ucsc_to_name = process_kgwname(kgwname_file) uniprot_to_genename = process_kgxref(kgxref_file, ucsc_to_name) # Read GET-Evidence flatfile getev_variants = dict() if isinstance(getev_file, str): getev_in = autozip.file_open(getev_file) else: getev_in = getev_file for line in getev_in: getev_data = json.loads(line) if 'gene' in getev_data and 'aa_change_short' in getev_data: gene_aachange_key = (getev_data['gene'] + '-' + getev_data['aa_change_short']) getev_variants[gene_aachange_key] = getev_data['variant_id'] getev_in.close() # Read Polyphen 2 data and return scores for GET-Ev variants pph2_tar = tarfile.open(name=pph2_file, mode='r:bz2') for taritem in pph2_tar: if re.match('pph2_whpss/(.*)\.pph2\.txt', str(taritem.name)): uniprot = re.match('pph2_whpss/(.*)\.pph2\.txt', taritem.name).group(1) if uniprot in uniprot_to_genename: gene = uniprot_to_genename[uniprot] pph2_genedata = pph2_tar.extractfile(taritem) for line in pph2_genedata: pph2_data = re.split(' *\t *', line.rstrip('\n')) key = gene + '-' + pph2_data[3] + pph2_data[2] + pph2_data[4] if key in getev_variants and pph2_data[16]: print '\t'.join([key, getev_variants[key], pph2_data[16]])
def convert_to_file(genotype_input, output_file): """Convert a deCODEme file and output GFF-formatted data to file""" output = output_file # default assumes writable file object if isinstance(output_file, str): output = autozip.file_open(output_file, 'w') conversion = convert(genotype_input) for line in conversion: output.write(line + "\n") output.close()
def process_kgwname(kgwname_file): """Return dict linking UCSC IDs to gene names from first column""" ucsc_to_name = dict() f_in = autozip.file_open(kgwname_file, 'r') for line in f_in: data = line.split() ucsc_to_name[data[1]] = data[0] f_in.close() return ucsc_to_name
def convert_to_file(cgi_input, output_file): """Convert a CGI var file and output GFF-formatted data to file""" output = output_file # default assumes writable file object if isinstance(output_file, str): output = autozip.file_open(output_file, 'w') conversion = convert(cgi_input) # set up generator for line in conversion: output.write(line + "\n") output.close()
def convert_to_file(genotype_input, output_file): """Convert a Family Tree DNA file and output GFF-formatted data to file""" output = output_file # default assumes writable file object if isinstance(output_file, str): output = autozip.file_open(output_file, 'w') conversion = convert(genotype_input) for line in conversion: output.write(line + "\n") output.close()
def getev_reprocess(genotype_file, server=None, options=None): """Redo analysis against GET-Evidence data""" init_stuff = processing_init(genotype_file, server) if init_stuff: output_dir, log, log_handle, lockfile, logfile = init_stuff else: return None log.put('#status 0 Reprocessing data against GET-Evidence') args = { 'metadata': os.path.join(output_dir, 'metadata.json'), 'nonsyn_data': os.path.join(output_dir, 'ns.gff'), 'getev_out': os.path.join(output_dir, 'get-evidence.json'), 'getev_genes_out': os.path.join(output_dir, 'get-ev_genes.json'), 'getev_flat': os.path.join(os.getenv('DATA'), GETEV_FLAT) } # Read metadata file (need this to get build info for transcripts file) try: f_metadata = autozip.file_open(args['metadata']) metadata = json.loads(f_metadata.next()) f_metadata.close() if metadata['genome_build'] == 'b36': args['transcripts'] = os.path.join(os.getenv('DATA'), KNOWNGENE_HG18_SORTED) elif metadata['genome_build'] == 'b37': args['transcripts'] = os.path.join(os.getenv('DATA'), KNOWNGENE_HG19_SORTED) else: raise KeyError except (IOError, KeyError): fcntl.flock(log_handle, fcntl.LOCK_UN) log_handle.close() genome_analyzer(genotype_file) return if (os.path.exists(args['nonsyn_data'] + '.gz')): args['nonsyn_data'] = args['nonsyn_data'] + '.gz' if options and options.chromosome: chrlist = [options.chromosome] else: chrlist = ['chr' + str(x) for x in range(1, 22) + ['X', 'Y']] progtrack = ProgressTracker(log_handle, [1, 99], expected=chrlist) # Get GET-Evidence hits gff_getevidence_map.match_getev_to_file( args['nonsyn_data'], args['getev_flat'], transcripts_file=args['transcripts'], output_file=args['getev_out'] + ".tmp", gene_out_file=args['getev_genes_out'] + ".tmp", progresstracker=progtrack) os.system("mv " + args['getev_out'] + ".tmp " + args['getev_out']) os.system("mv " + args['getev_genes_out'] + ".tmp " + args['getev_genes_out']) os.rename(lockfile, logfile) log_handle.close() print "Finished reprocessing GET-Evidence hits for " + str(genotype_file)
def convert_to_file(vcf_input, output_file): """Convert a VCF file and output GFF-formatted data to file""" output = output_file # default assumes writable file object if isinstance(output_file, str): output = autozip.file_open(output_file, 'w') conversion = convert(vcf_input) # set up generator for line in conversion: output.write(line + "\n") output.close()
def getev_reprocess(genotype_file, server=None, options=None): """Redo analysis against GET-Evidence data""" init_stuff = processing_init(genotype_file, server) if init_stuff: output_dir, log, log_handle, lockfile, logfile = init_stuff else: return None log.put("#status 0 Reprocessing data against GET-Evidence") args = { "metadata": os.path.join(output_dir, "metadata.json"), "nonsyn_data": os.path.join(output_dir, "ns.gff"), "getev_out": os.path.join(output_dir, "get-evidence.json"), "getev_genes_out": os.path.join(output_dir, "get-ev_genes.json"), "getev_flat": os.path.join(os.getenv("DATA"), GETEV_FLAT), } # Read metadata file (need this to get build info for transcripts file) try: f_metadata = autozip.file_open(args["metadata"]) metadata = json.loads(f_metadata.next()) f_metadata.close() if metadata["genome_build"] == "b36": args["transcripts"] = os.path.join(os.getenv("DATA"), KNOWNGENE_HG18_SORTED) elif metadata["genome_build"] == "b37": args["transcripts"] = os.path.join(os.getenv("DATA"), KNOWNGENE_HG19_SORTED) else: raise KeyError except (IOError, KeyError): fcntl.flock(log_handle, fcntl.LOCK_UN) log_handle.close() genome_analyzer(genotype_file) return if os.path.exists(args["nonsyn_data"] + ".gz"): args["nonsyn_data"] = args["nonsyn_data"] + ".gz" if options and options.chromosome: chrlist = [options.chromosome] else: chrlist = ["chr" + str(x) for x in range(1, 22) + ["X", "Y"]] progtrack = ProgressTracker(log_handle, [1, 99], expected=chrlist) # Get GET-Evidence hits gff_getevidence_map.match_getev_to_file( args["nonsyn_data"], args["getev_flat"], transcripts_file=args["transcripts"], output_file=args["getev_out"] + ".tmp", gene_out_file=args["getev_genes_out"] + ".tmp", progresstracker=progtrack, ) os.system("mv " + args["getev_out"] + ".tmp " + args["getev_out"]) os.system("mv " + args["getev_genes_out"] + ".tmp " + args["getev_genes_out"]) os.rename(lockfile, logfile) log_handle.close() print "Finished reprocessing GET-Evidence hits for " + str(genotype_file)
def getev_reprocess(genotype_file, server=None, options=None): """Redo analysis against GET-Evidence data""" init_stuff = processing_init(genotype_file, server) if init_stuff: output_dir, log, log_handle, lockfile, logfile = init_stuff else: return None log.put('#status 0 Reprocessing data against GET-Evidence') args = { 'metadata': os.path.join(output_dir, 'metadata.json'), 'nonsyn_data': os.path.join(output_dir, 'ns.gff'), 'getev_out': os.path.join(output_dir, 'get-evidence.json'), 'getev_genes_out': os.path.join(output_dir, 'get-ev_genes.json'), 'getev_flat': os.path.join(os.getenv('DATA'), GETEV_FLAT) } # Read metadata file (need this to get build info for transcripts file) try: f_metadata = autozip.file_open(args['metadata']) metadata = json.loads(f_metadata.next()) f_metadata.close() if metadata['genome_build'] == 'b36': args['transcripts'] = os.path.join(os.getenv('DATA'), KNOWNGENE_HG18_SORTED) elif metadata['genome_build'] == 'b37': args['transcripts'] = os.path.join(os.getenv('DATA'), KNOWNGENE_HG19_SORTED) else: raise KeyError except (IOError, KeyError): fcntl.flock(log_handle, fcntl.LOCK_UN) log_handle.close() genome_analyzer(genotype_file) return if (os.path.exists (args['nonsyn_data'] + '.gz')): args['nonsyn_data'] = args['nonsyn_data'] + '.gz' if options and options.chromosome: chrlist = [options.chromosome] else: chrlist = ['chr' + str(x) for x in range(1, 22) + ['X', 'Y']] progtrack = ProgressTracker(log_handle, [1, 99], expected=chrlist) # Get GET-Evidence hits gff_getevidence_map.match_getev_to_file(args['nonsyn_data'], args['getev_flat'], transcripts_file=args['transcripts'], output_file=args['getev_out'] + ".tmp", gene_out_file=args['getev_genes_out'] + ".tmp", progresstracker=progtrack) os.system("mv " + args['getev_out'] + ".tmp " + args['getev_out']) os.system("mv " + args['getev_genes_out'] + ".tmp " + args['getev_genes_out']) os.rename(lockfile, logfile) log_handle.close() print "Finished reprocessing GET-Evidence hits for " + str(genotype_file)
def convert(genotype_input): """Take in Ancestry genotype data, yield GFF formatted lines""" genotype_data = genotype_input if isinstance(genotype_input, str): genotype_data = autozip.file_open(genotype_input, 'r') build = DEFAULT_BUILD header_done = False for line in genotype_data: # Handle the header, get the genome build if you can. if not header_done: if re.match("#", line): if re.search("reference build 37", line): build = "b37" elif re.search("reference build 38", line): build = "b38" elif re.search("reference build 36", line): build = "b36" continue else: yield "##genome-build " + build header_done = True data = line.rstrip('\n').split() if len(data) < 5: continue if (data[1] == "MT") or (data[1] == "25"): chromosome = 'chrM' elif (data[1] == "23"): chromosome = 'chrX' elif (data[1] == "24"): chromosome = 'chrY' else: chromosome = 'chr' + data[1] pos_start = data[2] pos_end = data[2] # Ignore uncalled or indel positions. if not (re.match(r'[ACGT]', data[3])): continue if not (re.match(r'[ACGT]', data[4])): continue if data[3] == data[4]: attributes = 'alleles ' + data[3] else: attributes = 'alleles ' + data[3] + '/' + data[4] if re.match('rs', data[0]): attributes = attributes + '; db_xref dbsnp:' + data[0] output = [ chromosome, "CGI", "SNP", pos_start, pos_end, '.', '+', '.', attributes ] yield "\t".join(output)
def __init__(self, f_child, f_parA, f_parB, mend_errs): """Initializes class variables, opens input files.""" self.filenames = {0: f_child, 1: f_parA} self.mend_errs = mend_errs self.gffs = {0: None, 1: None} # Positions are a tuple of chromosome, start, end, and gff record self.positions = {0: ('chr1', -1, -1, None), 1: ('chr1', -1, -1, None)} if (not f_parB == None): self.filenames[2] = f_parB self.gffs[2] = None self.positions[2] = ('chr1', -1, -1, None) # Set up input/output files for idx, filename in self.filenames.iteritems(): self.gffs[idx] = gff.input(autozip.file_open(filename, 'r'))
def __init__(self, f_child, f_parA, f_parB, mend_errs): """Initializes class variables, opens input files.""" self.filenames = {0 : f_child, 1 : f_parA} self.mend_errs = mend_errs self.gffs = {0 : None, 1 : None} # Positions are a tuple of chromosome, start, end, and gff record self.positions = {0 : ('chr1', -1, -1, None), 1 : ('chr1', -1, -1, None)} if (not f_parB == None): self.filenames[2] = f_parB self.gffs[2] = None self.positions[2] = ('chr1', -1, -1, None) # Set up input/output files for idx, filename in self.filenames.iteritems(): self.gffs[idx] = gff.input(autozip.file_open(filename, 'r'))
def convert(genotype_input): """Take in deCODEme genotype data, yield GFF formatted lines""" genotype_data = genotype_input if isinstance(genotype_input, str): genotype_data = csv.reader( autozip.file_open(genotype_input, 'r', 'deCODEme_scan.csv')) else: genotype_data = csv.reader(genotype_input) # We are allowing people to donate only the 'deCODEme_scan.csv' file, # which unfortunately lacks build information (it is stored separately # in 'deCODEme_info.txt', but this file also contains the deCODEme # username). So fare deCODEme files have only been build 36, and so # this is the current assumption for data processing. build = "b36" yield "##genome-build " + build header_row = genotype_data.next() col = dict() for i in range(len(header_row)): col[header_row[i]] = i for row in genotype_data: variants = list(row[col['YourCode']]) if variants[0] == '-': continue chromosome = 'chr' + row[col['Chromosome']] strand = row[col['Strand']] if strand == '-': variants = [revcomp(x) for x in variants] pos_start = row[col['Position']] pos_end = pos_start attributes = '' if variants[0] == variants[1]: attributes = 'alleles ' + variants[0] else: attributes = 'alleles ' + variants[0] + '/' + variants[1] if re.match('rs', row[col['Name']]): attributes = attributes + '; db_xref dbsnp:' + row[col['Name']] output = [ chromosome, "deCODEme", "SNP", pos_start, pos_end, '.', '+', '.', attributes ] yield "\t".join(output)
def convert(genotype_input): """Take in 23andme genotype data, yield GFF formatted lines""" genotype_data = genotype_input if isinstance(genotype_input, str): genotype_data = autozip.file_open(genotype_input, 'r') build = DEFAULT_BUILD header_done = False for line in genotype_data: # Handle the header, get the genome build if you can. if not header_done: if re.match("#", line): if re.search("human assembly build 37", line): build = "b37" elif re.search("human assembly build 36", line): build = "b36" continue else: yield "##genome-build " + build header_done = True data = line.rstrip('\n').split() if len(data) < 3: continue if data[1] == "MT": chromosome = 'chrM' else: chromosome = 'chr' + data[1] pos_start = data[2] pos_end = data[2] # Ignore uncalled or indel positions. if not (re.match(r'[ACGT]{1,2}', data[3])): continue if len(data[3]) > 1: if data[3][0] == data[3][1]: attributes = 'alleles ' + data[3][0] else: attributes = 'alleles ' + data[3][0] + '/' + data[3][1] else: attributes = 'alleles ' + data[3] if re.match('rs', data[0]): attributes = attributes + '; db_xref dbsnp:' + data[0] output = [chromosome, "CGI", "SNP", pos_start, pos_end, '.', '+', '.', attributes] yield "\t".join(output)
def convert(input_file, options=None): input_type = detect_format.detect_format(input_file) if input_type == 'GFF': input_data = autozip.file_open(input_file) elif input_type == 'CGIVAR': input_data = cgivar_to_gff.convert(input_file, options) elif input_type == '23ANDME': input_data = gff_from_23andme.convert(input_file) elif input_type == 'VCF': input_data = vcf_to_gff.convert(input_file, options) elif input_type == 'deCODEme': input_data = gff_from_decodeme.convert(input_file) elif input_type == 'FTDNA': input_data = gff_from_ftdna.convert(input_file) else: raise Exception("input format not recognized") for line in input_data: yield line
def convert(genotype_input): """Take in Family Tree genotype data, yield GFF formatted lines""" genotype_data = genotype_input if isinstance(genotype_input, str): genotype_data = csv.reader(autozip.file_open(genotype_input, 'r')) else: genotype_data = csv.reader(genotype_input) # Currently Family Tree DNA appears to only be in build 36 format. # There doesn't appear to be any record in the files regarding which # build was used. build = "b36" yield "##genome-build " + build header_row = genotype_data.next() col = dict() for i in range(len(header_row)): col[header_row[i]] = i for row in genotype_data: variants = list(row[col['RESULT']]) if variants[0] == '-' or variants[0] == 'I' or variants[0] == 'D': continue chromosome = 'chr' + row[col['CHROMOSOME']] pos_start = row[col['POSITION']] pos_end = pos_start attributes = '' if variants[0] == variants[1]: attributes = 'alleles ' + variants[0] else: attributes = 'alleles ' + variants[0] + '/' + variants[1] if re.match('rs', row[col['RSID']]): attributes = attributes + '; db_xref dbsnp:' + row[col['RSID']] output = [ chromosome, "FTDNA", "SNP", pos_start, pos_end, '.', '+', '.', attributes ] yield "\t".join(output)
def convert(genotype_input): """Take in Family Tree genotype data, yield GFF formatted lines""" genotype_data = genotype_input if isinstance(genotype_input, str): genotype_data = csv.reader(autozip.file_open(genotype_input, 'r')) else: genotype_data = csv.reader(genotype_input) # Currently Family Tree DNA appears to only be in build 36 format. # There doesn't appear to be any record in the files regarding which # build was used. build = "b36" yield "##genome-build " + build header_row = genotype_data.next() col = dict() for i in range(len(header_row)): col[header_row[i]] = i for row in genotype_data: variants = list(row[col['RESULT']]) if variants[0] == '-' or variants[0] == 'I' or variants[0] == 'D': continue chromosome = 'chr' + row[col['CHROMOSOME']] pos_start = row[col['POSITION']] pos_end = pos_start attributes = '' if variants[0] == variants[1]: attributes = 'alleles ' + variants[0] else: attributes = 'alleles ' + variants[0] + '/' + variants[1] if re.match('rs', row[col['RSID']]): attributes = attributes + '; db_xref dbsnp:' + row[col['RSID']] output = [chromosome, "FTDNA", "SNP", pos_start, pos_end, '.', '+', '.', attributes] yield "\t".join(output)
def convert(vcf_input, options=None): """Generator that converts CGI var data to GFF-formated strings""" # Set up VCF input. Default is to assume a str generator. vcf_data = vcf_input if isinstance(vcf_input, str): vcf_data = autozip.file_open(vcf_input, 'r') build = DEFAULT_BUILD header_done = False saw_chromosome = False for line in vcf_data: # Handle the header, get the genome build if you can. if not header_done: if re.match("#", line): build = process_header(line, build) continue else: # Output GFF header once we're done reading VCF header. yield "##genome-build " + build header_done = True if re.search("^\W*$", line): continue if options and options.chromosome: data = line.rstrip('\n').split("\t") if (data[0] != options.chromosome and 'chr' + data[0] != options.chromosome): if saw_chromosome: # Assume all base calls for a single chromosome # are in a contiguous block. break continue saw_chromosome = True output = process_line(line) if output: yield output
def process_kgxref(kgxref_file, ucsc_to_name): """Find and return one-to-one Uniprot ID / gene name mapping Using kgXref and our own gene name mappings, some gene names appear to correspond to more than one Uniprot ID (100 in hg18) and some Uniprot IDs appear to correspond to more than one gene name (51 in hg18). Because these are such a small fraction, we remove them and return the one-to-one mapping (18,453 in hg18) as a dict where both are keys (e.g. 36,906 keys for hg18). """ name_to_uniprot = dict() uniprot_to_name = dict() name_unique = dict() uniprot_unique = dict() f_in = autozip.file_open(kgxref_file, 'r') for line in f_in: data = line.rstrip('\n').split('\t') if data[2] and data[0] in ucsc_to_name: genename = ucsc_to_name[data[0]] uniprotname = data[2] if re.match(r'(.*?)-', data[2]): uniprotname = re.match(r'(.*?)-', data[2]).group(1) if genename in name_to_uniprot: if not name_to_uniprot[genename] == uniprotname: name_unique[genename] = False else: name_to_uniprot[genename] = uniprotname name_unique[genename] = True if uniprotname in uniprot_to_name: if not uniprot_to_name[uniprotname] == genename: uniprot_unique[uniprotname] = False else: uniprot_to_name[uniprotname] = genename uniprot_unique[uniprotname] = True final_dict = dict() for key in uniprot_to_name: if uniprot_unique[key] and name_unique[uniprot_to_name[key]]: final_dict[key] = uniprot_to_name[key] return final_dict
def load_getev(getev_file): """Read GET-Evidence flatfile""" getev_variants = dict() if isinstance(getev_file, str): getev_in = autozip.file_open(getev_file) else: getev_in = getev_file for line in getev_in: getev_data = json.loads(line) has_variant_id = ('variant_id' in getev_data and getev_data['variant_id']) has_aachange = ('gene' in getev_data and getev_data['gene'] and 'aa_change_short' in getev_data and getev_data['aa_change_short']) has_dbsnp = ('dbsnp_id' in getev_data and getev_data['dbsnp_id']) if has_aachange and has_variant_id: gene_aachange_key = (getev_data['gene'] + '-' + getev_data['aa_change_short']) getev_variants[gene_aachange_key] = getev_data['variant_id'] elif has_dbsnp and has_variant_id: dbsnp_key = getev_data['dbsnp_id'] getev_variants[dbsnp_key] = getev_data['variant_id'] getev_in.close() return getev_variants
def read_metadata(self, genome_id): """Open file containing metadata, initializes self.metadata""" metadata_path = GENOMEFILE_PRE + genome_id + GENOMEMETA_POST f_meta = autozip.file_open(metadata_path) self.metadata = json.loads(f_meta.next()) f_meta.close()
def add_coverage(shasum, coveragefile): metadata = read_metadata(shasum) if (not metadata or not 'genome_build' in metadata or metadata['genome_build'] != 'b36'): return coveragefile coverage_in = autozip.file_open(coveragefile) gff_in = autozip.file_open('/home/trait/upload/' + shasum + '-out/ns.gff.gz') covdir, covfile = os.path.split(coveragefile) covfile_pre = covfile if re.match('(.*)\.gz', covfile): covfile_pre = re.match('(.*)\.gz', covfile).groups()[0] coverage_out_path = os.path.join(covdir, covfile_pre + '_' + shasum[0:6] + '.gz') coverage_out = autozip.file_open(coverage_out_path, 'w') cov_header = coverage_in.next().rstrip().split() coverage_out.write(' '.join(cov_header + [shasum]) + '\n') gff_lookahead = gff_in.next().split() while gff_lookahead and re.match('#', gff_lookahead[0]): gff_lookahead = gff_in.next().split() gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) coverage_currdata = coverage_in.next().split() cov_blank = ['0' for x in coverage_currdata[3:]] while coverage_currdata or gff_currdata: # Skip data that are zero or negative (??) coverage if (gff_currdata and int(gff_currdata[4]) - (int(gff_currdata[3]) - 1) <= 0): gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) continue if (coverage_currdata and int(coverage_currdata[2]) - int(coverage_currdata[1]) <= 0): try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None continue # If coverage file is done, output GFF line if not coverage_currdata: output = [gff_currdata[0]] + gff_currdata[3:5] + cov_blank + ['1'] coverage_out.write(' '.join(output) + '\n') gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) continue # If GFF file is done, output coverage file line if not gff_currdata: output = coverage_currdata + ['0'] coverage_out.write(' '.join(output) + '\n') try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None continue # If they aren't on the same chromosome, move one of them forward. if coverage_currdata[0] != gff_currdata[0]: if coverage_currdata[0] < gff_currdata[0]: output = coverage_currdata + ['0'] coverage_out.write(' '.join(output) + '\n') try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None else: output = ([gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] + [gff_currdata[4]] + cov_blank + ['1']) coverage_out.write(' '.join(output) + '\n') gff_currdata, gff_lookahead = move_gff_ahead( gff_in, gff_lookahead) continue # If we get here, we have both files & both are on the same chrom if int(coverage_currdata[1]) < (int(gff_currdata[3]) - 1): # Coverage file start is before GFF start. if int(coverage_currdata[2]) <= (int(gff_currdata[3]) - 1): # Whole coverage file data is before GFF line. output = coverage_currdata + ['0'] coverage_out.write(' '.join(output) + '\n') try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None else: # Print uncovered up to the GFF start. output = (coverage_currdata[0:2] + [str(int(gff_currdata[3]) - 1)] + coverage_currdata[3:] + ['0']) coverage_out.write(' '.join(output) + '\n') coverage_currdata[1] = str(int(gff_currdata[3]) - 1) if int(coverage_currdata[2]) <= int(coverage_currdata[1]): try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None elif int(coverage_currdata[1]) > (int(gff_currdata[3]) - 1): # GFF start is before coverage file start. if int(coverage_currdata[1]) > int(gff_currdata[4]): # Whole GFF file data is before coverage file data output = ([gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] + [gff_currdata[4]] + cov_blank + ['1']) coverage_out.write(' '.join(output) + '\n') gff_currdata, gff_lookahead = move_gff_ahead( gff_in, gff_lookahead) else: # Print uncovered GFF up to coverage file start output = ([gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] + [coverage_currdata[1]] + cov_blank + ['1']) coverage_out.write(' '.join(output) + '\n') gff_currdata[3] = str(int(coverage_currdata[1]) + 1) else: # Coverage file and GFF data have same start. if int(coverage_currdata[2]) < int(gff_currdata[4]): # Coverage file ends first: output, update GFF, advance coverage output = coverage_currdata + ['1'] coverage_out.write(' '.join(output) + '\n') gff_currdata[3] = str(int(coverage_currdata[2]) + 1) try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None elif int(coverage_currdata[2]) > int(gff_currdata[4]): # GFF ends first: output, update coverage, advance GFF output = (coverage_currdata[0:2] + [gff_currdata[4]] + coverage_currdata[3:] + ['1']) coverage_out.write(' '.join(output) + '\n') coverage_currdata[1] = gff_currdata[4] if int(coverage_currdata[2]) <= int(coverage_currdata[1]): try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None gff_currdata, gff_lookahead = move_gff_ahead( gff_in, gff_lookahead) else: # Both end at the same point: Output and advance both. output = coverage_currdata + ['1'] coverage_out.write(' '.join(output) + '\n') try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None gff_currdata, gff_lookahead = move_gff_ahead( gff_in, gff_lookahead) coverage_out.close() gff_in.close() coverage_in.close() return coverage_out_path
def genome_analyzer(genotype_file, server=None, options=None): """Perform analyses on genotype_file""" init_stuff = processing_init(genotype_file, server) if init_stuff: output_dir, log, log_handle, lockfile, logfile = init_stuff else: return None # Set up arguments used by processing commands and scripts. args = { 'genotype_input': str(genotype_file), 'miss_out': os.path.join(output_dir, 'missing_coding.json'), 'sorted_out': os.path.join(output_dir, 'source_sorted.gff.gz'), 'nonsyn_out_tmp': os.path.join(output_dir, 'ns_tmp.gff.gz'), 'nonsyn_out': os.path.join(output_dir, 'ns.gff.gz'), 'getev_out': os.path.join(output_dir, 'get-evidence.json'), 'getev_genes_out': os.path.join(output_dir, 'get-ev_genes.json'), 'metadata_out': os.path.join(output_dir, 'metadata.json'), 'genome_stats': os.path.join(os.getenv('DATA'), 'genome_stats.txt'), 'genetests': os.path.join(os.getenv('DATA'), GENETESTS_DATA), 'getev_flat': os.path.join(os.getenv('DATA'), GETEV_FLAT) } # Make output directory if needed try: if not os.path.exists(output_dir): os.makedirs(output_dir) except: print "Unexpected error:", sys.exc_info()[0] # Read metadata with uploaded file, if available. try: f_metadata = autozip.file_open( os.path.dirname(genotype_file) + '/metadata.json') metadata_line = f_metadata.next() genome_data = json.loads(metadata_line) except IOError: genome_data = dict() # Process and sort input genome data log.put('#status 0/100 converting and sorting input file') gff_in_gen = None # Look for parents and, if possible, use these to phase genome. if ('parent A' in genome_data and 'parent B' in genome_data): parA_in_dir = os.path.join( os.path.dirname(os.path.dirname(args['genotype_input'])), genome_data['parent A']) parB_in_dir = os.path.join( os.path.dirname(os.path.dirname(args['genotype_input'])), genome_data['parent B']) if os.path.exists(parA_in_dir) and os.path.exists(parB_in_dir): parA_files = os.listdir(parA_in_dir) parA_file_match = [ x for x in parA_files if re.match('genotype', x) ] parB_files = os.listdir(parB_in_dir) parB_file_match = [ x for x in parB_files if re.match('genotype', x) ] if parA_file_match and parB_file_match: parA_input = os.path.join(parA_in_dir, parA_file_match[0]) parB_input = os.path.join(parB_in_dir, parB_file_match[0]) gff_parA_gen = process_source(parA_input, dict(), options=options) gff_parB_gen = process_source(parB_input, dict(), options=options) gff_child_gen = process_source(args['genotype_input'], genome_data, options=options) parA_build = gff_parA_gen.next() parB_build = gff_parB_gen.next() genome_data['genome_build'] = gff_child_gen.next() if (parA_build == genome_data['genome_build'] and parB_build == genome_data['genome_build']): trio_phase = gff_trio_phase.PhaseTrio( gff_child_gen, gff_parA_gen, gff_parB_gen, False) gff_in_gen = trio_phase.call_phase() # Set up if trio phasing couldn't be done. if not gff_in_gen: # We pass build as a yield (instead of in metadata) to force the # generator to read through the header portion of the input data. gff_in_gen = process_source(args['genotype_input'], genome_data, options=options) genome_data['genome_build'] = gff_in_gen.next() # Set up build-dependent file locations if (genome_data['genome_build'] == "b36"): args['dbsnp'] = os.path.join(os.getenv('DATA'), DBSNP_B36_SORTED) args['reference'] = os.path.join(os.getenv('DATA'), REFERENCE_GENOME_HG18) args['transcripts'] = os.path.join(os.getenv('DATA'), KNOWNGENE_HG18_SORTED) elif (genome_data['genome_build'] == "b37"): args['dbsnp'] = os.path.join(os.getenv('DATA'), DBSNP_B37_SORTED) args['reference'] = os.path.join(os.getenv('DATA'), REFERENCE_GENOME_HG19) args['transcripts'] = os.path.join(os.getenv('DATA'), KNOWNGENE_HG19_SORTED) else: raise Exception("genome build data is invalid") if options and options.chromosome: chrlist = [options.chromosome] else: # It might be more elegant to extract this from metadata. chrlist = ['chr' + str(x) for x in range(1, 22) + ['X', 'Y']] # Process genome through a series of GFF-formatted string generators. log.put('#status 20 looking up reference alleles and ' 'dbSNP IDs, computing nonsynonymous changes, ' 'cross-referencing GET-Evidence database') progtrack = ProgressTracker(sys.stderr, [22, 99], expected=chrlist, metadata=genome_data) if not options or not options.no_metadata: # Record chromosomes seen and genome coverage. gff_in_gen = get_metadata.genome_metadata(gff_in_gen, args['genome_stats'], progresstracker=progtrack) # Report coding regions that lack coverage. gff_in_gen = call_missing.report_uncovered( gff_in_gen, args['transcripts'], args['genetests'], output_file=args['miss_out'], progresstracker=progtrack) if options and options.metadata_only: for line in gff_in_gen: pass else: # Find reference allele. gff_in_gen = gff_twobit_query.match2ref(gff_in_gen, args['reference']) # Look up dbSNP IDs gff_in_gen = gff_dbsnp_query.match2dbSNP(gff_in_gen, args['dbsnp']) # Check for nonsynonymous SNP gff_in_gen = gff_nonsynonymous_filter.predict_nonsynonymous( gff_in_gen, args['reference'], args['transcripts']) # Pull off GET-Evidence hits gff_in_gen = gff_getevidence_map.match_getev( gff_in_gen, args['getev_flat'], transcripts_file=args['transcripts'], gene_out_file=args['getev_genes_out'] + ".tmp", output_file=args['getev_out'] + ".tmp", progresstracker=progtrack) # Printing to output, pulls data through the generator chain. ns_out = autozip.file_open(args['nonsyn_out_tmp'], 'w') for line in gff_in_gen: ns_out.write(line + "\n") ns_out.close() os.system("mv " + args['getev_out'] + ".tmp " + args['getev_out']) os.system("mv " + args['nonsyn_out_tmp'] + " " + args['nonsyn_out']) os.system("mv " + args['getev_genes_out'] + ".tmp " + args['getev_genes_out']) # Print metadata metadata_f_out = open(args['metadata_out'], 'w') progtrack.write_metadata(metadata_f_out) metadata_f_out.close() log.put('#status 100 finished') os.rename(lockfile, logfile) log_handle.close() print "Finished processing file " + str(genotype_file)
def add_coverage(shasum, coveragefile): metadata = read_metadata(shasum) if not metadata or not "genome_build" in metadata or metadata["genome_build"] != "b36": return coveragefile coverage_in = autozip.file_open(coveragefile) gff_in = autozip.file_open("/home/trait/upload/" + shasum + "-out/ns.gff.gz") covdir, covfile = os.path.split(coveragefile) covfile_pre = covfile if re.match("(.*)\.gz", covfile): covfile_pre = re.match("(.*)\.gz", covfile).groups()[0] coverage_out_path = os.path.join(covdir, covfile_pre + "_" + shasum[0:6] + ".gz") coverage_out = autozip.file_open(coverage_out_path, "w") cov_header = coverage_in.next().rstrip().split() coverage_out.write(" ".join(cov_header + [shasum]) + "\n") gff_lookahead = gff_in.next().split() while gff_lookahead and re.match("#", gff_lookahead[0]): gff_lookahead = gff_in.next().split() gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) coverage_currdata = coverage_in.next().split() cov_blank = ["0" for x in coverage_currdata[3:]] while coverage_currdata or gff_currdata: # Skip data that are zero or negative (??) coverage if gff_currdata and int(gff_currdata[4]) - (int(gff_currdata[3]) - 1) <= 0: gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) continue if coverage_currdata and int(coverage_currdata[2]) - int(coverage_currdata[1]) <= 0: try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None continue # If coverage file is done, output GFF line if not coverage_currdata: output = [gff_currdata[0]] + gff_currdata[3:5] + cov_blank + ["1"] coverage_out.write(" ".join(output) + "\n") gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) continue # If GFF file is done, output coverage file line if not gff_currdata: output = coverage_currdata + ["0"] coverage_out.write(" ".join(output) + "\n") try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None continue # If they aren't on the same chromosome, move one of them forward. if coverage_currdata[0] != gff_currdata[0]: if coverage_currdata[0] < gff_currdata[0]: output = coverage_currdata + ["0"] coverage_out.write(" ".join(output) + "\n") try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None else: output = [gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] + [gff_currdata[4]] + cov_blank + ["1"] coverage_out.write(" ".join(output) + "\n") gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) continue # If we get here, we have both files & both are on the same chrom if int(coverage_currdata[1]) < (int(gff_currdata[3]) - 1): # Coverage file start is before GFF start. if int(coverage_currdata[2]) <= (int(gff_currdata[3]) - 1): # Whole coverage file data is before GFF line. output = coverage_currdata + ["0"] coverage_out.write(" ".join(output) + "\n") try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None else: # Print uncovered up to the GFF start. output = coverage_currdata[0:2] + [str(int(gff_currdata[3]) - 1)] + coverage_currdata[3:] + ["0"] coverage_out.write(" ".join(output) + "\n") coverage_currdata[1] = str(int(gff_currdata[3]) - 1) if int(coverage_currdata[2]) <= int(coverage_currdata[1]): try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None elif int(coverage_currdata[1]) > (int(gff_currdata[3]) - 1): # GFF start is before coverage file start. if int(coverage_currdata[1]) > int(gff_currdata[4]): # Whole GFF file data is before coverage file data output = [gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] + [gff_currdata[4]] + cov_blank + ["1"] coverage_out.write(" ".join(output) + "\n") gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) else: # Print uncovered GFF up to coverage file start output = ( [gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] + [coverage_currdata[1]] + cov_blank + ["1"] ) coverage_out.write(" ".join(output) + "\n") gff_currdata[3] = str(int(coverage_currdata[1]) + 1) else: # Coverage file and GFF data have same start. if int(coverage_currdata[2]) < int(gff_currdata[4]): # Coverage file ends first: output, update GFF, advance coverage output = coverage_currdata + ["1"] coverage_out.write(" ".join(output) + "\n") gff_currdata[3] = str(int(coverage_currdata[2]) + 1) try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None elif int(coverage_currdata[2]) > int(gff_currdata[4]): # GFF ends first: output, update coverage, advance GFF output = coverage_currdata[0:2] + [gff_currdata[4]] + coverage_currdata[3:] + ["1"] coverage_out.write(" ".join(output) + "\n") coverage_currdata[1] = gff_currdata[4] if int(coverage_currdata[2]) <= int(coverage_currdata[1]): try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) else: # Both end at the same point: Output and advance both. output = coverage_currdata + ["1"] coverage_out.write(" ".join(output) + "\n") try: coverage_currdata = coverage_in.next().split() except StopIteration: coverage_currdata = None gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead) coverage_out.close() gff_in.close() coverage_in.close() return coverage_out_path
def genome_analyzer(genotype_file, server=None, options=None): """Perform analyses on genotype_file""" init_stuff = processing_init(genotype_file, server) if init_stuff: output_dir, log, log_handle, lockfile, logfile = init_stuff else: return None # Set up arguments used by processing commands and scripts. args = { "genotype_input": str(genotype_file), "miss_out": os.path.join(output_dir, "missing_coding.json"), "sorted_out": os.path.join(output_dir, "source_sorted.gff.gz"), "nonsyn_out_tmp": os.path.join(output_dir, "ns_tmp.gff.gz"), "nonsyn_out": os.path.join(output_dir, "ns.gff.gz"), "getev_out": os.path.join(output_dir, "get-evidence.json"), "getev_genes_out": os.path.join(output_dir, "get-ev_genes.json"), "metadata_out": os.path.join(output_dir, "metadata.json"), "genome_stats": os.path.join(os.getenv("DATA"), "genome_stats.txt"), "genetests": os.path.join(os.getenv("DATA"), GENETESTS_DATA), "getev_flat": os.path.join(os.getenv("DATA"), GETEV_FLAT), } # Make output directory if needed try: if not os.path.exists(output_dir): os.makedirs(output_dir) except: print "Unexpected error:", sys.exc_info()[0] # Read metadata with uploaded file, if available. try: f_metadata = autozip.file_open(os.path.dirname(genotype_file) + "/metadata.json") metadata_line = f_metadata.next() genome_data = json.loads(metadata_line) except IOError: genome_data = dict() # Process and sort input genome data log.put("#status 0/100 converting and sorting input file") gff_in_gen = None # Look for parents and, if possible, use these to phase genome. if "parent A" in genome_data and "parent B" in genome_data: parA_in_dir = os.path.join(os.path.dirname(os.path.dirname(args["genotype_input"])), genome_data["parent A"]) parB_in_dir = os.path.join(os.path.dirname(os.path.dirname(args["genotype_input"])), genome_data["parent B"]) if os.path.exists(parA_in_dir) and os.path.exists(parB_in_dir): parA_files = os.listdir(parA_in_dir) parA_file_match = [x for x in parA_files if re.match("genotype", x)] parB_files = os.listdir(parB_in_dir) parB_file_match = [x for x in parB_files if re.match("genotype", x)] if parA_file_match and parB_file_match: parA_input = os.path.join(parA_in_dir, parA_file_match[0]) parB_input = os.path.join(parB_in_dir, parB_file_match[0]) gff_parA_gen = process_source(parA_input, dict(), options=options) gff_parB_gen = process_source(parB_input, dict(), options=options) gff_child_gen = process_source(args["genotype_input"], genome_data, options=options) parA_build = gff_parA_gen.next() parB_build = gff_parB_gen.next() genome_data["genome_build"] = gff_child_gen.next() if parA_build == genome_data["genome_build"] and parB_build == genome_data["genome_build"]: trio_phase = gff_trio_phase.PhaseTrio(gff_child_gen, gff_parA_gen, gff_parB_gen, False) gff_in_gen = trio_phase.call_phase() # Set up if trio phasing couldn't be done. if not gff_in_gen: # We pass build as a yield (instead of in metadata) to force the # generator to read through the header portion of the input data. gff_in_gen = process_source(args["genotype_input"], genome_data, options=options) genome_data["genome_build"] = gff_in_gen.next() # Set up build-dependent file locations if genome_data["genome_build"] == "b36": args["dbsnp"] = os.path.join(os.getenv("DATA"), DBSNP_B36_SORTED) args["reference"] = os.path.join(os.getenv("DATA"), REFERENCE_GENOME_HG18) args["transcripts"] = os.path.join(os.getenv("DATA"), KNOWNGENE_HG18_SORTED) elif genome_data["genome_build"] == "b37": args["dbsnp"] = os.path.join(os.getenv("DATA"), DBSNP_B37_SORTED) args["reference"] = os.path.join(os.getenv("DATA"), REFERENCE_GENOME_HG19) args["transcripts"] = os.path.join(os.getenv("DATA"), KNOWNGENE_HG19_SORTED) else: raise Exception("genome build data is invalid") if options and options.chromosome: chrlist = [options.chromosome] else: # It might be more elegant to extract this from metadata. chrlist = ["chr" + str(x) for x in range(1, 22) + ["X", "Y"]] # Process genome through a series of GFF-formatted string generators. log.put( "#status 20 looking up reference alleles and " "dbSNP IDs, computing nonsynonymous changes, " "cross-referencing GET-Evidence database" ) progtrack = ProgressTracker(sys.stderr, [22, 99], expected=chrlist, metadata=genome_data) if not options or not options.chromosome: # Record chromosomes seen and genome coverage. gff_in_gen = get_metadata.genome_metadata(gff_in_gen, args["genome_stats"], progresstracker=progtrack) # Report coding regions that lack coverage. gff_in_gen = call_missing.report_uncovered( gff_in_gen, args["transcripts"], args["genetests"], output_file=args["miss_out"], progresstracker=progtrack ) if options and options.metadata_only: for line in gff_in_gen: pass else: # Find reference allele. gff_in_gen = gff_twobit_query.match2ref(gff_in_gen, args["reference"]) # Look up dbSNP IDs gff_in_gen = gff_dbsnp_query.match2dbSNP(gff_in_gen, args["dbsnp"]) # Check for nonsynonymous SNP gff_in_gen = gff_nonsynonymous_filter.predict_nonsynonymous(gff_in_gen, args["reference"], args["transcripts"]) # Pull off GET-Evidence hits gff_in_gen = gff_getevidence_map.match_getev( gff_in_gen, args["getev_flat"], transcripts_file=args["transcripts"], gene_out_file=args["getev_genes_out"] + ".tmp", output_file=args["getev_out"] + ".tmp", progresstracker=progtrack, ) # Printing to output, pulls data through the generator chain. ns_out = autozip.file_open(args["nonsyn_out_tmp"], "w") for line in gff_in_gen: ns_out.write(line + "\n") ns_out.close() os.system("mv " + args["getev_out"] + ".tmp " + args["getev_out"]) os.system("mv " + args["nonsyn_out_tmp"] + " " + args["nonsyn_out"]) os.system("mv " + args["getev_genes_out"] + ".tmp " + args["getev_genes_out"]) # Print metadata metadata_f_out = open(args["metadata_out"], "w") progtrack.write_metadata(metadata_f_out) metadata_f_out.close() log.put("#status 100 finished") os.rename(lockfile, logfile) log_handle.close() print "Finished processing file " + str(genotype_file)
def get_allele_freqs(password, getev_file, excluded=None, chromfile=None, outputfile=None): # Set up output, genome inputs, GET-Evidence variants, and twobit reference. if outputfile: print "Setting up output file" f_out = autozip.file_open(outputfile, 'w') else: f_out = None genome_ids = get_genome_list(password, excluded) if chromfile: if f_out: print "Getting chromosomes..." chroms = read_single_items(chromfile) else: chroms = None if f_out: print "Reading GET-Ev flat file (takes a couple minutes)..." getev_variants = load_getev(getev_file) if f_out: print "Loading twobit genome..." twobit_genome = twobit.input(TWOBIT_PATH) if f_out: print( "Setting up GenomeSet (may be slow if each genome has to advance " + "to target chromosomes)...") genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants, verbose=True) else: genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants) if f_out: print "Find earliest ends" earliest_ends = genome_set.earliest_ends() #print earliest_ends # Move through the genomes to find allele frequencies while genome_set.genomes: # Move ahead of all "earliest ends" & save new earliest. next_earliest = genome_set.advance_all_past_end_pos(earliest_ends[0]) # Check all old "earliest ends" positions for interesting variants. has_var = [] is_interesting = False for position in earliest_ends: #print position if not position['ref']: has_var.append(position) #is_interesting = True if 'amino_acid' in position or 'getev_id' in position: is_interesting = True #if is_interesting: # print "Earliest ends: " + str(earliest_ends) # print [(x.id, x.data[-1]) for x in genome_set.genomes] # if has_var: # print "Var pos: " + str(has_var) # If there are interesting variants, calculate allele frequency. if has_var and is_interesting: # Check if another genomes has an overlapping variant extending # beyond this position, we're not ready to evaluate this yet # (it will be caught when the later overlapping one comes up). if genome_set.no_later_var(has_var): freqout = genome_set.eval_var_freq(has_var, twobit_genome) if f_out: f_out.write(freqout + '\n') else: print freqout genome_set.clean_out_prior_pos(earliest_ends) # Reset "earliest end" to next earliest positions. earliest_ends = next_earliest
def get_allele_freqs(password, getev_file, excluded=None, chromfile=None, outputfile=None): # Set up output, genome inputs, GET-Evidence variants, and twobit reference. if outputfile: print "Setting up output file" f_out = autozip.file_open(outputfile, 'w') else: f_out = None genome_ids = get_genome_list(password, excluded) if chromfile: if f_out: print "Getting chromosomes..." chroms = read_single_items(chromfile) else: chroms = None if f_out: print "Reading GET-Ev flat file (takes a couple minutes)..." getev_variants = load_getev(getev_file) if f_out: print "Loading twobit genome..." twobit_genome = twobit.input(TWOBIT_PATH) if f_out: print("Setting up GenomeSet (may be slow if each genome has to advance " + "to target chromosomes)...") genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants, verbose=True) else: genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants) if f_out: print "Find earliest ends" earliest_ends = genome_set.earliest_ends() #print earliest_ends # Move through the genomes to find allele frequencies while genome_set.genomes: # Move ahead of all "earliest ends" & save new earliest. next_earliest = genome_set.advance_all_past_end_pos(earliest_ends[0]) # Check all old "earliest ends" positions for interesting variants. has_var = [] is_interesting = False for position in earliest_ends: #print position if not position['ref']: has_var.append(position) #is_interesting = True if 'amino_acid' in position or 'getev_id' in position: is_interesting = True #if is_interesting: # print "Earliest ends: " + str(earliest_ends) # print [(x.id, x.data[-1]) for x in genome_set.genomes] # if has_var: # print "Var pos: " + str(has_var) # If there are interesting variants, calculate allele frequency. if has_var and is_interesting: # Check if another genomes has an overlapping variant extending # beyond this position, we're not ready to evaluate this yet # (it will be caught when the later overlapping one comes up). if genome_set.no_later_var(has_var): freqout = genome_set.eval_var_freq(has_var, twobit_genome) if f_out: f_out.write(freqout + '\n') else: print freqout genome_set.clean_out_prior_pos(earliest_ends) # Reset "earliest end" to next earliest positions. earliest_ends = next_earliest
def genome_analyzer(genotype_file, server=None, options=None): """Perform analyses on genotype_file""" global config init_stuff = processing_init(genotype_file, server) if init_stuff: output_dir, log, log_handle, lockfile, logfile = init_stuff else: return None # override default output directory if options and options.output_dir: output_dir = options.output_dir try: os.makedirs( output_dir, mode=0o777 ) except: pass # Set up arguments used by processing commands and scripts. args = { 'genotype_input': str(genotype_file), 'miss_out': os.path.join(output_dir, 'missing_coding.json'), 'sorted_out': os.path.join(output_dir, 'source_sorted.gff.gz'), 'nonsyn_out_tmp': os.path.join(output_dir, 'ns_tmp.gff.gz'), 'nonsyn_out': os.path.join(output_dir, 'ns.gff.gz'), 'getev_out': os.path.join(output_dir, 'get-evidence.json'), 'getev_genes_out': os.path.join(output_dir, 'get-ev_genes.json'), 'metadata_out': os.path.join(output_dir, 'metadata.json'), 'genome_stats': config['genome_stats'] , 'genetests': config['GENETESTS_DATA'], 'getev_flat': config['GETEV_FLAT'] } # Make output directory if needed try: if not os.path.exists(output_dir): os.makedirs(output_dir) except: print "Unexpected error:", sys.exc_info()[0] # Read metadata with uploaded file, if available. try: f_metadata = autozip.file_open(os.path.dirname(genotype_file) + '/metadata.json') metadata_line = f_metadata.next() genome_data = json.loads(metadata_line) except IOError: genome_data = dict() # Process and sort input genome data log.put ('#status 0/100 converting and sorting input file') gff_in_gen = None # Look for parents and, if possible, use these to phase genome. if ('parent A' in genome_data and 'parent B' in genome_data): parA_in_dir = os.path.join( os.path.dirname(os.path.dirname(args['genotype_input'])), genome_data['parent A']) parB_in_dir = os.path.join( os.path.dirname(os.path.dirname(args['genotype_input'])), genome_data['parent B']) if os.path.exists(parA_in_dir) and os.path.exists(parB_in_dir): parA_files = os.listdir(parA_in_dir) parA_file_match = [x for x in parA_files if re.match('genotype', x)] parB_files = os.listdir(parB_in_dir) parB_file_match = [x for x in parB_files if re.match('genotype', x)] if parA_file_match and parB_file_match: parA_input = os.path.join(parA_in_dir, parA_file_match[0]) parB_input = os.path.join(parB_in_dir, parB_file_match[0]) gff_parA_gen = process_source(parA_input, dict(), options=options) gff_parB_gen = process_source(parB_input, dict(), options=options) gff_child_gen = process_source(args['genotype_input'], genome_data, options=options) parA_build = gff_parA_gen.next() parB_build = gff_parB_gen.next() genome_data['genome_build'] = gff_child_gen.next() if (parA_build == genome_data['genome_build'] and parB_build == genome_data['genome_build']): trio_phase = gff_trio_phase.PhaseTrio(gff_child_gen, gff_parA_gen, gff_parB_gen, False) gff_in_gen = trio_phase.call_phase() # Set up if trio phasing couldn't be done. if not gff_in_gen: # We pass build as a yield (instead of in metadata) to force the # generator to read through the header portion of the input data. gff_in_gen = process_source(args['genotype_input'], genome_data, options=options) genome_data['genome_build'] = gff_in_gen.next() # Set up build-dependent file locations if (genome_data['genome_build'] == "b36"): args['dbsnp'] = config["DBSNP_B36_SORTED"] args['reference'] = config["REFERENCE_GENOME_HG18"] args['transcripts'] = config["KNOWNGENE_HG18_SORTED"] elif (genome_data['genome_build'] == "b37"): args['dbsnp'] = config["DBSNP_B37_SORTED"] args['reference'] = config["REFERENCE_GENOME_HG19"] args['transcripts'] = config["KNOWNGENE_HG19_SORTED"] else: raise Exception("genome build data is invalid") if options and options.chromosome: chrlist = [options.chromosome] else: # It might be more elegant to extract this from metadata. chrlist = ['chr' + str(x) for x in range(1, 22) + ['X', 'Y']] # Process genome through a series of GFF-formatted string generators. log.put('#status 20 looking up reference alleles and ' 'dbSNP IDs, computing nonsynonymous changes, ' 'cross-referencing GET-Evidence database') progtrack = ProgressTracker(sys.stderr, [22, 99], expected=chrlist, metadata=genome_data) if not options or not options.no_metadata: # Record chromosomes seen and genome coverage. gff_in_gen = get_metadata.genome_metadata(gff_in_gen, args['genome_stats'], progresstracker=progtrack) # Report coding regions that lack coverage. gff_in_gen = call_missing.report_uncovered(gff_in_gen, args['transcripts'], args['genetests'], output_file=args['miss_out'], progresstracker=progtrack) if options and options.metadata_only: for line in gff_in_gen: pass else: # Find reference allele. gff_in_gen = gff_twobit_query.match2ref(gff_in_gen, args['reference']) # Look up dbSNP IDs gff_in_gen = gff_dbsnp_query.match2dbSNP(gff_in_gen, args['dbsnp']) # Check for nonsynonymous SNP gff_in_gen = gff_nonsynonymous_filter.predict_nonsynonymous(gff_in_gen, args['reference'], args['transcripts'] ) # Pull off GET-Evidence hits gff_in_gen = gff_getevidence_map.match_getev(gff_in_gen, args['getev_flat'], transcripts_file=args['transcripts'], gene_out_file=args['getev_genes_out'] + ".tmp", output_file=args['getev_out'] + ".tmp", progresstracker=progtrack, genetests_filepath=config['GENETESTS_DATA'], blosum100_file=config['BLOSUM100'] ) # Printing to output, pulls data through the generator chain. ns_out = autozip.file_open(args['nonsyn_out_tmp'], 'w') for line in gff_in_gen: ns_out.write(line + "\n") ns_out.close() os.system("mv " + args['getev_out'] + ".tmp " + args['getev_out']) os.system("mv " + args['nonsyn_out_tmp'] + " " + args['nonsyn_out']) os.system("mv " + args['getev_genes_out'] + ".tmp " + args['getev_genes_out']) # Print metadata metadata_f_out = open(args['metadata_out'], 'w') progtrack.write_metadata(metadata_f_out) metadata_f_out.close() log.put ('#status 100 finished') os.rename(lockfile, logfile) log_handle.close() print "Finished processing file " + str(genotype_file)
def open_genome_file(self, genome_id): """Open file containing sequence data, initializes self.f_in""" genome_file_path = GENOMEFILE_PRE + genome_id + GENOMEDATA_POST self.f_in = autozip.file_open(genome_file_path)
def detect_format(file_input): """Detect the genetic data format of a file. Takes a path to a file, or a string generator (e.g. a filehandle). Tries to match one of the following: 23ANDME: 23andme (microarray genotyping) CGIVAR: Complete Genomics var file deCODEme: deCODEme (microarray genotyping) GFF: General Feature Format VCF: Variant Call Format (only tested for 23andme exome data) ANCESTRY: Ancestry (genotyping data) """ looks_like = dict() if isinstance(file_input, str): try: f_in = autozip.file_open(file_input, 'r') except AssertionError: f_in = autozip.file_open(file_input, 'r', 'deCODEme_scan.csv') if VERBOSE: print "deCODEme archive (deCODEme) detected" looks_like['deCODEme'] = True else: f_in = file_input line_count = 0 for line in f_in: line_count += 1 if any([looks_like[x] for x in looks_like.keys()]): break if line_count > MAX_LINES_CHECKED: break # Check comment lines, if they exist, for information on file type. if re.match('#', line): if re.match(r'#TYPE.*VAR-ANNOTATION', line): if VERBOSE: print "Complete Genomics var file format (CGIVAR) detected" looks_like['CGIVAR'] = True if re.match(r'##gff-version', line): if VERBOSE: print "General Feature Format (GFF) detected" looks_like['GFF'] = True if re.match(r'# This data file generated by 23andMe', line): if VERBOSE: print "23andme microarray genotyping data (23ANDME) detected" looks_like['23ANDME'] = True if re.match(r'##fileformat=VCFv4', line): if VERBOSE: print "Variant Call Format (VCF) detected" looks_like['VCF'] = True if re.match(r'#\s*AncestryDNA', line): if VERBOSE: print "Ancestry genotyping data (ANCESTRY) detected" # Look at other lines and decide based on their format. tsv_data = line.split('\t') csv_data = list(csv.reader([line]))[0] if (len(csv_data) > 5 and re.match(r'rs', csv_data[0]) and re.match(r'[ACGT]', csv_data[1]) and re.match(r'[0-9]', csv_data[3]) and re.match(r'[+-]', csv_data[4]) and re.match(r'[ACGT]', csv_data[5])): if VERBOSE: print "deCODEme microarray genotyping data (deCODEme) guessed" looks_like['deCODEme'] = True if (len(csv_data) > 3 and re.match(r'rs', csv_data[0]) and re.match(r'[0-9]', csv_data[2]) and re.match(r'[ACGT]', csv_data[3])): if VERBOSE: print "Family Tree DNA genotyping data (FTDNA) guessed" looks_like['FTDNA'] = True if (len(tsv_data) > 3 and re.match(r'rs', tsv_data[0]) and re.match(r'[0-9]', tsv_data[2]) and re.match(r'[ACGT][ACGT]', tsv_data[3])): if VERBOSE: print "23andme microarray genotyping data (23ANDME) guessed" looks_like['23ANDME'] = True if (len(tsv_data) > 4 and re.match(r'rs', tsv_data[0]) and re.match(r'[0-9]', tsv_data[2]) and re.match(r'[ACGT0-9]', tsv_data[3]) and re.match(r'[ACGT0-9]', tsv_data[4])): if VERBOSE: print "Ancestry genotyping data (ANCESTRY) guessed" looks_like['ANCESTRY'] = True if (len(tsv_data) > 6 and re.match(r'chr', tsv_data[3]) and re.match(r'[0-9]', tsv_data[4]) and re.match(r'[0-9]', tsv_data[5]) and (tsv_data[6] == "no-call" or tsv_data[6] == "ref")): if VERBOSE: print "Complete Genomics var file format (CGIvar) guessed" looks_like['CGIVAR'] = True if (len(tsv_data) > 6 and re.match(r'[0-9]', tsv_data[3]) and re.match(r'[0-9]', tsv_data[4]) and tsv_data[6] == "+"): if VERBOSE: print "General Feature Format (GFF) guessed" looks_like['GFF'] = True if (len(tsv_data) > 7 and re.match(r'[0-9]', tsv_data[1]) and re.match(r'[ACGT]', tsv_data[3]) and re.match(r'[ACGT]', tsv_data[4]) and len(tsv_data[7].split(';')) > 2): if VERBOSE: print "Variant Call Format (VCF) guessed" looks_like['VCF'] = True if isinstance(file_input, str): f_in.close() if any([looks_like[x] for x in looks_like.keys()]): return [x for x in looks_like.keys() if looks_like[x]][0] else: return 'UNKNOWN'
def detect_format(file_input): """Detect the genetic data format of a file. Takes a path to a file, or a string generator (e.g. a filehandle). Tries to match one of the following: 23ANDME: 23andme (microarray genotyping) CGIVAR: Complete Genomics var file deCODEme: deCODEme (microarray genotyping) GFF: General Feature Format VCF: Variant Call Format (only tested for 23andme exome data) """ looks_like = dict() if isinstance(file_input, str): try: f_in = autozip.file_open(file_input, "r") except AssertionError: f_in = autozip.file_open(file_input, "r", "deCODEme_scan.csv") print "deCODEme archive (deCODEme) detected" looks_like["deCODEme"] = True else: f_in = file_input line_count = 0 for line in f_in: line_count += 1 if any([looks_like[x] for x in looks_like.keys()]): break if line_count > MAX_LINES_CHECKED: break # Check comment lines, if they exist, for information on file type. if re.match("#", line): if re.match(r"#TYPE.*VAR-ANNOTATION", line): print "Complete Genomics var file format (CGIVAR) detected" looks_like["CGIVAR"] = True if re.match(r"##gff-version", line): print "General Feature Format (GFF) detected" looks_like["GFF"] = True if re.match(r"# This data file generated by 23andMe", line): print "23andme microarray genotyping data (23ANDME) detected" looks_like["23ANDME"] = True if re.match(r"##fileformat=VCFv4", line): print "Variant Call Format (VCF) detected" looks_like["VCF"] = True # Look at other lines and decide based on their format. tsv_data = line.split("\t") csv_data = list(csv.reader([line]))[0] if ( len(csv_data) > 5 and re.match(r"rs", csv_data[0]) and re.match(r"[ACGT]", csv_data[1]) and re.match(r"[0-9]", csv_data[3]) and re.match(r"[+-]", csv_data[4]) and re.match(r"[ACGT]", csv_data[5]) ): print "deCODEme microarray genotyping data (deCODEme) guessed" looks_like["deCODEme"] = True if ( len(csv_data) > 3 and re.match(r"rs", csv_data[0]) and re.match(r"[0-9]", csv_data[2]) and re.match(r"[ACGT]", csv_data[3]) ): print "Family Tree DNA genotyping data (FTDNA) guessed" looks_like["FTDNA"] = True if ( len(tsv_data) > 3 and re.match(r"rs", tsv_data[0]) and re.match(r"[0-9]", tsv_data[2]) and re.match(r"[ACGT][ACGT]", tsv_data[3]) ): print "23andme microarray genotyping data (23ANDME) guessed" looks_like["23ANDME"] = True if ( len(tsv_data) > 6 and re.match(r"chr", tsv_data[3]) and re.match(r"[0-9]", tsv_data[4]) and re.match(r"[0-9]", tsv_data[5]) and (tsv_data[6] == "no-call" or tsv_data[6] == "ref") ): print "Complete Genomics var file format (CGIvar) guessed" looks_like["CGIVAR"] = True if ( len(tsv_data) > 6 and re.match(r"[0-9]", tsv_data[3]) and re.match(r"[0-9]", tsv_data[4]) and tsv_data[6] == "+" ): print "General Feature Format (GFF) guessed" looks_like["GFF"] = True if ( len(tsv_data) > 7 and re.match(r"[0-9]", tsv_data[1]) and re.match(r"[ACGT]", tsv_data[3]) and re.match(r"[ACGT]", tsv_data[4]) and len(tsv_data[7].split(";")) > 2 ): print "Variant Call Format (VCF) guessed" looks_like["VCF"] = True if isinstance(file_input, str): f_in.close() if any([looks_like[x] for x in looks_like.keys()]): return [x for x in looks_like.keys() if looks_like[x]][0] else: return "UNKNOWN"