def main(): # parse options option, args = doc_optparse.parse(__doc__) if len(args) < 2: doc_optparse.exit() # try opening the file both ways, in case the arguments got confused try: gff_file = gff.input(args[1]) twobit_file = twobit.input(args[0]) except Exception: gff_file = gff.input(args[0]) twobit_file = twobit.input(args[1]) for record in gff_file: if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname ref_seq = twobit_file[chr][(record.start - 1):record.end] if option.diff: if record.attributes.has_key("ref_allele"): if record.attributes["ref_allele"].strip("\"") == ref_seq.upper(): continue record.attributes["ref_allele"] = ref_seq.upper() print record
def match2ref(gff_input, twobit_filename): # Iff gff_filename is a string ending with ".gz", assume gzip compressed gff_file = None if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_file = gff.input(gzip.open(gff_input)) else: # GFF will interpret if gff_filename is string containing path # to a GFF-formatted text file, or a string generator # (e.g. file object) with GFF-formatted strings gff_file = gff.input(gff_input) twobit_file = twobit.input(twobit_filename) header_done = False # Process input data to get ref allele for record in gff_file: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##gff-version " + gff_file.data[0] yield "##genome-build " + gff_file.data[1] yield "# Produced by: gff_twobit_query.py" yield "# Date: " + datetime.datetime.now().isoformat(' ') header_done = True # Skip REF lines if record.feature == "REF": yield str(record) continue # Add "chr" to chromosome ID if missing if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname ref_seq = "-" # represents variant with length zero if (record.end - (record.start - 1)) > 0: ref_seq = twobit_file[chr][(record.start - 1):record.end] if ref_seq == '': sys.stderr.write( "ERROR: this location does not exist in the reference genome. Start: %d, end: %d. Perhaps the input is aligned against a different reference genome?\n" % (record.start, record.end)) sys.exit() if record.attributes: # If reference at this pos, note this and remove attributes data. if ("alleles" in record.attributes and record.attributes["alleles"] == ref_seq.upper()): record.feature = "REF" record.attributes = None else: record.attributes["ref_allele"] = ref_seq.upper() yield str(record)
def match2ref(gff_input, twobit_filename): # Iff gff_filename is a string ending with ".gz", assume gzip compressed gff_file = None if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_file = gff.input(gzip.open(gff_input)) else: # GFF will interpret if gff_filename is string containing path # to a GFF-formatted text file, or a string generator # (e.g. file object) with GFF-formatted strings gff_file = gff.input(gff_input) twobit_file = twobit.input(twobit_filename) header_done = False # Process input data to get ref allele for record in gff_file: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##gff-version " + gff_file.data[0] yield "##genome-build " + gff_file.data[1] yield "# Produced by: gff_twobit_query.py" yield "# Date: " + datetime.datetime.now().isoformat(' ') header_done = True # Skip REF lines if record.feature == "REF": yield str(record) continue # Add "chr" to chromosome ID if missing if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname ref_seq = "-" # represents variant with length zero if (record.end - (record.start - 1)) > 0: ref_seq = twobit_file[chr][(record.start - 1):record.end] if ref_seq == '': sys.stderr.write ("ERROR: this location does not exist in the reference genome. Start: %d, end: %d. Perhaps the input is aligned against a different reference genome?\n" % (record.start, record.end)) sys.exit() if record.attributes: # If reference at this pos, note this and remove attributes data. if ("alleles" in record.attributes and record.attributes["alleles"] == ref_seq.upper()): record.feature = "REF" record.attributes = None else: record.attributes["ref_allele"] = ref_seq.upper() yield str(record)
# return if we don't have the correct arguments if len(sys.argv) < 3: raise SystemExit(__doc__.replace("%prog", sys.argv[0])) # first, try to connect to the databases try: connection = MySQLdb.connect(host=DB_HOST, user=DB_READ_USER, passwd=DB_READ_PASSWD, db=DB_READ_DATABASE) cursor = connection.cursor() except MySQLdb.OperationalError, message: print "Error %d while connecting to database: %s" % (message[0], message[1]) sys.exit() # try opening the file both ways, in case the arguments got confused try: gff_file = gff.input(sys.argv[2]) twobit_file = twobit.input(sys.argv[1]) except Exception: gff_file = gff.input(sys.argv[1]) twobit_file = twobit.input(sys.argv[2]) for record in gff_file: if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname # recall that record.start is 1-based, but the database is not cursor.execute(query, (chr, record.start - 1, record.end - 1)) data = cursor.fetchall() # go away if we have a non-coding sequence
def get_allele_freqs(password, getev_file, excluded=None, chromfile=None, outputfile=None): # Set up output, genome inputs, GET-Evidence variants, and twobit reference. if outputfile: print "Setting up output file" f_out = autozip.file_open(outputfile, 'w') else: f_out = None genome_ids = get_genome_list(password, excluded) if chromfile: if f_out: print "Getting chromosomes..." chroms = read_single_items(chromfile) else: chroms = None if f_out: print "Reading GET-Ev flat file (takes a couple minutes)..." getev_variants = load_getev(getev_file) if f_out: print "Loading twobit genome..." twobit_genome = twobit.input(TWOBIT_PATH) if f_out: print("Setting up GenomeSet (may be slow if each genome has to advance " + "to target chromosomes)...") genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants, verbose=True) else: genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants) if f_out: print "Find earliest ends" earliest_ends = genome_set.earliest_ends() #print earliest_ends # Move through the genomes to find allele frequencies while genome_set.genomes: # Move ahead of all "earliest ends" & save new earliest. next_earliest = genome_set.advance_all_past_end_pos(earliest_ends[0]) # Check all old "earliest ends" positions for interesting variants. has_var = [] is_interesting = False for position in earliest_ends: #print position if not position['ref']: has_var.append(position) #is_interesting = True if 'amino_acid' in position or 'getev_id' in position: is_interesting = True #if is_interesting: # print "Earliest ends: " + str(earliest_ends) # print [(x.id, x.data[-1]) for x in genome_set.genomes] # if has_var: # print "Var pos: " + str(has_var) # If there are interesting variants, calculate allele frequency. if has_var and is_interesting: # Check if another genomes has an overlapping variant extending # beyond this position, we're not ready to evaluate this yet # (it will be caught when the later overlapping one comes up). if genome_set.no_later_var(has_var): freqout = genome_set.eval_var_freq(has_var, twobit_genome) if f_out: f_out.write(freqout + '\n') else: print freqout genome_set.clean_out_prior_pos(earliest_ends) # Reset "earliest end" to next earliest positions. earliest_ends = next_earliest
# parse options option, args = doc_optparse.parse(__doc__) if len(args) < 1: doc_optparse.exit() # first, try to connect to the database try: connection = MySQLdb.connect(host=DB_HOST, user=DB_READ_USER, passwd=DB_READ_PASSWD, db=DB_READ_DATABASE) cursor = connection.cursor() except MySQLdb.OperationalError, message: print "Error %d while connecting to database: %s" % (message[0], message[1]) sys.exit() if option.reference: twobit_file = twobit.input(option.reference) for line in fileinput.input(args[0]): l = line.strip().split('\t') if len(l) < 5: print >> sys.stderr, l # input lines are in the form: # chromosome, position, rs, genotype, phenotype, pubmed (optional) if l[0].startswith("chr") or l[0] == "None": chr = l[0] else: chr = "chr" + l[0] try: pos = int(l[1]) except ValueError:
def predict_nonsynonymous(gff_input, twobit_path, transcript_path, progresstracker=False): twobit_file = twobit.input(twobit_path) transcript_input = transcript_file(transcript_path) # Set up gff_data gff_data = None if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_data = gff.input(gzip.open(gff_input)) else: # GFF will interpret if gff_filename is string containing path # to a GFF-formatted text file, or a string generator # (e.g. file object) with GFF-formatted strings gff_data = gff.input(gff_input) header_done = False for record in gff_data: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##genome-build " + gff_data.data[1] yield "# Produced by: gff_nonsynonymous_filter.py" yield "# Date: " + datetime.datetime.now().isoformat(" ") header_done = True if record.feature == "REF": yield str(record) continue if record.seqname.startswith("chr"): chromosome = record.seqname else: if record.seqname.startswith("Chr"): chromosome = "chr" + record.seqname[3:] else: chromosome = "chr" + record.seqname if progresstracker: progresstracker.saw(chromosome) # record.start is 1-based, but UCSC annotation starts are 0-based, so subtract 1 record_position = (chromosome, record.start - 1) transcripts = transcript_input.cover_next_position(record_position) # Skip the rest if no transcripts are returned if not transcripts: yield str(record) continue # otherwise, cycle through nonsyn_inferences = [] splice_inferences = [] ucsc_transcripts = [] is_nonsynonymous = is_splice = False for data in transcripts: # need to make "d" match up with transcript file order # d : geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds # 0, 3, 6, 7, 9, 10 d = (data[0], data[3], int(data[6]), int(data[7]), data[9], data[10]) i = infer_function(twobit_file, record, *d) if i[0] == "nonsynonymous coding": nonsyn_inferences.append("%s %s" % (d[0], i[2])) is_nonsynonymous = True ucsc_transcripts.append(data[1]) elif i[0] == "splice site": splice_inferences.append("%s %s " % (d[0], i[2])) is_splice = True # set the attribute if we can if (not is_nonsynonymous) and (not is_splice): yield str(record) else: if len(nonsyn_inferences) > 0: unique_inferences = unique(nonsyn_inferences) unique_inferences.sort(key=str.lower) record.attributes["amino_acid"] = "/".join(unique_inferences) record.attributes["ucsc_trans"] = ",".join(ucsc_transcripts) if len(splice_inferences) > 0: # Not going to report splice sites for now, but leaving the # code here because we hope to later. - Madeleine 2010/11/29 pass # unique_inferences = unique(splice_inferences) # unique_inferences.sort(key=str.lower) # record.attributes["splice"] = "/".join(unique_inferences) yield str(record)
def get_allele_freqs(password, getev_file, excluded=None, chromfile=None, outputfile=None): # Set up output, genome inputs, GET-Evidence variants, and twobit reference. if outputfile: print "Setting up output file" f_out = autozip.file_open(outputfile, 'w') else: f_out = None genome_ids = get_genome_list(password, excluded) if chromfile: if f_out: print "Getting chromosomes..." chroms = read_single_items(chromfile) else: chroms = None if f_out: print "Reading GET-Ev flat file (takes a couple minutes)..." getev_variants = load_getev(getev_file) if f_out: print "Loading twobit genome..." twobit_genome = twobit.input(TWOBIT_PATH) if f_out: print( "Setting up GenomeSet (may be slow if each genome has to advance " + "to target chromosomes)...") genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants, verbose=True) else: genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants) if f_out: print "Find earliest ends" earliest_ends = genome_set.earliest_ends() #print earliest_ends # Move through the genomes to find allele frequencies while genome_set.genomes: # Move ahead of all "earliest ends" & save new earliest. next_earliest = genome_set.advance_all_past_end_pos(earliest_ends[0]) # Check all old "earliest ends" positions for interesting variants. has_var = [] is_interesting = False for position in earliest_ends: #print position if not position['ref']: has_var.append(position) #is_interesting = True if 'amino_acid' in position or 'getev_id' in position: is_interesting = True #if is_interesting: # print "Earliest ends: " + str(earliest_ends) # print [(x.id, x.data[-1]) for x in genome_set.genomes] # if has_var: # print "Var pos: " + str(has_var) # If there are interesting variants, calculate allele frequency. if has_var and is_interesting: # Check if another genomes has an overlapping variant extending # beyond this position, we're not ready to evaluate this yet # (it will be caught when the later overlapping one comes up). if genome_set.no_later_var(has_var): freqout = genome_set.eval_var_freq(has_var, twobit_genome) if f_out: f_out.write(freqout + '\n') else: print freqout genome_set.clean_out_prior_pos(earliest_ends) # Reset "earliest end" to next earliest positions. earliest_ends = next_earliest
def predict_nonsynonymous(gff_input, twobit_path, transcript_path, progresstracker=False): twobit_file = twobit.input(twobit_path) transcript_input = transcript_file(transcript_path) # Set up gff_data gff_data = None if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)): gff_data = gff.input(gzip.open(gff_input)) else: # GFF will interpret if gff_filename is string containing path # to a GFF-formatted text file, or a string generator # (e.g. file object) with GFF-formatted strings gff_data = gff.input(gff_input) header_done = False for record in gff_data: # Have to do this after calling the first record to # get the iterator to read through the header data if not header_done: yield "##genome-build " + gff_data.data[1] yield "# Produced by: gff_nonsynonymous_filter.py" yield "# Date: " + datetime.datetime.now().isoformat(' ') header_done = True if record.feature == "REF": yield str(record) continue if record.seqname.startswith("chr"): chromosome = record.seqname else: if record.seqname.startswith("Chr"): chromosome = "chr" + record.seqname[3:] else: chromosome = "chr" + record.seqname if progresstracker: progresstracker.saw(chromosome) # record.start is 1-based, but UCSC annotation starts are 0-based, so subtract 1 record_position = (chromosome, record.start - 1) transcripts = transcript_input.cover_next_position(record_position) # Skip the rest if no transcripts are returned if (not transcripts): yield str(record) continue # otherwise, cycle through nonsyn_inferences = [] splice_inferences = [] ucsc_transcripts = [] is_nonsynonymous = is_splice = False for data in transcripts: # need to make "d" match up with transcript file order # d : geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds # 0, 3, 6, 7, 9, 10 d = (data[0], data[3], int(data[6]), int(data[7]), data[9], data[10]) i = infer_function(twobit_file, record, *d) if i[0] == "nonsynonymous coding": nonsyn_inferences.append("%s %s" % (d[0], i[2])) is_nonsynonymous = True ucsc_transcripts.append(data[1]) elif i[0] == "splice site": splice_inferences.append("%s %s " % (d[0], i[2])) is_splice = True # set the attribute if we can if (not is_nonsynonymous) and (not is_splice): yield str(record) else: if len(nonsyn_inferences) > 0: unique_inferences = unique(nonsyn_inferences) unique_inferences.sort(key=str.lower) record.attributes["amino_acid"] = "/".join(unique_inferences) record.attributes["ucsc_trans"] = ",".join(ucsc_transcripts) if len(splice_inferences) > 0: # Not going to report splice sites for now, but leaving the # code here because we hope to later. - Madeleine 2010/11/29 pass # unique_inferences = unique(splice_inferences) # unique_inferences.sort(key=str.lower) # record.attributes["splice"] = "/".join(unique_inferences) yield str(record)
def main(): # parse options option, args = doc_optparse.parse(__doc__) if len(args) < 2: doc_optparse.exit() flank = int(option.flank or 0) # try opening the file both ways, in case the arguments got confused try: gff_file = gff.input(args[1]) twobit_file = twobit.input(args[0]) except Exception: gff_file = gff.input(args[0]) twobit_file = twobit.input(args[1]) # initialize a set of variables to keep track of uniqueness, if we need them if option.unique: previous_record = None previous_ref_seq = None repetition_count = 1 for record in gff_file: # if we're using the unique option, output the previous record only when # we're sure we've seen all repetitions of it if option.unique and record == previous_record: repetition_count += 1 continue elif option.unique: if previous_record: previous_record.attributes["repetition_count"] = str(repetition_count) print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq) repetition_count = 1 previous_record = record if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname ref_seq = twobit_file[chr][(record.start - 1):record.end] if flank != 0: # calculate the flanks (these variables are 0-based) left_flank_start = record.start - flank - 1 left_flank_end = record.start - 1 if left_flank_start < 0: left_flank_start = 0 right_flank_start = record.end right_flank_end = record.end + flank # now find them left_flank_seq = twobit_file[chr][left_flank_start:left_flank_end] right_flank_seq = twobit_file[chr][right_flank_start:right_flank_end] ref_seq = left_flank_seq + "\n\n" + ref_seq + "\n\n" + right_flank_seq if option.strand and record.strand == "-": ref_seq = reverse_complement(ref_seq) # we don't output the current record if we're using the unique option if option.unique: previous_ref_seq = ref_seq else: print FastaRecord(str(record).replace("\t", "|"), ref_seq) # we'll have one last record yet to output if we used the unique option if option.unique: previous_record.attributes["repetition_count"] = str(repetition_count) print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)