def header_data(gff_in, metadata=dict(), check_ref=0):
    """Read GFF header data from file, store or return metadata
    
    Optionally also checks the first N lines for records where the type is 
    "REF" (third column). (Our genome processing treats these as regions where 
    the genotype is "called" as matching the reference genome.)
    """
    # Set up GFF data
    if isinstance(gff_in, str) and re.search(r'\.gz$', gff_in):
        gff_data = gff.input(gzip.open(gff_in))
    else:
        gff_data = gff.input(gff_in)

    # Pull record to force GFFFile to read through header, then store metadata.
    record = gff_data.next()
    metadata['gff-format'], metadata['build'] = gff_data.data[0:2]
    
    # Check for REF lines if we asked to do this. False unless we see some.
    if check_ref > 0:
        metadata['has_ref'] = False
        for i in range(check_ref):
            try:
                if record.feature == "REF":
                    metadata['has_ref'] = True
                    break
                record = gff_data.next()
            except StopIteration:
                break

    return metadata
Exemplo n.º 2
0
def header_data(gff_in, metadata=dict(), check_ref=0):
    """Read GFF header data from file, store or return metadata
    
    Optionally also checks the first N lines for records where the type is 
    "REF" (third column). (Our genome processing treats these as regions where 
    the genotype is "called" as matching the reference genome.)
    """
    # Set up GFF data
    if isinstance(gff_in, str) and re.search(r'\.gz$', gff_in):
        gff_data = gff.input(gzip.open(gff_in))
    else:
        gff_data = gff.input(gff_in)

    # Pull record to force GFFFile to read through header, then store metadata.
    record = gff_data.next()
    metadata['gff-format'], metadata['build'] = gff_data.data[0:2]

    # Check for REF lines if we asked to do this. False unless we see some.
    if check_ref > 0:
        metadata['has_ref'] = False
        for i in range(check_ref):
            try:
                if record.feature == "REF":
                    metadata['has_ref'] = True
                    break
                record = gff_data.next()
            except StopIteration:
                break

    return metadata
Exemplo n.º 3
0
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 2:
		doc_optparse.exit()
	
	# try opening the file both ways, in case the arguments got confused
	try:
		gff_file = gff.input(args[1])
		twobit_file = twobit.input(args[0])
	except Exception:
		gff_file = gff.input(args[0])
		twobit_file = twobit.input(args[1])
	
	for record in gff_file:
		if record.seqname.startswith("chr"):
			chr = record.seqname
		else:
			chr = "chr" + record.seqname
		
		ref_seq = twobit_file[chr][(record.start - 1):record.end]
		
		if option.diff:
			if record.attributes.has_key("ref_allele"):
				if record.attributes["ref_allele"].strip("\"") == ref_seq.upper():
					continue
		
		record.attributes["ref_allele"] = ref_seq.upper()
		print record
Exemplo n.º 4
0
def main():
    # return if we don't have the correct arguments
    if len(sys.argv) < 3:
        raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
    
    g1 = gff.input(sys.argv[1])
    g2 = gff.input(sys.argv[2])
    for line in g1.intersect(g2):
        print line
Exemplo n.º 5
0
def match2ref(gff_input, twobit_filename):

    # Iff gff_filename is a string ending with ".gz", assume gzip compressed
    gff_file = None
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_file = gff.input(gzip.open(gff_input))
    else:
        # GFF will interpret if gff_filename is string containing path
        # to a GFF-formatted text file, or a string generator
        # (e.g. file object) with GFF-formatted strings
        gff_file = gff.input(gff_input)

    twobit_file = twobit.input(twobit_filename)

    header_done = False

    # Process input data to get ref allele
    for record in gff_file:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##gff-version " + gff_file.data[0]
            yield "##genome-build " + gff_file.data[1]
            yield "# Produced by: gff_twobit_query.py"
            yield "# Date: " + datetime.datetime.now().isoformat(' ')
            header_done = True

        # Skip REF lines
        if record.feature == "REF":
            yield str(record)
            continue

        # Add "chr" to chromosome ID if missing
        if record.seqname.startswith("chr"):
            chr = record.seqname
        else:
            chr = "chr" + record.seqname

        ref_seq = "-"  # represents variant with length zero
        if (record.end - (record.start - 1)) > 0:
            ref_seq = twobit_file[chr][(record.start - 1):record.end]
        if ref_seq == '':
            sys.stderr.write(
                "ERROR: this location does not exist in the reference genome. Start: %d, end: %d. Perhaps the input is aligned against a different reference genome?\n"
                % (record.start, record.end))
            sys.exit()

        if record.attributes:
            # If reference at this pos, note this and remove attributes data.
            if ("alleles" in record.attributes
                    and record.attributes["alleles"] == ref_seq.upper()):
                record.feature = "REF"
                record.attributes = None
            else:
                record.attributes["ref_allele"] = ref_seq.upper()
            yield str(record)
Exemplo n.º 6
0
def match2dbSNP(gff_input_file, dbsnp_file):
    # Set up dbSNP input
    dbSNP_input = dbSNP(dbsnp_file)

    # Create genome_file record generator
    gff_data = None
    if isinstance(gff_input_file, str) and (re.match(".*\.gz$", gff_input_file)):
        gff_data = gff.input(gzip.open(gff_input_file))
    else:
        gff_data = gff.input(gff_input_file)

    header_done = False

    for record in gff_data:  
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##gff-version " + gff_data.data[0]
            yield "##genome-build " + gff_data.data[1]
            yield "# Produced by: gff_dbsnp_query.py"
            yield "# Date: " + datetime.datetime.now().isoformat(' ')
            header_done = True

        if record.feature == "REF":
            yield str(record)
            continue

        # chromosome prefix not used by dbSNP, so it is removed if present
        if record.seqname.startswith("chr") or record.seqname.startswith("Chr"):
            chromosome = record.seqname[3:]
        else:
            chromosome = record.seqname

        # position is adjusted to match the zero-start used by dbSNP positions
        record_position = (chromosome, record.start - 1)

        dbSNP_position = dbSNP_input.up_to_position(record_position)
        dbSNP_data = dbSNP_input.data

        if (dbSNP_position and dbSNP_input.comp_position(dbSNP_position,record_position) == 0):
            dbSNP_datum = "dbsnp:rs%s" % dbSNP_data[0]
            record_dbxref_data = []
            if record.version >= 3:
                if "Dbxref" in record.attributes:
                    record_dbxref_data = record.attributes["Dbxref"].split(",")
                if not any([re.search(dbSNP_data[0],datum) for datum in record_dbxref_data]):
                    record_dbxref_data.append(dbSNP_datum)
                    record.attributes["Dbxref"] = ",".join(record_dbxref_data)
            else:
                if "db_xref" in record.attributes:
                    record_dbxref_data = record.attributes["db_xref"].split(",")
                if not any([re.search(dbSNP_data[0],datum) for datum in record_dbxref_data]):
                    record_dbxref_data.append(dbSNP_datum)
                    record.attributes["db_xref"] = ",".join(record_dbxref_data)
        yield str(record)
def match2ref(gff_input, twobit_filename):

    # Iff gff_filename is a string ending with ".gz", assume gzip compressed
    gff_file = None
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_file = gff.input(gzip.open(gff_input))
    else:
        # GFF will interpret if gff_filename is string containing path 
        # to a GFF-formatted text file, or a string generator 
        # (e.g. file object) with GFF-formatted strings
        gff_file = gff.input(gff_input)
    
    twobit_file = twobit.input(twobit_filename)

    header_done = False
    
    # Process input data to get ref allele
    for record in gff_file:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##gff-version " + gff_file.data[0]
            yield "##genome-build " + gff_file.data[1]
            yield "# Produced by: gff_twobit_query.py"
            yield "# Date: " + datetime.datetime.now().isoformat(' ')
            header_done = True
        
        # Skip REF lines
        if record.feature == "REF":
            yield str(record)
            continue

        # Add "chr" to chromosome ID if missing
        if record.seqname.startswith("chr"):
            chr = record.seqname
        else:
            chr = "chr" + record.seqname

        ref_seq = "-"  # represents variant with length zero
        if (record.end - (record.start - 1)) > 0:
            ref_seq = twobit_file[chr][(record.start - 1):record.end]
        if ref_seq == '':
            sys.stderr.write ("ERROR: this location does not exist in the reference genome. Start: %d, end: %d. Perhaps the input is aligned against a different reference genome?\n" % (record.start, record.end))
            sys.exit() 

        if record.attributes:
            # If reference at this pos, note this and remove attributes data.
            if ("alleles" in record.attributes and 
                record.attributes["alleles"] == ref_seq.upper()):
                record.feature = "REF"
                record.attributes = None
            else:
                record.attributes["ref_allele"] = ref_seq.upper()
            yield str(record)
Exemplo n.º 8
0
def main():
    # return if we don't have the correct arguments
    if len(sys.argv) < 2:
        raise SystemExit(__doc__.replace("%prog", sys.argv[0]))

    yh_gff = gff.input(sys.argv[1], version=3)

    for record in yh_gff:
        # SNPs only, please
        if record.feature != "SNP":
            continue

        # downgrade
        record.version = 2

        # standardize a few things about the GFF record
        alleles = record.attributes["allele"].split("/")
        if len(alleles) == 2 and alleles[0] == alleles[1]:
            record.attributes["alleles"] = alleles[0]
        else:
            record.attributes["alleles"] = "/".join(alleles)
        del record.attributes["allele"]

        record.attributes["ref_allele"] = record.attributes["ref"]
        del record.attributes["ref"]

        if record.attributes["alleles"].find("/") == -1:
            record.attributes["counts"] = record.attributes["support1"]
        else:
            record.attributes["counts"] = "%s/%s" % (
                record.attributes["support1"], record.attributes["support2"])
            del record.attributes["support2"]
        del record.attributes["support1"]

        print record
def main():
    # return if we don't have the correct arguments
    if len(sys.argv) < 2:
        raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
    
    yh_gff = gff.input(sys.argv[1], version=3)
    
    for record in yh_gff:
        # SNPs only, please
        if record.feature != "SNP":
            continue
        
        # downgrade
        record.version = 2

        # standardize a few things about the GFF record
        alleles = record.attributes["allele"].split("/")
        if len(alleles) == 2 and alleles[0] == alleles[1]:
            record.attributes["alleles"] = alleles[0]
        else:
            record.attributes["alleles"] = "/".join(alleles)
        del record.attributes["allele"]
        
        record.attributes["ref_allele"] = record.attributes["ref"]
        del record.attributes["ref"]
        
        if record.attributes["alleles"].find("/") == -1:
            record.attributes["counts"] = record.attributes["support1"]
        else:
            record.attributes["counts"] = "%s/%s" % (record.attributes["support1"],
                                                     record.attributes["support2"])
            del record.attributes["support2"]
        del record.attributes["support1"]
        
        print record
Exemplo n.º 10
0
    def __init__(self, f_child, f_parA, f_parB, mend_errs):
        """Initializes class variables, opens input files."""
        self.filenames = {0: f_child, 1: f_parA}
        self.mend_errs = mend_errs
        self.gffs = {0: None, 1: None}
        # Positions are a tuple of chromosome, start, end, and gff record
        self.positions = {0: ('chr1', -1, -1, None), 1: ('chr1', -1, -1, None)}
        if (not f_parB == None):
            self.filenames[2] = f_parB
            self.gffs[2] = None
            self.positions[2] = ('chr1', -1, -1, None)

        # Set up input/output files
        for idx, filename in self.filenames.iteritems():
            self.gffs[idx] = gff.input(autozip.file_open(filename, 'r'))
Exemplo n.º 11
0
 def __init__(self, f_child, f_parA, f_parB, mend_errs):
     """Initializes class variables, opens input files."""
     self.filenames = {0 : f_child, 1 : f_parA}
     self.mend_errs = mend_errs
     self.gffs = {0 : None, 1 : None}
     # Positions are a tuple of chromosome, start, end, and gff record
     self.positions = {0 : ('chr1', -1, -1, None),
                       1 : ('chr1', -1, -1, None)}
     if (not f_parB == None):
         self.filenames[2] = f_parB
         self.gffs[2] = None
         self.positions[2] = ('chr1', -1, -1, None)
 
     # Set up input/output files
     for idx, filename in self.filenames.iteritems():
         self.gffs[idx] = gff.input(autozip.file_open(filename, 'r'))
def main():
    # return if we don't have the correct arguments
    if len(sys.argv) < 2:
        raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
    
    watson_gff = gff.input(sys.argv[1])
    
    for record in watson_gff:
        # standardize feature name
        record.feature = "SNP"
        
        # double check alleles and allele counts
        alleles = record.attributes["alleles"]
        ref_allele = record.attributes["ref_allele"]
        ref_counts = int(record.attributes["ref_counts"])
        oth_counts = int(record.attributes["oth_counts"])
        
        # if we're homozygous for the other allele, then we exclude
        # the reference allele from the list of alleles
        if ref_counts == 0:
            if alleles.startswith(ref_allele):
                alleles = alleles[-1]
            else:
                alleles = alleles[0]
            counts = str(oth_counts)
        # otherwise, we make sure that the first allele listed is the
        # reference allele, and create the counts attribute accordingly
        elif alleles.startswith(ref_allele):
            counts = "%s/%s" % (ref_counts, oth_counts)
        # this shouldn't happen, but in case, we do it the other way
        # if necessary
        else:
            counts = "%s/%s" % (oth_counts, ref_counts)
        
        # now we modify the record and output
        record.attributes["alleles"] = alleles
        record.attributes["counts"] = counts
        del record.attributes["ref_counts"]
        del record.attributes["oth_counts"]
        print record
Exemplo n.º 13
0
def main():
    # return if we don't have the correct arguments
    if len(sys.argv) < 2:
        raise SystemExit(__doc__.replace("%prog", sys.argv[0]))

    watson_gff = gff.input(sys.argv[1])

    for record in watson_gff:
        # standardize feature name
        record.feature = "SNP"

        # double check alleles and allele counts
        alleles = record.attributes["alleles"]
        ref_allele = record.attributes["ref_allele"]
        ref_counts = int(record.attributes["ref_counts"])
        oth_counts = int(record.attributes["oth_counts"])

        # if we're homozygous for the other allele, then we exclude
        # the reference allele from the list of alleles
        if ref_counts == 0:
            if alleles.startswith(ref_allele):
                alleles = alleles[-1]
            else:
                alleles = alleles[0]
            counts = str(oth_counts)
        # otherwise, we make sure that the first allele listed is the
        # reference allele, and create the counts attribute accordingly
        elif alleles.startswith(ref_allele):
            counts = "%s/%s" % (ref_counts, oth_counts)
        # this shouldn't happen, but in case, we do it the other way
        # if necessary
        else:
            counts = "%s/%s" % (oth_counts, ref_counts)

        # now we modify the record and output
        record.attributes["alleles"] = alleles
        record.attributes["counts"] = counts
        del record.attributes["ref_counts"]
        del record.attributes["oth_counts"]
        print record
Exemplo n.º 14
0
def genome_metadata(gff_input, genome_stats_file, progresstracker):
    """Take GFF, track and record associated metadata, yield same GFF lines

    Required arguments:
    gff_input: file or GFF-formatted string generator
    genome_stats_file: str, path to a text file containing chromosome sizes 
    progresstracker: ProgressTracker object

    The following keys will store metadata in progresstracker.metadata:
    chromosomes: list of str, chromosome names
    called_num: int, # of positions called
    match_num: int, # of positions called w/chr matching ref
    ref_all_num: int, # of positions in reference genome (includes unplaceable)
    ref_nogap_num: int, # of placeable positions in reference genome
    called_frac_all: float, fraction of reference called (includes unplaceable)
    called_frac_nogap: float, fraction of placeable reference called

    Returns a generator, yielding same GFF-formatted strings as were inputed.
    """
    # 'chromosomes_raw' is a list of all the raw chromosome sequences seen.
    # 'chromosomes' has the same names edited, if needed, to match ref_genome.
    chromosomes_raw = list()

    # 'called_num' counts total positions called, while 'match_num' only counts
    # positions which match a chromosome ID in the ref_genome data.
    called_num = 0
    match_num = 0

    # 'ref_all_num' and 'ref_nogap_num' increment total and placeable genome 
    # sizes (respectively) when new chromosomes are seen (for example, the 
    # lengths for chrY are only added if chrY was seen).
    ref_all_num = 0
    ref_nogap_num = 0

    # Set up gff_data.
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_data = gff.input(gzip.open(gff_input))
    else:
        gff_data = gff.input(gff_input)
    
    # Get chromosome lengths (total and placeable) for reference genome.
    try:
        ref_genome = get_genome_stats(progresstracker.metadata['build'], 
                                      genome_stats_file)
    except KeyError:
        ref_genome = get_genome_stats(DEFAULT_BUILD, genome_stats_file)

    # Initialize chromosomes list, we'll add them as we see them.
    progresstracker.metadata['chromosomes'] = list()

    # Progress through GFF input.
    header_done = False
    for record in gff_data:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##gff-version " + gff_data.data[0]
            yield "##genome-build " + gff_data.data[1]
            yield "# Produced by: get_metadata.py"
            header_done = True

        # Record number of positions called.
        dist = (record.end - (record.start - 1))
        called_num += dist
        is_in_ref_genome = (record.seqname in ref_genome
                            or "chr" + record.seqname in ref_genome
                            or "chr" + record.seqname[3:] in ref_genome)
        if is_in_ref_genome:
            match_num += dist

        # If this is a new chromosome: (1) Add it to our chromosomes list,
        # (2) increase genome size variables (ref_all_num and ref_nogap_num)
        # (3) call progresstracker.saw().
        if record.seqname not in chromosomes_raw:
            chromosomes_raw.append(record.seqname)
            # Standardize chromosome name for metadata storage.
            chr_name = ""
            if record.seqname in ref_genome:
                chr_name = record.seqname
            elif "chr" + record.seqname in ref_genome:
                chr_name = "chr" + record.seqname
            elif "chr" + record.seqname[3:] in ref_genome:
                chr_name = "chr" + record.seqname[3:]
            if chr_name:
                progresstracker.metadata['chromosomes'].append(chr_name)
                ref_all_num += ref_genome[record.seqname]['seq_all']
                ref_nogap_num += ref_genome[record.seqname]['seq_nogap']
                progresstracker.saw(chr_name)
        
        yield str(record)
    
    progresstracker.metadata['called_num'] = called_num
    progresstracker.metadata['match_num'] = match_num
    progresstracker.metadata['ref_all_num'] = ref_all_num
    progresstracker.metadata['ref_nogap_num'] = ref_nogap_num

    if ref_all_num > 0:
        called_frac_all = match_num * 1.0 / ref_all_num
        progresstracker.metadata['called_frac_all'] = called_frac_all

    if ref_nogap_num > 0:
        called_frac_nogap = match_num * 1.0 / ref_nogap_num
        progresstracker.metadata['called_frac_nogap'] = called_frac_nogap
Exemplo n.º 15
0
	# first, try to connect to the databases
	try:
		connection = MySQLdb.connect(host=DB_HOST, user=HGMD_USER, passwd=HGMD_PASSWD, db=HGMD_DATABASE)
		cursor = connection.cursor()
	except MySQLdb.OperationalError, message:
		sys.stderr.write ("Error %d while connecting to database: %s" % (message[0], message[1]))
		sys.exit()
	
	# make sure the required table is really there
	try:
		cursor.execute ('DESCRIBE mutation')
	except MySQLdb.Error:
		sys.stderr.write ("No mutation table => empty output")
		sys.exit()
	
	gff_file = gff.input(sys.argv[1])	
	for record in gff_file:
		# lightly parse alleles
		alleles = record.attributes["alleles"].strip("\"").split("/")
		ref_allele = record.attributes["ref_allele"].strip("\"")
		
		# determine zygosity
		if len(alleles) == 1:
			zygosity = "hom"
		else:
			zygosity = "het"
		
		# examine each amino acid change
		amino_acid_changes = record.attributes["amino_acid"].strip("\"").split("/")
		for a in amino_acid_changes:
			amino_acid = a.split(" ")
Exemplo n.º 16
0
def main():
	f = gff.input(sys.argv[1])
	for record in f:
		print record.id, record.attributes
Exemplo n.º 17
0
def report_uncovered(gff_input,
                     transcript_filename,
                     genetests_filename,
                     output_file=None,
                     progresstracker=None):
    """Compare GFF records to transcripts to find missing coding regions

    Reports missing regions, yielding JSON-formatted strings. If output_file 
    is provided, instead yields the GFF-formatted strings from gff_input and 
    writes the JSON-formatted report strings to file.

    Required arguments:
    gff_input: GFF-formatted strings, string generator or file (can be .gz)
    transcript_filename: transcripts file
    genetests_filename: genetests file

    Optional arguments:
    output_file: If provided, opens and writes to this location (see above)
    progresstracker: If provided, records metadata to progresstracker.metadata
    """
    # Set up GFF input. If it ends with '.gz', assume gzip compressed.
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_data = gff.input(gzip.open(gff_input))
    else:
        gff_data = gff.input(gff_input)

    # set up transcript file input
    transcript_input = transcript.Transcript_file(transcript_filename)

    # grab genetests gene names
    genetests_input = open(genetests_filename)
    genetests_names = set()
    for line in genetests_input:
        if (re.match("#", line)):
            continue
        data = line.split("\t")
        if data[4] == "na":
            continue
        if not (re.match(".*Clinical", data[5])):
            # currently we require "clinical testing available"
            continue
        names = data[4].split("|")
        for name in names:
            genetests_names.add(name)

    # Set up optional output.
    f_out = False
    if output_file:
        if re.match(r'\.gz$', output_file):
            f_out = gzip.open(output_file, 'w')
        else:
            f_out = open(output_file, 'w')

    # If progresstracker was sent, track these for metadata.
    if progresstracker:
        progresstracker.metadata['ref_coding_n'] = 0
        progresstracker.metadata['ref_coding_clintest_n'] = 0
        progresstracker.metadata['called_coding_n'] = 0
        progresstracker.metadata['called_coding_clintest_n'] = 0

    # Store to-be-examined regions, we'll remove covered regions from this list.
    # key: Transcript object
    # value: list of tuples (chr (string), start (int), end (int))
    # Note: Start is 1-based, not 0-based as is in transcript files
    examined_regions = {}

    header_done = False
    for record in gff_data:
        if not header_done:
            yield "##gff-version " + gff_data.data[0]
            yield "##genome-build " + gff_data.data[1]
            yield "# Produced by: call_missing.py"
            header_done = True

        if f_out:
            yield str(record)

        # Move forward in transcripts until past record end.
        chromosome = std_chr_name(record.seqname)
        next_region = (chromosome, record.start, record.end)
        removed_transcripts = transcript_input.cover_next_position(next_region)

        for curr_ts in transcript_input.transcripts:
            # Add to examined_regions if new.
            if (not curr_ts in examined_regions):
                regions = []
                for i in range(len(curr_ts.data["coding_starts"])):
                    region = (curr_ts.data["chr"],
                              (curr_ts.data["coding_starts"][i] + 1),
                              curr_ts.data["coding_ends"][i])
                    regions.append(region)
                examined_regions[curr_ts] = regions
            # Examine regions and remove any covered by the record.
            curr_ts_regions = examined_regions[curr_ts]
            examined_regions[curr_ts] = remove_covered(curr_ts_regions, record)

        # Process past transcripts.
        results = process_ts_missing(removed_transcripts, examined_regions,
                                     genetests_names, progresstracker)
        for gene_data in results:
            if gene_data["length"] > 0:
                if f_out:
                    f_out.write(json.dumps(gene_data) + '\n')
                else:
                    yield json.dumps(gene_data)

    # Move through any remaining transcripts and return missing.
    beyond_end_hack = ("chrZ", 9999999999)
    removed_transcripts = transcript_input.cover_next_position(beyond_end_hack)
    remaining_transcripts = removed_transcripts + transcript_input.transcripts
    results = process_ts_missing(remaining_transcripts, examined_regions,
                                 genetests_names, progresstracker)
    for gene_data in results:
        if gene_data["length"] > 0:
            if f_out:
                f_out.write(json.dumps(gene_data) + '\n')
            else:
                yield json.dumps(gene_data)
Exemplo n.º 18
0
def main():
    # return if we don't have the correct arguments
    if len(sys.argv) < 2:
        raise SystemExit(__doc__.replace("%prog", sys.argv[0]))

    gff_file = gff.input(sys.argv[1])
    for record in gff_file:
        # lightly parse alleles
        alleles = record.attributes["alleles"].strip("\"").split("/")
        ref_allele = record.attributes["ref_allele"].strip("\"")

        # compress identical alleles like "A/A" into just "A"
        while len(alleles) > 1 and alleles[0].upper() == alleles[1].upper():
            alleles.pop(0)

        trait_allele = None

        # determine zygosity
        if len(alleles) == 1:
            zygosity = "hom"
            trait_allele = alleles[0]
        else:
            zygosity = "het"

        genotype = "/".join(alleles)
        if ref_allele in alleles:
            leftover_alleles = copy(alleles)
            leftover_alleles.remove(ref_allele)
            genotype = ref_allele + "/" + "/".join(leftover_alleles)
            if not trait_allele and len(leftover_alleles) == 1:
                trait_allele = leftover_alleles[0]

        # get dbSNP ID
        if "db_xref" in record.attributes:
            dbSNP_ID = record.attributes["db_xref"].lstrip("dbsnp:")
        else:
            dbSNP_ID = None

        # examine each amino acid change
        if "amino_acid_changes" in record.attributes:
            amino_acid_changes = record.attributes["amino_acid"].strip(
                "\"").split("/")
        else:
            amino_acid_changes = list()
        for a in amino_acid_changes:
            amino_acid = a.split(" ")
            gene = amino_acid.pop(0)  # the first item is always the gene name

            aa_done = {}
            for amino_acid_change_and_position in amino_acid:

                if amino_acid_change_and_position in aa_done:
                    continue
                aa_done[amino_acid_change_and_position] = 1

                if record.start == record.end:
                    coordinates = str(record.start)
                else:
                    coordinates = str(record.start) + "-" + str(record.end)

                output = {
                    "chromosome": record.seqname,
                    "coordinates": coordinates,
                    "gene": gene,
                    "amino_acid_change": amino_acid_change_and_position,
                    "genotype": genotype,
                    "ref_allele": ref_allele,
                    "trait_allele": trait_allele,
                    "zygosity": zygosity,
                    "variant": str(record),
                }
                if (dbSNP_ID):
                    output["dbSNP"] = dbSNP_ID

                print json.dumps(output)

        # Print one json line if there were no amino acid changes
        if (not amino_acid_changes):
            if record.start == record.end:
                coordinates = str(record.start)
            else:
                coordinates = str(record.start) + "-" + str(record.end)
            output = {
                "chromosome": record.seqname,
                "coordinates": coordinates,
                "genotype": genotype,
                "ref_allele": ref_allele,
                "trait_allele": trait_allele,
                "zygosity": zygosity,
                "variant": str(record)
            }
            if (dbSNP_ID):
                output["dbSNP"] = dbSNP_ID
            print json.dumps(output)
Exemplo n.º 19
0
def main():
    f = gff.input(sys.argv[1])
    for record in f:
        print record.id, record.attributes
Exemplo n.º 20
0
def match_getev(gff_in, getev_flat, transcripts_file=None,
                gene_out_file=None, output_file=None, 
                progresstracker=None):
    """String generator returning JSON-formatted data from GET-Evidence

    Required inputs:
    gff_in: GFF-formated string generator, text, or .gz gzip-compressed
    getev_flat: JSON-formated text, or .gz gzip-compressed
    
    Optional inputs:
    output_file: if set, print to this & generator instead yields GFF lines
    progress_tracker: ProgressTracker object from progresstracker.py

    Each output line yielded is JSON-formatted and corresponds to data for a
    particular variant. It will always contain 'chr', 'coordinates', 
    'GET-Evidence', 'genotype', 'autoscore' and at least one of these two 
    possibilities: (1) 'gene' and 'amino_acid_change' or (2) 'dbsnp'. It may 
    also contain 'testable', 'reviewed', and items copied by copy_output_data.
    """
    # Load data from GET-Evidence and Genetests files.
    getev_by_aa, getev_by_dbsnp = read_getev_flat(getev_flat)
    genetests_filepath = os.path.join(os.getenv('DATA'), GENETESTS_DATA)
    genetests_clin, genetests_rev = read_genetests(genetests_filepath)

    # Set up optional output, will not be compressed.
    f_json_out = None
    f_gene_out = None
    if output_file:
        f_json_out = open(output_file, 'w')
    if gene_out_file and transcripts_file:
        gene_data = dict()
        f_gene_out = open(gene_out_file, 'w')
        transcripts = read_transcripts(transcripts_file)

    # Set up BLOSUM100 matrix to score amino acid disruptiveness.
    blosum_matrix = blosum100()

    # Set up GFF data. Can be a string generator, text, or 
    # (if it ends with '.gz') a gzip-compressed text.
    gff_data = None
    if isinstance(gff_in, str) and re.search(r'\.gz$', gff_in):
        gff_data = gff.input(gzip.open(gff_in))
    else:
        gff_data = gff.input(gff_in)

    for record in gff_data:
        # If outputing JSON to file, yield GFF data as it's read.
        if f_json_out:
            yield str(record)

        # Ignore regions called as matching reference.
        if record.feature == 'REF':
            continue

        # If producing a gene report, output finished genes
        if f_gene_out:
            to_remove = []
            for gene in gene_data:
                if not gene in transcripts:
                    # Remove genes we don't recognize
                    to_remove.append(gene)
                else:
                    if transcripts[gene]['end'] < record.end:
                        gene_report(f_gene_out, gene, gene_data[gene])
                        to_remove.append(gene)
            for gene in to_remove:
                gene_data.pop(gene)

        # Track progress if a ProgressTracker was passed to us
        if progresstracker: 
            progresstracker.saw(record.seqname)

        # Store data for JSON output as dict.
        output = dict()

        # Parse GFF attributes to find the alleles, reference allele, phase, and dbSNP.
        alleles = record.attributes['alleles'].strip('"').split('/') # don't sort!
        if len(alleles) == 1:
            output['genotype'] = alleles[0]
        elif len(alleles) > 2 or len(alleles) < 1:
            # Not sure what to do with >2 or 0 alleles! Skip it.
            continue
        else:
            output['genotype'] = '/'.join(sorted(alleles))
        ref_allele = record.attributes['ref_allele'].strip('"')
        output['ref_allele'] = ref_allele
        if 'phase' in record.attributes:
            # Add phase attribute for the non-reference allele;
            # if both non-reference, treat as unphased.
            phase_data = record.attributes['phase'].strip().split('/')
            if len(alleles) == 2 and len(phase_data) == 2:
                if alleles[0] == ref_allele:
                    output['phase'] = phase_data[1]
                elif alleles[1] == ref_allele:
                    output['phase'] = phase_data[0]
        dbsnp_ids = []
        if 'db_xref' in record.attributes or 'Dbxref' in record.attributes:
            if 'db_xref' in record.attributes:
                entries = [d.strip() for d in record.attributes['db_xref'].split(',')]
            else:
                entries = [d.strip() for d in record.attributes['Dbxref'].split(',')]
            for entry in entries:
                data = entry.split(':')
                if re.match('dbsnp', data[0]) and re.match('rs', data[1]):
                    dbsnp_ids.append(data[1])
            if dbsnp_ids:
                output["dbSNP"] = ",".join(dbsnp_ids)
        
        # Default presence in GET-Evidence is false, set as true later 
        # if a match is found.
        output['GET-Evidence'] = False
        
        # Store position data
        output['chromosome'] = record.seqname
        if record.start == record.end:
            output['coordinates'] = str(record.start)
        else:
            output['coordinates'] = str(record.start) + "-" + str(record.end)

        # If there is an amino acid change reported, look it up based on this.
        if "amino_acid" in record.attributes:
            # Get gene and amino acid change, store in output.
            # Note: parse_aa_change will call sys.exit() if it's misformatted.
            # TODO: analyze more than the first change, multiple are split by /
            aa_changes = record.attributes['amino_acid'].split('/')
            aa_data = aa_changes[0].split()
            gene, aa_change_and_pos = aa_data[0:2]
            # "X" is preferred for stop, "*" can break things like URLs.
            aa_change_and_pos = re.sub(r'\*', r'X', aa_change_and_pos)
            (aa_from, aa_pos, aa_to) = parse_aa_change(aa_change_and_pos)
            output["gene"] = gene
            output["amino_acid_change"] = aa_data[1]

            # Check if the gene is in Genetests. If so, store result.
            if gene in genetests_clin:
                output["testable"] = True
                if gene in genetests_rev:
                    output["reviewed"] = True

            # Try to look up in GET-Evidence by amino acid change.
            aa_key = gene + "-" + aa_change_and_pos
            if aa_key in getev_by_aa:
                getev_data = getev_by_aa[aa_key]
                copy_output_data(getev_data, output)
                output["GET-Evidence"] = True
            else:
                # If not in GET-Evidence by aa, try dbsnp ID.
                if "dbSNP" in output:
                    dbsnp_ids = output["dbSNP"].split(",")
                    for dbsnp_id in dbsnp_ids:
                        if dbsnp_id in getev_by_dbsnp:
                            getev_data = getev_by_dbsnp[dbsnp_id]
                            output["GET-Evidence"] = True
                            copy_output_data(getev_data, output)
                            output["autoscore"] = autoscore(output, 
                                                            blosum_matrix,
                                                            aa_from, aa_to)
                            # Quit after first hit passing threshold
                            if output["autoscore"] >= 2 or suff_eval(output):
                                output["dbSNP"] = dbsnp_id
                                break
            # Calculate autoscore, yield json data if at least 2.
            output["autoscore"] = autoscore(output, blosum_matrix, aa_from, aa_to)
            if output["autoscore"] >= 2 or suff_eval(output):
                # This barfs on Unicode sometimes.
                try:
                    json_output = str(json.dumps(output, ensure_ascii=False))
                except:
                    continue
                if f_json_out:
                    f_json_out.write(json_output + '\n')
                else:
                    yield json_output
            # TODO: print when beyond end of gene, not when new one seen
            if f_gene_out and 'ucsc_trans' in record.attributes:
                # We take 1st & ignore multiple transcripts (which are rare)
                gene = record.attributes['ucsc_trans'].split(',')[0]
                if gene in gene_data:
                    gene_data[gene].append(output)
                else:
                    gene_data[gene] = [ output ]
        else:
            # If no gene data at all, try dbsnp ID.
            if "dbSNP" in output:
                dbsnp_ids = output["dbSNP"].split(",")
                for dbsnp_id in dbsnp_ids:
                    if dbsnp_id in getev_by_dbsnp:
                        output["GET-Evidence"] = True
                        getev_data = getev_by_dbsnp[dbsnp_id]
                        copy_output_data(getev_data, output)
                        output["autoscore"] = autoscore(output)
                        # Quit after first hit passing threshold
                        if output["autoscore"] >= 2 or suff_eval(output):
                            output["dbSNP"] = dbsnp_id
                            break
                    break  # quit after first hit
            output["autoscore"] = autoscore(output)
            # Autoscore bar is lower here because you can only get points if 
            # the dbSNP ID is in one of the variant specific databases (max 2).
            if output["autoscore"] >= 1 or suff_eval(output):
                # This barfs on Unicode sometimes.
                try:
                    json_output = str(json.dumps(output, ensure_ascii=False))
                except:
                    continue
                if f_json_out:
                    f_json_out.write(json_output + '\n')
                else:
                    yield json_output
    if f_json_out:
        f_json_out.close()
    if f_gene_out:
        f_gene_out.close()
def predict_nonsynonymous(gff_input,
                          twobit_path,
                          transcript_path,
                          progresstracker=False):
    twobit_file = twobit.input(twobit_path)
    transcript_input = transcript_file(transcript_path)

    # Set up gff_data
    gff_data = None
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_data = gff.input(gzip.open(gff_input))
    else:
        # GFF will interpret if gff_filename is string containing path
        # to a GFF-formatted text file, or a string generator
        # (e.g. file object) with GFF-formatted strings
        gff_data = gff.input(gff_input)

    header_done = False

    for record in gff_data:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##genome-build " + gff_data.data[1]
            yield "# Produced by: gff_nonsynonymous_filter.py"
            yield "# Date: " + datetime.datetime.now().isoformat(' ')
            header_done = True

        if record.feature == "REF":
            yield str(record)
            continue

        if record.seqname.startswith("chr"):
            chromosome = record.seqname
        else:
            if record.seqname.startswith("Chr"):
                chromosome = "chr" + record.seqname[3:]
            else:
                chromosome = "chr" + record.seqname
        if progresstracker: progresstracker.saw(chromosome)

        # record.start is 1-based, but UCSC annotation starts are 0-based, so subtract 1
        record_position = (chromosome, record.start - 1)

        transcripts = transcript_input.cover_next_position(record_position)

        # Skip the rest if no transcripts are returned
        if (not transcripts):
            yield str(record)
            continue

        # otherwise, cycle through
        nonsyn_inferences = []
        splice_inferences = []
        ucsc_transcripts = []
        is_nonsynonymous = is_splice = False

        for data in transcripts:
            # need to make "d" match up with transcript file order
            # d : geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds
            #     0, 3, 6, 7, 9, 10
            d = (data[0], data[3], int(data[6]), int(data[7]), data[9],
                 data[10])
            i = infer_function(twobit_file, record, *d)
            if i[0] == "nonsynonymous coding":
                nonsyn_inferences.append("%s %s" % (d[0], i[2]))
                is_nonsynonymous = True
                ucsc_transcripts.append(data[1])
            elif i[0] == "splice site":
                splice_inferences.append("%s %s " % (d[0], i[2]))
                is_splice = True

        # set the attribute if we can
        if (not is_nonsynonymous) and (not is_splice):
            yield str(record)
        else:
            if len(nonsyn_inferences) > 0:
                unique_inferences = unique(nonsyn_inferences)
                unique_inferences.sort(key=str.lower)
                record.attributes["amino_acid"] = "/".join(unique_inferences)
                record.attributes["ucsc_trans"] = ",".join(ucsc_transcripts)
            if len(splice_inferences) > 0:
                # Not going to report splice sites for now, but leaving the
                # code here because we hope to later. - Madeleine 2010/11/29
                pass
                # unique_inferences = unique(splice_inferences)
                # unique_inferences.sort(key=str.lower)
                # record.attributes["splice"] = "/".join(unique_inferences)
            yield str(record)
def predict_nonsynonymous(gff_input, twobit_path, transcript_path, progresstracker=False):
    twobit_file = twobit.input(twobit_path)
    transcript_input = transcript_file(transcript_path)

    # Set up gff_data
    gff_data = None
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_data = gff.input(gzip.open(gff_input))
    else:
        # GFF will interpret if gff_filename is string containing path
        # to a GFF-formatted text file, or a string generator
        # (e.g. file object) with GFF-formatted strings
        gff_data = gff.input(gff_input)

    header_done = False

    for record in gff_data:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##genome-build " + gff_data.data[1]
            yield "# Produced by: gff_nonsynonymous_filter.py"
            yield "# Date: " + datetime.datetime.now().isoformat(" ")
            header_done = True

        if record.feature == "REF":
            yield str(record)
            continue

        if record.seqname.startswith("chr"):
            chromosome = record.seqname
        else:
            if record.seqname.startswith("Chr"):
                chromosome = "chr" + record.seqname[3:]
            else:
                chromosome = "chr" + record.seqname
        if progresstracker:
            progresstracker.saw(chromosome)

        # record.start is 1-based, but UCSC annotation starts are 0-based, so subtract 1
        record_position = (chromosome, record.start - 1)

        transcripts = transcript_input.cover_next_position(record_position)

        # Skip the rest if no transcripts are returned
        if not transcripts:
            yield str(record)
            continue

        # otherwise, cycle through
        nonsyn_inferences = []
        splice_inferences = []
        ucsc_transcripts = []
        is_nonsynonymous = is_splice = False

        for data in transcripts:
            # need to make "d" match up with transcript file order
            # d : geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds
            #     0, 3, 6, 7, 9, 10
            d = (data[0], data[3], int(data[6]), int(data[7]), data[9], data[10])
            i = infer_function(twobit_file, record, *d)
            if i[0] == "nonsynonymous coding":
                nonsyn_inferences.append("%s %s" % (d[0], i[2]))
                is_nonsynonymous = True
                ucsc_transcripts.append(data[1])
            elif i[0] == "splice site":
                splice_inferences.append("%s %s " % (d[0], i[2]))
                is_splice = True

        # set the attribute if we can
        if (not is_nonsynonymous) and (not is_splice):
            yield str(record)
        else:
            if len(nonsyn_inferences) > 0:
                unique_inferences = unique(nonsyn_inferences)
                unique_inferences.sort(key=str.lower)
                record.attributes["amino_acid"] = "/".join(unique_inferences)
                record.attributes["ucsc_trans"] = ",".join(ucsc_transcripts)
            if len(splice_inferences) > 0:
                # Not going to report splice sites for now, but leaving the
                # code here because we hope to later. - Madeleine 2010/11/29
                pass
                # unique_inferences = unique(splice_inferences)
                # unique_inferences.sort(key=str.lower)
                # record.attributes["splice"] = "/".join(unique_inferences)
            yield str(record)
Exemplo n.º 23
0
def genome_metadata(gff_input, genome_stats_file, progresstracker):
    """Take GFF, track and record associated metadata, yield same GFF lines

    Required arguments:
    gff_input: file or GFF-formatted string generator
    genome_stats_file: str, path to a text file containing chromosome sizes 
    progresstracker: ProgressTracker object

    The following keys will store metadata in progresstracker.metadata:
    chromosomes: list of str, chromosome names
    called_num: int, # of positions called
    match_num: int, # of positions called w/chr matching ref
    ref_all_num: int, # of positions in reference genome (includes unplaceable)
    ref_nogap_num: int, # of placeable positions in reference genome
    called_frac_all: float, fraction of reference called (includes unplaceable)
    called_frac_nogap: float, fraction of placeable reference called

    Returns a generator, yielding same GFF-formatted strings as were inputed.
    """
    # 'chromosomes_raw' is a list of all the raw chromosome sequences seen.
    # 'chromosomes' has the same names edited, if needed, to match ref_genome.
    chromosomes_raw = list()

    # 'called_num' counts total positions called, while 'match_num' only counts
    # positions which match a chromosome ID in the ref_genome data.
    called_num = 0
    match_num = 0

    # 'ref_all_num' and 'ref_nogap_num' increment total and placeable genome
    # sizes (respectively) when new chromosomes are seen (for example, the
    # lengths for chrY are only added if chrY was seen).
    ref_all_num = 0
    ref_nogap_num = 0

    # Set up gff_data.
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_data = gff.input(gzip.open(gff_input))
    else:
        gff_data = gff.input(gff_input)

    # Get chromosome lengths (total and placeable) for reference genome.
    try:
        ref_genome = get_genome_stats(progresstracker.metadata['build'],
                                      genome_stats_file)
    except KeyError:
        ref_genome = get_genome_stats(DEFAULT_BUILD, genome_stats_file)

    # Initialize chromosomes list, we'll add them as we see them.
    progresstracker.metadata['chromosomes'] = list()

    # Progress through GFF input.
    header_done = False
    for record in gff_data:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if not header_done:
            yield "##gff-version " + gff_data.data[0]
            yield "##genome-build " + gff_data.data[1]
            yield "# Produced by: get_metadata.py"
            header_done = True

        # Record number of positions called.
        dist = (record.end - (record.start - 1))
        called_num += dist
        is_in_ref_genome = (record.seqname in ref_genome
                            or "chr" + record.seqname in ref_genome
                            or "chr" + record.seqname[3:] in ref_genome)
        if is_in_ref_genome:
            match_num += dist

        # If this is a new chromosome: (1) Add it to our chromosomes list,
        # (2) increase genome size variables (ref_all_num and ref_nogap_num)
        # (3) call progresstracker.saw().
        if record.seqname not in chromosomes_raw:
            chromosomes_raw.append(record.seqname)
            # Standardize chromosome name for metadata storage.
            chr_name = ""
            if record.seqname in ref_genome:
                chr_name = record.seqname
            elif "chr" + record.seqname in ref_genome:
                chr_name = "chr" + record.seqname
            elif "chr" + record.seqname[3:] in ref_genome:
                chr_name = "chr" + record.seqname[3:]
            if chr_name:
                progresstracker.metadata['chromosomes'].append(chr_name)
                ref_all_num += ref_genome[record.seqname]['seq_all']
                ref_nogap_num += ref_genome[record.seqname]['seq_nogap']
                progresstracker.saw(chr_name)

        yield str(record)

    progresstracker.metadata['called_num'] = called_num
    progresstracker.metadata['match_num'] = match_num
    progresstracker.metadata['ref_all_num'] = ref_all_num
    progresstracker.metadata['ref_nogap_num'] = ref_nogap_num

    if ref_all_num > 0:
        called_frac_all = match_num * 1.0 / ref_all_num
        progresstracker.metadata['called_frac_all'] = called_frac_all

    if ref_nogap_num > 0:
        called_frac_nogap = match_num * 1.0 / ref_nogap_num
        progresstracker.metadata['called_frac_nogap'] = called_frac_nogap
Exemplo n.º 24
0
def main():
    # return if we don't have the correct arguments
    if len(sys.argv) < 2:
        raise SystemExit(__doc__.replace("%prog", sys.argv[0]))

    # try opening the database connection; fail if unable to open
    try:
        dbsnp_connection = MySQLdb.connect(host=DB_HOST, user=DBSNP_USER, passwd=DBSNP_PASSWD, db=DBSNP_DATABASE)
        dbsnp_cursor = dbsnp_connection.cursor()
    except MySQLdb.OperationalError, message:
        print "Error %d while connecting to database: %s" % (message[0], message[1])
        sys.exit()

    # now read the file and loop through
    f = gff.input(sys.argv[1])
    for record in f:
        # the database shows unplaced SNPs as having 0-based position 0
        # (i.e. 1-based position 1), so looking up position 1 would be
        # unfortunate
        if record.start == 1:
            print record
            continue

        if record.seqname.startswith("chr"):
            chr = record.seqname[3:]
        else:
            chr = record.seqname

        # recall that record.start is 1-based, but the database is not
        dbsnp_cursor.execute(dbsnp_query, (chr, record.start - 1))
Exemplo n.º 25
0
def main():
    # return if we don't have the correct arguments
    if len(sys.argv) < 2:
        raise SystemExit(__doc__.replace("%prog", sys.argv[0]))
    
    gff_file = gff.input(sys.argv[1])    
    for record in gff_file:
        # lightly parse alleles
        alleles = record.attributes["alleles"].strip("\"").split("/")
        ref_allele = record.attributes["ref_allele"].strip("\"")

        # compress identical alleles like "A/A" into just "A"
        while len(alleles) > 1 and alleles[0].upper() == alleles[1].upper():
            alleles.pop(0)

        trait_allele = None;

        # determine zygosity
        if len(alleles) == 1:
            zygosity = "hom"
            trait_allele = alleles[0]
        else:
            zygosity = "het"

        genotype = "/".join(alleles)
        if ref_allele in alleles:
            leftover_alleles = copy(alleles)
            leftover_alleles.remove(ref_allele)
            genotype = ref_allele + "/" + "/".join(leftover_alleles)
            if not trait_allele and len(leftover_alleles) == 1:
                trait_allele = leftover_alleles[0]

        # examine each amino acid change
        amino_acid_changes = record.attributes["amino_acid"].strip("\"").split("/")
        for a in amino_acid_changes:
            amino_acid = a.split(" ")
            gene = amino_acid.pop(0) # the first item is always the gene name

            aa_done = {}
            for amino_acid_change_and_position in amino_acid:

                if amino_acid_change_and_position in aa_done:
                    continue
                aa_done[amino_acid_change_and_position] = 1

                if record.start == record.end:
                    coordinates = str(record.start)
                else:
                    coordinates = str(record.start) + "-" + str(record.end)

                output = {
                    "chromosome": record.seqname,
                    "coordinates": coordinates,
                    "gene": gene,
                    "amino_acid_change": amino_acid_change_and_position,
                    "genotype": genotype,
                    "ref_allele": ref_allele,
                    "trait_allele": trait_allele,
                    "zygosity": zygosity,
                    "variant": str(record),
                }
                print json.dumps(output)
Exemplo n.º 26
0
def report_uncovered(gff_input, transcript_filename, genetests_filename, 
                     output_file=None, progresstracker=None):
    """Compare GFF records to transcripts to find missing coding regions

    Reports missing regions, yielding JSON-formatted strings. If output_file 
    is provided, instead yields the GFF-formatted strings from gff_input and 
    writes the JSON-formatted report strings to file.

    Required arguments:
    gff_input: GFF-formatted strings, string generator or file (can be .gz)
    transcript_filename: transcripts file
    genetests_filename: genetests file

    Optional arguments:
    output_file: If provided, opens and writes to this location (see above)
    progresstracker: If provided, records metadata to progresstracker.metadata
    """
    # Set up GFF input. If it ends with '.gz', assume gzip compressed.
    if isinstance(gff_input, str) and (re.match(".*\.gz$", gff_input)):
        gff_data = gff.input(gzip.open(gff_input))
    else:
        gff_data = gff.input(gff_input)
    
    # set up transcript file input
    transcript_input = transcript.Transcript_file(transcript_filename)

    # grab genetests gene names
    genetests_input = open(genetests_filename)
    genetests_names = set()
    for line in genetests_input:
        if (re.match("#", line)):
            continue
        data = line.split("\t")
        if data[4] == "na":
            continue
        if not (re.match(".*Clinical", data[5])):
            # currently we require "clinical testing available"
            continue
        names = data[4].split("|")
        for name in names:
            genetests_names.add(name)

    # Set up optional output.
    f_out = False
    if output_file:
        if re.match(r'\.gz$', output_file):
            f_out = gzip.open(output_file, 'w')
        else:
            f_out = open(output_file, 'w')

    # If progresstracker was sent, track these for metadata.
    if progresstracker:
        progresstracker.metadata['ref_coding_n'] = 0
        progresstracker.metadata['ref_coding_clintest_n'] = 0
        progresstracker.metadata['called_coding_n'] = 0
        progresstracker.metadata['called_coding_clintest_n'] = 0

    # Store to-be-examined regions, we'll remove covered regions from this list.
    # key: Transcript object
    # value: list of tuples (chr (string), start (int), end (int))
    # Note: Start is 1-based, not 0-based as is in transcript files
    examined_regions = {}

    for record in gff_data:
        if f_out:
            yield str(record)

        # Move forward in transcripts until past record end.
        chromosome = std_chr_name(record.seqname)
        next_region = (chromosome, record.start, record.end)
        removed_transcripts = transcript_input.cover_next_position(next_region)

        for curr_ts in transcript_input.transcripts:
            # Add to examined_regions if new.
            if (not curr_ts in examined_regions):
                regions = [] 
                for i in range(len(curr_ts.data["coding_starts"])):
                    region = (curr_ts.data["chr"],
                              (curr_ts.data["coding_starts"][i] + 1),
                              curr_ts.data["coding_ends"][i])
                    regions.append(region)
                examined_regions[curr_ts] = regions
            # Examine regions and remove any covered by the record.
            curr_ts_regions = examined_regions[curr_ts]
            examined_regions[curr_ts] = remove_covered(curr_ts_regions, record)

        # Process past transcripts.
        results = process_ts_missing(removed_transcripts, examined_regions,
                                     genetests_names, progresstracker)
        for gene_data in results:
            if gene_data["length"] > 0:
                if f_out:
                    f_out.write(json.dumps(gene_data) + '\n')
                else:
                    yield json.dumps(gene_data)

    # Move through any remaining transcripts and return missing.
    beyond_end_hack = ("chrZ", 9999999999)
    removed_transcripts = transcript_input.cover_next_position(beyond_end_hack)
    remaining_transcripts = removed_transcripts + transcript_input.transcripts
    results = process_ts_missing(remaining_transcripts, examined_regions,
                                 genetests_names, progresstracker)
    for gene_data in results:
        if gene_data["length"] > 0:
            if f_out:
                f_out.write(json.dumps(gene_data) + '\n')
            else:
                yield json.dumps(gene_data)
Exemplo n.º 27
0
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 2:
		doc_optparse.exit()
	
	gff_files_1 = glob.glob(args[0])
	gff_files_2 = glob.glob(args[1])
	
	# create temporary files to store intersections
	temp_file_1 = TemporaryFile()
	temp_file_2 = TemporaryFile()
	
	if not option.enumerate:
		# use a wider column if we're going to need it
		if option.read_depth:
			col_width = 24
		elif option.verbose:
			col_width = 16
		else:
			col_width = 8
		
		# print column headings
		print " " * 8,
		for i in range(1, len(gff_files_1) + 1):
			print excel_column(i).ljust(col_width),
		print ""
	
	# initialize counter to print row headings
	file_number = 0
	
	# iterate through the second list of files
	for g2_path in gff_files_2:
		
		# print row heading
		if not option.enumerate:
			file_number += 1
			print str(file_number).ljust(8),
		
		# now iterate through the first list, do intersections and compare
		for g1_path in gff_files_1:
			
			# do the intersection one way
			g1 = gff.input(g1_path)
			g2 = gff.input(g2_path)
			for line in g1.intersect(g2):
				print >> temp_file_1, line
			
			# now do the intersection the other way
			g1_reverse = gff.input(g1_path)
			g2_reverse = gff.input(g2_path)
			for line in g2_reverse.intersect(g1_reverse):
				print >> temp_file_2, line
			
			# rewind each temporary file now storing intersection data
			temp_file_1.seek(0)
			temp_file_2.seek(0)
			
			# now go through the temporary files and work out concordancy
			g1_intx = gff.input(temp_file_1)
			g2_intx = gff.input(temp_file_2)
			matching_count = unmatching_count = 0
			# we cannot chain equal signs here, because the two would reference the
			# same list, and that would be bad...
			matching_read_depths, unmatching_read_depths = [], []
			
			for record1 in g1_intx:
				record2 = g2_intx.next()
				
				# these records should match in terms of the interval they represent
				if record2.seqname != record1.seqname or \
				  record2.start != record1.start or \
				  record2.end != record1.end:
				  	raise ValueError("files must be pre-sorted")
				
				# isolate the read depth info if we need to
				if option.read_depth:
					rd = []
					try:
						rd.append(int(record1.attributes["read_depth"].strip("\"")))
					except KeyError:
						pass
					try:
						rd.append(int(record2.attributes["read_depth"].strip("\"")))
					except KeyError:
						pass
				
				# now test if there's concordance
				try:
					if sorted(record2.attributes["alleles"].strip("\"").split("/")) != \
					  sorted(record1.attributes["alleles"].strip("\"").split("/")):
						unmatching_count += 1
						if option.enumerate:
							record1.attributes["concordant"] = "false"
							record2.attributes["concordant"] = "false"
							print record1
							print record2
						if option.read_depth:
							unmatching_read_depths.extend(rd)
					else:
						matching_count += 1
						if option.enumerate:
							record1.attributes["concordant"] = "true"
							record2.attributes["concordant"] = "true"
							print record1
							print record2
						if option.read_depth:
							matching_read_depths.extend(rd)
				# no alleles? not a SNP
				except KeyError:
					continue
			
			# now we print the result, being mindful of possible zero division problems, etc.
			if option.enumerate:
				pass
			elif option.read_depth:
				try:
					a = "%.1f" % mean(matching_read_depths)
					b = "%.1f" % median(matching_read_depths)
				except TypeError:
					a = "--"
					b = "--"
				try:
					c = "%.1f" % mean(unmatching_read_depths)
					d = "%.1f" % median(unmatching_read_depths)
				except TypeError:
					c = "--"
					d = "--"
				print ("%s %s : %s %s" % (a, b, c, d)).ljust(col_width),
			else:
				try:
					p = "%.1f%%" % (float(matching_count) / (matching_count + unmatching_count) * 100)
				except ZeroDivisionError:
					p = "--"
				if option.verbose:
					total_count = unmatching_count + matching_count
					print ("%s %s/%s" % (p, matching_count, total_count)).ljust(col_width),
				else:
					print p.ljust(col_width),
			
			# now we rewind, delete everything, and start again!
			temp_file_1.seek(0)
			temp_file_1.truncate()
			temp_file_2.seek(0)
			temp_file_2.truncate()
		
		# wrap up the line
		print ""
	
	# print the legend describing what the column and row headings mean
	if not option.enumerate:
		print "-" * 8
		file_number = 0
		for i in gff_files_1:
			file_number += 1
			print ("[%s]" % excel_column(file_number)).ljust(8),
			print i
		file_number = 0
		for i in gff_files_2:
			file_number += 1
			print ("[%s]" % file_number).ljust(8),
			print i
Exemplo n.º 28
0
        )
        cursor = connection.cursor()
    except MySQLdb.OperationalError, message:
        sys.stderr.write("Error %d while connecting to database: %s" % (message[0], message[1]))
        sys.exit()

        # make sure the required table is really there
    try:
        cursor.execute("DESCRIBE latest")
    except MySQLdb.Error:
        sys.stderr.write("No 'latest' table => empty output")
        sys.exit()

    found_aa_for_rsid = dict()

    gff_file = gff.input(sys.argv[1])
    for record in gff_file:
        # lightly parse to find the alleles and rs number
        alleles = record.attributes["alleles"].strip('"').split("/")
        ref_allele = record.attributes["ref_allele"].strip('"')
        xrefs = ()
        try:
            xrefs = record.attributes["db_xref"].strip('"').split(",")
        except KeyError:
            try:
                xrefs = record.attributes["Dbxref"].strip('"').split(",")
            except KeyError:
                pass

                # we wouldn't know what to do with this, so pass it up for now
        if len(alleles) > 2:
Exemplo n.º 29
0
def match_getev(gff_in,
                getev_flat,
                transcripts_file=None,
                gene_out_file=None,
                output_file=None,
                progresstracker=None):
    """String generator returning JSON-formatted data from GET-Evidence

    Required inputs:
    gff_in: GFF-formated string generator, text, or .gz gzip-compressed
    getev_flat: JSON-formated text, or .gz gzip-compressed
    
    Optional inputs:
    output_file: if set, print to this & generator instead yields GFF lines
    progress_tracker: ProgressTracker object from progresstracker.py

    Each output line yielded is JSON-formatted and corresponds to data for a
    particular variant. It will always contain 'chr', 'coordinates', 
    'GET-Evidence', 'genotype', 'autoscore' and at least one of these two 
    possibilities: (1) 'gene' and 'amino_acid_change' or (2) 'dbsnp'. It may 
    also contain 'testable', 'reviewed', and items copied by copy_output_data.
    """
    # Load data from GET-Evidence and Genetests files.
    getev_by_aa, getev_by_dbsnp = read_getev_flat(getev_flat)
    genetests_filepath = os.path.join(os.getenv('DATA'), GENETESTS_DATA)
    genetests_clin, genetests_rev = read_genetests(genetests_filepath)

    # Set up optional output, will not be compressed.
    f_json_out = None
    f_gene_out = None
    if output_file:
        f_json_out = open(output_file, 'w')
    if gene_out_file and transcripts_file:
        gene_data = dict()
        f_gene_out = open(gene_out_file, 'w')
        transcripts = read_transcripts(transcripts_file)

    # Set up BLOSUM100 matrix to score amino acid disruptiveness.
    blosum_matrix = blosum100()

    # Set up GFF data. Can be a string generator, text, or
    # (if it ends with '.gz') a gzip-compressed text.
    gff_data = None
    if isinstance(gff_in, str) and re.search(r'\.gz$', gff_in):
        gff_data = gff.input(gzip.open(gff_in))
    else:
        gff_data = gff.input(gff_in)

    header_done = False

    for record in gff_data:
        # Have to do this after calling the first record to
        # get the iterator to read through the header data
        if (not header_done) and f_json_out:
            yield "##genome-build " + gff_data.data[1]
            yield "# File creation date: " + datetime.datetime.now().isoformat(
                ' ')
            header_done = True

        # If outputing JSON to file, yield GFF data as it's read.
        if f_json_out:
            yield str(record)

        # Ignore regions called as matching reference.
        if record.feature == 'REF':
            continue

        # If producing a gene report, output finished genes
        if f_gene_out:
            to_remove = []
            for gene in gene_data:
                if not gene in transcripts:
                    # Remove genes we don't recognize
                    to_remove.append(gene)
                else:
                    if transcripts[gene]['end'] < record.end:
                        gene_report(f_gene_out, gene, gene_data[gene])
                        to_remove.append(gene)
            for gene in to_remove:
                gene_data.pop(gene)

        # Track progress if a ProgressTracker was passed to us
        if progresstracker:
            progresstracker.saw(record.seqname)

        # Store data for JSON output as dict.
        output = dict()

        # Parse GFF attributes to find the alleles, reference allele, phase, and dbSNP.
        alleles = record.attributes['alleles'].strip('"').split(
            '/')  # don't sort!
        if len(alleles) == 1:
            output['genotype'] = alleles[0]
        elif len(alleles) > 2 or len(alleles) < 1:
            # Not sure what to do with >2 or 0 alleles! Skip it.
            continue
        else:
            output['genotype'] = '/'.join(sorted(alleles))
        ref_allele = record.attributes['ref_allele'].strip('"')
        output['ref_allele'] = ref_allele
        if 'phase' in record.attributes:
            # Add phase attribute for the non-reference allele;
            # if both non-reference, treat as unphased.
            phase_data = record.attributes['phase'].strip().split('/')
            if len(alleles) == 2 and len(phase_data) == 2:
                if alleles[0] == ref_allele:
                    output['phase'] = phase_data[1]
                elif alleles[1] == ref_allele:
                    output['phase'] = phase_data[0]
        dbsnp_ids = []
        if 'db_xref' in record.attributes or 'Dbxref' in record.attributes:
            if 'db_xref' in record.attributes:
                entries = [
                    d.strip() for d in record.attributes['db_xref'].split(',')
                ]
            else:
                entries = [
                    d.strip() for d in record.attributes['Dbxref'].split(',')
                ]
            for entry in entries:
                data = entry.split(':')
                if re.match('dbsnp', data[0]) and re.match('rs', data[1]):
                    dbsnp_ids.append(data[1])
            if dbsnp_ids:
                output["dbSNP"] = ",".join(dbsnp_ids)

        # Default presence in GET-Evidence is false, set as true later
        # if a match is found.
        output['GET-Evidence'] = False

        # Store position data
        output['chromosome'] = record.seqname
        if record.start == record.end:
            output['coordinates'] = str(record.start)
        else:
            output['coordinates'] = str(record.start) + "-" + str(record.end)

        aa_changes = []
        # If there are any amino acid changes reported, look them up
        if "amino_acid" in record.attributes:
            for gene_aa_aa in record.attributes['amino_acid'].split('/'):
                aas = gene_aa_aa.split()
                gene = aas.pop(0)
                aa_seen = {}
                for aa in aas:
                    if aa in aa_seen: continue
                    aa_seen[aa] = 1
                    aa_changes.append([gene, aa])
        for aa_data in aa_changes:
            # Get gene and amino acid change, store in output.
            # Note: parse_aa_change will call sys.exit() if it's misformatted.
            gene, aa_change_and_pos = aa_data
            # "X" is preferred for stop, "*" can break things like URLs.
            aa_change_and_pos = re.sub(r'\*', r'X', aa_change_and_pos)
            (aa_from, aa_pos, aa_to) = parse_aa_change(aa_change_and_pos)
            output["gene"] = gene
            output["amino_acid_change"] = aa_data[1]

            # Check if the gene is in Genetests. If so, store result.
            if gene in genetests_clin:
                output["testable"] = True
                if gene in genetests_rev:
                    output["reviewed"] = True

            # Try to look up in GET-Evidence by amino acid change.
            aa_key = gene + "-" + aa_change_and_pos
            if aa_key in getev_by_aa:
                getev_data = getev_by_aa[aa_key]
                copy_output_data(getev_data, output)
                output["GET-Evidence"] = True
            else:
                # If not in GET-Evidence by aa, try dbsnp ID.
                if "dbSNP" in output:
                    dbsnp_ids = output["dbSNP"].split(",")
                    for dbsnp_id in dbsnp_ids:
                        if dbsnp_id in getev_by_dbsnp:
                            getev_data = getev_by_dbsnp[dbsnp_id]
                            output["GET-Evidence"] = True
                            copy_output_data(getev_data, output)
                            output["autoscore"] = autoscore(
                                output, blosum_matrix, aa_from, aa_to)
                            output["suff_eval"] = suff_eval(output)
                            output["dbSNP"] = dbsnp_id
                            # Quit after first hit passing threshold
                            if output["autoscore"] >= 2 or output["suff_eval"]:
                                break

            # Calculate autoscore, if not already done during dbSNP selection process
            if not ("autoscore" in output):
                output["autoscore"] = autoscore(output, blosum_matrix, aa_from,
                                                aa_to)
                if output["GET-Evidence"]:
                    output["suff_eval"] = suff_eval(output)

            # This barfs on Unicode sometimes.
            try:
                json_output = str(json.dumps(output, ensure_ascii=False))
            except:
                output['summary_short'] = (
                    'Summary for this variant not ' +
                    'displayed. It may contain a Unicode character ' +
                    'preventing it from being properly processed.')
                json_output = str(json.dumps(output, ensure_ascii=False))

            if f_json_out:
                f_json_out.write(json_output + '\n')
            else:
                yield json_output

            # TODO: print when beyond end of gene, not when new one seen
            if f_gene_out and 'ucsc_trans' in record.attributes:
                # We take 1st & ignore multiple transcripts (which are rare)
                gene = record.attributes['ucsc_trans'].split(',')[0]
                if gene in gene_data:
                    gene_data[gene].append(output)
                else:
                    gene_data[gene] = [output]

        if len(aa_changes) == 0:
            # If no gene data at all, try dbsnp ID.
            if "dbSNP" in output:
                dbsnp_ids = output["dbSNP"].split(",")
                for dbsnp_id in dbsnp_ids:
                    if dbsnp_id in getev_by_dbsnp:
                        output["GET-Evidence"] = True
                        getev_data = getev_by_dbsnp[dbsnp_id]
                        copy_output_data(getev_data, output)
                        output["autoscore"] = autoscore(output)
                        output["suff_eval"] = suff_eval(output)
                        output["dbSNP"] = dbsnp_id
                        # Quit after first hit passing threshold
                        if output["autoscore"] >= 2 or output["suff_eval"]:
                            break

            # If no gene data and dbSNP id is not listed in
            # GET-Evidence, don't output.
            if "autoscore" in output:
                # This barfs on Unicode sometimes.
                try:
                    json_output = str(json.dumps(output, ensure_ascii=False))
                except:
                    continue
                if f_json_out:
                    f_json_out.write(json_output + '\n')
                else:
                    yield json_output
    if f_json_out:
        f_json_out.close()
    if f_gene_out:
        f_gene_out.close()
Exemplo n.º 30
0
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 2:
		doc_optparse.exit()
	
	flank = int(option.flank or 0)
	
	# try opening the file both ways, in case the arguments got confused
	try:
		gff_file = gff.input(args[1])
		twobit_file = twobit.input(args[0])
	except Exception:
		gff_file = gff.input(args[0])
		twobit_file = twobit.input(args[1])
	
	# initialize a set of variables to keep track of uniqueness, if we need them
	if option.unique:
		previous_record = None
		previous_ref_seq = None
		repetition_count = 1
	
	for record in gff_file:
		# if we're using the unique option, output the previous record only when
		# we're sure we've seen all repetitions of it
		if option.unique and record == previous_record:
			repetition_count += 1
			continue
		elif option.unique:
			if previous_record:
				previous_record.attributes["repetition_count"] = str(repetition_count)
				print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)
			repetition_count = 1
			previous_record = record

		if record.seqname.startswith("chr"):
			chr = record.seqname
		else:
			chr = "chr" + record.seqname
		
		ref_seq = twobit_file[chr][(record.start - 1):record.end]

		if flank != 0:
			# calculate the flanks (these variables are 0-based)
			left_flank_start = record.start - flank - 1
			left_flank_end = record.start - 1
			if left_flank_start < 0:
				left_flank_start = 0
			
			right_flank_start = record.end
			right_flank_end = record.end + flank
			
			# now find them
			left_flank_seq = twobit_file[chr][left_flank_start:left_flank_end]
			right_flank_seq = twobit_file[chr][right_flank_start:right_flank_end]
			ref_seq = left_flank_seq + "\n\n" + ref_seq + "\n\n" + right_flank_seq
		
		if option.strand and record.strand == "-":
			ref_seq = reverse_complement(ref_seq)
		
		# we don't output the current record if we're using the unique option
		if option.unique:
			previous_ref_seq = ref_seq
		else:
			print FastaRecord(str(record).replace("\t", "|"), ref_seq)
	
	# we'll have one last record yet to output if we used the unique option
	if option.unique:
		previous_record.attributes["repetition_count"] = str(repetition_count)
		print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)