def count(json_tar, json_specs, out=sys.stdout): """Print a count matrix of the target genes in a given GO category.""" specs = json.load(vskip(open(json_specs))) tar = json.load(vskip(open(json_tar))) # First thing: remove the genes tagged "NA" if present. NA = tar.pop('NA', None) # There must be a "total" key of the target dictionary # that will be used for the counts. total = tar.pop('total') # Turn to sets to remove potential duplicate entries. for key in specs: specs[key] = set(specs[key]) for key in tar: tar[key] = set(tar[key]) # Get all GO-annotated genes. annotated = set([gene for ls in specs.values() for gene in ls]) result = [] # List of count lists. # Proteins in line, GO terms in column. Fill by line. for prot in sorted(tar): # Count the intersection of GO and targets. result.append( [prot] + \ [str(len(specs[GO].intersection(tar[prot]))) for GO in specs] + \ [str(len(annotated.intersection(tar[prot])))] ) # The last line contains the total genes with a GO term. result.append( ['total'] + \ [str(len(specs[GO].intersection(total))) for GO in specs] + \ [str(len(annotated.intersection(total)))] ) # Print the table header. out.write('\t'.join(specs) + '\ttotal\n') # Print the table for line in result: out.write('\t'.join(line) + '\n')
def parseGeneAssociations(filename, comment_char='!', columns=(2,5)): assoVersion = ['-- associations version information --'] pairlist = [] for line in vskip(open(filename)): if line.startswith(comment_char): assoVersion.append(line[1:].rstrip()) else: # Parse by specified columns. items = line.split('\t') gene = items[columns[0]-1] # Skip gene with no canonical ID (flanked by '__') # and lines with the 'NOT' keyword if gene[:2] == '__' or re.search('NOT', line): continue GOterm = items[columns[1]-1] pairlist.append((GOterm, gene)) return { 'associations': pairlist, 'assoVersion': assoVersion }
#!/usr/bin/env python # -*- coding: utf-8 -*- try: import json except ImportError: import simplejson as json import sys import random from vtrack import vheader, vskip def shuffle(jsontar): """Modify jsontar in place, using the 'total' field.""" sample = random.Random().sample for prot in jsontar: jsontar[prot] = sample(jsontar['total'], len(jsontar[prot])) if __name__ == '__main__': jsontar = json.load(vskip(open(sys.argv[1]))) shuffle(jsontar) sys.stdout.write(vheader(*sys.argv)) json.dump(jsontar, sys.stdout, indent=4)
#! /usr/bin/env python # -*- coding: utf-8 -*- import sys import json import re from vtrack import vskip ########################## # get the mapping table # # GOID GOTERM # # get the dict_go from G.F dict_go = json.load(vskip(open(sys.argv[1]))) for i in dict_go.keys(): goID = re.match('^GO:[0-9]+',i).group(0) goTERM = re.search('\((.*)\)', i).group(1) sys.stdout.write("%s\t%s\n"%(goID, goTERM))
def JSONtargets(mappingfile, bindingfile): """Create a gene target set in JSON format from a gene mapping file and a discrete binding profile.""" # Read in gene mapping. Skip comment lines and remove stray # 'chr' sometimes present in chromosome names. mapping = [ l.rstrip().replace('chr','').split('\t') \ for l in vskip(open(mappingfile, 'r')) \ if l[0] != '#' ] # Remove the header if present (recognized by 'start' and # 'end' in third and fourth columns. if mapping[0][2:4] == ['start','end']: mapping.pop(0) # Collect TSS, if gene is on +, TSS is on start, else on end. TSS = {} for row in mapping: thisTSS = { '+': lambda x: (x[1], int(x[2])), # 2nd and 3rd column. '-': lambda x: (x[1], int(x[3])) # 2nd and 4th column. }.get(row[4])(row) # Arrange geneIDs by TSS in a dictionary. # Example: TSS['FBgn0031208'] = ('2L', 7529) TSS[row[0]] = thisTSS # Read in binding data. Skip comment lines and remove # 'chr' on chromosome names. binding = [ l.rstrip().replace('chr','').split('\t') \ for l in vskip(open(bindingfile, 'r')) \ if l[0] != '#' ] # Get feature names and remove (pop) the header. # Example: features = ['D005', 'D007', ...] features = binding.pop(0)[4:] # "all" and "NA" are mutually exclusive lists of genes. targets = {'total': [], 'NA': []} for feature in features: targets[feature] = [] # Collect mapping information (seqname, start, end) and # binding info (0/1). mapinfo = {} bindinfo = {} for row in binding: # Example: mapinfo['r5GATC2L00037'] = ('2L', 5301, 6026) mapinfo[row[0]] = (row[1], int(row[2]), int(row[3])) # Example: bindinfo['r5GATC2L00037'] = [0,0,1,...] bindinfo[row[0]] = row[4:] # Get the closest feature to TSS. close_elt = get_closest(TSS, mapinfo, dist = dist) for geneID in close_elt: if dist(TSS[geneID], mapinfo[close_elt[geneID]]) > MAXDIST: # The gene is too far. Push it to NA. targets.get('NA').append(geneID) else: targets.get('total').append(geneID) # The gene gets the status of the binding element closest # to its TSS. for feature in [ feat for (feat, yes) in \ # Example: [('D005', 0), ('D007', 0), ...] zip(features, bindinfo[close_elt[geneID]]) \ if yes == '1' ]: targets.get(feature).append(geneID) # Print the version tracking header and the JSON data. sys.stdout.write(vheader(*sys.argv)) json.dump(targets, sys.stdout, indent=4)
If the FBgn is not in the table, it is flanked by '__' which is something to grep for. """ from __future__ import with_statement import re import sys from vtrack import vheader, vskip canonID = {} # Read-in the lookup table in a dict. with open(sys.argv[1]) as lookup: for line in vskip(lookup): canonID.update([line.rstrip().split('\t')]) def to_canonID(FBmatch): return canonID.get( FBmatch.group(), '__' + FBmatch.group() ) # Write the vheader. sys.stdout.write(vheader(*sys.argv)) # Read-in arg file and update FBgn line by line. with open(sys.argv[2]) as argfile: for line in argfile: sys.stdout.write( re.sub('FBgn[0-9]{7}', to_canonID, line)