Exemplo n.º 1
0
def count(json_tar, json_specs, out=sys.stdout):
   """Print a count matrix of the target genes in a given
   GO category."""

   specs = json.load(vskip(open(json_specs)))
   tar = json.load(vskip(open(json_tar)))

   # First thing: remove the genes tagged "NA" if present.
   NA = tar.pop('NA', None)
   # There must be a "total" key of the target dictionary
   # that will be used for the counts.
   total = tar.pop('total')

   # Turn to sets to remove potential duplicate entries.
   for key in specs: specs[key] = set(specs[key])
   for key in tar: tar[key] = set(tar[key])

   # Get all GO-annotated genes.
   annotated = set([gene for ls in specs.values() for gene in ls])

   result = [] # List of count lists.

   # Proteins in line, GO terms in column. Fill by line.
   for prot in sorted(tar):
      # Count the intersection of GO and targets.
      result.append(
         [prot] + \
         [str(len(specs[GO].intersection(tar[prot]))) for GO in specs] + \
         [str(len(annotated.intersection(tar[prot])))]
      )
   # The last line contains the total genes with a GO term.
   result.append(
      ['total'] + \
      [str(len(specs[GO].intersection(total))) for GO in specs] + \
      [str(len(annotated.intersection(total)))]
   )

   # Print the table header.
   out.write('\t'.join(specs) + '\ttotal\n')
   # Print the table
   for line in result:
      out.write('\t'.join(line) + '\n')
Exemplo n.º 2
0
def parseGeneAssociations(filename, comment_char='!', columns=(2,5)):
   assoVersion = ['-- associations version information --']
   pairlist = []
   for line in vskip(open(filename)):
      if line.startswith(comment_char):
         assoVersion.append(line[1:].rstrip())
      else:
         # Parse by specified columns.
         items = line.split('\t')
         gene = items[columns[0]-1]
         # Skip gene with no canonical ID (flanked by '__')
         # and lines with the 'NOT' keyword
         if gene[:2] == '__' or re.search('NOT', line):
            continue
         GOterm = items[columns[1]-1]
         pairlist.append((GOterm, gene))

   return {
         'associations': pairlist,
         'assoVersion': assoVersion
      }
Exemplo n.º 3
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

try:
   import json
except ImportError:
   import simplejson as json
import sys
import random
from vtrack import vheader, vskip

def shuffle(jsontar):
   """Modify jsontar in place, using the 'total' field."""
   sample = random.Random().sample
   for prot in jsontar:
      jsontar[prot] = sample(jsontar['total'], len(jsontar[prot]))

if __name__ == '__main__':
   jsontar = json.load(vskip(open(sys.argv[1])))
   shuffle(jsontar)
   sys.stdout.write(vheader(*sys.argv))
   json.dump(jsontar, sys.stdout, indent=4)
Exemplo n.º 4
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import json
import re
from vtrack import vskip

##########################
# get the mapping table  #
# GOID   GOTERM          #

# get the dict_go from G.F
dict_go = json.load(vskip(open(sys.argv[1])))

for i in dict_go.keys():
   goID = re.match('^GO:[0-9]+',i).group(0)
   goTERM = re.search('\((.*)\)', i).group(1)
   sys.stdout.write("%s\t%s\n"%(goID, goTERM))
Exemplo n.º 5
0
def JSONtargets(mappingfile, bindingfile):
   """Create a gene target set in JSON format from a gene mapping
   file and a discrete binding profile."""
   # Read in gene mapping. Skip comment lines and remove stray
   # 'chr' sometimes present in chromosome names.
   mapping = [
         l.rstrip().replace('chr','').split('\t') \
         for l in vskip(open(mappingfile, 'r')) \
         if l[0] != '#'
      ]

   # Remove the header if present (recognized by 'start' and
   # 'end' in third and fourth columns.
   if mapping[0][2:4] == ['start','end']: mapping.pop(0)

   # Collect TSS, if gene is on +, TSS is on start, else on end.
   TSS = {}
   for row in mapping:
      thisTSS = {
        '+': lambda x: (x[1], int(x[2])), # 2nd and 3rd column.
        '-': lambda x: (x[1], int(x[3]))  # 2nd and 4th column.
      }.get(row[4])(row)
      # Arrange geneIDs by TSS in a dictionary.
      # Example: TSS['FBgn0031208'] = ('2L', 7529)
      TSS[row[0]] = thisTSS


   # Read in binding data. Skip comment lines and remove
   # 'chr' on chromosome names.
   binding = [
         l.rstrip().replace('chr','').split('\t') \
         for l in vskip(open(bindingfile, 'r')) \
         if l[0] != '#'
      ]
   # Get feature names and remove (pop) the header.
   # Example: features = ['D005', 'D007', ...]
   features = binding.pop(0)[4:]
   # "all" and "NA" are mutually exclusive lists of genes.
   targets = {'total': [], 'NA': []}
   for feature in features:
      targets[feature] = []


   # Collect mapping information (seqname, start, end) and
   # binding info (0/1).
   mapinfo = {}
   bindinfo = {}
   for row in binding:
      # Example: mapinfo['r5GATC2L00037'] = ('2L', 5301, 6026)
      mapinfo[row[0]] = (row[1], int(row[2]), int(row[3]))
      # Example: bindinfo['r5GATC2L00037'] = [0,0,1,...]
      bindinfo[row[0]] = row[4:]


   # Get the closest feature to TSS.
   close_elt = get_closest(TSS, mapinfo, dist = dist)


   for geneID in close_elt:
      if dist(TSS[geneID], mapinfo[close_elt[geneID]]) > MAXDIST:
         # The gene is too far. Push it to NA.
         targets.get('NA').append(geneID)
      else:
         targets.get('total').append(geneID)
         # The gene gets the status of the binding element closest
         # to its TSS.
         for feature in [
               feat for (feat, yes) in \
               # Example: [('D005', 0), ('D007', 0), ...]
               zip(features, bindinfo[close_elt[geneID]]) \
               if yes == '1'
            ]:
            targets.get(feature).append(geneID)


   # Print the version tracking header and the JSON data.
   sys.stdout.write(vheader(*sys.argv))
   json.dump(targets, sys.stdout, indent=4)
Exemplo n.º 6
0
If the FBgn is not in the table, it is flanked by '__'
which is something to grep for.
"""

from __future__ import with_statement

import re
import sys

from vtrack import vheader, vskip

canonID = {}

# Read-in the lookup table in a dict.
with open(sys.argv[1]) as lookup:
   for line in vskip(lookup):
      canonID.update([line.rstrip().split('\t')])

def to_canonID(FBmatch):
   return canonID.get(
         FBmatch.group(),
         '__' + FBmatch.group()
      )

# Write the vheader.
sys.stdout.write(vheader(*sys.argv))
# Read-in arg file and update FBgn line by line.
with open(sys.argv[2]) as argfile:
   for line in argfile:
      sys.stdout.write(
            re.sub('FBgn[0-9]{7}', to_canonID, line)