Exemplo n.º 1
0
def launch_many(run_id):
    '''
Generate script paramaters and launch a bunch of bsub jobs.

Designed to be run on the cluster via an interactive shell.
Note: If this is not run on cluster, since it does not look
up a remote url for files, it won't be able to find expression
data.

'''
    print 'Launching all jobs!'

    #MAKE INPUTS 
    expr_filenames = ['soheil/expression_c4d_n4_tt_{0}.mat'.format(ttnum)
                      for ttnum in range(70)] + ['soheil/expression_c4d_n4_intercluster.mat']
    urls = [ cfg.dataURL(f) for f in expr_filenames ]
    remote_exprnames =[  cfg.dataPath(url) for url in urls ]

    inp_dicts = [dict(out_iter_num = out_iter_num,
                      in_iter_num = in_iter_num,
                      k = k,
                      beta = beta,
                      f_mix = f_mix,
                      f_sim = f_sim,
                      f_en_in = f_en_in,
                      f_en_out = f_en_out,
                      th_cor = th_cor,
                      trunc_value = trunc_value,
                      degree_bound = degree_bound,
                      filename = filename)
                 for out_iter_num in array([25],double)
                 for in_iter_num in array([100],double)
                 for k in array([6],double)
                 for beta in array([4],double)
                 for f_mix in array([2],double)
                 for f_sim in array([.8],double)
                 for f_en_in in array([1.],double)
                 for f_en_out in array([1.],double)
                 for th_cor in array([.6],double)
                 for trunc_value in array([3],double)
                 for degree_bound in array([3],double)
                 for filename in remote_exprnames ]

    

    #MAKE EYEBALL
    eyeball = bsub.eyeball(run_id, 
                           os.path.abspath(inspect.stack()[0][1]),
                           inp_dicts,
                           func = 'run_single',
                           name = 'mcmc_',
                           mem = 3)

    #LAUNCH EYEBALL JOBS
    eyeball.launch()

    
    #RETURN A LIST OF LAUNCHED JOBS
    return dict(cmds=eyeball.cmds,
                inputs = inp_dicts)
Exemplo n.º 2
0
def sort_prefixes(volume_name="cb"):
    prefix_path = config.dataPath(config.dataURL("genbank/prefixes"))
    for p in os.listdir(prefix_path):
        f = os.path.join(prefix_path, p)
        fopen = open(f)
        lines = fopen.readlines()
        lsort = sorted(lines)
        fopen.close()
        fopen = open(f, "w")
        fopen.writelines(lsort)
        fopen.close()
        print p
Exemplo n.º 3
0
def split_prefixes(volume_name="cb"):
    """splits the massive genebank accession list up by prefixes.
takes a volume name as a parameter in case the accesion list is
stored in an atypical location"""

    path = config.dataPath(config.dataURL("genbank/gb_acclist.genbank", volume_name=volume_name))
    path_home = os.path.dirname(path)
    fopen = open(path)
    prefixes = {}
    count = 0
    long_count = 0
    for l in fopen.xreadlines():
        if l[1].isdigit():
            pend = 1
        elif l[2].isdigit():
            pend = 2
        elif l[3].isdigit():
            pend = 3
        elif l[4].isdigit():
            pend = 4
        elif l[5].isdigit():
            pend = 5
        elif l[6].isdigit():
            pend = 6
        else:
            raise Exception()

        prefix = l[0:pend]

        if not prefixes.has_key(prefix):
            print "getting pre"
            prefixes[prefix] = open(os.path.join(path_home, "prefixes/" + prefix), "a")
        f = prefixes[prefix]
        f.write(l)

        count += 1
        if count > 100000:
            count = 0
            long_count += 1
            print prefix, l
            if long_count > 10:
                print prefixes
                long_count = 0
                while prefixes:
                    f = prefixes.pop(prefixes.keys()[0])
                    f.close()

    for k, p in prefixes.iteritems():
        p.close()
Exemplo n.º 4
0
def fill_all_rdb16s(reset = True):
  paths = []
  for r, ds, fs in os.walk(config.dataPath('alignments/16s')):
    for f in fs:
      if '.gbk' in f:
        paths.append(os.path.join(r,f))
  cbdb = compbio.projects.cbdb
  dbi = cbdb.getName('16s',
                     tables = get_tables(),
                     reset = np.mod(reset, 2))
  last_ofs = 0
  for p in paths:
    fopen = open(p)
    a = dbi.Alignment(file_name =config.dataURL(p))
    dbi.Session.add(a)
    dbi.Session.commit()
    count = 0 

    for rec in SeqIO.parse(fopen, 'genbank'):
      try:
        src_taxon = rec.features[0].qualifiers['db_xref'][0][6:]
      except Exception, e:
        src_taxon = None

      ann = sjson.dumps(rec.annotations, default = lambda x: x.__str__())
      seq = dbi.Sequence(name = rec.name,
                         file_name = p,
                         file_offset = last_ofs,
                         sequence = rec.seq.__str__(),
                         gb_accession = rec.id,
                         gb_accession_version = 1,
                         gb_id = None,
                         annotations = ann,
                         alignment = a,
                         source_taxon = src_taxon
                         )
      dbi.Session.add(seq)
      last_ofs = fopen.tell()
      if np.mod(count, 1000) == 0:
        print count, p, seq.source_organism
        dbi.Session.commit()
      count += 1
    dbi.Session.commit()
Exemplo n.º 5
0
def search_sorted(prefix_name, query, volume_name="cb"):
    """performs a binary search within a sorted file to find the
  genbank id for a given query"""

    prefix_file = os.path.join(
        config.dataPath(config.dataURL("genbank", volume_name=volume_name)), "prefixes/" + prefix_name
    )
    fopen = open(prefix_file)
    size = os.path.getsize(prefix_file)
    start = 0
    stop = size

    hplast = 0
    while 1:
        halfpt = (start + stop) / 2
        fopen.seek(halfpt)
        if halfpt == 0:
            line = fopen.readline()
        else:
            blank = fopen.readline()
            line = fopen.readline()

        c0 = line.split(",")[0]

        if c0 == query:
            return line.split(",")[2].strip()
        elif c0 < query:
            start = halfpt
        else:
            stop = halfpt

        if halfpt == hplast:
            raise Exception("Query not for: %s" % query)
        hplast = halfpt
        if start == stop:
            raise Exception("Query not found: %s" % query)
Exemplo n.º 6
0
#!/usr/bin/env python
import compbio.config as cfg
import sys

def usage():
	print '''
Usage: 
  pydatapath.py path volume
'''

if __name__ == '__main__':
	path = sys.argv[1] if len(sys.argv) > 1 else ''
	volume = sys.argv[2] if len(sys.argv) > 2 else ''
	host = ''
	
	if path == 'usage':
		usage()
		exit(0)
	

	localpath = sys.argv[1] if len(sys.argv) > 1 else ''
	path = cfg.dataPath(cfg.dataURL(path, volume_name = volume))
	sys.stdout.write(path)
	exit(0)	
Exemplo n.º 7
0
import os
import compbio.config as config

for r, d, fs in os.walk(config.dataPath(config.dataURL("unseen_data"))):
    for f in fs:
        if ".stk" in f:
            print f
Exemplo n.º 8
0
def fill_db( name = 'bacterial_genomes', reset = False,
              postgres = False, host = 'broad'):
    dbi = cbdb.getName(
                       name,
                       postgres = postgres,
                       tables = get_tables(),
                       reset = np.mod(reset, 2), 
                       host = host)


    paths = []
    for r,ds, fs in os.walk('/Volumes/ganymede/all.gbk/'):
      for f in fs:
        if 'gbk' in f: paths.append(os.path.join(r, f))
        count = 0 
    

    for p in paths:
      
      if count < 1668:
        count += 1
        continue
      count += 1
      fopen = open(p)
      for rec in SeqIO.parse(fopen, 'genbank'):
        f0 = rec.features[0]
        if f0.type == 'source':
          source_taxon = f0.qualifiers['db_xref'][0][6:]
          source_organism=f0.qualifiers['organism'][0]
        else:
          source_taxon = None
          source_organism = None
          
        fa_seqpath = 'genomes/'+rec.id+'.fa'
        fa_sequrl = config.dataURL(fa_seqpath)
        fa_seqfile = config.dataPath(fa_sequrl)
        fopen = open(fa_seqfile,'w')
        SeqIO.write(rec,fopen, 'fasta')
        fopen.close()

        adds = []
        genome = dbi.Genome(name = rec.name, 
                           seq_url =fa_sequrl,
                           source_taxon = source_taxon,
                           source_organism = source_organism,
                           gb_accession = rec.id,
                           annotations = rec.annotations.__str__())

        #adds.append(genome)
        print 'adding genome ' + source_organism
        dbi.Session.add(genome)
        print 'commiting update ' 
        dbi.Session.commit()
        print 'genome added! '
        for f in rec.features:
          feature = dbi.Feature(type = f.type,
                                start = f.location.start.position,
                                start_ext = f.location.start.extension,
                                end = f.location.end.position,
                                end_ext = f.location.end.extension,
                                strand = f.strand,
                                genomeobj = genome)
          #print 'adding feature ' + f.type
          #dbi.Session.add(feature)
          adds.append(feature)
          for k,v in f.qualifiers.iteritems():
            q = dbi.Qualifier(key = k,
                                      value = v.__str__(),
                                      featureobj = feature)
            #dbi.Session.add(q)
            adds.append(q)
          for sf in f.sub_features:
            sub = dbi.SubFeature(type = sf.type,
                                 start = sf.location.start.position,
                                 start_ext = sf.location.start.extension,
                                 end =sf.location.end.position,
                                 end_ext = sf.location.end.extension,
                                 strand = sf.strand,
                                 featureobj = feature)
            adds.append(sub)
            #dbi.Session.add(sub)
            for k,v in sf.qualifiers.iteritems():
              q = dbi.Qualifier(key = k,
                                value = v.__str__(),
                                subfeatureobj = sf)
              #Session.add(q)
              adds.append(q)
                                
        dbi.Session.add_all(adds)



        if np.mod(count, 2) == 0:
          print count
#print count, p , seq.source_organism
          print 'committing update'
          dbi.Session.commit()
          print 'update commited!'
      dbi.Session.commit()