Пример #1
0
class App(object):

  def __init__(self, host, user, passwd,
               study_label,
               maker, model, release):
    self.kb = KB(driver='omero')(host, user, passwd)
    self.mset = self.kb.get_snp_markers_set(maker, model, release)
    self.logger = logging.getLogger()
    if not self.mset:
      raise ValueError('SNPMarkersSet[%s,%s,%s] has not been defined.'
                       % (maker, model, release))
    #--
    alabel = 'load_genotypes-setup-%s' % time.time()
    self.asetup = self.kb.factory.create(self.kb.ActionSetup,
                                         {'label' : alabel,
                                          'conf'  : ''}).save()
    #--
    dmaker, dmodel, drelease = 'CRS4', 'load_genotypes', '0.1'
    dlabel = '%s-%s-%s' % (dmaker, dmodel, drelease)
    device = self.kb.get_device(dlabel)
    if not device:
      device = self.kb.factory.create(self.kb.Device,
                                      {'label' : dlabel,
                                       'maker' : dmaker,
                                       'model' : dmodel,
                                       'release' : drelease}).save()
    self.device = device
    #-- FIXME this will break if study is not defined.
    self.study = self.kb.get_study(study_label)

  def check_snp_markers_set(self, marker_types, marker_names):
    self.logger.info('start checking snp_markers_set')
    mdefs, msetc = self.kb.get_snp_markers_set_content(self.mset)
    rs_labels = mdefs['rs_label']
    for t, n in it.izip(marker_types, marker_names):
      if t == 'M':
        if not n in rs_labels:
          msg = 'marker %s is not in the specified SNPMarkersSet' % n
          self.logger.critical(msg)
          raise ValueError(msg)
    self.logger.info('done checking snp_markers_set')

  def create_action(self, target):
    conf = {'setup' : self.asetup,
            'device' : self.device,
            'actionCategory' : self.kb.ActionCategory.MEASUREMENT,
            'operator' : 'Alfred E. Neumann',
            'context'  : self.study,
            'target'   : target,
            }
    action = self.kb.factory.create(self.kb.ActionOnVessel, conf).save()
    return action

  def create_data_sample(self, action, label):
    conf = {'snpMarkersSet' : self.mset,
            'label' : label,
            'status' : self.kb.DataSampleStatus.USABLE,
            'action' : action}
    return self.kb.factory.create(self.kb.GenotypeDataSample, conf).save()

  def load(self, pedfile, datfile, conf_value=1.0):
    pr = PedReader(pedfile, datfile, conf_value)
    self.check_snp_markers_set(pr.marker_types, pr.marker_names)
    #--
    self.logger.info('start loading from pedfile %s' % pedfile.name)
    for x in pr:
      sample = self.kb.get_vessel(x['sample_label'])
      if not sample:
        self.logger.error('No sample with label %s in VL' % x['sample_label'])
        continue
      action = self.create_action(sample)
      avid = action.id
      action.unload()
      data_sample = self.create_data_sample(action, x['label'])
      data_object = self.kb.add_gdo_data_object(avid, data_sample,
                                                x['probs'], x['confs'])
      self.logger.info('-- loaded %s' % x['label'])
    self.logger.info('done loading from pedfile %s' % pedfile.name)
"""

mset.load_markers()
kb.update_snp_positions(mset.markers, ref_genome)
data_sample_by_id = {}
family = []
for i, ind in enumerate(kb.get_individuals(study)):
  family.append(ind)
  action = kb.create_an_action(study, target=ind, doc='fake dataset')
  conf = {'label' : 'taq-%03d' % i,
          'status' : kb.DataSampleStatus.USABLE,
          'action' : action,
          'snpMarkersSet' : mset}
  data_sample = kb.factory.create(kb.GenotypeDataSample, conf).save()
  probs, conf = make_fake_data(mset)
  do = kb.add_gdo_data_object(action, data_sample, probs, conf)
  data_sample_by_id[ind.id] = data_sample

""" ..

Note how we first create a DataSample object (GenotypeDataSample)
which basically keeps track of the fact that there exists a genotyping
data set defined on a given snp markers set, and then we provide an
actual data object that describes the physical object that contains
the real data. The idea is that there could be many instances, data
equivalent, that link to the same DataSample, e.g., on different file
systems, in different formats and so on.
"""

""" ..
Пример #3
0
class App(object):
    def __init__(self, host, user, passwd, study_label, maker, model, release):
        self.kb = KB(driver='omero')(host, user, passwd)
        self.mset = self.kb.get_snp_markers_set(maker, model, release)
        self.logger = logging.getLogger()
        if not self.mset:
            raise ValueError('SNPMarkersSet[%s,%s,%s] has not been defined.' %
                             (maker, model, release))
        #--
        alabel = 'load_genotypes-setup-%s' % time.time()
        self.asetup = self.kb.factory.create(self.kb.ActionSetup, {
            'label': alabel,
            'conf': ''
        }).save()
        #--
        dmaker, dmodel, drelease = 'CRS4', 'load_genotypes', '0.1'
        dlabel = '%s-%s-%s' % (dmaker, dmodel, drelease)
        device = self.kb.get_device(dlabel)
        if not device:
            device = self.kb.factory.create(
                self.kb.Device, {
                    'label': dlabel,
                    'maker': dmaker,
                    'model': dmodel,
                    'release': drelease
                }).save()
        self.device = device
        #-- FIXME this will break if study is not defined.
        self.study = self.kb.get_study(study_label)

    def check_snp_markers_set(self, marker_types, marker_names):
        self.logger.info('start checking snp_markers_set')
        mdefs, msetc = self.kb.get_snp_markers_set_content(self.mset)
        rs_labels = mdefs['rs_label']
        for t, n in it.izip(marker_types, marker_names):
            if t == 'M':
                if not n in rs_labels:
                    msg = 'marker %s is not in the specified SNPMarkersSet' % n
                    self.logger.critical(msg)
                    raise ValueError(msg)
        self.logger.info('done checking snp_markers_set')

    def create_action(self, target):
        conf = {
            'setup': self.asetup,
            'device': self.device,
            'actionCategory': self.kb.ActionCategory.MEASUREMENT,
            'operator': 'Alfred E. Neumann',
            'context': self.study,
            'target': target,
        }
        action = self.kb.factory.create(self.kb.ActionOnVessel, conf).save()
        return action

    def create_data_sample(self, action, label):
        conf = {
            'snpMarkersSet': self.mset,
            'label': label,
            'status': self.kb.DataSampleStatus.USABLE,
            'action': action
        }
        return self.kb.factory.create(self.kb.GenotypeDataSample, conf).save()

    def load(self, pedfile, datfile, conf_value=1.0):
        pr = PedReader(pedfile, datfile, conf_value)
        self.check_snp_markers_set(pr.marker_types, pr.marker_names)
        #--
        self.logger.info('start loading from pedfile %s' % pedfile.name)
        for x in pr:
            sample = self.kb.get_vessel(x['sample_label'])
            if not sample:
                self.logger.error('No sample with label %s in VL' %
                                  x['sample_label'])
                continue
            action = self.create_action(sample)
            avid = action.id
            action.unload()
            data_sample = self.create_data_sample(action, x['label'])
            data_object = self.kb.add_gdo_data_object(avid, data_sample,
                                                      x['probs'], x['confs'])
            self.logger.info('-- loaded %s' % x['label'])
        self.logger.info('done loading from pedfile %s' % pedfile.name)
class markers_set(unittest.TestCase):

  def __init__(self, name):
    super(markers_set, self).__init__(name)
    self.kill_list = []

  def setUp(self):
    self.kb = KB(driver='omero')(OME_HOST, OME_USER, OME_PASSWD)
    conf = {
      'label': 'TEST-%f' % time.time(),
      'description': 'unit test garbage',
      }
    self.study = self.kb.factory.create(self.kb.Study, conf).save()
    self.kill_list.append(self.study)
    self.action = self.kb.create_an_action(self.study)
    self.kill_list.append(self.action)

  def tearDown(self):
    self.kill_list.reverse()
    for x in self.kill_list:
      self.kb.delete(x)
    self.kill_list = []

  def create_markers(self, N):
    def marker_generator():
      for i in range(N):
        label = 'A%f-%d' % (time.time(), i)
        yield (label, label, 'ACCA[A/B]TACCA')
    source, context, release = 'unit_testing', 'markers_set', '%f' % time.time()
    ref_rs_genome, dbsnp_build = 'foo-rs-genome', 123000
    lvs = self.kb.create_markers(source, context, release,
                                 ref_rs_genome, dbsnp_build,
                                 marker_generator(), self.action)
    return lvs

  def create_snp_markers_set(self, lvs):
    label = 'ams-%f' % time.time()
    maker, model, release = 'FOO', 'FOO1', '%f' % time.time()
    markers_selection = [(v[1], i, False) for i, v in enumerate(lvs)]
    mset = self.kb.create_snp_markers_set(label, maker, model, release,
                                          len(lvs),
                                          markers_selection, self.action)
    self.kill_list.append(mset)
    return mset

  def create_alignments(self, mset, ref_genome, n_duplicates):
    mset.load_markers()
    self.assertTrue(len(mset) > 0)
    n_aligns = len(mset.markers) + n_duplicates
    pos = []
    def insert_duplicates(markers):
      count = 0
      for i, m in enumerate(markers):
        n_copies = 1
        if count < n_duplicates:
          count += 1
          n_copies = 2
          r = (m[0], random.randint(1,26), 22 + i*1000, True,
               'A' if (i%2)== 0 else 'B', n_copies)
          yield r
        r = (m[0], random.randint(1,26), 1 + i*1000, True,
             'A' if (i%2)== 0 else 'B', n_copies)
        pos.append((0,0) if n_copies > 1 else (r[1], r[2]))
        yield r
    aligns = [x for x in insert_duplicates(mset.markers)]
    random.shuffle(aligns)
    self.kb.align_snp_markers_set(mset, ref_genome, aligns, self.action)
    return pos

  def create_data_sample(self, mset, label):
    conf = {
      'label': label,
      'status': self.kb.DataSampleStatus.USABLE,
      'action': self.action,
      'snpMarkersSet': mset,
      }
    data_sample = self.kb.factory.create(self.kb.GenotypeDataSample,
                                         conf).save()
    self.kill_list.append(data_sample)
    return data_sample

  def create_data_object(self, data_sample, add_nan=False):
    probs, confs = make_fake_data(data_sample.snpMarkersSet, add_nan)
    do = self.kb.add_gdo_data_object(self.action, data_sample, probs, confs)
    self.kill_list.append(do)
    return probs, confs

  def create_aligned_mset(self, N, N_dups, ref_genome):
    lvs = self.create_markers(N)
    mset = self.create_snp_markers_set(lvs)
    pos = self.create_alignments(mset, ref_genome, N_dups)
    return mset, pos

  def test_creation_destruction(self):
    N = 32
    lvs = self.create_markers(N)
    mset = self.create_snp_markers_set(lvs)
    mset.load_markers()
    self.assertEqual(len(mset), N)
    for lv, m in it.izip(lvs, mset.markers):
      self.assertEqual(lv[1], m[0])

  def test_align(self):
    N = 16
    N_dups = 4
    ref_genome = 'g' + ('%f' % time.time())[-14:]
    mset, pos = self.create_aligned_mset(N, N_dups, ref_genome)
    mset.load_alignments(ref_genome)
    for p, m in it.izip(pos, mset.get_markers_iterator()):
      self.assertEqual(p, m.position)

  def test_read_ssc(self):
    N = 16
    N_dups = 4
    ref_genome = 'g' + ('%f' % time.time())[-14:]
    mset, pos = self.create_aligned_mset(N, N_dups, ref_genome)
    mset.load_markers(additional_fields=['label'])
    probs, confs = make_fake_data(mset)
    sample_id = 'ffoo-%f' % time.time()
    fn = tempfile.NamedTemporaryFile().name
    make_fake_ssc(mset, sample_id, probs, confs, fn)
    probs_1, confs_1 = gio.read_ssc(fn, mset)
    self.assertAlmostEqual(np.sum(np.abs(probs - probs_1)), 0.0)
    self.assertAlmostEqual(np.sum(np.abs(confs - confs_1)), 0.0)

  def test_gdo(self):
    N = 32
    lvs = self.create_markers(N)
    mset = self.create_snp_markers_set(lvs)
    mset.load_markers()
    data_sample = self.create_data_sample(mset, 'foo-data')
    probs, confs = self.create_data_object(data_sample)
    probs1, confs1 = data_sample.resolve_to_data()
    self.assertTrue((probs == probs1).all())
    self.assertTrue((confs == confs1).all())
    s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample])
    for i, x in enumerate(s):
      self.assertTrue((probs == x['probs']).all())
      self.assertTrue((confs == x['confidence']).all())
    self.assertEqual(i, 0)
    indices = slice(N/4, N/2)
    s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample],
                                 indices=indices)
    for i, x in enumerate(s):
      self.assertTrue((probs[:,indices] == x['probs']).all())
      self.assertTrue((confs[indices] == x['confidence']).all())
    self.assertEqual(i, 0)

  def test_define_range_selector(self):
    N, N_dups = 16, 0
    ref_genome = 'g' + ('%f' % time.time())[-14:]
    mset, pos = self.create_aligned_mset(N, N_dups, ref_genome)
    mset.load_alignments(ref_genome)
    pos.sort()
    if len(pos) > 2:
      low_pos, high_pos = pos[1], pos[-2]
    gc_range = (low_pos, high_pos)
    range_sel = self.kb.SNPMarkersSet.define_range_selector(mset, gc_range)
    i = 0
    for (i, m) in enumerate(mset.get_markers_iterator()):
      if i in range_sel:
        self.assertTrue(low_pos <= m.position <= high_pos)
      else:
        self.assertTrue(low_pos > m.position or high_pos < m.position)

  def test_intersect(self):
    ref_genome = 'g' + ('%f' % time.time())[-14:]
    N1 = 16
    M1 = 2
    N2 = N1/2
    M2 = 1
    lvs = self.create_markers(N1)
    mset1 = self.create_snp_markers_set(lvs)
    mset1.load_markers()
    aligns = [(m[0], random.randint(1,26), 1 + i*2000, True, 'A', 1)
              for i, m in enumerate(mset1.markers)]
    for i in range(M1):
      aligns[i] = (aligns[i][0], 0, 0, True, 'A', 0)
    self.kb.align_snp_markers_set(mset1, ref_genome, aligns, self.action)
    lvs = self.create_markers(N2)
    mset2 = self.create_snp_markers_set(lvs)
    mset2.load_markers()
    aligns = [(m[0], a[1], a[2], a[3], a[4], a[5])
              for m, a in it.izip(mset2.markers, aligns[:len(mset2)])]
    for i in range(M2):
      aligns[i] = (aligns[i][0], 0, 0, True, 'A', 0)
    self.kb.align_snp_markers_set(mset2, ref_genome, aligns, self.action)
    mset1.load_alignments(ref_genome)
    mset2.load_alignments(ref_genome)
    idx1, idx2 = self.kb.SNPMarkersSet.intersect(mset1, mset1)
    self.assertTrue(np.array_equal(idx1, idx2))
    self.assertEqual(len(idx1), len(mset1))
    self.assertEqual(len(idx1), N1)

    idx1, idx2 = self.kb.SNPMarkersSet.intersect(mset1, mset2)
    self.assertEqual(len(idx1), len(idx2))
    self.assertEqual(len(idx1), N2 - max(M1, M2))
    for i,j in it.izip(idx1, idx2):
      m1, m2 = mset1[i], mset2[j]
      self.assertEqual(m1.position, m2.position)
      self.assertTrue(m1.position > (0,0))

  def test_speed(self):
    ref_genome = 'g' + ('%f' % time.time())[-14:]
    N1 = 1024*1024
    N2 = N1/2
    beg = time.time()
    lvs = self.create_markers(N1)
    print ''
    print 'creating %d markers took %f' % (N1, time.time() - beg)
    beg = time.time()
    mset1 = self.create_snp_markers_set(lvs)
    print 'creating a markers set with %d markers took %f' % (N1,
                                                              time.time() - beg)
    beg = time.time()
    mset1.load_markers()
    print 'loading markers took %f' % (time.time() - beg)
    beg = time.time()
    aligns = [(m[0], random.randint(1,26), 1 + i*2000, True, 'A', 1)
              for i, m in enumerate(mset1.markers)]
    print 'creating %d aligns took %f' % (N1, time.time() - beg)
    beg = time.time()
    self.kb.align_snp_markers_set(mset1, ref_genome, aligns, self.action)
    print 'saving  %d aligns took %f' % (N1, time.time() - beg)
    beg = time.time()
    mset1.load_alignments(ref_genome)
    print 'loading  %d aligns took %f' % (N1, time.time() - beg)
    beg = time.time()
    idx1, idx2 = self.kb.SNPMarkersSet.intersect(mset1, mset1)
    print 'intersecting  %d aligns took %f' % (N1, time.time() - beg)

  def test_speed_gdo(self):
    N = 934968
    beg = time.time()
    lvs = self.create_markers(N)
    print ''
    print 'creating %d markers took %f' % (N, time.time() - beg)
    mset = self.create_snp_markers_set(lvs)
    beg = time.time()
    mset.load_markers()
    print 'loading %d markers took %f' % (N, time.time() - beg)
    beg = time.time()
    data_sample = self.create_data_sample(mset, 'foo-data')
    print 'creating a data sample took %f' % (time.time() - beg)
    beg = time.time()
    probs, confs = self.create_data_object(data_sample)
    print 'creating a data object took %f' % (time.time() - beg)
    beg = time.time()
    probs1, confs1 = data_sample.resolve_to_data()
    print 'resolving to data  took %f' % (time.time() - beg)
    self.assertTrue((probs == probs1).all())
    self.assertTrue((confs == confs1).all())
    beg = time.time()
    s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample])
    for i, x in enumerate(s):
      self.assertTrue((probs == x['probs']).all())
      self.assertTrue((confs == x['confidence']).all())
    self.assertEqual(i, 0)
    print 'iterating took %f' % (time.time() - beg)
    indices = slice(N/4, N/2)
    beg = time.time()
    s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample],
                                 indices=indices)
    for i, x in enumerate(s):
      self.assertTrue((probs[:,indices] == x['probs']).all())
      self.assertTrue((confs[indices] == x['confidence']).all())
    self.assertEqual(i, 0)
    print 'iterating with indices took %f' % (time.time() - beg)
Пример #5
0
mset.load_markers()
kb.update_snp_positions(mset.markers, ref_genome)
data_sample_by_id = {}
family = []
for i, ind in enumerate(kb.get_individuals(study)):
    family.append(ind)
    action = kb.create_an_action(study, target=ind, doc='fake dataset')
    conf = {
        'label': 'taq-%03d' % i,
        'status': kb.DataSampleStatus.USABLE,
        'action': action,
        'snpMarkersSet': mset
    }
    data_sample = kb.factory.create(kb.GenotypeDataSample, conf).save()
    probs, conf = make_fake_data(mset)
    do = kb.add_gdo_data_object(action, data_sample, probs, conf)
    data_sample_by_id[ind.id] = data_sample
""" ..

Note how we first create a DataSample object (GenotypeDataSample)
which basically keeps track of the fact that there exists a genotyping
data set defined on a given snp markers set, and then we provide an
actual data object that describes the physical object that contains
the real data. The idea is that there could be many instances, data
equivalent, that link to the same DataSample, e.g., on different file
systems, in different formats and so on.
"""
""" ..

As an example, we will now write out the information we have just
saved as a plink pedfile.