Exemplo n.º 1
def calcLscoreFromHardSegmentation(origmatpath, doAppendTxtFile=True):
    ''' Compute variational objective from specific segmentation (as .mat file)

    Lscore : scalar

    Post Condition
    if doAppend, will append to evidence-saved-params.txt
    # Figure out dataName
    dataName = origmatpath.replace(os.environ['BNPYOUTDIR'], '')
    if dataName.startswith(os.path.sep):
        dataName = dataName[1:]
    dataName = dataName.split(os.path.sep)[0]

    # Load train data
    Data, TrueZ = loadDatasetByName(dataName)

    # Load the estimated zs and convert to flat 1D array
    SVars = scipy.io.loadmat(origmatpath)
    estZflat = convertStateSeq_list2flat(
    initLP = bnpy.init.FromTruth.convertLPFromHardToSoft(
        dict(Z=estZflat), Data)

    # Load a model with right hyperparameters
    # Requires similar run with memoized already started.
    memomatpath = origmatpath.replace('foxHDPHMMsampler', 'bnpyHDPHMMmemo')
    memomatpath = os.path.sep.join(memomatpath.split(os.path.sep)[:-1])
    print memomatpath
    model, lapQ = bnpy.ioutil.ModelReader.loadModelForLap(memomatpath, 0.0)

    # Run inference one step forward
    initLP = model.allocModel.initLPFromResp(Data, initLP)
    initSS = model.get_global_suff_stats(Data, initLP)
    LP = model.calc_local_params(Data)
    SS = model.get_global_suff_stats(Data, LP)

    # Compute evidence
    Lscore = model.calc_evidence(SS=SS)
    print Lscore, '<<< Lscore'

    # Write to text files!
    if doAppendTxtFile:
        outpathFields = origmatpath.split(os.path.sep)
        taskoutpath = os.path.sep.join(outpathFields[:-1])
        outtxtfile = os.path.join(taskoutpath, 'evidence-saved-params.txt')
        with open(outtxtfile, 'a') as f:
            f.write('%.6f\n' % (Lscore))
    return Lscore, model
Exemplo n.º 2
def makeAlignedStateSeqsFromMATFile(origmatpath):
  # Figure out dataName
  dataName = origmatpath.replace(os.environ['BNPYOUTDIR'], '')
  if dataName.startswith(os.path.sep):
    dataName = dataName[1:]
  dataName = dataName.split(os.path.sep)[0]
  print 'Aligning state seqs for dataset: ', dataName

  # Load the estimated zs and convert to list format
  SVars = scipy.io.loadmat(origmatpath)
  estZlist = convertStateSeq_MAT2list(SVars['zHatBySeq'])

  # Find relabeling that aligns to true labels, and get back into MAT format
  alignedZlist, hdist = getAlignedStateSeqsAndHammingDist(dataName, estZlist)

  zHatBySeq = convertStateSeq_list2MAT(alignedZlist)

  # Save to MAT file
  outmatpath = origmatpath.replace('MAPStateSeqs', 'MAPStateSeqsAligned')
  scipy.io.savemat(outmatpath, dict(zHatBySeq=zHatBySeq), oned_as='rows')

  # Write to text files!
  outpathFields = origmatpath.split(os.path.sep)
  taskoutpath = os.path.sep.join(outpathFields[:-1])

  outtxtfile = os.path.join(taskoutpath, 'hamming-distance.txt')
  with open(outtxtfile, 'a') as f:
    f.write('%.6f\n' % (hdist))

  lap = float(outpathFields[-1][3:9])
  outtxtfile = os.path.join(taskoutpath, 'laps-saved-params.txt')
  with open(outtxtfile, 'a') as f:
    f.write('%d\n' % (lap))

  Keff = getKeff(alignedZlist)
  outtxtfile = os.path.join(taskoutpath, 'Keff-saved-params.txt')
  with open(outtxtfile, 'a') as f:
    f.write('%d\n' % (Keff))
  outtxtfile = os.path.join(taskoutpath, 'K-saved-params.txt')
  with open(outtxtfile, 'a') as f:
    f.write('%d\n' % (SVars['K']))
  print 'Done. Appended data from lap %.3f to \n  %s' % (
       lap, taskoutpath)
def makeBEDFileFromSegmentation(chrData, taskpath, lapFrac, ColorMap=[[255, 0, 0]]):
    """ Create .bed format file for display at genome.ucsc.edu

    Post Condition
    File exists at <taskpath>/Lap<Prefix>Segmentation.bed
    import bnpy
    import scipy.io
    from bnpy.util.StateSeqUtil import convertStateSeq_MAT2list

    ColorMap = np.asarray(ColorMap)
    header1 = "browser position chr7:116,260,000-116,361,000"
    header2 = 'track name="CD4T segmentation test" visibility=2 itemRgb="On" useScore=1'
    chrName = chrData.fileNames[0]
    slinePattern = "chr7 0 " + str(chrData.endLoc_bp) + " state??? 1000 . 0 0"
    segfpath = os.path.join(taskpath, "Lap%08.3fMAPStateSeqs.mat" % (lapFrac))
    SaveVars = scipy.io.loadmat(segfpath)
    zBySeqMAT = SaveVars["zHatBySeq"]
    zBySeq = convertStateSeq_MAT2list(zBySeqMAT)
    for zHat in zBySeq:
        uLabels = np.unique(zHat)
        for uLoc, uID in enumerate(uLabels):
            sline = slinePattern.replace("???", str(uID))
            blockSizes, blockStarts = extractBlocksFromMask(zHat == uID)
            blockSizes *= chrData.stepSize_bp
            blockStarts *= chrData.stepSize_bp
            blockStarts[1:] += chrData.startLoc_bp

            # Must add final singleton block to mark the endLoc
            blockSizes = np.hstack([blockSizes, 1])
            blockStarts = np.hstack([blockStarts, chrData.endLoc_bp - 1])

            blockSizeStr = ",".join(["%d" % (x) for x in blockSizes])
            blockStartStr = ",".join(["%d" % (x) for x in blockStarts])
            curColorRGB = ColorMap[uLoc % ColorMap.shape[0]]
            colorStr = ",".join(["%d" % (x) for x in curColorRGB])
            sline = sline + " %s %d %s %s" % (colorStr, len(blockSizes), blockSizeStr, blockStartStr)
            print sline
Exemplo n.º 4
def plotSingleJob(
    Returns the array of Data corresponding to a single sequence to display

    If dispTrue = True, the true labels will be shown underneath the
      estimated labels
    # Make sequences zero-indexed
    if isinstance(sequences, str):
        sequences = np.asarray([int(x) for x in args.sequences.split(',')],
    sequences = np.asarray(sequences, dtype=np.int32)
    if np.min(sequences) < 1:
        raise ValueError('Sequences need to be one-index.\n' +
                         'Valid values are 1,2,...N.')
    sequences -= 1

    # Determine the jobpath and taskids
    jobpath = os.path.join(os.path.expandvars('$BNPYOUTDIR'), dataset, jobname)
    if isinstance(taskids, str):
        if taskids.startswith('.'):
            taskids = [taskids]
            taskids = BNPYArgParser.parse_task_ids(jobpath, taskids)
    elif isinstance(taskids, int):
        taskids = [str(taskids)]

    datasetPrefFile = os.path.join(jobpath, taskids[0],
    datasetPrefs = dict()
    if os.path.exists(datasetPrefFile):
        with open(datasetPrefFile, 'r') as f:
            for line in f.readlines():
                fields = line.strip().split(' ')
                if len(fields) != 2:
                datasetPrefs[fields[0]] = fields[1]

    # Load Data from its python module
    Datamod = imp.load_source(
        dataset, os.path.expandvars('$BNPYDATADIR/' + dataset + '.py'))
    if dataset == 'SpeakerDiar':
        if len(sequences) > 1:
            raise ValueError(
                'Joint modeling of several sequences makes no sense')
        Data = Datamod.get_data(meetingNum=sequences[0] + 1, **datasetPrefs)
        jobpath = jobpath.replace('SpeakerDiar',
                                  'SpeakerDiar' + str(sequences[0] + 1))
        sequences[0] = 0

        Data = Datamod.get_data(**datasetPrefs)

    # Determine the maximum length among any of the sequences to be plotted
    if maxT is None:
        Ts = Data.doc_range[sequences + 1] - Data.doc_range[sequences]
        maxT = np.max(Ts)

    # Define the number of pixels used by vertical space of figure
    NUM_STACK = int(np.ceil(maxT / float(aspectFactor)))
    if dispTrue:
        NUM_STACK /= 2

    f, axes = plt.subplots(len(sequences),

    # For singleton case, make sure that axes is index-able
    if len(sequences) == 1 and len(taskids) == 1:
        axes = [axes]

    for tt, taskidstr in enumerate(taskids):
        if tt == 0 and taskidstr.startswith('.'):

        path = os.path.join(jobpath, taskidstr) + os.path.sep

        # Figure out which lap to use
        if lap == 'final':
            lapsFile = open(path + 'laps-saved-params.txt')
            curLap = lapsFile.readlines()
            curLap = float(curLap[-1])
            curLap = int(lap)

        if showELBOInTitle:
            hdists = np.loadtxt(os.path.join(path, 'hamming-distance.txt'))
            hlaps = np.loadtxt(os.path.join(path, 'laps-saved-params.txt'))
            Keffvals = np.loadtxt(os.path.join(path, 'Keff-saved-params.txt'))
            # Determine scalar values to display
            loc = np.argmin(np.abs(hlaps - curLap))
            hdist = hdists[loc]
            Kefffinal = Keffvals[loc]

                Kvals = np.loadtxt(os.path.join(path, 'K.txt'))
                ELBOscores = np.loadtxt(os.path.join(path, 'evidence.txt'))
                laps = np.loadtxt(os.path.join(path, 'laps.txt'))

                loc = np.argmin(np.abs(laps - curLap))
                ELBO = ELBOscores[loc]
                Kfinal = Kvals[loc]
            except IOError:
                ELBO = 0.0
                Kfinal = Kefffinal

        # Load in the saved Data from $BNPYOUTDIR
            filename = 'Lap%08.3fMAPStateSeqsAligned.mat' % curLap
            zHatBySeq = scipy.io.loadmat(path + filename)
            key1 = 'zHatBySeqAligned'
            key2 = 'zHatBySeq'
            if key1 in zHatBySeq:
                zHatBySeq = convertStateSeq_MAT2list(zHatBySeq[key1])
            elif key2 in zHatBySeq:
                zHatBySeq = convertStateSeq_MAT2list(zHatBySeq[key2])
                raise IOError
        except IOError:
            filename = 'Lap%08.3fMAPStateSeqs.mat' % curLap
            zHatBySeq = scipy.io.loadmat(path + filename)
            zHatBySeq = convertStateSeq_MAT2list(zHatBySeq['zHatBySeq'])

        if specialStateIDs is not None:
            zHatBySeq = relabelAllSequences(zHatBySeq, specialStateIDs)

        # Find maximum number of states we need to display
        nSeq = len(zHatBySeq)
        Kmax = np.max([zHatBySeq[i].max() for i in xrange(nSeq)])
        hasGroundTruth = False

        vmin = 0
        Kignore = 0
        if hasattr(Data, 'TrueParams') and 'Z' in Data.TrueParams:
            hasGroundTruth = True
            Kmax = np.maximum(Data.TrueParams['Z'].max(), Kmax)
            uLabels = np.unique(Data.TrueParams['Z'])
            Kignore = np.sum(uLabels < 0)
            if Kignore > 0:
                for k in range(1, Kignore + 1):
                    print 'ignoring state %d  Ttrue = %d' % (
                        -k, np.sum(Data.TrueParams['Z'] == -k))

            if colorManyToOne:
                # For each state in zHat, find best true sequence
                Zflat = convertStateSeq_list2flat(zHatBySeq, Data)
                ZflatA = -1 * np.ones_like(Zflat)
                for uID in np.unique(Zflat):
                    overlap = np.zeros(uLabels.size)
                    for ii, trueID in enumerate(uLabels):
                        overlap[ii] = np.sum(
                            np.logical_and(Data.TrueParams['Z'] == trueID,
                                           Zflat == uID))
                    bestii = overlap.argmax()
                    ZflatA[Zflat == uID] = uLabels[bestii]
                zHatBySeq = convertStateSeq_flat2list(ZflatA, Data)

        # In case there's only one sequence, make sure it's index-able
        for ii, seqNum in enumerate(sequences):
            image = np.tile(zHatBySeq[seqNum], (NUM_STACK, 1))

            # Add the true labels to the image (if they exist)
            if hasGroundTruth and dispTrue:
                start = Data.doc_range[seqNum]
                stop = Data.doc_range[seqNum + 1]
                img_trueZ = np.tile(Data.TrueParams['Z'][start:stop],
                                    (NUM_STACK, 1))
                if dispTrue == 2:
                    image = img_trueZ  # Show only true labels
                    image = np.vstack((image, img_trueZ))

            image = image[:, :maxT]
            if len(sequences) == 1 or len(taskids) == 1:
                cur_ax = axes[ii + tt]
                cur_ax = axes[ii, tt]

            if hasattr(cmap, 'N'):
                vmax = cmap.N
                vmax = Kmax

            cur_ax.imshow(Kignore + image + .0001,
            if tt == 0:
                if seqNames is not None:
                    h = cur_ax.set_ylabel('%s' % (seqNames[ii]), fontsize=13)

                elif len(sequences) > 4:
                    cur_ax.set_ylabel('%d' % (seqNum + 1), fontsize=13)
                    cur_ax.set_ylabel('Seq. %d' % (seqNum + 1), fontsize=13)

            if ii == 0:
                if showELBOInTitle:
                    fmtSpec = "ELBO: %.3f  K=%d Keff=%d  "
                    if hdist > 0.01:
                        fmtSpec += "dist=%.2f"
                    elif hdist > 0.001:
                        fmtSpec += "dist=%.3f"
                        fmtSpec += "dist=%.4f"
                    title = fmtSpec % (ELBO, Kfinal, Kefffinal, hdist)

            cur_ax.set_xlim([0, maxT])
            cur_ax.set_ylim([0, image.shape[0]])
            # ... end loop over sequences
    return axes, zHatBySeq