Exemplo n.º 1
0
 def vert_winner(self):
     spaces_by_col = [[self.spaces[i][j] for i in range(3)] for j  in range(3)]
     for col in spaces_by_col:
         if utils.all_same(col):
             self.set_winner(col[0])
             return True
     return False
Exemplo n.º 2
0
 def vert_winner(self):
     spaces_by_col = [[self.spaces[i][j] for i in range(3)]
                      for j in range(3)]
     for col in spaces_by_col:
         if utils.all_same(col):
             self.set_winner(col[0])
             return True
     return False
Exemplo n.º 3
0
 def diag_winner(self):
     diag1 = [(0,0), (1,1), (2,2)]
     diag2 = [(0,2), (1,1), (2,0)]
     diags = [diag1, diag2]
     for diag in diags:
         if utils.all_same([self.spaces[i][j] for i,j in diag]):
             self.set_winner(self.spaces[1][1])
             return True
     return False
Exemplo n.º 4
0
 def diag_winner(self):
     diag1 = [(0, 0), (1, 1), (2, 2)]
     diag2 = [(0, 2), (1, 1), (2, 0)]
     diags = [diag1, diag2]
     for diag in diags:
         if utils.all_same([self.spaces[i][j] for i, j in diag]):
             self.set_winner(self.spaces[1][1])
             return True
     return False
Exemplo n.º 5
0
def conservation_score(f_chromsizes, d_phastcons, in_gff, out_avcons):
    tmp = random_string(12)

    d_split = out_avcons + 'gff_by_chromosome_'+tmp
    d_cons =  out_avcons + 'conservation_'+tmp

    [os.makedirs(d) for d in [d_split, d_cons] if not os.path.exists(d)]

    f_cons      = out_avcons + 'conservation.txt'
    f_aver_cons = out_avcons

    ## get chromosomes
    chromosomes = []
    with open(f_chromsizes) as f:
        for l in f:
            c = l.split('\t')[0]
            if (('random' not in c) and ('chrM' not in c) and ('chrUn' not in c)):
                chromosomes.append(c[3:])

    ## separate infile by chromosome
    for c in chromosomes:
        f_out = os.path.join(d_split, 'tss_filtered_all_'+c+'.gff')
        with open(f_out, 'w') as out:
            with open(in_gff) as f:
                for line in f:
                    chrom = line.split('\t')[0]
                    if (chrom == c):
                        out.write(line)

    ## calculate conservation per chromosome
    _conservation(chromosomes, d_split, d_cons, d_phastcons)

    ## merge chromosomes
    os.system("cat "+d_cons+"/conservation_all_*txt > "+f_cons)

    ## get average conservation
    _average_conservation(f_cons, f_aver_cons)

    ## cleanup
    is_same = []
    for c in chromosomes:
        n_gff = line_count(os.path.join(d_split, 'tss_filtered_all_'+c+'.gff'))
        n_con = line_count(os.path.join(d_cons,  'conservation_all_'+c+'.txt'))
        is_same.append(n_gff == n_con)
    if all_same(is_same):
        os.system('rm -r %s %s %s' % (d_split, d_cons, f_cons))
    else:
        not_equal = [chromosomes[i] for i,v in enumerate(is_same) if not v]
        sys.exit('Error: Total number of positions does not match for chr: ' + ' '.join(not_equal))

    ## sort average conservation
    sorted_avcons = out_avcons + '.sorted.tmp'
    cmd = "sort -k1,2 -n "+out_avcons+" > "+sorted_avcons
    os.system(cmd)
    return sorted_avcons
Exemplo n.º 6
0
def build_features_matrix(sorted_gff, sorted_cpg, sorted_avcons, sorted_tata, f_out):
    ## check that all in files contain same number of data lines
    n_g = line_count(sorted_gff)
    n_c = line_count(sorted_cpg)
    n_a = line_count(sorted_avcons)
    n_t = line_count(sorted_tata)
    if not all_same([n_g, n_c, n_a, n_t]):
        sys.exit('Error: line count of feature files are not all equal:%s,%s,%s,%s' %
            n_g, n_c, n_a, n_t)

    ## create matrix
    lcount = 0
    with open(f_out, 'w') as out:
        with open(sorted_gff) as f:
            for l in f:
                lcount += 1

                l = l.strip().split('\t')
                c      = l[0]
                region_up   = l[3] #500bp   upstream of start; not used
                region_down = l[4] #500bp downstream of start; not used
                count  = l[5]
                strand = l[6]

                info = l[8].split(';')
                #dist_score = '?'

                peak_start = get_value_from_keycolonvalue_list('start', info)
                peak_stop  = get_value_from_keycolonvalue_list('stop', info)

                CpG_value    = linecache.getline(sorted_cpg,lcount).strip().split('\t')[3]
                try:
                    conservation = linecache.getline(sorted_avcons,lcount).strip().split('\t')[2]
                except:
                    conservation = '0'

                affinity     = linecache.getline(sorted_tata,lcount).strip().split('\t')[7]

                features = ';'.join(['cpg:'+CpG_value, 'cons:'+conservation, 'tata:'+affinity])
                new_info = ';'.join(['region_start:'+region_up, 'region_stop:'+region_down])
                line = '\t'.join([c, l[1], l[2],
                                  peak_start, peak_stop, count, strand,
                                  features, new_info])
                out.write(line + '\n')
Exemplo n.º 7
0
 def elemset(self):
     if self._elemset is None:
         elemnamesets = [frozenset(i.elemnames) for i in self.forms]
         assert all_same(elemnamesets)
         self._elemset = frozenset(self.forms[0].elems)
     return self._elemset
Exemplo n.º 8
0
 def row_winner(self):
     for row in self.spaces:
         if utils.all_same(row):
             self.set_winner(row[0])
             return True
     return False
Exemplo n.º 9
0
 def elemset(self):
     if self._elemset is None:
         elemnamesets = [frozenset(i.elemnames) for i in self.forms]
         assert all_same(elemnamesets)
         self._elemset = frozenset(self.forms[0].elems)
     return self._elemset
Exemplo n.º 10
0
def flatten(dm):
    """
    Takes a DataMat who's elements are arrays and returns a flattened copy
    in which the DataMat element is the lowest atom of data: so no DataMat
    element contains time-indexed fields: all the time points are directly,
    flatly, accessible.
    Makes DataMat potentially extremely long, but eases merging, aligning, 
    and maybe also analysis.
    """
    tmfields = dm.time_based_fields
    seqfields = []
    dbg(2, 'will flatten DataMat with %d elements.' % (len(dm)))
    #Step 1. Determine which fields need flattening.
    # TODO: a better test for the sequence fields is needed here.
    for f in dm.fieldnames():
        if (dm.__dict__[f].dtype == np.object) and isiterable(dm.__dict__[f][0]):
            seqfields += [f]
            dbg(3, "seqfield: %s, %s, %s" % (f, 
                    type(dm.__dict__[f][0]),
                    dm.__dict__[f][0].dtype))

    #Step 2. Determine the amount of elements in the fields to be flattened.
    nelements = []
    for dmi in dm:
        elementn = [len(dmi.field(f)[0]) for f in seqfields]
        assert(all_same(elementn))
        nelements += [elementn[0]]
    dbg(2, 'flattened DataMat will contain %d elements' % (sum(nelements)))

    newdm = dm.copy_empty()
    newdm._num_fix = sum(nelements)

    nonseqfields = set(seqfields).symmetric_difference(set(dm.fieldnames()))
    newdata = {}
    newmask = {}
    #Step 3. Create new, empty, arrays for each of the non-sequence fields.
    for f in nonseqfields:
        dbg(3, "creating empty non-seq field '%s'" % (f))
        #to avoid problems with uninitialised values, use ma_nans instead of
        # ma.empty(sum(nelements), dtype=dm.field(f).dtype)
        if isiterable(dm.field(f)[0]):
            fdtype = np.object
        else:
            fdtype = dm.field(f).dtype

        newdata[f] = ma_nans(sum(nelements)).astype(fdtype)

    #Step 4. Expand all non-sequence fields into the new empty arrays.
    sidx = 0
    for idx, dmi in enumerate(dm):
        eidx = sidx + nelements[idx]
        dbg(4, '%d,%d' % (sidx, eidx))
        for f in nonseqfields:
            dbg(3, "element %d/%d: filling non-seq field '%s' [%d:%d] (%s)" % (idx,
                    len(dm),
                    f,
                    sidx, eidx,
                    str(dmi.field(f)[0])))
            if isiterable(dmi.field(f)[0]):
                for ii in xrange(sidx, eidx):
                    newdata[f][ii] = \
                        dmi.field(f)[0].astype(np.object)
            else:
                newdata[f][sidx:eidx] = dmi.field(f)[0]
        sidx = eidx


    #Step 5. Stack all the sequence fields together.
    for f in seqfields:
        dbg(3, "stacking sequence field '%s'" % (f))
        newdata[f] = np.hstack(dm.field(f))
        newmask[f] = np.hstack(np.ma.getmaskarray(dm.field(f)))
        dbg(4, 'newmask[%s]: %s' % (f, newmask[f]))
        warn('todo: set mask correctly')

    #Step 6. Create the new DataMat
    for k, v in newdata.iteritems():
        newdm.add_field(k, v)

    return newdm #newdata, newmask
Exemplo n.º 11
0
 def row_winner(self):
     for row in self.spaces:
         if utils.all_same(row):
             self.set_winner(row[0])
             return True
     return False