def vert_winner(self): spaces_by_col = [[self.spaces[i][j] for i in range(3)] for j in range(3)] for col in spaces_by_col: if utils.all_same(col): self.set_winner(col[0]) return True return False
def diag_winner(self): diag1 = [(0,0), (1,1), (2,2)] diag2 = [(0,2), (1,1), (2,0)] diags = [diag1, diag2] for diag in diags: if utils.all_same([self.spaces[i][j] for i,j in diag]): self.set_winner(self.spaces[1][1]) return True return False
def diag_winner(self): diag1 = [(0, 0), (1, 1), (2, 2)] diag2 = [(0, 2), (1, 1), (2, 0)] diags = [diag1, diag2] for diag in diags: if utils.all_same([self.spaces[i][j] for i, j in diag]): self.set_winner(self.spaces[1][1]) return True return False
def conservation_score(f_chromsizes, d_phastcons, in_gff, out_avcons): tmp = random_string(12) d_split = out_avcons + 'gff_by_chromosome_'+tmp d_cons = out_avcons + 'conservation_'+tmp [os.makedirs(d) for d in [d_split, d_cons] if not os.path.exists(d)] f_cons = out_avcons + 'conservation.txt' f_aver_cons = out_avcons ## get chromosomes chromosomes = [] with open(f_chromsizes) as f: for l in f: c = l.split('\t')[0] if (('random' not in c) and ('chrM' not in c) and ('chrUn' not in c)): chromosomes.append(c[3:]) ## separate infile by chromosome for c in chromosomes: f_out = os.path.join(d_split, 'tss_filtered_all_'+c+'.gff') with open(f_out, 'w') as out: with open(in_gff) as f: for line in f: chrom = line.split('\t')[0] if (chrom == c): out.write(line) ## calculate conservation per chromosome _conservation(chromosomes, d_split, d_cons, d_phastcons) ## merge chromosomes os.system("cat "+d_cons+"/conservation_all_*txt > "+f_cons) ## get average conservation _average_conservation(f_cons, f_aver_cons) ## cleanup is_same = [] for c in chromosomes: n_gff = line_count(os.path.join(d_split, 'tss_filtered_all_'+c+'.gff')) n_con = line_count(os.path.join(d_cons, 'conservation_all_'+c+'.txt')) is_same.append(n_gff == n_con) if all_same(is_same): os.system('rm -r %s %s %s' % (d_split, d_cons, f_cons)) else: not_equal = [chromosomes[i] for i,v in enumerate(is_same) if not v] sys.exit('Error: Total number of positions does not match for chr: ' + ' '.join(not_equal)) ## sort average conservation sorted_avcons = out_avcons + '.sorted.tmp' cmd = "sort -k1,2 -n "+out_avcons+" > "+sorted_avcons os.system(cmd) return sorted_avcons
def build_features_matrix(sorted_gff, sorted_cpg, sorted_avcons, sorted_tata, f_out): ## check that all in files contain same number of data lines n_g = line_count(sorted_gff) n_c = line_count(sorted_cpg) n_a = line_count(sorted_avcons) n_t = line_count(sorted_tata) if not all_same([n_g, n_c, n_a, n_t]): sys.exit('Error: line count of feature files are not all equal:%s,%s,%s,%s' % n_g, n_c, n_a, n_t) ## create matrix lcount = 0 with open(f_out, 'w') as out: with open(sorted_gff) as f: for l in f: lcount += 1 l = l.strip().split('\t') c = l[0] region_up = l[3] #500bp upstream of start; not used region_down = l[4] #500bp downstream of start; not used count = l[5] strand = l[6] info = l[8].split(';') #dist_score = '?' peak_start = get_value_from_keycolonvalue_list('start', info) peak_stop = get_value_from_keycolonvalue_list('stop', info) CpG_value = linecache.getline(sorted_cpg,lcount).strip().split('\t')[3] try: conservation = linecache.getline(sorted_avcons,lcount).strip().split('\t')[2] except: conservation = '0' affinity = linecache.getline(sorted_tata,lcount).strip().split('\t')[7] features = ';'.join(['cpg:'+CpG_value, 'cons:'+conservation, 'tata:'+affinity]) new_info = ';'.join(['region_start:'+region_up, 'region_stop:'+region_down]) line = '\t'.join([c, l[1], l[2], peak_start, peak_stop, count, strand, features, new_info]) out.write(line + '\n')
def elemset(self): if self._elemset is None: elemnamesets = [frozenset(i.elemnames) for i in self.forms] assert all_same(elemnamesets) self._elemset = frozenset(self.forms[0].elems) return self._elemset
def row_winner(self): for row in self.spaces: if utils.all_same(row): self.set_winner(row[0]) return True return False
def flatten(dm): """ Takes a DataMat who's elements are arrays and returns a flattened copy in which the DataMat element is the lowest atom of data: so no DataMat element contains time-indexed fields: all the time points are directly, flatly, accessible. Makes DataMat potentially extremely long, but eases merging, aligning, and maybe also analysis. """ tmfields = dm.time_based_fields seqfields = [] dbg(2, 'will flatten DataMat with %d elements.' % (len(dm))) #Step 1. Determine which fields need flattening. # TODO: a better test for the sequence fields is needed here. for f in dm.fieldnames(): if (dm.__dict__[f].dtype == np.object) and isiterable(dm.__dict__[f][0]): seqfields += [f] dbg(3, "seqfield: %s, %s, %s" % (f, type(dm.__dict__[f][0]), dm.__dict__[f][0].dtype)) #Step 2. Determine the amount of elements in the fields to be flattened. nelements = [] for dmi in dm: elementn = [len(dmi.field(f)[0]) for f in seqfields] assert(all_same(elementn)) nelements += [elementn[0]] dbg(2, 'flattened DataMat will contain %d elements' % (sum(nelements))) newdm = dm.copy_empty() newdm._num_fix = sum(nelements) nonseqfields = set(seqfields).symmetric_difference(set(dm.fieldnames())) newdata = {} newmask = {} #Step 3. Create new, empty, arrays for each of the non-sequence fields. for f in nonseqfields: dbg(3, "creating empty non-seq field '%s'" % (f)) #to avoid problems with uninitialised values, use ma_nans instead of # ma.empty(sum(nelements), dtype=dm.field(f).dtype) if isiterable(dm.field(f)[0]): fdtype = np.object else: fdtype = dm.field(f).dtype newdata[f] = ma_nans(sum(nelements)).astype(fdtype) #Step 4. Expand all non-sequence fields into the new empty arrays. sidx = 0 for idx, dmi in enumerate(dm): eidx = sidx + nelements[idx] dbg(4, '%d,%d' % (sidx, eidx)) for f in nonseqfields: dbg(3, "element %d/%d: filling non-seq field '%s' [%d:%d] (%s)" % (idx, len(dm), f, sidx, eidx, str(dmi.field(f)[0]))) if isiterable(dmi.field(f)[0]): for ii in xrange(sidx, eidx): newdata[f][ii] = \ dmi.field(f)[0].astype(np.object) else: newdata[f][sidx:eidx] = dmi.field(f)[0] sidx = eidx #Step 5. Stack all the sequence fields together. for f in seqfields: dbg(3, "stacking sequence field '%s'" % (f)) newdata[f] = np.hstack(dm.field(f)) newmask[f] = np.hstack(np.ma.getmaskarray(dm.field(f))) dbg(4, 'newmask[%s]: %s' % (f, newmask[f])) warn('todo: set mask correctly') #Step 6. Create the new DataMat for k, v in newdata.iteritems(): newdm.add_field(k, v) return newdm #newdata, newmask