示例#1
0
    def load_datasets(self, 
                      path_pos, 
                      path_neg, 
                      background_alg,
                      SHOULD_SHORT=0,
                      HEADERS=False,
                      HARDMASK=False
                      ):
        positive = []
        negative = []

        header_pos = []
        header_neg = []
        print "HARDMASK:", HARDMASK
        
        #Load positive for path_pos FASTA
        for line in self.readfasta(path_pos, HARDMASK):            
            if HEADERS and line[0] == '>':
                header_pos.append(line)
            else:
                positive.append(line)
            if SHOULD_SHORT > 0 and int(SHOULD_SHORT) == len(positive): 
                break            

        #Load background as shuffled copy of pisitive
        if path_neg != None:
            #Load background from path_neg FASTA
            for line in self.readfasta(path_neg): # Should bg determi hardmasking with the same variable as positive?
                if HEADERS and line[0] == '>':
                        header_neg.append(line)
                else:
                    negative.append(line)
                if SHOULD_SHORT > 0 and int(SHOULD_SHORT) == len(negative): 
                    break               
        elif background_alg == "dinuclShuffle":
            for line in positive:            
                sh_line =  altschulEriksonDinuclShuffle.dinuclShuffle(line.upper())
                negative.append(sh_line)
        else:
            #No background
            negative.append("NNNNNNNNNNNNNNNNNNNNNNNNNNN")

        print "Lengths:", len(positive), len(negative)
        if HEADERS:
            return positive, negative, header_pos, header_neg
        else:
            return positive, negative
def generate_sequences(seqs, nfold):
    bg_gc_list = []
    bg_lengths = []
    for record in seqs:
        seq = record.seq.__str__()
        for n in range(0, nfold):
            new_sequence = ""
            for sequence in split_seq(seq):
                if re.match("N", sequence):
                    new_sequence += sequence
                elif sequence:
                    new_sequence += dinuclShuffle(sequence)
            new_seq = SeqRecord(Seq(new_sequence, generic_dna),
                                id="background_seq_for_{0:s}".format(
                                    record.name),
                                description="")
            print(new_seq.format("fasta"), end="")
            bg_gc_list.append(GC(new_sequence))
            bg_lengths.append(len(new_sequence))
    return bg_gc_list, bg_lengths
def generate_sequences(seqs, nfold):
  cpt = 1
  bg_gc_list = []
  bg_lengths = []
  for record in seqs:
    seq = record.seq.__str__()
    descr = "Background sequence for {0:s}".format(record.name)
    for n in range(0, nfold):
      new_sequence = ""
      for sequence in split_seq(seq):
        if re.match('N', sequence):
          new_sequence += sequence
        elif sequence:
          new_sequence += dinuclShuffle(sequence)
      new_seq = SeqRecord(Seq(new_sequence, generic_dna),
          id="background_seq_{0:d}".format(cpt), description=descr)
      print new_seq.format("fasta"),
      bg_gc_list.append(GC(new_sequence))
      bg_lengths.append(len(new_sequence))
      cpt += 1
  return bg_gc_list, bg_lengths
示例#4
0
def main(fileName, NUM):
    seq = file2string(fileName)
    for i in range(NUM):
        shuffledSeq = dinuclShuffle(seq)
        sys.stdout.write(">%d\n" % (i + 1))
        sys.stdout.write("%s\n" % shuffledSeq)
def main(fileName,NUM):
  seq = file2string(fileName)
  for i in range(NUM):
    shuffledSeq = dinuclShuffle(seq) 
    sys.stdout.write(">%d\n" % (i+1))
    sys.stdout.write("%s\n" % shuffledSeq)
示例#6
0
def shuffle_window(ss, wl, step):
    bs = ss[:]
    for i in range(0, len(bs) - 1, step):
        #print i,"\t",ss[i:(i+wl)]
        bs = bs[0:i] + dinuclShuffle(bs[i:(i + wl)]) + bs[i + wl:]
    return (bs)  # returns shuffled sequence
def shuffle_window(ss, wl, step):
  bs = ss[:]
  for i in range (0, len(bs)-1, step):
    #print i,"\t",ss[i:(i+wl)]
    bs = bs[0:i] + dinuclShuffle(bs[i:(i+wl)]) + bs[i+wl:]
  return(bs) # returns shuffled sequence