예제 #1
0
 def __init__(self, fileName, pseudocountProb=0.0):
     self.fileName = fileName
     fileHandle = util.get_file_handle(fileName)
     self.pseudocountProb = pseudocountProb
     self.loadedMotifs = OrderedDict()
     action = self.getReadPwmAction(self.loadedMotifs)
     util.perform_action_on_each_line_of_file(
         file_handle=fileHandle,
         action=action,
         transformation=util.trim_newline)
     for pwm in self.loadedMotifs.values():
         pwm.finalise(pseudocountProb=self.pseudocountProb)
     super(AbstractLoadedMotifsFromFile, self).__init__(self.loadedMotifs)
예제 #2
0
파일: dnase.py 프로젝트: yynst2/simdna
 def generateSequences(self):
     fileHandle = util.get_file_handle(self.dnaseSimulationFile)
     for lineNumber, line in enumerate(fileHandle):
         if (lineNumber > 0):  #ignore title
             inp = util.default_tab_seppd(line)
             sequenceName = inp[0]
             backgroundGenerator = ShuffledBackgroundGenerator(
                 string=inp[1], shuffler=self.shuffler)
             embedders = [
                 parseDnaseMotifEmbedderString(embedderString,
                                               self.loadedMotifs)
                 for embedderString in inp[2].split(",")
                 if len(embedderString) > 0
             ]
             yield SingleDnaseSequenceGenerator(
                 backgroundGenerator=backgroundGenerator,
                 dnaseMotifEmbedders=embedders,
                 sequenceName=sequenceName).generateSequence()
예제 #3
0
def read_simdata_file(simdata_file, one_hot_encode=False, ids_to_load=None):
    ids = []
    sequences = []
    embeddings = []
    labels = []
    if (ids_to_load is not None):
        ids_to_load = set(ids_to_load)
    def action(inp, line_number):
        if (line_number > 1):
            if (ids_to_load is None or (inp[0] in ids_to_load)):
                ids.append(inp[0]) 
                sequences.append(inp[1])
                embeddings.append(getEmbeddingsFromString(inp[2]))
                labels.append([int(x) for x in inp[3:]])
    util.perform_action_on_each_line_of_file(
        file_handle=util.get_file_handle(simdata_file),
        action=action,
        transformation=util.default_tab_seppd)
    return util.enum(
            ids=ids,
            sequences=sequences,
            embeddings=embeddings,
            labels=np.array(labels))
예제 #4
0
def printSequences(outputFileName, sequenceSetGenerator,
                   includeEmbeddings=False, labelGenerator=None,
                   includeFasta=False, prefix=None):
    """Print a series of synthetic sequences.

    Given an output filename, and an instance of
        :class:`.AbstractSequenceSetGenerator`, will call the
        sequenceSetGenerator and print the generated sequences
        to the output file. Will also create a file "info_outputFileName.txt"
        in the same directory as outputFileName that contains
        all the information about sequenceSetGenerator.

    Arguments:
        outputFileName: string

        sequenceSetGenerator: instance of
            :class:`.AbstractSequenceSetGenerator`
    
        includeEmbeddings: a boolean indicating whether to print a
            column that lists the embeddings
    
        labelGenerator: optional instance of :class:`.LabelGenerator`

        includeFasta: optional boolean indicating whether to also
            print out the generated sequences in fasta format
            (the file will be produced with a .fa extension)

        prefix: string - this will be prefixed in front of the generated
            sequence ids, followed by a hyphen
    """
    ofh = util.get_file_handle(outputFileName, 'w')
    if (includeFasta):
        fastaOfh = util.get_file_handle(util.get_file_name_parts(
            outputFileName).get_transformed_file_path(
            lambda x: x, extension=".fa"), 'w')
    ofh.write("seqName\tsequence"
              + ("\tembeddings" if includeEmbeddings else "")
              + ("\t" +
                 "\t".join(labelGenerator.labelNames)
                 if labelGenerator is not None else "") + "\n")
    generatedSequences = sequenceSetGenerator.generateSequences()  # returns a generator
    for generatedSequence in generatedSequences:
        ofh.write((prefix+"-" if prefix is not None else "")
                  + generatedSequence.seqName + "\t" + generatedSequence.seq
                  + ("\t" + ",".join(str(x)
                     for x in generatedSequence.embeddings)
                         if includeEmbeddings else "")
                  + ("\t" + "\t".join(str(x) for x in labelGenerator.generateLabels(
                      generatedSequence)) if labelGenerator is not None else "")
                  + "\n")
        if (includeFasta):
            fastaOfh.write(">" + (prefix+"-" if prefix is not None else "")
                               + generatedSequence.seqName + "\n")
            fastaOfh.write(generatedSequence.seq + "\n")

    ofh.close()
    if (includeFasta):
        fastaOfh.close()
    infoFilePath = (util.get_file_name_parts(outputFileName)
                        .get_transformed_file_path(
                          lambda x: x + "_info", extension=".txt"))

    ofh = util.get_file_handle(infoFilePath, 'w')
    ofh.write(util.format_as_json(sequenceSetGenerator.getJsonableObject()))
    ofh.close()