def __init__(self, fileName, pseudocountProb=0.0): self.fileName = fileName fileHandle = util.get_file_handle(fileName) self.pseudocountProb = pseudocountProb self.loadedMotifs = OrderedDict() action = self.getReadPwmAction(self.loadedMotifs) util.perform_action_on_each_line_of_file( file_handle=fileHandle, action=action, transformation=util.trim_newline) for pwm in self.loadedMotifs.values(): pwm.finalise(pseudocountProb=self.pseudocountProb) super(AbstractLoadedMotifsFromFile, self).__init__(self.loadedMotifs)
def generateSequences(self): fileHandle = util.get_file_handle(self.dnaseSimulationFile) for lineNumber, line in enumerate(fileHandle): if (lineNumber > 0): #ignore title inp = util.default_tab_seppd(line) sequenceName = inp[0] backgroundGenerator = ShuffledBackgroundGenerator( string=inp[1], shuffler=self.shuffler) embedders = [ parseDnaseMotifEmbedderString(embedderString, self.loadedMotifs) for embedderString in inp[2].split(",") if len(embedderString) > 0 ] yield SingleDnaseSequenceGenerator( backgroundGenerator=backgroundGenerator, dnaseMotifEmbedders=embedders, sequenceName=sequenceName).generateSequence()
def read_simdata_file(simdata_file, one_hot_encode=False, ids_to_load=None): ids = [] sequences = [] embeddings = [] labels = [] if (ids_to_load is not None): ids_to_load = set(ids_to_load) def action(inp, line_number): if (line_number > 1): if (ids_to_load is None or (inp[0] in ids_to_load)): ids.append(inp[0]) sequences.append(inp[1]) embeddings.append(getEmbeddingsFromString(inp[2])) labels.append([int(x) for x in inp[3:]]) util.perform_action_on_each_line_of_file( file_handle=util.get_file_handle(simdata_file), action=action, transformation=util.default_tab_seppd) return util.enum( ids=ids, sequences=sequences, embeddings=embeddings, labels=np.array(labels))
def printSequences(outputFileName, sequenceSetGenerator, includeEmbeddings=False, labelGenerator=None, includeFasta=False, prefix=None): """Print a series of synthetic sequences. Given an output filename, and an instance of :class:`.AbstractSequenceSetGenerator`, will call the sequenceSetGenerator and print the generated sequences to the output file. Will also create a file "info_outputFileName.txt" in the same directory as outputFileName that contains all the information about sequenceSetGenerator. Arguments: outputFileName: string sequenceSetGenerator: instance of :class:`.AbstractSequenceSetGenerator` includeEmbeddings: a boolean indicating whether to print a column that lists the embeddings labelGenerator: optional instance of :class:`.LabelGenerator` includeFasta: optional boolean indicating whether to also print out the generated sequences in fasta format (the file will be produced with a .fa extension) prefix: string - this will be prefixed in front of the generated sequence ids, followed by a hyphen """ ofh = util.get_file_handle(outputFileName, 'w') if (includeFasta): fastaOfh = util.get_file_handle(util.get_file_name_parts( outputFileName).get_transformed_file_path( lambda x: x, extension=".fa"), 'w') ofh.write("seqName\tsequence" + ("\tembeddings" if includeEmbeddings else "") + ("\t" + "\t".join(labelGenerator.labelNames) if labelGenerator is not None else "") + "\n") generatedSequences = sequenceSetGenerator.generateSequences() # returns a generator for generatedSequence in generatedSequences: ofh.write((prefix+"-" if prefix is not None else "") + generatedSequence.seqName + "\t" + generatedSequence.seq + ("\t" + ",".join(str(x) for x in generatedSequence.embeddings) if includeEmbeddings else "") + ("\t" + "\t".join(str(x) for x in labelGenerator.generateLabels( generatedSequence)) if labelGenerator is not None else "") + "\n") if (includeFasta): fastaOfh.write(">" + (prefix+"-" if prefix is not None else "") + generatedSequence.seqName + "\n") fastaOfh.write(generatedSequence.seq + "\n") ofh.close() if (includeFasta): fastaOfh.close() infoFilePath = (util.get_file_name_parts(outputFileName) .get_transformed_file_path( lambda x: x + "_info", extension=".txt")) ofh = util.get_file_handle(infoFilePath, 'w') ofh.write(util.format_as_json(sequenceSetGenerator.getJsonableObject())) ofh.close()