def getDomains(self, sparkContext): # recover the species name for using in temp files self.species = Utils.getSpecies(self.source_path) domainFinder = DomainFinder.DomainFinder() # load source sequences into a single list if ("fasta" in self.source_type): list, file_content = Parsers.parseFastaToList(self.source_path, "") elif ("genbank" in self.source_type): list = Parsers.genBankToAminoacid(self.source_path) print('Processing domains...') # create RDD with source sequences sourceRDD = sparkContext.parallelize(file_content, numSlices=2000) if ("nucleotide" in self.source_type): # execute sixFrame translation for each sequence in RDD sourceRDD = sourceRDD.map(lambda x: SixFrameTranslator.main(x)) # execute Pfam domain prediction for each sixFrame translation in RDD domainsRDD = sourceRDD.map(lambda x: domainFinder.main(x[0], x[1])) processedRDD = domainsRDD.map( lambda x: self.processDomainOutput(x[0], x[1])) # recover Pfam domain prediction results from RDD result = processedRDD.collectAsMap() print('Done!') return result
def genBankToAminoacid(path): list = [] # only aminoacid sequence translations = '' files = [] if (os.path.isfile(path)): files.append(path) else: files = Utils.listFilesExt(path, 'gbk') for file in files: species = Utils.getSpecies(file) records = parseGenBank(file) for record in records: locus = record.id for feature in record.features: #if feature.key == "CDS": if feature.type == "CDS": id, locus_tag, gene, protein_id, translation, \ product, function, description = '','','','','','','','' for key, value in feature.qualifiers.items(): # get rid of the quotes around the qualifier # find entry ID if key == "translation": translation = value[0] elif key == "gene": gene = value[0] elif key == "locus_tag": locus_tag = value[0] elif key == "protein_id": protein_id = value[0] protein_id = protein_id.replace('/', '') elif key == "product": product = value[0] elif key == "function": function = value[0] #priority for gene ID id = locus_tag if not id and len(locus_tag) > 1 else id id = gene if not id and len(gene) > 1 else id id = protein_id if not id and len(protein_id) > 1 else id description = product if product.strip() else description description += '|' + function if function.strip( ) else description entry = '>' + locus + '|' + species + '|' + id + '|' + description + '\n' + translation if (entry not in list): list.append(entry) translations += translation return list, translations
def __init__(self, source_type=None, source_path=None, result_path=None): self.config = Utils.loadConfig() self.task = self.config.get('dataPipeline', 'task') self.source_path = self.config.get( 'dataPipeline', 'source.path') if source_path is None else source_path self.source_type = self.config.get( 'dataPipeline', 'source.type') if source_type is None else source_type self.result_path = self.config.get( 'dataPipeline', 'result.path') if result_path is None else result_path self.result_path = Utils.normalizePath(self.result_path) # create if it doesnt exist os.makedirs(os.path.dirname(self.result_path), exist_ok=True) # recover the species name for using in temp files self.species = Utils.getSpecies(self.source_path) # temp dir + file used by sub-pipelines self.path = os.path.dirname(os.path.realpath(__file__)) self.path += '/temp/' os.makedirs(os.path.dirname(self.path), exist_ok=True)