Пример #1
0
    def getDomains(self, sparkContext):

        # recover the species name for using in temp files
        self.species = Utils.getSpecies(self.source_path)
        domainFinder = DomainFinder.DomainFinder()

        # load source sequences into a single list
        if ("fasta" in self.source_type):
            list, file_content = Parsers.parseFastaToList(self.source_path, "")
        elif ("genbank" in self.source_type):
            list = Parsers.genBankToAminoacid(self.source_path)

        print('Processing domains...')

        # create RDD with source sequences
        sourceRDD = sparkContext.parallelize(file_content, numSlices=2000)

        if ("nucleotide" in self.source_type):
            # execute sixFrame translation for each sequence in RDD
            sourceRDD = sourceRDD.map(lambda x: SixFrameTranslator.main(x))

        # execute Pfam domain prediction for each sixFrame translation in RDD
        domainsRDD = sourceRDD.map(lambda x: domainFinder.main(x[0], x[1]))
        processedRDD = domainsRDD.map(
            lambda x: self.processDomainOutput(x[0], x[1]))

        # recover Pfam domain prediction results from RDD
        result = processedRDD.collectAsMap()

        print('Done!')

        return result
Пример #2
0
def genBankToAminoacid(path):
    list = []
    # only aminoacid sequence
    translations = ''
    files = []
    if (os.path.isfile(path)):
        files.append(path)
    else:
        files = Utils.listFilesExt(path, 'gbk')

    for file in files:
        species = Utils.getSpecies(file)
        records = parseGenBank(file)

        for record in records:
            locus = record.id
            for feature in record.features:
                #if feature.key == "CDS":
                if feature.type == "CDS":
                    id, locus_tag, gene, protein_id, translation, \
                    product, function, description  = '','','','','','','',''

                    for key, value in feature.qualifiers.items():
                        # get rid of the quotes around the qualifier
                        # find entry ID
                        if key == "translation":
                            translation = value[0]
                        elif key == "gene":
                            gene = value[0]
                        elif key == "locus_tag":
                            locus_tag = value[0]
                        elif key == "protein_id":
                            protein_id = value[0]
                            protein_id = protein_id.replace('/', '')
                        elif key == "product":
                            product = value[0]
                        elif key == "function":
                            function = value[0]

                    #priority for gene ID
                    id = locus_tag if not id and len(locus_tag) > 1 else id
                    id = gene if not id and len(gene) > 1 else id
                    id = protein_id if not id and len(protein_id) > 1 else id

                    description = product if product.strip() else description
                    description += '|' + function if function.strip(
                    ) else description

                    entry = '>' + locus + '|' + species + '|' + id + '|' + description + '\n' + translation
                    if (entry not in list):
                        list.append(entry)
                        translations += translation

    return list, translations
Пример #3
0
 def __init__(self, source_type=None, source_path=None, result_path=None):
     self.config = Utils.loadConfig()
     self.task = self.config.get('dataPipeline', 'task')
     self.source_path = self.config.get(
         'dataPipeline',
         'source.path') if source_path is None else source_path
     self.source_type = self.config.get(
         'dataPipeline',
         'source.type') if source_type is None else source_type
     self.result_path = self.config.get(
         'dataPipeline',
         'result.path') if result_path is None else result_path
     self.result_path = Utils.normalizePath(self.result_path)
     # create if it doesnt exist
     os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
     # recover the species name for using in temp files
     self.species = Utils.getSpecies(self.source_path)
     # temp dir + file used by sub-pipelines
     self.path = os.path.dirname(os.path.realpath(__file__))
     self.path += '/temp/'
     os.makedirs(os.path.dirname(self.path), exist_ok=True)