Exemplo n.º 1
0
    def loadFilterMap(self, sparkContext):
        filterList = Utils.readFileLines(self.filterList)
        # returns tuple (((file, content), 'domains'))
        content = Parsers.parseDatasetContents(self.filterMap, 'domains',
                                               'domains')

        domRDD = sparkContext.parallelize(content, numSlices=1000)
        domainsRDD = domRDD.map(lambda x: (Utils.getFileName(x[0][0]).replace(
            '.domains', ''), x[0][1]))

        # lists genes that have any domains in filterList
        # discards ".\d+" end of Pfam ID
        filter = domainsRDD.filter(lambda x: any(
            domain in filterList for domain in re.split("[\n.]", x[1])))

        result = filter.collectAsMap().keys()
        genes = sorted([i for i in result])

        print('Loaded filter:', len(genes), ' genes will be filtered from',
              len(filterList), 'domains.')
        return genes
Exemplo n.º 2
0
 def getTupleFilename(self, tup):
     return Utils.getFileName(tup[0]).split('.')[0]