def loadFilterMap(self, sparkContext): filterList = Utils.readFileLines(self.filterList) # returns tuple (((file, content), 'domains')) content = Parsers.parseDatasetContents(self.filterMap, 'domains', 'domains') domRDD = sparkContext.parallelize(content, numSlices=1000) domainsRDD = domRDD.map(lambda x: (Utils.getFileName(x[0][0]).replace( '.domains', ''), x[0][1])) # lists genes that have any domains in filterList # discards ".\d+" end of Pfam ID filter = domainsRDD.filter(lambda x: any( domain in filterList for domain in re.split("[\n.]", x[1]))) result = filter.collectAsMap().keys() genes = sorted([i for i in result]) print('Loaded filter:', len(genes), ' genes will be filtered from', len(filterList), 'domains.') return genes
def getTupleFilename(self, tup): return Utils.getFileName(tup[0]).split('.')[0]