def extractRewardPerFeat(self, dataPath, outputPath, featType, sourceType, rewardType): rewardperfeat = {} # tuple of shape {(file, content, id),'kmers'} resultLabel = Parsers.parseDatasetContents(dataPath, featType, sourceType) fileindex = list(set([i[0][0] for i in resultLabel])) for item in resultLabel: filename = item[0][0] label = Utils.getLabel(filename) content = item[0][1] idx = fileindex.index(filename) occ = 1 if label == 1 else -1 if (content in rewardperfeat): if ('label' in rewardType): rewardperfeat[content][idx] = occ else: rewardperfeat[content][idx] += occ else: rewardperfeat[content] = [0] * len(fileindex) if ('label' in rewardType): rewardperfeat[content][idx] = occ else: rewardperfeat[content][idx] += occ outputstr = '' for k, v in rewardperfeat.items(): outputstr += k + '\t' + (',').join(map(str, v)) + '\n' Utils.writeFile(outputPath, outputstr[:-1]) return rewardperfeat
def addInstance(self, info): # receives tuple in format: # (fileID, [list of extracted features]) # for pfam only (inverted): # (pfamID, [list of fileIDs]) instanceValues = list() label = Utils.getLabel(''.join(info[1])) if 'pfam' in self.featType else Utils.getLabel(info[0]) fileID = info[0] features = [info[0]] if 'pfam' in self.featType else info[1] for feat in features: instanceValues.append(self.dictionary.get(feat)) if feat in self.dictionary else "" size = len(instanceValues) instanceID = (fileID, int(size), str(label)) return instanceID, instanceValues
def countOccurrence(self, dataPath, sparkContext): feats = self.loadFeatures() contentIds = [] listContents = Parsers.parseDatasetContents(dataPath, self.featType, self.sourceType) parentDir = os.path.split(os.path.dirname(listContents[0][0][0]))[1] for info in listContents: filename = info[0][0] content = info[0][1] type = info[1] firstLine = Utils.readFileLines(filename)[0] id = firstLine.replace( '>', '') if '|' in firstLine else firstLine.split('.')[0].replace( '>', '') label = Utils.getLabel(filename) # avoid cases in which test synthetic genes are long and # in the split different clusters share same (gene) id for item in contentIds: if (id in item[0] and type in item[1]): id = id + '|' contentIds.append(tuple([id, type, content, label])) sourceRDD = sparkContext.parallelize(contentIds, numSlices=1000) occRDD = sourceRDD.map(lambda x: self.occurrence(x, feats)) # combine features with same ID and filter out instances with not enough features reducedRDD = occRDD.reduceByKey( lambda x, y: self.mergeFeatsSameId(x, y)) ids = reducedRDD.map(lambda x: x[0]).collect() occ = reducedRDD.map(lambda x: x[1][0]).collect() labels = reducedRDD.map(lambda x: x[1][1]).collect() print('Features loaded.') return np.array(ids), np.array(occ), np.array(labels), parentDir