Exemplo n.º 1
0
    def extractRewardPerFeat(self, dataPath, outputPath, featType, sourceType,
                             rewardType):

        rewardperfeat = {}
        # tuple of shape {(file, content, id),'kmers'}
        resultLabel = Parsers.parseDatasetContents(dataPath, featType,
                                                   sourceType)
        fileindex = list(set([i[0][0] for i in resultLabel]))

        for item in resultLabel:
            filename = item[0][0]
            label = Utils.getLabel(filename)
            content = item[0][1]
            idx = fileindex.index(filename)
            occ = 1 if label == 1 else -1

            if (content in rewardperfeat):
                if ('label' in rewardType):
                    rewardperfeat[content][idx] = occ
                else:
                    rewardperfeat[content][idx] += occ
            else:
                rewardperfeat[content] = [0] * len(fileindex)
                if ('label' in rewardType):
                    rewardperfeat[content][idx] = occ
                else:
                    rewardperfeat[content][idx] += occ

        outputstr = ''
        for k, v in rewardperfeat.items():
            outputstr += k + '\t' + (',').join(map(str, v)) + '\n'
        Utils.writeFile(outputPath, outputstr[:-1])

        return rewardperfeat
Exemplo n.º 2
0
    def addInstance(self, info):
        # receives tuple in format:
        # (fileID, [list of extracted features])
        # for pfam only (inverted):
        # (pfamID, [list of fileIDs])
        instanceValues = list()

        label = Utils.getLabel(''.join(info[1])) if 'pfam' in self.featType else Utils.getLabel(info[0])
        fileID = info[0]
        features = [info[0]] if 'pfam' in self.featType else info[1]

        for feat in features:
            instanceValues.append(self.dictionary.get(feat)) if feat in self.dictionary else ""

        size = len(instanceValues)
        instanceID = (fileID, int(size), str(label))

        return instanceID, instanceValues
Exemplo n.º 3
0
    def countOccurrence(self, dataPath, sparkContext):
        feats = self.loadFeatures()
        contentIds = []

        listContents = Parsers.parseDatasetContents(dataPath, self.featType,
                                                    self.sourceType)
        parentDir = os.path.split(os.path.dirname(listContents[0][0][0]))[1]

        for info in listContents:
            filename = info[0][0]
            content = info[0][1]
            type = info[1]
            firstLine = Utils.readFileLines(filename)[0]
            id = firstLine.replace(
                '>',
                '') if '|' in firstLine else firstLine.split('.')[0].replace(
                    '>', '')
            label = Utils.getLabel(filename)

            # avoid cases in which test synthetic genes are long and
            # in the split different clusters share same (gene) id
            for item in contentIds:
                if (id in item[0] and type in item[1]):
                    id = id + '|'

            contentIds.append(tuple([id, type, content, label]))

        sourceRDD = sparkContext.parallelize(contentIds, numSlices=1000)
        occRDD = sourceRDD.map(lambda x: self.occurrence(x, feats))

        # combine features with same ID and filter out instances with not enough features
        reducedRDD = occRDD.reduceByKey(
            lambda x, y: self.mergeFeatsSameId(x, y))

        ids = reducedRDD.map(lambda x: x[0]).collect()
        occ = reducedRDD.map(lambda x: x[1][0]).collect()
        labels = reducedRDD.map(lambda x: x[1][1]).collect()

        print('Features loaded.')
        return np.array(ids), np.array(occ), np.array(labels), parentDir