Exemplo n.º 1
0
    def __init__(self):
        self.config = Utils.loadConfig()
        self.task = self.config.get('eval', 'task')
        self.gold = self.config.get('eval', 'goldID.path')
        self.result = self.config.get('eval', 'result.path')
        self.threshold = float(self.config.get('eval', 'threshold'))
        self.sparkContext = SparkContext(conf=Utils.getSparkConf('filter'))

        self.Similarity = Similarity.Similarity(self.config)
        self.Filter = Filter.Filter(self.config,
                                    sparkContext=self.sparkContext)
        self.Merger = Merger.Merger(self.config)

        self.goldIDs = Utils.readFileLines(self.gold)[1:]
        self.resultFiles = Utils.listFilesExt(self.result, 'IDs.test')

        # total nb of gold genes
        self.nbGoldGenes = len(self.goldIDs)
        # total nb of gold clusters
        self.foldedGold = Utils.foldClusterData(self.goldIDs, 'gold', 0)
        self.goldGenes = [
            gene for genes in self.foldedGold.values() for gene in genes
        ]
        self.nbGoldClusters = len(self.foldedGold)
        self.outputheader = 'goldClusterID\tgoldGeneID\tpredictedClusterLabel\tpredictedClusterID\n'
        self.scoreheader = 'goldClusterID\tpredictedClusterID\tclusterScore\n'
Exemplo n.º 2
0
def clustersToGFF(clusterspath, gffpath, goldpath, annotpath, source_type):
    gffcontent = Gff3(gffpath)
    clustercontent, goldContent, annotationContent = "", "", ""

    clustercontent = Utils.readFileLines(clusterspath)
    clusters = Utils.foldClusterData(
        clustercontent, "",
        0.5) if 'score' in clusterspath else Utils.foldClusterData(
            clustercontent, "gold", "")

    goldContent = '\t'.join(Utils.readFileLines(goldpath)) if goldpath else ""
    annotationList = Utils.readFileLines(annotpath) if annotpath else ""
    annotationContent = ('\n').join(annotationList) if annotpath else ""

    # sort dict by key
    clusters = OrderedDict(sorted(clusters.items(), key=lambda x: x[0]))
    gffclusterfile = clusterspath.rsplit('.', 1)[0] + '.percluster.gff3'
    gffgenefile = clusterspath.rsplit('.', 1)[0] + '.pergene.gff3'

    outputcluster, outputgene = "##gff-version 3\n", "##gff-version 3\n"
    # filter only "mRNA" features, return dict {gene name, gff line}
    mRNAdict = {
        line['attributes']['Name'].replace('.1', ''): line
        for line in gffcontent.lines if line['type'] == 'mRNA'
    }

    for key, value in clusters.items():
        for gene in value:
            gene = gene.replace('.1', '')
            thisgene = mRNAdict.get(gene)

            if (thisgene is not None):
                chr = thisgene['seqid']
                position = str(thisgene['start']) + '\t' + str(thisgene['end'])
                score = '?'
                strand = thisgene['strand']
                phase = thisgene['phase']
                info = 'Name=' + gene + ';Note=' + key + '\n'

                if (goldContent):
                    if (gene in annotationContent):
                        annot = [
                            item for item in annotationList if gene in item
                        ]
                        annot = annot[0].split('\t')[1] if annot else ''
                        if ('backbone' in annot):
                            info = info.replace("\n",
                                                ";color=#EE0000\n")  # red
                        elif ('tailor' in annot):
                            info = info.replace("\n",
                                                ";color=#EE9300\n")  # orange
                        elif ('transcript') in annot:
                            info = info.replace(
                                "\n", ";color=#048014\n")  # forest green
                        elif ('transport' in annot):
                            info = info.replace(
                                "\n", ";color=#1888f0\n")  # light blue
                    elif (gene in goldContent):
                        info = info.replace(
                            "\n", ";color=#9931f2\n")  # bright purple
                outputgene += chr + '\t' + source_type + '\t' + position + '\t' + score + '\t' + strand + '\t' + phase + '\t' + info

            else:
                print('gene not found:', gene)

        startID = value[0].replace('.1', '')
        endID = value[-1].replace('.1', '')
        startGene = mRNAdict.get(startID)
        endGene = mRNAdict.get(endID)
        chr = startGene['seqid']
        position = str(startGene['start']) + '\t' + str(endGene['end'])

        strand = startGene['strand']
        phase = startGene['phase']
        score = '?'
        info = 'Name=' + key + ';Note=' + ('|').join(value) + '\n'
        outputcluster += chr + '\t' + source_type + '\t' + position + '\t' + score + '\t' + strand + '\t' + phase + '\t' + info

    Utils.writeFile(gffclusterfile, outputcluster)
    Utils.writeFile(gffgenefile, outputgene)

    return gffcontent