示例#1
0
 def __init__(self, file, skipInit=False, stream=open(os.devnull, 'w')):
     if type(file) == tuple:
         print >> stream, "Creation of the phylogenetic tree ...",
         (self.items, self.root, self.officialName) = file
     else:
         print >> stream, "Loading phylogenetic tree %s ..." % file,
         self.officialName = {}
         self.items = self.newCommonNamesMapperInstance()
         # name and instance of file
         f = myFile.openFile(file, 'r')
         try:
             self.name = f.name
         except AttributeError:
             self.name = file
         f = myFile.firstLineBuffer(f)
         if (';' in f.firstLine) or ('(' in f.firstLine):
             self.__loadFromNewick__(' '.join(f).replace('\n', '') + " ;")
         else:
             self.__loadFromMyFormat__(f)
         f.close()
         if not skipInit:
             self.reinitTree()
         else:
             print >> stream, "OK"
示例#2
0
    def __init__(self, *args, **kwargs):
        self.name = None
        # this dict contains the sets chromosomes per type of contig (cf class ContigType)
        self.chrSet = collections.defaultdict(set)
        # kwargs.get('name', default=None)
        myTools.DefaultOrderedDict.__init__(self, default_factory=list)
        self.withDict = kwargs.get("withDict", False)
        if self.withDict:
            self.g2p = {}
        if len(args) == 0:
            return
        else:
            assert len(args) == 1, args
            arg = args[0]

        if isinstance(arg, str):
            fileName = arg
            self.name = fileName
            print >> sys.stderr, "Loading LightGenome from", fileName,
            # FIXME use myFile.firstLineBuffer to choose which format is in
            # input.
            # choice of the loading function
            flb = myFile.firstLineBuffer(myFile.openFile(fileName, 'r'))
            c = flb.firstLine.split("\t")
            if len(c) == 6:
                print >> sys.stderr, "(c, beg, end, s, gName, transcriptName) -> (c, s, gName)",
                # c, beg, end, s,  gName, transcriptName
                reader = myFile.myTSV.readTabular(fileName, [str, int, int, int, str, str])
                reader = ((c, strand, gName) for (c, beg, end, strand, gName, tName) in reader)
            elif len(c) == 3:
                print >> sys.stderr, "(c, s, gName)",
                # c, s, gName
                reader = myFile.myTSV.readTabular(fileName, [str, int, str])
            elif len(c) == 5:
                print >> sys.stderr, "(c, beg, end, s, gName) -> (c, s, gName)",
                # c, beg, end, s,  gName
                tmpReader = myFile.myTSV.readTabular(fileName, [str, int, int, int, str])
                # check, with the first line, if there are several gene names (the format genome of Matthieu contains several gene names)
                (c, beg, end, strand, gNames) = tmpReader.next()
                severalNames = True if len(gNames.split(' ')) > 0 else False
                reader = itertools.chain([(c, beg, end, strand, gNames)], tmpReader)
                if severalNames:
                    # if gNames contains more than one gene name, only take the first gene name
                    reader = ((c, strand, gNames.split(' ')[0]) for (c, beg, end, strand, gNames) in reader)
                else:
                    reader = ((c, strand, gName) for (c, beg, end, strand, gName) in reader)
            else:
                raise ValueError("%s file is badly formatted" % fileName)
            print >> sys.stderr, "...",
            # FIXME do not need beg, end and tName
            # c = chromosome name
            # beg = coordinate in nucleotides of the beginning of
            # transcription of the shortest transcript
            # end = coordinate in nucleotides of the ending of
            # transcription of the shortest transcript
            # gName = gene name
            # tName = transcript name
            idx = -1
            c_old = None
            for (c, strand, gName) in reader:
                self.chrSet[contigType(c)].add(c)
                idx = (idx + 1) if c == c_old else 0
                self[c].append(OGene(gName, strand))
                if self.withDict:
                    # dict 'G'ene to (pronounced '2') 'P'osition
                    self.g2p[gName] = GeneP(c, idx)
                    c_old = c
            print >> sys.stderr, 'OK'
        elif isinstance(arg, myGenomes.Genome):
            genome = arg
            self.name = genome.name
            self.chrSet = arg.chrSet
            for c in genome.lstGenes.keys():
                for (idx, g) in enumerate(genome.lstGenes[c]):
                    self[str(c)].append(OGene(g.names[0], g.strand))
                    if self.withDict:
                        self.g2p[g.names[0]] = GeneP(str(c), idx)
        elif isinstance(arg, LightGenome):
            self.name = arg.name
            self.chrSet = arg.chrSet
            self.withDict = arg.withDict
            for c in arg:
                self[c] = [OGene(gene.n, gene.s) for gene in arg[c]]
            if self.withDict:
                self.g2p = dict((gn, GeneP(gp.c, gp.idx)) for (gn, gp) in arg.g2p.iteritems())
        elif isinstance(arg, dict):
            genome = arg
            for c in genome:
                for (idx, (gName, strand)) in enumerate(genome[c]):
                    self[c].append(OGene(gName, strand))
                    if self.withDict:
                        # dict 'G'ene to (pronounced '2') 'P'osition
                        self.g2p[gName] = GeneP(c, idx)
        else:
            raise ValueError('Constructor needs a file')
示例#3
0
    def __init__(self, fichier, **kwargs):
        if isinstance(fichier, str):
            print >> sys.stderr, "Loading genome of", fichier, "...",
            f = myFile.firstLineBuffer(myFile.openFile(fichier, 'r'))

            # list of genes per chromosome
            self.lstGenes = collections.defaultdict(list)

            # choice of the loading function
            c = f.firstLine.split("\t")
            if f.firstLine.startswith(">") or f.firstLine.endswith("$"):
                # GRIMM-Synteny format
                ######################
                if f.firstLine.startswith(">"):
                    self.name = f.firstLine[1:].strip()
                chrom = 1
                for l in f:
                    l = l.strip()
                    if not l.endswith("$"):
                        continue
                    for (i,x) in enumerate(l.replace("$","").split()):
                        strand = -1 if x.startswith("-") else 1
                        self.addGene([x[1:] if x[0] in "-+" else x], chrom, i, i+1, strand)
                    chrom += 1
                print >> sys.stderr, "(GRIMM)",

            elif len(c) == 1:
                # ancestral genes: "NAMES"
                ##########################
                for (i,l) in enumerate(f):
                    self.lstGenes[None].append( Gene(None, i, i+1, 0, tuple(intern(x) for x in l.split())) )
                print >> sys.stderr, "(ancestral genes)",

            elif (len(c) == 2) and not set(c[1]).issubset("01-"):
                # ancestral genome: "CHR NAMES"
                ###############################
                lastC = None
                for (i,l) in enumerate(f):
                    c = l.split("\t")
                    if lastC != c[0]:
                        lastC = c[0]
                        dec = i
                    self.addGene(c[1].split(), c[0], i-dec, i-dec+1, 0)
                print >> sys.stderr, "(ancestral genome: chrom+noms)",

            elif (len(c) >= 5) and (" " not in c[3]) and (len(c[4]) > 0):
                # Ensembl: "CHR BEG END STRAND NAMES"
                #####################################
                for l in f:
                    c = l.replace('\n', '').split('\t')
                    self.addGene(c[4].split(), c[0], int(c[1]), int(c[2]), int(c[3]) if c[3]!='None' else None)
                print >> sys.stderr, "(Ensembl)",

            elif (len(c) == 4) and int(c[1]) < 2:
                # ancestral genome: "CHR STRAND LST-INDEX LST-STRANDS"
                ######################################################
                if 'ancGenes' in kwargs:
                    ancGenes = kwargs["ancGenes"].lstGenes[None]
                lastC = None
                for l in f:
                    c = l.split("\t")
                    if lastC != c[0]:
                        lastC = c[0]
                        pos = 0
                        currC = commonChrName(c[0])
                    data = zip([int(x) for x in c[2].split()], [int(x) for x in c[3].split()])
                    if int(c[1]) < 0:
                        data = [(i,-s) for (i,s) in data.__reversed__()]
                    for (index,strand) in data:
                        if 'ancGenes' in kwargs:
                            self.lstGenes[currC].append( Gene(currC, pos, pos+1, strand, ancGenes[index].names) )
                        else:
                            self.lstGenes[currC].append( Gene(currC, pos, pos+1, strand, (index,)) )
                        pos += 1
                print >> sys.stderr, "(ancestral genome: chrom+diags)",

            else:
                if len(c) == 2:
                    (ili,ils) = (0,1)
                else:
                    assert len(c) >= 4
                    (ili,ils) = (2,3)
                    self.ancName = c[0]

                if 'ancGenes' in kwargs:
                    ancGenes = kwargs["ancGenes"].lstGenes[None]

                # ancestral genome: "LST-INDEX LST-STRANDS"
                #############################################
                for (i,l) in enumerate(f):
                    c = l.split("\t")
                    chrom = i+1
                    lchrom = self.lstGenes[chrom]
                    for (pos,(index,strand)) in enumerate(itertools.izip(c[ili].split(), c[ils].split())):
                        if 'ancGenes' in kwargs:
                            lchrom.append( Gene(chrom, pos, pos+1, int(strand), ancGenes[int(index)].names) )
                        else:
                            lchrom.append( Gene(chrom, pos, pos+1, int(strand), (int(index),) ) )
                print >> sys.stderr, "(ancestral genome: diags)",

            f.close()
            self.name = fichier

        else:
            genomeBase = fichier
            print >> sys.stderr, "Filtering of", genomeBase.name, "...",
            filterIn = set(kwargs["filterIn"]) if "filterIn" in kwargs else None
            filterOut = set(kwargs["filterOut"]) if "filterOut" in kwargs else None

            def filt(gene):
                if filterIn is not None:
                    return any(s in filterIn for s in gene.names)
                if filterOut is not None:
                    return all(s not in filterOut for s in gene.names)
                return True

            self.lstGenes = {}
            for (chrom,l) in genomeBase.lstGenes.iteritems():
                l = [gene for gene in l if filt(gene)]
                if len(l) > 0:
                    self.lstGenes[chrom] = l
            self.name = "Filter from " + genomeBase.name
            print >> sys.stderr, "%d genes -> %d genes" % (sum(len(x) for x in genomeBase.lstGenes.itervalues()), sum(len(x) for x in self.lstGenes.itervalues())),

        self.init(**kwargs)
        print >> sys.stderr, "OK"