Exemplo n.º 1
0
 def from_gff(cls, filename, name, description, build, organism):
     ''' imports refgen from gff file '''
     self = cls.create(name, description, type='RefGen')
     self._global('build', build)
     self._global('organism', organism)
     genes = list()
     with open(filename, 'r') as IN:
         for line in IN:
             #skip comment lines
             if line.startswith('#'):
                 continue
             chrom, source, feature, start, end, score, strand, frame, attributes = line.strip(
             ).split()
             attributes = dict([
                 (field.split('='))
                 for field in attributes.strip(';').split(';')
             ])
             if feature == 'chromosome':
                 self.log('Found a chromosome: {}', attributes['ID'])
                 self.add_chromosome(Chrom(attributes['ID'], end))
             if feature == 'gene':
                 genes.append(
                     Gene(chrom,
                          int(start),
                          int(end),
                          attributes['ID'].upper(),
                          strand=strand,
                          build=build,
                          organism=organism))
     self.add_gene(genes)
     return self
Exemplo n.º 2
0
 def from_ids(self, gene_list, check_shape=False, enumerated=False):
     ''' returns gene object list from an iterable of id strings '''
     genes = [
         Gene(*x, build=self.build, organism=self.organism)
         for x in self.db.cursor().execute(''' 
         SELECT chromosome,start,end,id FROM genes WHERE id IN ('{}')
         '''.format("','".join(map(str.upper, gene_list))))
     ]
     if check_shape and len(genes) != len(gene_list):
         raise ValueError('Some input ids do not have genes in reference')
     return genes
Exemplo n.º 3
0
def build_cob(args):
    try:
        # Build the refgen
        refgen = co.RefGen(args.refgen)
        # Check that the sep is likely right.
        if len(pd.read_table(args.filename, sep=args.sep).columns) == 1:
            print(("Detected only 1 column in {}, are you sure "
                   "colunms are separated by '{}'?").format(
                       args.filename, args.sep))
            return None
        if args.allow_non_membership:
            refgen = refgen.copy('{}_tmp'.format(refgen.name),
                                 'temp refgen'.format(refgen.name))
            # Add non membership genes
            for gid in pd.read_table(args.filename, sep=args.sep).index:
                refgen.add_gene(Gene(None, None, id=gid))

        quality_control = False if args.skip_quality_control else True
        normalize = False if args.skip_normalization else True
        quantile = False if args.skip_quantile else True

        # Check to see if this dataset is already built
        if available_datasets('Expr', args.name):
            print('Warning! This dataset has already been built.')
            co.Tools.del_dataset('Expr', args.name, force=args.force)

        # Basically just pass all the CLI arguments to the COB class method
        cob = co.COB.from_table(
            args.filename,
            args.name,
            args.description,
            refgen,
            # Optional arguments
            sep=args.sep,
            rawtype=args.rawtype,
            # Data Processing
            quality_control=quality_control,
            normalization=normalize,
            quantile=quantile,
            # Data processing parameters
            max_gene_missing_data=args.max_gene_missing_data,
            max_accession_missing_data=args.max_accession_missing_data,
            min_single_sample_expr=args.min_single_sample_expr,
            min_expr=args.min_expr,
            max_val=args.max_val,
            dry_run=args.dry_run,
            zscore_cutoff=args.zscore_cutoff,
            index_col=args.index_col)
        print(cob.summary())
    except Exception as e:
        print("Build failed. Rolling back: removing corrupted files...")
        co.Tools.del_dataset('Expr', args.name, force=True)
        raise e
Exemplo n.º 4
0
 def __getitem__(self, item):
     try:
         gene_data = self.db.cursor().execute(
             '''
             SELECT chromosome,start,end,id FROM genes WHERE id = ?
         ''', (item, )).fetchone()
         return Gene(*gene_data, build=self.build, organism=self.organism)
     except Exception as e:
         pass
     try:
         _ = (x for x in item)
         return list(self.from_ids(list(_)))
     except TypeError as e:
         self.log('not iterable: {}', e)
         pass
     return None
Exemplo n.º 5
0
 def downstream_genes(self, locus, gene_limit=1000):
     '''
         returns genes downstream of a locus. Genes are ordered so that the 
         nearest genes are at the beginning of the list. 
     '''
     return [
         Gene(*x, build=self.build, organism=self.organism)
         for x in self.db.cursor().execute(
             ''' 
         SELECT chromosome,start,end,id FROM genes
         WHERE chromosome = ?
         AND start > ?
         AND start < ?
         ORDER BY start ASC
         LIMIT ?
         ''', (locus.chrom, locus.start, locus.downstream, gene_limit))
     ]
Exemplo n.º 6
0
 def within_gene(self, locus):
     ''' 
         Returns the gene the locus is within, or None 
     '''
     try:
         x = [
             Gene(*x, build=self.build, organism=self.organism)
             for x in self.db.cursor().execute(
                 ''' 
             SELECT chromosome,start,end,id FROM genes 
             WHERE chromosome = ?
             AND start < ?
             AND end > ?
         ''', (locus.chrom, locus.start, locus.start))
         ][0]
         return x
     except Exception as e:
         return None
Exemplo n.º 7
0
 def genes_within(self, loci, chain=True):
     ''' 
         Returns the genes within a locus, or None 
     '''
     try:
         iterator = iter(loci)
         genes = [
             self.genes_within(locus, chain=chain) for locus in iterator
         ]
         if chain:
             genes = list(itertools.chain(*genes))
         return genes
     except TypeError as e:
         return [
             Gene(*x, build=self.build, organism=self.organism)
             for x in self.db.cursor().execute(
                 ''' 
             SELECT chromosome,start,end,id FROM genes
             WHERE chromosome = ?
             AND start > ?
             AND end < ?
         ''', (loci.chrom, loci.start, loci.end))
         ]
Exemplo n.º 8
0
def build_cob(args):
    try:
        # Build the refgen
        refgen = co.RefGen(args.refgen)
        # Check that the sep is likely right.
        if len(pd.read_table(args.filename, sep=args.sep).columns) == 1:
            print(("Detected only 1 column in {}, are you sure "
                   "colunms are separated by '{}'?").format(
                       args.filename, args.sep))
            return None
        elif (len(pd.read_table(args.filename, sep=args.sep).columns) < 20
              and args.non_interactive != True):
            print((
                "Detected fewer than 20 accessions in the expression matrix. "
                "Calculating co-expression with this many datapoints is not advised"
            ))
            if input("are you sure you want to continue? [y/n]: ").upper(
            ) == "Y":
                pass
            else:
                sys.exit(1)
        if args.allow_non_membership:
            refgen = refgen.copy("{}_tmp".format(refgen.name),
                                 "temp refgen".format(refgen.name))
            # Add non membership genes
            for gid in pd.read_table(args.filename, sep=args.sep).index:
                refgen.add_gene(Gene(None, None, id=gid))

        quality_control = False if args.skip_quality_control else True
        normalize = False if args.skip_normalization else True
        quantile = True if args.quantile else False

        # Check to see if this dataset is already built
        if available_datasets("Expr", args.name):
            print("Warning! This dataset has already been built.")
            co.Tools.del_dataset("Expr", args.name, force=args.force)

        # Basically just pass all the CLI arguments to the COB class method
        cob = co.COB.from_table(
            args.filename,
            args.name,
            args.description,
            refgen,
            # Optional arguments
            sep=args.sep,
            rawtype=args.rawtype,
            # Data Processing
            quality_control=quality_control,
            normalization=normalize,
            quantile=quantile,
            # Data processing parameters
            max_gene_missing_data=args.max_gene_missing_data,
            max_accession_missing_data=args.max_accession_missing_data,
            min_single_sample_expr=args.min_single_sample_expr,
            min_expr=args.min_expr,
            max_val=args.max_val,
            dry_run=args.dry_run,
            zscore_cutoff=args.zscore_cutoff,
            index_col=args.index_col,
        )
        print(cob.summary())
    except Exception as e:
        print(
            f"Build failed for {args.name}. Rolling back: removing corrupted files..."
        )
        co.Tools.del_dataset("Expr", args.name, force=True)
        raise e
Exemplo n.º 9
0
 def iter_genes(self):
     ''' iterates over genes in refgen, only returns genes within gene filter '''
     return (Gene(*x, build=self.build, organism=self.organism)
             for x in self.db.cursor().execute('''
         SELECT chromosome,start,end,id FROM genes
         '''))
Exemplo n.º 10
0
 def random_gene(self):
     return Gene(*self.db.cursor().execute(
         ''' 
         SELECT chromosome,start,end,id from genes WHERE rowid = ?
         ''', (random.randint(1, self.num_genes()), )).fetchone())