示例#1
0
文件: genefamily.py 项目: ajm/glutton
def json_to_glutton(families) :
    tmp = {}
    bad_family_count = 0
    bad_gene_count = 0

    for famid in families :
        #tmp[famid] = GeneFamily([ Gene(*families[famid][geneid], id=geneid) for geneid in families[famid] ], id=famid)
        
        fam = []
        bad = False
        for geneid in families[famid] :
            genename,geneseq = families[famid][geneid]

            # if i find this, keep on adding then i can print out the whole family later
            if geneseq == 'Sequenceunavailable' :
                bad = True

            fam.append( Gene(genename, geneseq, id=geneid) )
        
        if not bad :
            tmp[famid] = GeneFamily(fam, id=famid)
        else :
            bad_family_count += 1
            bad_gene_count += len(fam)

    if bad_family_count > 0 :
        get_log().error("%d bad gene families (containing %d genes)" % (bad_family_count, bad_gene_count))

    return tmp
示例#2
0
文件: blast.py 项目: ajm/glutton
    def makedb(db_fname, nucleotide=False) :
        c = ["makeblastdb", "-in", db_fname, "-dbtype", "nucl" if nucleotide else "prot"]

        try :
            get_log().debug(" ".join(c))
            subprocess.check_output(c, stderr=subprocess.STDOUT, close_fds=True)

        except subprocess.CalledProcessError, cpe :
            get_log().fatal("%s returncode=%d\n%s" % (c[0], cpe.returncode, cpe.output))
            exit(1)
示例#3
0
def download_database_biomart(species, release, database_name='ensembl', nucleotide=True) :
    
    mart_name, schema_name, mart_release, table_name, species_desc = get_all_species(get_marts(database_name), database_name)[species]

    if mart_release != release :
        get_log().fatal("requested release (%d) does not match current biomart release (%d)" % (release, mart_release))
        exit(1)

    seq = get_sequences(species, database_name, schema_name, table_name, nucleotide)
    h**o = get_homology_info(species, database_name, schema_name, table_name, nucleotide)

    return group_into_families(seq, h**o)
示例#4
0
文件: main.py 项目: ajm/glutton
def main() :
    args = handle_args(argv[1:])

    generic_options(args)

    start_time = time.time()

    get_log().info("this is glutton version %s" % str(glutton.__version__))

    ret = commands[argv[1]](args)

    get_log().info("%s took %s" % (argv[1], duration_str(time.time() - start_time)))

    return ret
示例#5
0
def list_command(args) :
    log = get_log()
    e = EnsemblDownloader()

    suppression_defaults = {
            'ensembl'   : 70, # errors with 69 and older (missing columns)
            'metazoa'   : 17,
            'protists'  : 17,
            'plants'    : 17,
            'bacteria'  : 17,
            'fungi'     : 17,
        }

    if not args.suppress :
        args.suppress = suppression_defaults[args.database]

    log.info("listing species in %s database" % args.database)
    log.info("suppressing releases prior to %d" % args.suppress)

    try :
        pretty_print_table(
            ('Species', 'Release'), 
            e.get_all_species(db=args.database, 
                              suppress=args.suppress))

    except EnsemblDownloadError, ede :
        log.fatal(ede.message)
        exit(1)
示例#6
0
def get_sequences(species, database_name, schema_name, table_name, nucleotide) :
    global TIMEOUT
    
    log = get_log()
    log.debug("getting sequences...")

    payload_sequences = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName="%s" formatter="FASTA" header="0" uniqueRows="0" count="" datasetConfigVersion="0.7">
    <Dataset name="%s" interface="default">
        <Attribute name="ensembl_gene_id" />
        <Attribute name="%s" />
    </Dataset>
</Query>"""

    # e.g. 'dipodomys_ordii': ('dordii_gene_ensembl', 'Dipodomys ordii genes (dipOrd1)')
    seq_type = 'coding' if nucleotide else 'peptide'
    #query = payload_sequences % (schema_name if database_name != 'ensembl' else 'default', table_name, seq_type)
    query = payload_sequences % (database_name + '_mart' if database_name != 'ensembl' else 'default', table_name, seq_type)
    params = urllib.urlencode({ 'query' : query })

    print query

    sequences = {}

    try :
        f = urllib2.urlopen(get_URL(database_name), params)

    except urllib2.URLError, ue :
        log.fatal("biomart sequence query error (%s)" % str(ue))
        exit(1)
示例#7
0
文件: aligner.py 项目: ajm/glutton
    def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) :
        self.directory = join(top_level_directory, 'alignments')
        self.min_length = min_length # glutton
        self.min_hitidentity = min_hitidentity # blast 
        self.min_hitlength = min_hitlength # blast
        self.max_evalue = max_evalue # blast
        self.min_alignidentity = min_alignidentity # pagan
        self.min_alignoverlap = min_alignoverlap # pagan

        check_dir(self.directory, create=True)

        self.search = All_vs_all_search(batch_size)
        self.cleanup_files = []
        self.q = None

        self.lock = threading.Lock()
        self.complete_jobs = 0
        self.total_jobs = 0

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.param.set_reference(self.db)

        self.resume = self.param.able_to_resume()

        self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume)
        self.param.set_full_checksum()
示例#8
0
    def __init__(self, batch_size=100):
        self.nucleotide = False
        self.min_hitidentity = None
        self.min_hitlength = None
        self.max_evalue = None
        self.batch_size = batch_size
        self.log = get_log()
        self.cleanup_files = []
        self.gene_assignments = {}
        self.lock = threading.Lock()
        self.q = None

        self.total_jobs = 0
        self.complete_jobs = 0
示例#9
0
def get_all_species_pycogent(db, suppress) :
    log = get_log()

    #try :
    from cogent.db.ensembl import Species

    #except ImportError :
    #    log.fatal("pycogent import failed, exiting...")
    #    exit(1)

    log.warning("pycogent cannot differentiate between ensembl projects, listing everything...")

    # annoying that i cannot see another way to programmatically get
    # a list of supported species
    return [ (i.split()[-1], 'unknown') for i in str(Species).split('\n')[3:-1] ]
示例#10
0
文件: queue.py 项目: ajm/glutton
    def __init__(self, qtimeout=1, maxsize=0):

        if maxsize == 0 :
            maxsize = num_threads() * 2

        self.log = get_log()

        self.q = Queue.Queue(maxsize)
        self.workers = self._init_workers(num_threads())
        self.q_timeout = qtimeout
        self.running = False
        self.no_more_jobs = False
        
        self.jobs_completed = 0
        self.jobs_counter = itertools.count(start=1)

        self.start()
示例#11
0
文件: db.py 项目: ajm/glutton
    def __init__(self, fname=None) :
        self.fname       = fname
        self.compression = ZIP_DEFLATED
        self.metadata    = None
        self.data        = None     # dict of famid -> GeneFamily obj (list of Genes)
        self.seq2famid   = None     # dict of geneid -> famid
        self.dirty       = False
        self.lock        = threading.Lock()
        self.complete_jobs = 0
        self.total_jobs = 0

        self.log = get_log()

        if self.fname :
            self._read()

            if not self.is_complete() :
                self.log.warn("%s is not complete!" % self.fname)
示例#12
0
文件: info.py 项目: ajm/glutton
    def __init__(self, project_dir, create=False) :
        self.directory = project_dir
        self.create = create
        check_dir(self.directory, create=self.create)

        self.log = get_log()

        self.params = self.load(self.parameter_filename)

        if not self.params :
            self.params = { 'db_species'    : None,
                            'db_release'    : None,
                            'db_filename'   : None,

                            'db_checksum'   : None,
                            'full_checksum' : None,
                            'sample_checksum' : None,

                            'samples'       : {} }
示例#13
0
文件: info.py 项目: ajm/glutton
    def __init__(self, alignments_dir, parameters, db, resume=True) :
        self.directory = alignments_dir
        self.params = parameters
        self.db = db

        check_dir(self.directory)

        self.log = get_log()
        self.lock = threading.RLock() # a single function requires this be an RLock over a Lock

        # the alignment procedure can take a long time, so everything needs to be 
        # restartable, in addition - if we restart it then we need to be sure that 
        # the parameters used are the same, i.e.: same reference database etc etc

        self.contig_query_map = {}          # file id -> contig id -> query id (file id is provided by the user, called a 'label')
        self.query_gene_map = {}            # query id -> (gene id, +/-) or None
        self.genefamily_filename_map = {}   # gene family id -> filename

        if resume :
            self.read_progress_files()
示例#14
0
def get_all_species(marts, database_name) :
    global TIMEOUT

    query = get_URL(database_name) + '?type=datasets&mart=%s'
    species = {}

    log = get_log()

    for k in marts :
        if not marts[k] :
            continue

        mart_name,schema_name,release = marts[k]

        try :
            f = urllib2.urlopen(query % mart_name, timeout=TIMEOUT)

        except urllib2.URLError, ue :
            log.fatal("biomart dataset listing timed out")
            exit(1)

        for line in f : 
            line = line.strip()
            if not line :
                continue

            data = line.split('\t')

            db_name = data[1]
            description = data[2]

            if ' genes ' in description :
                species_name = description.split(' genes ')[0].lower().replace(' ', '_')
            else :
                species_name = db_name.split('_')[0]

            species[species_name] = (mart_name, schema_name, release, db_name, description)

        f.close()
示例#15
0
def build_command(args) :
    log = get_log()
    gdb = GluttonDB()

    def _cleanup(signal, frame) :
        print >> stderr, "Killed by user, cleaning up..."
        gdb.stop()
        os._exit(0)

    signal.signal(signal.SIGINT, _cleanup)

    try :
        gdb.build(args.output, 
              args.species, 
              args.release, 
              args.database,
              True, #not args.protein, 
              args.download)

    except GluttonDBBuildError, nmgfe :
        log.fatal(nmgfe.message)
        exit(1)
示例#16
0
    def __init__(self, top_level_directory, reference_fname, assembler_name, protein_identity, alignment_length, min_gene_coverage, do_not_trim=False, testmode='none') :
        self.alignments_dir     = join(top_level_directory, 'alignments')
        self.output_dir         = join(top_level_directory, 'postprocessing')
        self.protein_identity   = protein_identity
        self.alignment_length   = alignment_length
        self.min_gene_coverage  = min_gene_coverage
        self.trim               = not do_not_trim
        self.testmode           = testmode

        self.scaffold_dir       = join(self.output_dir, 'scaffolds')
        self.genefamily_msa_dir = join(self.output_dir, 'genefamily_msa')
        self.gene_msa_dir       = join(self.output_dir, 'gene_msa')

        check_dir(self.output_dir, create=True)
        check_dir(self.scaffold_dir, create=True)
        check_dir(self.genefamily_msa_dir, create=True)
        check_dir(self.gene_msa_dir, create=True)

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.info = GluttonInformation(self.alignments_dir, self.param, self.db)

        # check reference was the same
        if not self.param.same_reference(self.db) :
            self.log.error("current reference is %s, alignments were performed using %s" % (reference_fname, self.db.filename))
            exit(1);

        # perhaps slightly overambitious to exit, just stick to a warning      
        pending,failures = self.info.num_alignments_not_done()
        if pending != 0 :
            self.log.warn("%d alignments were not run!" % pending)

        self.assembler = AssemblerOutput(assembler_name)

        # e.g. query39806_orf1
        self.orfname_regex = re.compile("^(query\d+)\_orf(\d)$")
示例#17
0
def get_marts(database_name) :
    global TIMEOUT

    if database_name :
        marts = { database_name : None }
    else :
        marts = {   
                'ensembl'   : None,
                'metazoa'   : None,
                'plants'    : None,
                'protists'  : None,
                'fungi'     : None,
                'bacteria'  : None  }

    log = get_log()
    query = get_URL(database_name) + '?type=registry'
    
    try :
        dom = parse(urllib2.urlopen(query, timeout=TIMEOUT))
    
    except urllib2.URLError, urle :
        log.fatal("biomart registry query timed out")
        exit(1)
示例#18
0
def get_homology_info(species, database_name, schema_name, table_name, nucleotide) :
    global TIMEOUT

    log = get_log()
    log.debug("getting homology...")

    payload_homology = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName="%s" formatter="TSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.7">
    <Dataset name="%s" interface="default">
        <Attribute name="ensembl_gene_id" />
        <Attribute name="%s" />
    </Dataset>
</Query>"""

    # is this always correct?
    short_species_name = table_name.split('_')[0]

    # this does not seem like a great idea, but until it breaks it will do...
    if '_eg_' in table_name :
        paralog_name = short_species_name + '_eg_paralog_gene'
    else :
        paralog_name = short_species_name + '_paralog_ensembl_gene'

    #query2 = payload_homology % (schema_name if database_name != 'ensembl' else 'default', table_name, paralog_name)
    query2 = payload_homology % (database_name + '_mart' if database_name != 'ensembl' else 'default', table_name, paralog_name)
    params2 = urllib.urlencode({ 'query' : query2 })

    print query2

    # make api call
    try :
        f = urllib2.urlopen(get_URL(database_name), params2)

    except urllib2.URLError, ue :
        log.fatal("biomart homology query error (%s)" % str(ue))
        exit(1)
示例#19
0
文件: info.py 项目: ajm/glutton
 def __init__(self) :
     self.log = get_log()
示例#20
0
    def __init__(self) :
        self.log = get_log()
        self.method = get_ensembl_download_method()

        self.log.info("ensembl download method is set to: %s" % self.method)
示例#21
0
文件: job.py 项目: ajm/glutton
 def __init__(self, callback) :
     self.state = Job.QUEUED
     self.log = get_log()
     self.callback = callback
示例#22
0
文件: base.py 项目: ajm/glutton
 def __init__(self, location=None) :
     self.binary_location = get_binary_path(self.name) if not location else location
     self.log = get_log()
示例#23
0
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) :
    log = get_log()

    #try :
    import cogent
    from cogent.db.ensembl import Species, Genome, Compara, HostAccount
    from cogent.db.ensembl.database import Database

    #except ImportError :
    #    log.fatal("pycogent import failed, exiting...")
    #    exit(1)

    if cogent.version_info != (1,5,3) :
        log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version)


    release, db_name, db_details = get_missing_info(species, release, database_name)

    account = HostAccount(
                db_details['hostname'],
                db_details['username'],
                db_details['password'],
                port=db_details['port'])

    if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None"
        log.warning("%s not found in pycogent, attempting to add it manually" % species)
        Species.amendSpecies(species.capitalize().replace('_', ' '), species)

    genome = Genome(species, Release=release, account=account)
    compara = Compara([species], Release=release, account=account)



    # DON'T TRY THIS AT HOME!
    #
    # what happens is it searches for compara databases, but unfortunately finds more than one
    # in this situation pycogent just connects to the first one, which is always compara_bacteria
    # so one solution is to dig through all the compara objects internals to provide a connection
    # to the correct database ... obviously not the best solution, but at 6 lines of code definitely
    # the shortest ;-P
    #
    if db_name not in ('ensembl', 'bacteria') :
        log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...")

        from cogent.db.ensembl.host import DbConnection
        from cogent.db.ensembl.name import EnsemblDbName
        import sqlalchemy

        new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name))
        compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name)
        compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db)
    # end of DON'T TRY THIS AT HOME!



    genes = set()
    families = []

    stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

    for gene in genome.getGenesMatching(BioType='protein_coding') :
        stableid = gene.StableId

        # ignore genes that have already been seen as members of other gene families
        if stableid in genes :
            continue

        genes.add(stableid)

        paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog')
        
        current = []
        
        if paralogs is None :
            stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))
            current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq)))

        else :
            for paralog in paralogs.Members :
                paralogid = paralog.StableId
                genes.add(paralogid)

                stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

                try :
                    current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq)))
                
                except AttributeError :
                    log.fatal("pycogent did not find a canonical transcript for %s" % paralogid)
                    exit(1)

        #print ','.join([ i for i,j in current ])
        families.append(current)

    stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes)))

    return families
示例#24
0
文件: pagan.py 项目: ajm/glutton
    def __init__(self):
        super(Pagan, self).__init__()

        self.log = get_log()
        self.protein_alignment_fname = None
        self.nucleotide_alignment_fname = None