예제 #1
0
def make_tch(file):
    tch = [ncbi[file][0], '%s/%s' % (taxdir, ncbi[file][0])]
    if check(tch[1]) is False:
        if tch[0] == 'gi2taxid.tch':
            db = hash.Hash()
            db.open(tch[1])
            gi2taxid = ['%s/%s' % (taxdir, file) for file in 'gi_taxid_prot.dmp', 'gi_taxid_nucl.dmp']
            for file in gi2taxid:
                for line in open(file):
                    gi, taxid = line.strip().split()
                    db[gi] = taxid
            db.close()
        elif file == 'names.dmp':
            names2ids, ids2names = ['%s/%s' % (taxdir, i) for i in ncbi[file]]
            names2idsdb = hash.Hash()
            names2idsdb.open(names2ids)
            ids2namesdb = hash.Hash()
            ids2namesdb.open(ids2names)
            for line in open('%s/%s' % (taxdir, file)):
                if 'scientific name' in line:
                    id, sciname = [i.strip() for i in line.strip().split('|')[0:2]]
                    names2idsdb[sciname] = id
                    ids2namesdb[id] = sciname
            names2idsdb.close()
            ids2namesdb.close()
        elif file == 'nodes.dmp':
            db = hash.Hash()
            db.open(tch[1])
            for line in open('%s/%s' % (taxdir, file)):
                id, parent_id, parent = [i.strip() for i in line.strip().split('|')[0:3]]
                db[id] = '%s %s' % (parent_id, parent)
            db.close()
예제 #2
0
def get_db(database, blast):
    if database.rsplit('.', 1)[1] == 'tch':
        id2desc = hash.Hash()
        id2desc.open(database)
        return id2desc
    else:
        return headerid2desc(database, get_subset(blast))
예제 #3
0
def combine_with_hits(clean, s_db, search_out, hits):
    """
    combine sequences with best hits from search
    """
    best = set([hit[1].split()[0] for hit in numblast(open(search_out), hits, False, False)])
    combo = '%s.best%srefs.fa' % (clean.rsplit('.', 1)[0], hits)
    if os.path.exists(combo) is True:
        return combo
    combo = open(combo, 'w')
    for seq in parse_fasta(clean):
        print >> combo, '\n'.join(seq)
    # create/open tch for search database
    s_tch = '%s.tch' % (s_db)
    if os.path.exists(s_tch) is False:
        fasta2tch(s_db)
    id2seq = hash.Hash()
    id2seq.open(s_tch)
    # get sequences for best hits from tch
    for hit in best:
        seq = id2seq[hit].split('\n')
        header = remove_char(seq[0].split()[0]).replace('>', '>best-hit_')
        print >> combo, '\n'.join([header, seq[1].upper()])
    combo.close()
    id2seq.close()
    return combo.name
예제 #4
0
def tch(fasta):
    tch = '%s.tch' % (fasta)
    db = hash.Hash()
    db.open(tch)
    for sequence in fasta_parser.iterate_fasta(fasta):
        db[sequence[0].split('>')[1].split()[0]] = '\n'.join(sequence)
    db.close()
예제 #5
0
def _get_tch(path):
    if not path.endswith('.tch'):
        path += '.tch'
    if path not in _TCH_POOL:
        _TCH_POOL[path] = tch.Hash()
        _TCH_POOL[path].open(path, tch.HDBOREADER | tch.HDBONOLCK)
        # _TCH_POOL[path].setmutex()
    return _TCH_POOL[path]
예제 #6
0
def get_tchs():
    [download_file(file) for file in ncbi]
    [make_tch(file) for file in ncbi]
    # database directory
    tchs = {} # dictionary of tch files with their open files as keys
    for tch in set([f for i in ncbi.values() for f in i]):
            full_tch = '%s/%s' % (taxdir, tch)
            db = hash.Hash()
            db.open(full_tch)
            tchs[tch] = db
    return tchs
예제 #7
0
def ko2kegg(file, option, file_type):
	tch = option2kegg(option)
	kegg = hash.Hash()
	kegg.open(tch)
	if file_type == 'fasta':
		for sequence in parse_fasta(file):
			header = sequence[0].split('>')[1]
			id = header.split()[0]
			yield header
			ks = set(find_ko(header.split()))
			for k in ks:
				if k in kegg:
					for function in kegg[k].split('|'):
						# - id - k - function
						yield '\t%s\t%s\t%s' % (id, k, function)
				else:
						yield '\t%s\t%s\tn/a' % (id, k)

	elif file_type == 'list':
		for line in file:
			line = line.strip()
			if len(line.split()) != 0:
				id = line.split()[0]
				yield line
				ks = set(find_ko(line.split()))
				for k in ks:
					if k in kegg:
						for function in kegg[k].split('|'):
							# - id - k - function
							yield '\t%s\t%s\t%s' % (id, k, function)
					else:
						yield '\t%s\t%s\tn/a' % (id, k)

	else:
		ks = set(find_ko(file))
		for k in ks:
			if k in kegg:
				for function in kegg[k].split('|'):
					yield [k, function]
			else:
				yield [k, 'n/a']
	kegg.close()