def missing_species_names(names): """returns a Table of missing species names, or None""" missing = [] for name in names: n = Species.get_species_name(name) if n == "None": missing.append([name]) if missing: result = make_table(header=["MISSING SPECIES"], data=missing) else: result = None return result
def get_chrom_names(ref_species, compara): """returns the list of chromosome names""" genome_db = compara.ComparaDb.get_table("genome_db") dnafrag = compara.ComparaDb.get_table("dnafrag") joined = genome_db.join( dnafrag, onclause=genome_db.c.genome_db_id == dnafrag.c.genome_db_id) condition = sql.and_( dnafrag.c.coord_system_name == "chromosome", genome_db.c.name == Species.get_ensembl_db_prefix(ref_species), dnafrag.c.is_reference == 1, ) query = sql.select([dnafrag.c.name], condition).select_from(joined) chroms = [r[0] for r in query.execute()] return chroms
def renamed_seqs(aln): """renames sequences to be just species common name""" new = [] names = Counter() for seq in aln.seqs: latin = get_latin_from_label(seq.name) common = Species.get_common_name(latin) names[common] += 1 seq.name = common new.append((seq.name, seq)) if max(list(names.values())) > 1: # a species occures more than once return None return make_aligned_seqs(data=new, moltype=DNA, array_align=False)
def display_available_dbs(account, release=None): """displays the available Ensembl databases at the nominated host""" db_list = get_db_name(account=account, db_type="core", release=release) db_list += get_db_name(account=account, db_type="compara", release=release) rows = [] for db_name in db_list: species_name = db_name.species if species_name: common_name = Species.get_common_name(db_name.species, level="ignore") if "compara" in db_name.name: species_name = common_name = "-" rows.append([db_name.release, db_name.name, species_name, common_name]) table = make_table( header=["Release", "Db Name", "Species", "Common Name"], data=rows, space=2 ) table = table.sorted(["Release", "Db Name"]) table.legend = ( "Values of 'None' indicate cogent does not have a value for that database name." ) return table
def get_one2one_orthologs( compara, ref_genes, outpath, not_strict, force_overwrite, test ): """writes one-to-one orthologs of protein coding genes to outpath""" species = Counter(compara.species) written = 0 records = [] with click.progressbar(ref_genes, label="Finding 1to1 orthologs") as ids: for gene in ids: outfile_name = os.path.join(outpath, "%s.fa.gz" % gene) if os.path.exists(outfile_name) and not force_overwrite: written += 1 continue syntenic = list( compara.get_related_genes( stableid=gene, relationship="ortholog_one2one" ) ) if len(syntenic) != 1: continue syntenic = syntenic[0] if not not_strict and ( syntenic is None or Counter(syntenic.get_species_set()) != species ): # skipping, not all species had a 1to1 ortholog for this gene continue seqs = [] for m in syntenic.members: records.append([gene, m.stableid, m.location, m.description]) name = Species.get_common_name(m.genome.species) cds = m.canonical_transcript.cds.trim_stop_codon(allow_partial=True) cds.name = name seqs.append([name, cds]) seqs = make_unaligned_seqs(data=seqs) if test: print() print(gene) print(seqs.to_fasta()) else: with gzip.open(outfile_name, "wt") as outfile: outfile.write(seqs.to_fasta() + "\n") LOGGER.output_file(outfile_name) written += 1 if test: msg = "Would have written %d files to %s" % (written, outpath) else: msg = "Wrote %d files to %s" % (written, outpath) click.echo(msg) if written > 0: metadata = make_table( header=["refid", "stableid", "location", "description"], rows=records ) metadata.write(os.path.join(outpath, "metadata.tsv")) return