def writeOrgFile(org, output, compress=False): with write_compressed_or_not(output + "/" + org.name + ".tsv",compress) as outfile: outfile.write("\t".join(["gene","contig","start","stop","strand","ori","family","nb_copy_in_org","partition","persistent_neighbors","shell_neighbors","cloud_neighbors"]) + "\n") for contig in org.contigs: for gene in contig.genes: nb_pers = 0 nb_shell = 0 nb_cloud = 0 for neighbor in gene.family.neighbors: if neighbor.namedPartition == "persistent": nb_pers+=1 elif neighbor.namedPartition == "shell": nb_shell+=1 else: nb_cloud+=1 outfile.write("\t".join(map(str,[ gene.ID if gene.local_identifier == "" else gene.local_identifier, contig.name, gene.start, gene.stop, gene.strand, "T" if (gene.name.upper() == "DNAA" or gene.product.upper() == "DNAA") else "F", gene.family.name, len(gene.family.getGenesPerOrg(org)), gene.family.namedPartition, nb_pers, nb_shell, nb_cloud ])) + "\n")
def writeSpotModules(output, compress): logging.getLogger().info("Writing modules to spot associations...") fam2mod = {} for mod in pan.modules: for fam in mod.families: fam2mod[fam] = mod with write_compressed_or_not(output + "/modules_spots.tsv", compress) as fout: fout.write("module_id\tspot_id\n") for spot in pan.spots: curr_mods = defaultdict(set) for rgp in spot.getUniqContent(): for fam in rgp.families: mod = fam2mod.get(fam) if mod is not None: curr_mods[mod].add(fam) for mod in curr_mods: if curr_mods[mod] == mod.families: # if all the families in the module are found in the spot, write the association fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n") logging.getLogger().info(f"Done writing module to spot associations to: {output + '/modules_spots.tsv'}")
def writeRegionsSequences(pangenome, output, compress, regions, fasta, anno, disable_bar=False): organisms_file = fasta if fasta is not None else anno org_dict = {} for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: logging.getLogger().error( f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") exit(1) org_dict[elements[0]] = elements[1] logging.getLogger().info(f"Writing {regions} rgp genomic sequences...") regions_to_write = [] if regions == "complete": for region in pangenome.regions: if not region.isContigBorder: regions_to_write.append(region) else: regions_to_write = pangenome.regions regions_to_write = sorted(regions_to_write, key=lambda x: x.organism.name) # order regions by organism, so that we only have to read one genome at the time outname = output + f"/{regions}_rgp_genomic_sequences.fasta" with write_compressed_or_not(outname, compress) as fasta: loaded_genome = "" bar = tqdm(regions_to_write, unit="rgp", disable=disable_bar) for region in bar: if region.organism.name != loaded_genome: loaded_genome = region.organism.name genome_sequence = read_genome_file(org_dict, loaded_genome) fasta.write(f">{region.name}\n") fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60)) bar.close() logging.getLogger().info(f"Done writing the regions nucleotide sequences: '{outname}'")
def writeRegions(output, compress = False): fname = output + "/plastic_regions.tsv" with write_compressed_or_not(fname, compress) as tab: tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\n") regions = sorted(pan.regions, key = lambda x : (x.organism.name, x.contig.name, x.start)) for region in regions: tab.write('\t'.join(map(str,[region.name, region.organism, region.contig, region.start, region.stop, len(region.genes), region.isContigBorder, region.isWholeContig]))+"\n")
def summarize_spots(spots, output, compress): def r_and_s(value): """ rounds to dp figures and returns a str of the provided value""" if isinstance(value, float): return str(round(value,3)) else: return str(value) with write_compressed_or_not(output + "/summarize_spots.tsv", compress) as fout: fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\tstdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n") for spot in sorted(spots, key=lambda x : len(x.regions), reverse=True): tot_fams = set() rgp_list = list(spot.regions) len_uniq_content = len(spot.getUniqContent()) size_list = [] for rgp in spot.regions: tot_fams |= rgp.families size_list.append(len(rgp.genes)) mean_size = mean(size_list) stdev_size = stdev(size_list) if len(size_list) > 1 else 0 max_size = max(size_list) min_size = min(size_list) fout.write("\t".join(map(r_and_s,[f"spot_{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, mean_size,stdev_size,max_size, min_size])) + "\n") logging.getLogger().info(f"Done writing spots in : '{output + '/summarize_spots.tsv'}'")
def writeFastaGeneFam(pangenome, output, compress, gene_families, show_bar=True): outname = output + f"/{gene_families}_nucleotide_families.fasta" genefams = set() if gene_families == 'all': logging.getLogger().info( "Writing all of the representative nucleotide sequences of the gene families..." ) genefams = pangenome.geneFamilies if gene_families in ['persistent', 'shell', 'cloud']: logging.getLogger().info( f"Writing the representative nucleotide sequences of the {gene_families} gene families..." ) for fam in pangenome.geneFamilies: if fam.namedPartition == gene_families: genefams.add(fam) if gene_families == "rgp": logging.getLogger().info( f"Writing the representative nucleotide sequences of the gene families in RGPs..." ) for region in pangenome.regions: genefams |= region.families with write_compressed_or_not(outname, compress) as fasta: getGeneSequencesFromFile(pangenome.file, fasta, [fam.name for fam in genefams], show_bar=show_bar) logging.getLogger().info( f"Done writing the representative nucleotide sequences of the gene families : '{outname}'" )
def writeGeneSequences(pangenome, output, compress, genes, show_bar=True): logging.getLogger().info("Writing all the gene nucleic sequences...") outname = output + f"/{genes}_genes.fna" genes_to_write = [] if genes == 'all': logging.getLogger().info("Writing all of the gene sequences...") genes_to_write = pangenome.genes if genes in ['persistent', 'shell', 'cloud']: logging.getLogger().info( f"Writing all of the {genes} gene sequences...") for gene in pangenome.genes: if gene.family.namedPartition == genes: genes_to_write.append(gene) if genes == "rgp": logging.getLogger().info( f"Writing all of the gene sequences in RGP...") for region in pangenome.regions: genes_to_write.extend(region.genes) logging.getLogger().info(f"There are {len(genes_to_write)} genes to write") with write_compressed_or_not(outname, compress) as fasta: if pangenome.status["geneSequences"] in ["inFile"]: getGeneSequencesFromFile(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]), show_bar=show_bar) elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: writeGeneSequencesFromAnnotations(pangenome, fasta, genes_to_write, show_bar=show_bar) else: #this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") logging.getLogger().info(f"Done writing the gene sequences : '{outname}'")
def writeGeneSequences(output, compress=False): logging.getLogger().info("Writing all the gene nucleic sequences...") outname = output + "/all_genes.fna" with write_compressed_or_not(outname, compress) as fasta: getGeneSequencesFromFile(pan, fasta) logging.getLogger().info( f"Done writing all the gene sequences : '{outname}'")
def writeGeneFamiliesTSV(output, compress=False): logging.getLogger().info("Writing the file providing the association between genes and gene families...") outname = output + "/gene_families.tsv" with write_compressed_or_not(outname,compress) as tsv: for fam in pan.geneFamilies: for gene in fam.genes: tsv.write("\t".join([fam.name, gene.ID if gene.local_identifier == "" else gene.local_identifier, "F" if gene.is_fragment else ""])+"\n") logging.getLogger().info(f"Done writing the file providing the association between genes and gene families : '{outname}'")
def spot2rgp(spots, output, compress): with write_compressed_or_not(output + "/spots.tsv", compress) as fout: fout.write("spot_id\trgp_id\n") n_spot = 0 for spot in spots: for rgp in spot.regions: fout.write(f"spot_{spot.ID}\t{rgp.name}\n") n_spot+=1
def writeJSON(output, compress): logging.getLogger().info("Writing the json file for the pangenome graph...") outname = output + "/pangenomeGraph.json" with write_compressed_or_not(outname, compress) as json: writeJSONheader(json) writeJSONnodes(json) writeJSONedges(json) json.write("}") logging.getLogger().info(f"Done writing the json file : '{outname}'")
def writeBorders(output, dup_margin, compress): multigenics = pan.get_multigenics(dup_margin=dup_margin) all_fams = set() with write_compressed_or_not(output+"/spot_borders.tsv",compress) as fout: fout.write("spot_id\tnumber\tborder1\tborder2\n") for spot in sorted(pan.spots, key= lambda x: len(x.regions), reverse=True): curr_borders=spot.borders(pan.parameters["spots"]["set_size"], multigenics) for c, border in curr_borders: famstring1 = ",".join([ fam.name for fam in border[0] ]) famstring2 = ",".join([ fam.name for fam in border[1]]) all_fams |= set(border[0]) all_fams |= set(border[1]) fout.write(f"{spot.ID}\t{c}\t{famstring1}\t{famstring2}\n") with write_compressed_or_not(output + "/border_protein_genes.fasta",compress) as fout: for fam in all_fams: fout.write(f">{fam.name}\n") fout.write(f"{fam.sequence}\n")
def writeModules(output, compress): logging.getLogger().info("Writing functional modules...") with write_compressed_or_not(output + "/functional_modules.tsv", compress) as fout: fout.write("module_id\tfamily_id\n") for mod in pan.modules: for family in mod.families: fout.write(f"module_{mod.ID}\t{family.name}\n") fout.close() logging.getLogger().info(f"Done writing functional modules to: '{output + '/functional_modules.tsv'}'")
def writeFastaGeneFam(pangenome, output, compress, gene_families, soft_core=0.95, disable_bar=False): outname = output + f"/{gene_families}_nucleotide_families.fasta" genefams = selectFamilies(pangenome, gene_families, "representative nucleotide sequences of the gene families", soft_core) with write_compressed_or_not(outname, compress) as fasta: getGeneSequencesFromFile(pangenome.file, fasta, [fam.name for fam in genefams], disable_bar=disable_bar) logging.getLogger().info(f"Done writing the representative nucleotide sequences of the gene families : '{outname}'")
def writeFastaGenFam(output, compress=False): logging.getLogger().info( "Writing the representative nucleic sequences of all the gene families..." ) outname = output + "/representative_gene_families.fna" with write_compressed_or_not(outname, compress) as fasta: getGeneSequencesFromFile(pan, fasta, [fam.name for fam in pan.geneFamilies]) logging.getLogger().info( f"Done writing the representative nucleic sequences of all the gene families : '{outname}'" )
def writeFastaProtFam(pangenome, output, compress, prot_families, soft_core=0.95, disable_bar=False): outname = output + f"/{prot_families}_protein_families.faa" genefams = selectFamilies(pangenome, prot_families, "representative amino acid sequences of the gene families", soft_core) with write_compressed_or_not(outname, compress) as fasta: bar = tqdm(genefams, unit="prot families", disable=disable_bar) for fam in bar: fasta.write('>' + fam.name + "\n") fasta.write(fam.sequence + "\n") bar.close() logging.getLogger().info(f"Done writing the representative amino acid sequences of the gene families : '{outname}'")
def writeGeneSequences(output, compress=False): logging.getLogger().info("Writing all the gene nucleic sequences...") outname = output + "/all_genes.fna" with write_compressed_or_not(outname, compress) as fasta: if pan.status["geneSequences"] in ["inFile"]: getGeneSequencesFromFile(pan, fasta) elif pan.status["geneSequences"] in ["Computed", "Loaded"]: writeGeneSequencesFromAnnotations(pan, fasta) else: #this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") logging.getLogger().info( f"Done writing all the gene sequences : '{outname}'")
def writeOrgModules(output, compress): logging.getLogger().info("Writing modules to organisms associations...") with write_compressed_or_not(output + "/modules_in_organisms.tsv", compress) as fout: fout.write("module_id\torganism\tcompletion\n") for mod in pan.modules: mod_orgs = set() for fam in mod.families: mod_orgs |= fam.organisms for org in mod_orgs: completion = round(len(org.families & mod.families) / len(mod.families), 2) fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") fout.close() logging.getLogger().info( f"Done writing modules to organisms associations to: '{output + '/modules_in_organisms.tsv'}'")
def writeGEXF(output, light = True, soft_core = 0.95, compress=False): txt = "Writing the gexf file for the pangenome graph..." if light: txt = "Writing the light gexf file for the pangenome graph..." logging.getLogger().info(txt) outname = output + "/pangenomeGraph" outname += "_light" if light else "" outname += ".gexf" with write_compressed_or_not(outname,compress) as gexf: writeGEXFheader(gexf, light) writeGEXFnodes(gexf, light) writeGEXFedges(gexf, light) writeGEXFend(gexf) logging.getLogger().info(f"Done writing the gexf file : '{outname}'")
def writeFastaProtFam(output, compress=False): logging.getLogger().info( "Writing the representative proteic sequences of all the gene families..." ) outname = output + "/representative_gene_families.faa" with write_compressed_or_not(outname, compress) as fasta: bar = tqdm(range(pan.number_of_geneFamilies()), unit="prot families") for fam in list(pan.geneFamilies): fasta.write('>' + fam.name + "\n") fasta.write(fam.sequence + "\n") bar.update() bar.close() logging.getLogger().info( f"Done writing the representative proteic sequences of all the gene families : '{outname}'" )
def writeModuleSummary(output, compress): logging.getLogger().info("Writing functional modules summary...") with write_compressed_or_not(output + "/modules_summary.tsv", compress) as fout: fout.write("module_id\tnb_families\tnb_organisms\tpartition\tmean_number_of_occurrence\n") for mod in pan.modules: org_dict = defaultdict(set) partition_counter = Counter() for family in mod.families: partition_counter[family.namedPartition] += 1 for gene in family.genes: org_dict[gene.organism].add(gene) fout.write( f"module_{mod.ID}\t{len(mod.families)}\t{len(org_dict)}\t{partition_counter.most_common(1)[0][0]}\t" f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod.families), 3)}\n") fout.close() logging.getLogger().info(f"Done writing module summary: '{output + '/modules_summary.tsv'}'")
def writeOrgFile(org, output, compress=False): with write_compressed_or_not(output + "/" + org.name + ".tsv", compress) as outfile: header = ["gene", "contig", "start", "stop", "strand", "family", "nb_copy_in_org", "partition", "persistent_neighbors", "shell_neighbors", "cloud_neighbors"] if needRegions: header.append("RGPs") if needSpots: header.append("Spots") if needModules: header.append("Modules") outfile.write("\t".join(header) + "\n") for contig in org.contigs: for gene in contig.genes: nb_pers = 0 nb_shell = 0 nb_cloud = 0 modules = None RGP = None spot = None for neighbor in gene.family.neighbors: if neighbor.namedPartition == "persistent": nb_pers += 1 elif neighbor.namedPartition == "shell": nb_shell += 1 else: nb_cloud += 1 row = [gene.ID if gene.local_identifier == "" else gene.local_identifier, contig.name, gene.start, gene.stop, gene.strand, gene.family.name, len(gene.family.getGenesPerOrg(org)), gene.family.namedPartition, nb_pers, nb_shell, nb_cloud] if needRegions: if len(gene.RGP) > 0: RGP = ','.join([str(region.name) for region in gene.RGP]) row.append(RGP) if needSpots: if len(gene.family.spot) > 0: spot = ','.join([str(s.ID) for s in gene.family.spot]) row.append(spot) if needModules: if len(gene.family.modules) > 0: modules = ','.join(["module_" + str(module.ID) for module in gene.family.modules]) row.append(modules) outfile.write("\t".join(map(str, row)) + "\n")
def writeGeneSequences(pangenome, output, compress, genes, soft_core=0.95, disable_bar=False): logging.getLogger().info("Writing all the gene nucleotide sequences...") outname = output + f"/{genes}_genes.fna" genefams = selectFamilies(pangenome, genes, "gene nucleotide sequences", soft_core) genes_to_write = [] for fam in genefams: genes_to_write.extend(fam.genes) logging.getLogger().info(f"There are {len(genes_to_write)} genes to write") with write_compressed_or_not(outname, compress) as fasta: if pangenome.status["geneSequences"] in ["inFile"]: getGeneSequencesFromFile(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]), disable_bar=disable_bar) elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: writeGeneSequencesFromAnnotations(pangenome, fasta, genes_to_write, disable_bar=disable_bar) else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") logging.getLogger().info(f"Done writing the gene sequences : '{outname}'")
def writeGenePresenceAbsence(output, compress=False): logging.getLogger().info(f"Writing the gene presence absence file ...") outname = output + "/gene_presence_absence.Rtab" with write_compressed_or_not(outname,compress) as matrix: index_org = {} default_dat = [] for index, org in enumerate(pan.organisms): default_dat.append('0') index_org[org] = index matrix.write('\t'.join(['Gene']#14 +[str(org) for org in pan.organisms])+"\n")#15 default_genes = ["0"] * len(pan.organisms) org_index = pan.getIndex()#should just return things for fam in pan.geneFamilies: genes = default_genes.copy() for org in fam.organisms: genes[org_index[org]] = "1" matrix.write('\t'.join([fam.name]#14 +genes)+"\n")#15 logging.getLogger().info(f"Done writing the gene presence absence file : '{outname}'")
def writeRGPModules(output, compress): logging.getLogger().info("Clustering RGPs based on module content...") lists = write_compressed_or_not(output + "/modules_RGP_lists.tsv", compress) lists.write("representative_RGP\tnb_spots\tmod_list\tRGP_list\n") fam2mod = {} for mod in pan.modules: for fam in mod.families: fam2mod[fam] = mod region2spot = {} for spot in pan.spots: for region in spot.regions: region2spot[region] = spot mod_group2rgps = defaultdict(list) for region in pan.regions: curr_mod_list = set() for fam in region.families: mod = fam2mod.get(fam) if mod is not None: curr_mod_list.add(mod) if curr_mod_list != set(): mod_group2rgps[frozenset(curr_mod_list)].append(region) for mod_list, regions in mod_group2rgps.items(): spot_list = set() for region in regions: myspot = region2spot.get(region) if myspot is not None: spot_list.add(region2spot[region]) lists.write(f"{regions[0].name}\t{len(spot_list)}\t{','.join(['module_' + str(mod.ID) for mod in mod_list])}\t" f"{','.join([reg.name for reg in regions])}\n") lists.close() logging.getLogger().info(f"RGP and associated modules are listed in : {output + '/modules_RGP_lists.tsv'}")
def writeFastaProtFam(pangenome, output, compress, prot_families, show_bar=False): outname = output + f"/{prot_families}_protein_families.faa" genefams = set() if prot_families == 'all': logging.getLogger().info( "Writing all of the representative amino acid sequences of the gene families..." ) genefams = pangenome.geneFamilies if prot_families in ['persistent', 'shell', 'cloud']: logging.getLogger().info( f"Writing the representative amino acid sequences of the {prot_families} gene families..." ) for fam in pangenome.geneFamilies: if fam.namedPartition == prot_families: genefams.add(fam) if prot_families == "rgp": logging.getLogger().info( f"Writing the representative amino acid sequences of the gene families in RGPs..." ) for region in pangenome.regions: genefams |= region.families with write_compressed_or_not(outname, compress) as fasta: bar = tqdm(genefams, unit="prot families", disable=not show_bar) for fam in bar: fasta.write('>' + fam.name + "\n") fasta.write(fam.sequence + "\n") bar.close() logging.getLogger().info( f"Done writing the representative amino acid sequences of the gene families : '{outname}'" )
def writeStats(output, soft_core, dup_margin, compress=False): logging.getLogger().info("Writing pangenome statistics...") logging.getLogger().info("Writing statistics on persistent duplication...") single_copy_markers = set()#could use bitarrays if speed is needed with write_compressed_or_not(output + "/mean_persistent_duplication.tsv", compress) as outfile: outfile.write(f"#duplication_margin={round(dup_margin,3)}\n") outfile.write("\t".join(["persistent_family","duplication_ratio","mean_presence","is_single_copy_marker"]) + "\n") for fam in pan.geneFamilies: if fam.namedPartition == "persistent": mean_pres = len(fam.genes) / len(fam.organisms) nb_multi = 0 for gene_list in fam.getOrgDict().values(): if len(gene_list) > 1: nb_multi +=1 dup_ratio = nb_multi / len(fam.organisms) is_SCM = False if dup_ratio < dup_margin: is_SCM = True single_copy_markers.add(fam) outfile.write("\t".join([fam.name, str(round(dup_ratio,3)), str(round(mean_pres,3)), str(is_SCM)]) + "\n") logging.getLogger().info("Done writing stats on persistent duplication") logging.getLogger().info("Writing genome per genome statistics (completeness and counts)...") soft = set()#could use bitarrays if speed is needed core = set() for fam in pan.geneFamilies: if len(fam.organisms) >= pan.number_of_organisms() * soft_core: soft.add(fam) if len(fam.organisms) == pan.number_of_organisms(): core.add(fam) with write_compressed_or_not(output + "/organisms_statistics.tsv", compress) as outfile: outfile.write(f"#soft_core={round(soft_core,3)}\n") outfile.write(f"#duplication_margin={round(dup_margin,3)}\n") outfile.write("\t".join(["organism","nb_families","nb_persistent_families","nb_shell_families","nb_cloud_families","nb_exact_core","nb_soft_core","nb_genes","nb_persistent_genes","nb_shell_genes","nb_cloud_genes","nb_exact_core_genes","nb_soft_core_genes","completeness","nb_single_copy_markers"]) + "\n") for org in pan.organisms: fams = org.families nb_pers = 0 nb_shell = 0 nb_cloud = 0 for fam in fams: if fam.namedPartition == "persistent": nb_pers+=1 elif fam.namedPartition == "shell": nb_shell+=1 else: nb_cloud+=1 nb_gene_pers = 0 nb_gene_shell = 0 nb_gene_soft = 0 nb_gene_cloud = 0 nb_gene_core = 0 for gene in org.genes: if gene.family.namedPartition == "persistent": nb_gene_pers +=1 elif gene.family.namedPartition == "shell": nb_gene_shell +=1 else: nb_gene_cloud += 1 if gene.family in soft: nb_gene_soft+=1 if gene.family in core: nb_gene_core+=1 completeness = "NA" if len(single_copy_markers) > 0: completeness = round((len(fams & single_copy_markers) / len(single_copy_markers))*100,2) outfile.write("\t".join(map(str,[org.name, len(fams), nb_pers, nb_shell, nb_cloud, len(core & fams), len(soft & fams), org.number_of_genes(), nb_gene_pers, nb_gene_shell, nb_gene_cloud, nb_gene_core, nb_gene_soft, completeness, len(fams & single_copy_markers)])) + "\n") logging.getLogger().info("Done writing genome per genome statistics")
def writeMatrix(sep, ext, output, compress=False, geneNames = False): logging.getLogger().info(f"Writing the .{ext} file ...") outname = output + "/matrix." + ext with write_compressed_or_not(outname,compress) as matrix: index_org = {} default_dat = [] for index, org in enumerate(pan.organisms): default_dat.append('0') index_org[org] = index matrix.write(sep.join(['"Gene"',#1 '"Non-unique Gene name"',#2 '"Annotation"',#3 '"No. isolates"',#4 '"No. sequences"',#5 '"Avg sequences per isolate"',#6 '"Accessory Fragment"',#7 '"Genome Fragment"',#8 '"Order within Fragment"',#9 '"Accessory Order with Fragment"',#10 '"QC"',#11 '"Min group size nuc"',#12 '"Max group size nuc"',#13 '"Avg group size nuc"']#14 +['"'+str(org)+'"' for org in pan.organisms])+"\n")#15 default_genes = ['""'] * len(pan.organisms) if geneNames else ["0"] * len(pan.organisms) org_index = pan.getIndex()#should just return things for fam in pan.geneFamilies: genes = default_genes.copy() l = [] alt = fam.namedPartition if fam.partition != "" else False genenames = Counter() product = Counter() for org, gene_list in fam.getOrgDict().items(): genes[org_index[org]] = " ".join([ '"' + str(gene) + '"' for gene in gene_list]) if geneNames else str(len(gene_list)) for gene in gene_list: l.append(gene.stop - gene.start) product[gene.product] +=1 genenames[gene.name] += 1 if fam.partition != "": alt = fam.namedPartition else: alt = str(product.most_common(1)[0][0]) l = [ gene.stop - gene.start for gene in fam.genes ] matrix.write(sep.join(['"'+fam.name+'"',#1 '"'+alt+'"',#2 '"'+ str(product.most_common(1)[0][0]) +'"',#3 '"' + str(len(fam.organisms)) + '"',#4 '"' + str(len(fam.genes)) + '"',#5 '"' + str(round(len(fam.genes)/len(fam.organisms),2)) + '"',#6 '"NA"',#7 '"NA"',#8 '""',#9 '""',#10 '""',#11 '"' + str(min(l)) + '"',#12 '"' + str(max(l)) + '"',#13 '"' + str(round(sum(l)/len(l),2)) + '"']#14 +genes)+"\n")#15 logging.getLogger().info(f"Done writing the matrix : '{outname}'")