示例#1
0
 def check(self):
     files = [
         f for f in os.listdir(MARKERS_FOLDER)
         if isfile(join(MARKERS_FOLDER, f))
     ]
     for marker_file in files:
         if not is_whitelist_file(marker_file):
             dendrogram_path = join(DENDROGRAMS_FOLDER,
                                    get_taxonomy_file_name(marker_file))
             print("DEND:" + dendrogram_path)
             if os.path.exists(dendrogram_path):
                 headers, marker_records = read_csv_to_dict(join(
                     MARKERS_FOLDER, marker_file),
                                                            delimiter="\t")
                 if str(dendrogram_path).endswith(".json"):
                     dend = dend_json_2_nodes_n_edges(dendrogram_path)
                 else:
                     dend = nomenclature_2_nodes_n_edges(dendrogram_path)
                 dend_dict = index_dendrogram(dend)
                 self.check_cluster_name(dend_dict, marker_records,
                                         marker_file)
             else:
                 message = "Could not find taxonomy file '{}' for marker '{}'." \
                     .format(dendrogram_path, marker_file)
                 self.reports.append(message)
    def test_get_synonym_pairs(self):
        tree = dend_json_2_nodes_n_edges(PATH_DENDROGRAM_JSON)
        nodes = tree['nodes']

        node3_pairs = get_synonym_pairs(nodes[3]).split(PAIR_SEPARATOR)
        self.assertEqual("CS202002013_120", nodes[3]["cell_set_accession"])
        self.assertEqual(5, len(node3_pairs))
        self.assertTrue("cell_set_preferred_alias:''" in node3_pairs)
        self.assertTrue("original_label:n4" in node3_pairs)
        self.assertTrue("cell_set_label:RNAseq 001-091" in node3_pairs)
        self.assertTrue("cell_set_aligned_alias:''" in node3_pairs)
        self.assertTrue("cell_set_additional_aliases:''" in node3_pairs)

        node20_pairs = get_synonym_pairs(nodes[20]).split(PAIR_SEPARATOR)
        self.assertEqual("CS202002013_6", nodes[20]["cell_set_accession"])
        self.assertEqual(5, len(node20_pairs))
        self.assertTrue(
            "cell_set_preferred_alias:Lamp5 Pdlim5_2" in node20_pairs)
        self.assertTrue("original_label:Lamp5 Pdlim5_2" in node20_pairs)
        self.assertTrue("cell_set_label:RNAseq 006" in node20_pairs)
        self.assertTrue("cell_set_aligned_alias:''" in node20_pairs)
        self.assertTrue("cell_set_additional_aliases:''" in node20_pairs)

        node50_pairs = get_synonym_pairs(nodes[50]).split(PAIR_SEPARATOR)
        self.assertEqual("CS202002013_146", nodes[50]["cell_set_accession"])
        self.assertEqual(5, len(node50_pairs))
        self.assertTrue("cell_set_preferred_alias:''" in node50_pairs)
        self.assertTrue("original_label:n30" in node50_pairs)
        self.assertTrue("cell_set_label:RNAseq 022-025" in node50_pairs)
        self.assertTrue("cell_set_aligned_alias:''" in node50_pairs)
        self.assertTrue("cell_set_additional_aliases:''" in node50_pairs)
def generate_cross_species_template(taxonomy_file_path, output_filepath):
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)
        cross_species_template = []

        headers, cs_by_preferred_alias = read_csv_to_dict(
            CROSS_SPECIES_PATH,
            id_column_name="cell_set_preferred_alias",
            id_to_lower=True)
        headers, cs_by_aligned_alias = read_csv_to_dict(
            CROSS_SPECIES_PATH,
            id_column_name="cell_set_aligned_alias",
            id_to_lower=True)

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                cross_species_classes = set()
                if o["cell_set_aligned_alias"] and str(
                        o["cell_set_aligned_alias"]).lower(
                        ) in cs_by_aligned_alias:
                    cross_species_classes.add(PCL_BASE + get_class_id(
                        cs_by_aligned_alias[str(o["cell_set_aligned_alias"]).
                                            lower()]["cell_set_accession"]))

                if "cell_set_additional_aliases" in o and o[
                        "cell_set_additional_aliases"]:
                    additional_aliases = str(
                        o["cell_set_additional_aliases"]).lower().split(
                            EXPRESSION_SEPARATOR)
                    for additional_alias in additional_aliases:
                        if additional_alias in cs_by_preferred_alias:
                            cross_species_classes.add(PCL_BASE + get_class_id(
                                cs_by_preferred_alias[additional_alias]
                                ["cell_set_accession"]))

                if len(cross_species_classes):
                    d = dict()
                    d['defined_class'] = PCL_BASE + get_class_id(
                        o['cell_set_accession'])
                    d['cross_species_classes'] = EXPRESSION_SEPARATOR.join(
                        cross_species_classes)

                    cross_species_template.append(d)

        class_robot_template = pd.DataFrame.from_records(
            cross_species_template)
        class_robot_template.to_csv(output_filepath, sep="\t", index=False)
def generate_marker_gene_set_template(taxonomy_file_path, output_filepath):
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)

        if "Reference_gene_list" in taxonomy_config:
            gene_db_path = ENSEMBLE_PATH.format(
                str(taxonomy_config["Reference_gene_list"][0]).strip().lower())
            gene_names = read_gene_data(gene_db_path)
            minimal_markers = read_markers(
                MARKER_PATH.format(taxon.replace("CCN", "").replace("CS", "")),
                gene_names)

        else:
            minimal_markers = {}

        class_seed = [
            'defined_class', 'Marker_set_of', 'Minimal_markers',
            'Brain_region_abbv', 'Species_abbv', 'Brain_region', 'Parent'
        ]
        class_template = []

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                if o['cell_set_accession'] in minimal_markers:
                    d = dict()
                    d['defined_class'] = PCL_BASE + get_marker_gene_set_id(
                        o['cell_set_accession'])
                    d['Marker_set_of'] = o['cell_set_preferred_alias']
                    d['Minimal_markers'] = minimal_markers[
                        o['cell_set_accession']]
                    if 'Brain_region_abbv' in taxonomy_config:
                        d['Brain_region_abbv'] = taxonomy_config[
                            'Brain_region_abbv'][0]
                    if 'Species_abbv' in taxonomy_config:
                        d['Species_abbv'] = taxonomy_config['Species_abbv'][0]
                    d['Brain_region'] = taxonomy_config['Brain_region'][0]
                    d['Parent'] = "SO:0001260"  # sequence collection

                    for k in class_seed:
                        if not (k in d.keys()):
                            d[k] = ''
                    class_template.append(d)

        class_robot_template = pd.DataFrame.from_records(class_template)
        class_robot_template.to_csv(output_filepath, sep="\t", index=False)
示例#5
0
def main():
    log.info("Dendrogram validation started.")
    files = [f for f in os.listdir(DENDROGRAM_FOLDER) if isfile(join(DENDROGRAM_FOLDER, f))]
    is_valid = True
    for file in files:
        filename, file_extension = os.path.splitext(file)
        if file_extension == ".json":
            dend = dend_json_2_nodes_n_edges(join(DENDROGRAM_FOLDER, file))
            is_valid &= PrefAliasUniquenessChecker().check(filename, dend)

    if not is_valid:
        raise ValidationError("Dendrogram validation failed and issues logged.")
def generate_curated_class_template(dend_json_path, output_filepath):
    dend = dend_json_2_nodes_n_edges(dend_json_path)
    robot_class_curation_seed = {
        'ID': 'ID',
        'Label': 'LABEL',
        'PrefLabel': 'A skos:prefLabel',
        'Synonyms': 'A oboInOwl:has_exact_synonym',
        'Exemplar': "EC CL:0000003 and 'has_exemplar' value %",
        'Classification': 'SC %',
        'Comment': 'A rdfs:comment',
        'part of': "SC 'part of' some %",
        'part evidence comment': '>A rdfs:comment',
        'part evidence pub':
        ">A dc:reference SPLIT='|'",  # Bundle this with dbxrefs?
        'part evidence dbxref': ">A oio:hasDbXref SPLIT='|'",
        'location': "SC 'located in' some %",
        'location evidence comment': '>A rdfs:comment',
        'location evidence pub':
        ">A dc:reference SPLIT='|'",  # Bundle this with dbxrefs?
        'location evidence dbxref': ">A oio:hasDbXref SPLIT='|'",
        'has soma location': "SC 'has soma location' some %",
        'soma location evidence comment': '>A rdfs:comment',
        'soma location evidence pub':
        ">A dc:reference SPLIT='|'",  # Bundle this with dbxrefs?
        'soma location evidence dbxref': ">A oio:dbxref SPLIT='|'",
        'cell morphology phenotype': "SC 'bearer of' some %",
        'cell morphology evidence comment': '>A rdfs:comment',
        'cell morphology evidence pub':
        ">A dc:reference SPLIT='|'",  # Bundle this with dbxrefs?
        'cell morphology evidence dbxref': ">A oio:hasDbXref SPLIT='|'"
    }
    class_template = [robot_class_curation_seed]

    # not bothering with stable IDs for now:

    for o in dend['nodes']:
        ID = 'http://brain_data_standards/scratch_' + str(uuid.uuid1())
        d = dict()
        d['ID'] = ID
        d['Class_type'] = 'subclass'
        d['Exemplar'] = 'AllenDend:' + o['cell_set_accession']
        if o['cell_set_label']:
            d['Label'] = o['cell_set_label']
        if o['cell_set_preferred_alias']:
            d['PrefLabel'] = o['cell_set_preferred_alias']
        for k in robot_class_curation_seed.keys():
            if not (k in d.keys()):
                d[k] = ''
        class_template.append(d)

    class_robot_template = pd.DataFrame.from_records(class_template)
    class_robot_template.to_csv(output_filepath, sep="\t", index=False)
def generate_homologous_to_template(taxonomy_file_path, all_base_files,
                                    output_filepath):
    """
    Homologous_to relations require a separate template. If this operation is driven by the nomenclature tables,
    some dangling classes may be generated due to root classes that don't have a class and should not be aligned.
    So, instead of nomenclature tables, base files are used for populating homologous to relations. This ensures all
    alignments has a corresponding class.
    Args:
        taxonomy_file_path: path of the taxonomy file
        all_base_files: paths of the all class template base files
        output_filepath: template output file path
    """
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    other_taxonomy_aliases = index_base_files(
        [t for t in all_base_files if taxon not in t])

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)

        data_template = []

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                d = dict()
                d['defined_class'] = PCL_BASE + get_class_id(
                    o['cell_set_accession'])
                homologous_to = list()
                for other_aliases in other_taxonomy_aliases:
                    if "cell_set_aligned_alias" in o and o["cell_set_aligned_alias"] \
                            and str(o["cell_set_aligned_alias"]).lower() in other_aliases:
                        homologous_to.append(other_aliases[str(
                            o["cell_set_aligned_alias"]).lower()]
                                             ["defined_class"])
                d['homologous_to'] = "|".join(homologous_to)

                data_template.append(d)

        robot_template = pd.DataFrame.from_records(data_template)
        robot_template.to_csv(output_filepath, sep="\t", index=False)
def generate_marker_template(dend_json_path, output_filepath):
    dend = dend_json_2_nodes_n_edges(dend_json_path)
    robot_marker_template_seed = {
        'ID': 'ID',
        'Expresses': "TI 'expresses' % SPLIT = '|' ",
        'Evidence': "^A rdfs:comment"
    }
    template = [robot_marker_template_seed]
    for o in dend['nodes']:
        d = dict()
        d['ID'] = 'AllenDend:' + o['cell_set_accession']
        for k in robot_marker_template_seed.keys():
            if not (k in d.keys()):
                d[k] = ''
        template.append(d)
    class_robot_template = pd.DataFrame.from_records(template)
    class_robot_template.to_csv(output_filepath, sep="\t", index=False)
示例#9
0
 def check(self):
     files = [
         f for f in os.listdir(MARKERS_FOLDER)
         if isfile(join(MARKERS_FOLDER, f))
     ]
     for marker_file in files:
         if not is_whitelist_file(marker_file):
             dendrogram_path = join(DENDROGRAMS_FOLDER,
                                    get_taxonomy_file_name(marker_file))
             if os.path.exists(dendrogram_path):
                 marker_records = read_tsv(join(MARKERS_FOLDER,
                                                marker_file))
                 if str(dendrogram_path).endswith(".json"):
                     dend = dend_json_2_nodes_n_edges(dendrogram_path)
                 else:
                     dend = nomenclature_2_nodes_n_edges(dendrogram_path)
                 self.check_all_nodes_exist(dend, marker_records,
                                            marker_file)
def generate_curated_class_template(taxonomy_file_path, output_filepath):
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)

        class_curation_seed = [
            'defined_class', 'Curated_synonyms', 'Classification',
            'Classification_comment', 'Classification_pub', 'Expresses',
            'Expresses_comment', 'Expresses_pub', 'Projection_type', 'Layers',
            'Cross_species_text', 'Comment'
        ]
        class_template = []

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                d = dict()
                d['defined_class'] = PCL_BASE + get_class_id(
                    o['cell_set_accession'])
                if o['cell_set_preferred_alias']:
                    d['prefLabel'] = o['cell_set_preferred_alias']
                elif o['cell_set_additional_aliases']:
                    d['prefLabel'] = str(
                        o['cell_set_additional_aliases']).split(
                            EXPRESSION_SEPARATOR)[0]

                for k in class_curation_seed:
                    if not (k in d.keys()):
                        d[k] = ''
                class_template.append(d)

        class_robot_template = pd.DataFrame.from_records(class_template)
        class_robot_template.to_csv(output_filepath, sep="\t", index=False)
示例#11
0
 def check(self):
     files = [
         f for f in os.listdir(MARKERS_FOLDER)
         if isfile(join(MARKERS_FOLDER, f))
     ]
     for marker_file in files:
         if not is_whitelist_file(marker_file):
             dendrogram_path = join(DENDROGRAMS_FOLDER,
                                    get_taxonomy_file_name(marker_file))
             if os.path.exists(dendrogram_path):
                 headers, marker_records = read_csv_to_dict(join(
                     MARKERS_FOLDER, marker_file),
                                                            delimiter="\t")
                 if str(dendrogram_path).endswith(".json"):
                     dend = dend_json_2_nodes_n_edges(dendrogram_path)
                 else:
                     dend = nomenclature_2_nodes_n_edges(dendrogram_path)
                 dend_dict = index_dendrogram(dend)
                 self.check_all_node_ids_valid(dend_dict, marker_records,
                                               marker_file)
                 self.check_all_node_ids_unique(marker_file)
def generate_ind_template(dend_json_path, output_filepath):
    dend = dend_json_2_nodes_n_edges(dend_json_path)
    robot_template_seed = {
        'ID': 'ID',
        'Label': 'LABEL',
        'PrefLabel': 'A skos:prefLabel',
        'Entity Type': 'TI %',
        'TYPE': 'TYPE',
        'Property Assertions': "I BDSHELP:subcluster_of SPLIT='|'",
        'Synonyms': 'A oboInOwl:has_exact_synonym',
        'Function': 'TI capable_of some %'
    }
    dl = [robot_template_seed]

    synonym_properties = [
        'original_label', 'cell_set_aligned_alias',
        'cell_set_additional_aliases'
    ]

    for o in dend['nodes']:
        d = dict()
        d['ID'] = 'AllenDend:' + o['cell_set_accession']
        d['TYPE'] = 'owl:NamedIndividual'
        d['Label'] = o['cell_set_label'] + ' - ' + o['cell_set_accession']
        d['PrefLabel'] = o['cell_set_preferred_alias'] + ' - ' + o[
            'cell_set_accession']
        d['Entity Type'] = 'BDSHELP:Cluster'
        d['Synonyms'] = '|'.join([
            o[prop] for prop in synonym_properties
            if prop in o.keys() and o[prop]
        ])
        d['Property Assertions'] = '|'.join([
            'AllenDend:' + e[1] for e in dend['edges']
            if e[0] == o['cell_set_accession']
        ])
        # There should only be one!
        dl.append(d)
    robot_template = pd.DataFrame.from_records(dl)
    robot_template.to_csv(output_filepath, sep="\t", index=False)
def generate_pattern_table_denormalised_markers(dend_json_path, output_filepath):
    path_parts = dend_json_path.split(os.path.sep)
    taxon = path_parts[len(path_parts) - 1].split(".")[0]

    taxonomy_config = read_taxonomy_config(taxon)

    dend = dend_json_2_nodes_n_edges(dend_json_path)
    dend_tree = read_dendrogram_tree(dend_json_path)

    marker_path = MARKER_PATH.format(str(taxon).replace("CCN", ""))
    gene_db_path = ENSEMBLE_PATH.format(str(taxonomy_config["Reference_gene_list"][0]).strip().lower())

    if taxonomy_config:
        subtrees = get_subtrees(dend_tree, taxonomy_config)
        gene_names = read_gene_data(gene_db_path)
        denorm_markers = get_denorm_markers(taxon, gene_names)
        minimal_markers = read_markers(marker_path, gene_names)

        dl = []
        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(*subtrees):
                d = dict()
                d['defined_class'] = ALLEN_DEND_ + o['cell_set_accession']
                d['gross_cell_type'] = get_gross_cell_type(o['cell_set_accession'], subtrees, taxonomy_config)
                d['taxon'] = taxonomy_config['Species'][0]
                d['brain_region'] = taxonomy_config['Brain_region'][0]

                if o['cell_set_accession'] in denorm_markers:
                    d['denorm_marker_list'] = denorm_markers[o['cell_set_accession']]
                    d['minimal_marker_list'] = minimal_markers[o['cell_set_accession']]
                else:
                    d['denorm_marker_list'] = ''
                    d['minimal_marker_list'] = ''

                dl.append(d)

    robot_template = pd.DataFrame.from_records(dl)
    robot_template.to_csv(output_filepath, sep="\t", index=False)
    def test_get_synonyms_from_taxonomy(self):
        tree = dend_json_2_nodes_n_edges(PATH_DENDROGRAM_JSON)
        nodes = tree['nodes']

        node8_synonyms = get_synonyms_from_taxonomy(
            nodes[8]).split(OR_SEPARATOR)
        node8_synonyms = list(filter(None, node8_synonyms))
        self.assertEqual("CS202002013_125", nodes[8]["cell_set_accession"])
        self.assertEqual(1, len(node8_synonyms))
        self.assertTrue("Lamp5" in node8_synonyms)

        node20_synonyms = get_synonyms_from_taxonomy(
            nodes[20]).split(OR_SEPARATOR)
        node20_synonyms = list(filter(None, node20_synonyms))
        self.assertEqual("CS202002013_6", nodes[20]["cell_set_accession"])
        self.assertEqual(1, len(node20_synonyms))
        self.assertTrue("Lamp5 Pdlim5_2" in node20_synonyms)

        node50_synonyms = get_synonyms_from_taxonomy(
            nodes[50]).split(OR_SEPARATOR)
        node50_synonyms = list(filter(None, node50_synonyms))
        self.assertEqual("CS202002013_146", nodes[50]["cell_set_accession"])
        self.assertEqual(0, len(node50_synonyms))
def generate_app_specific_template(taxonomy_file_path, output_filepath):
    if str(taxonomy_file_path).endswith(".json"):
        dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
    else:
        dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)

    robot_template_seed = {
        'ID': 'ID',
        'TYPE': 'TYPE',
        'cell_set_color': "A ALLENHELP:cell_set_color"
    }
    dl = [robot_template_seed]

    for o in dend['nodes']:
        if "cell_set_color" in o and o["cell_set_color"]:
            d = dict()
            d['ID'] = 'PCL:' + get_individual_id(o['cell_set_accession'])
            d['TYPE'] = 'owl:NamedIndividual'
            d['cell_set_color'] = str(o["cell_set_color"]).strip()
            dl.append(d)

    robot_template = pd.DataFrame.from_records(dl)
    robot_template.to_csv(output_filepath, sep="\t", index=False)
def generate_denormalised_marker_template(taxonomy_file_path, output_marker_path):
    """
    Enriches existing marker file based on inheritance relations extracted from dendrogram file.
    New maker table, following the same format as the input marker table, with each node associated with a
    non-redundant list of all markers associated with the term in the input + all markers associated with parent terms.
    Args:
        dend_json_path: Path of the dendrogram file
        output_marker_path: Path of the new marker file

    """
    path_parts = taxonomy_file_path.split(os.path.sep)
    taxon = path_parts[len(path_parts) - 1].split(".")[0]
    if str(taxonomy_file_path).endswith(".json"):
        dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
    else:
        dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        taxon = path_parts[len(path_parts) - 1].split(".")[0].replace("nomenclature_table_", "")

    taxonomy_config = read_taxonomy_config(taxon)

    root_nodes = get_root_nodes(taxonomy_config)

    marker_path = MARKER_PATH.format(str(taxon).replace("CCN", "").replace("CS", ""))
    generate_denormalised_marker(dend, marker_path, output_marker_path, root_nodes)
def generate_base_class_template(taxonomy_file_path, output_filepath):
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)

        if "Reference_gene_list" in taxonomy_config:
            gene_db_path = ENSEMBLE_PATH.format(
                str(taxonomy_config["Reference_gene_list"][0]).strip().lower())
            gene_names = read_gene_data(gene_db_path)
            minimal_markers = read_markers(
                MARKER_PATH.format(taxon.replace("CCN", "").replace("CS", "")),
                gene_names)
            allen_markers = read_markers(
                ALLEN_MARKER_PATH.format(
                    taxon.replace("CCN", "").replace("CS", "")), gene_names)
        else:
            minimal_markers = {}
            allen_markers = {}

        class_seed = [
            'defined_class', 'prefLabel', 'Alias_citations',
            'Synonyms_from_taxonomy', 'Gross_cell_type', 'Taxon',
            'Brain_region', 'Minimal_markers', 'Allen_markers', 'Individual',
            'Brain_region_abbv', 'Species_abbv', 'Cluster_ID', 'part_of',
            'has_soma_location', 'aligned_alias', 'marker_gene_set'
        ]
        class_template = []

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                d = dict()
                d['defined_class'] = PCL_BASE + get_class_id(
                    o['cell_set_accession'])
                if o['cell_set_preferred_alias']:
                    d['prefLabel'] = o['cell_set_preferred_alias']
                elif o['cell_set_additional_aliases']:
                    d['prefLabel'] = str(
                        o['cell_set_additional_aliases']).split(
                            EXPRESSION_SEPARATOR)[0]
                d['Synonyms_from_taxonomy'] = get_synonyms_from_taxonomy(o)
                d['Gross_cell_type'] = get_gross_cell_type(
                    o['cell_set_accession'], subtrees, taxonomy_config)
                d['Taxon'] = taxonomy_config['Species'][0]
                d['Brain_region'] = taxonomy_config['Brain_region'][0]
                d['Cluster_ID'] = o['cell_set_accession']
                if 'cell_set_alias_citation' in o and o[
                        'cell_set_alias_citation']:
                    alias_citations = [
                        citation.strip() for citation in str(
                            o["cell_set_alias_citation"]).split("|")
                        if citation and citation.strip()
                    ]
                    d["Alias_citations"] = "|".join(alias_citations)
                if o['cell_set_accession'] in minimal_markers:
                    d['Minimal_markers'] = minimal_markers[
                        o['cell_set_accession']]
                if o['cell_set_accession'] in allen_markers:
                    d['Allen_markers'] = allen_markers[o['cell_set_accession']]
                else:
                    d['Allen_markers'] = ''
                if 'Brain_region_abbv' in taxonomy_config:
                    d['Brain_region_abbv'] = taxonomy_config[
                        'Brain_region_abbv'][0]
                if 'Species_abbv' in taxonomy_config:
                    d['Species_abbv'] = taxonomy_config['Species_abbv'][0]
                d['Individual'] = PCL_BASE + get_individual_id(
                    o['cell_set_accession'])

                for index, subtree in enumerate(subtrees):
                    if o['cell_set_accession'] in subtree:
                        location_rel = taxonomy_config['Root_nodes'][index][
                            'Location_relation']
                        if location_rel == "part_of":
                            d['part_of'] = taxonomy_config['Brain_region'][0]
                            d['has_soma_location'] = ''
                        elif location_rel == "has_soma_location":
                            d['part_of'] = ''
                            d['has_soma_location'] = taxonomy_config[
                                'Brain_region'][0]

                if "cell_set_aligned_alias" in o and o[
                        "cell_set_aligned_alias"]:
                    d['aligned_alias'] = o["cell_set_aligned_alias"]
                if o['cell_set_accession'] in minimal_markers:
                    d['marker_gene_set'] = PCL_PREFIX + get_marker_gene_set_id(
                        o['cell_set_accession'])

                for k in class_seed:
                    if not (k in d.keys()):
                        d[k] = ''
                class_template.append(d)

        class_robot_template = pd.DataFrame.from_records(class_template)
        class_robot_template.to_csv(output_filepath, sep="\t", index=False)
def normalize_raw_markers(raw_marker):
    """
    Raw marker files has different structure than the expected. Needs these modifications:
        - Extract Taxonomy_node_ID: clusterName matches cell_set_aligned_alias of the dendrogram.
        - Resolve markers: convert marker names to ensemble IDs from local DBs
    Args:
        raw_marker:
    """
    taxonomy_config = get_taxonomy_config(raw_marker)
    taxonomy_id = taxonomy_config["Taxonomy_id"]

    print("Taxonomy ID: " + taxonomy_id)
    if taxonomy_id == "CS1908210":
        print("Read dendrogram: " + taxonomy_id)
        dend = dend_json_2_nodes_n_edges(DENDROGRAM.format(taxonomy_id))
        nomenclature_indexes = [
                                index_dendrogram(dend, id_field_name="cell_set_preferred_alias", id_to_lower=True),
                                # index_dendrogram(dend, id_field_name="cell_set_aligned_alias", id_to_lower=True),
                                index_dendrogram(dend, id_field_name="cell_set_accession", id_to_lower=True),
                                index_dendrogram(dend, id_field_name="original_label", id_to_lower=True),
                                index_dendrogram(dend, id_field_name="cell_set_additional_aliases", id_to_lower=True)
                                ]
    else:
        print("Read nomenclature table: " + taxonomy_id)
        nomenclature_indexes = [read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="cell_set_preferred_alias", id_to_lower=True)[1],
                                read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="cell_set_aligned_alias", id_to_lower=True)[1],
                                read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="cell_set_accession", id_to_lower=True)[1],
                                read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="original_label", id_to_lower=True)[1],
                                read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="cell_set_additional_aliases", id_to_lower=True)[1],
                                ]

    gene_db_path = GENE_DB_PATH.format(str(taxonomy_config["Reference_gene_list"][0]).strip().lower())
    headers, genes_by_name = read_csv_to_dict(gene_db_path, id_column=2, delimiter="\t", id_to_lower=True)
    species_abbv = get_species_for_gene_db(gene_db_path).lower()

    unmatched_markers = set()
    normalized_markers = []

    if raw_marker.endswith(".csv"):
        headers, raw_marker_data = read_csv_to_dict(raw_marker, id_column_name="clusterName")
    else:
        headers, raw_marker_data = read_csv_to_dict(raw_marker, id_column_name="clusterName", delimiter="\t")

    for cluster_name in raw_marker_data:
        normalized_data = {}
        row = raw_marker_data[cluster_name]
        cluster_name_variants = [cluster_name.lower(), cluster_name.lower().replace("-", "/"),
                                 cluster_name.replace("Micro", "Microglia").lower()]

        nomenclature_node = search_terms_in_index(cluster_name_variants, nomenclature_indexes)
        if nomenclature_node:
            node_id = nomenclature_node["cell_set_accession"]
            marker_names = get_marker_names(row)
            marker_ids = []
            for name in marker_names:
                if name:
                    if species_abbv + " " + name.lower() in genes_by_name:
                        marker_ids.append(str(genes_by_name[species_abbv + " " + name.lower()]["ID"]))
                    elif species_abbv + " " + name.lower().replace("_", "-") in genes_by_name:
                        marker_ids.append(str(genes_by_name[species_abbv + " " + name.lower().replace("_", "-")]["ID"]))
                    else:
                        unmatched_markers.add(name)

            normalized_data["Taxonomy_node_ID"] = node_id
            normalized_data["clusterName"] = nomenclature_node["cell_set_preferred_alias"]
            normalized_data["Markers"] = "|".join(marker_ids)

            normalized_markers.append(normalized_data)
        else:
            log.error("Node with cluster name '{}' couldn't be found in the nomenclature.".format(cluster_name))
            # raise Exception("Node with cluster name {} couldn't be found in the nomenclature.".format(cluster_name))

    class_robot_template = pd.DataFrame.from_records(normalized_markers)
    class_robot_template.to_csv(OUTPUT_MARKER.format(taxonomy_id.replace("CCN", "").replace("CS", "")), sep="\t", index=False)
    log.error("Following markers could not be found in the db ({}): {}".format(len(unmatched_markers),
                                                                               str(unmatched_markers)))
def generate_ind_template(taxonomy_file_path, output_filepath):
    path_parts = taxonomy_file_path.split(os.path.sep)
    taxon = path_parts[len(path_parts) - 1].split(".")[0]

    if str(taxonomy_file_path).endswith(".json"):
        dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
    else:
        dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        taxon = path_parts[len(path_parts) - 1].split(".")[0].replace(
            "nomenclature_table_", "")

    dend_tree = generate_dendrogram_tree(dend)
    taxonomy_config = read_taxonomy_config(taxon)
    allen_descriptions = read_allen_descriptions(
        ALLEN_DESCRIPTIONS_PATH, taxonomy_config['Species_abbv'][0])

    subtrees = get_subtrees(dend_tree, taxonomy_config)

    robot_template_seed = {
        'ID': 'ID',
        'Label': 'LABEL',
        'PrefLabel': 'A skos:prefLabel',
        'Entity Type': 'TI %',
        'TYPE': 'TYPE',
        'Property Assertions': "I 'subcluster of' SPLIT=|",
        'Synonyms': 'A oboInOwl:hasExactSynonym SPLIT=|',
        'Cluster_ID': "A 'cluster id'",
        'Function': 'TI capable_of some %',
        'cell_set_preferred_alias': "A n2o:cell_set_preferred_alias",
        'original_label': "A n2o:original_label",
        'cell_set_label': "A n2o:cell_set_label",
        'cell_set_aligned_alias': "A n2o:cell_set_aligned_alias",
        'cell_set_additional_aliases':
        "A n2o:cell_set_additional_aliases SPLIT=|",
        'cell_set_alias_assignee': "A n2o:cell_set_alias_assignee SPLIT=|",
        'cell_set_alias_citation': "A n2o:cell_set_alias_citation SPLIT=|",
        'Metadata': "A n2o:node_metadata",
        'Exemplar_of': "TI 'exemplar data of' some %",
        'Comment': "A rdfs:comment",
        'Aliases': "A oboInOwl:hasRelatedSynonym SPLIT=|",
        'Rank': "A 'cell_type_rank' SPLIT=|"
    }
    dl = [robot_template_seed]

    synonym_properties = [
        'cell_set_aligned_alias', 'cell_set_additional_aliases'
    ]

    for o in dend['nodes']:
        d = dict()
        d['ID'] = 'PCL:' + get_individual_id(o['cell_set_accession'])
        d['TYPE'] = 'owl:NamedIndividual'
        d['Label'] = o['cell_set_label'] + ' - ' + o['cell_set_accession']
        if 'cell_set_preferred_alias' in o and o['cell_set_preferred_alias']:
            d['PrefLabel'] = o['cell_set_preferred_alias']
        else:
            d['PrefLabel'] = o['cell_set_accession']
        d['Entity Type'] = 'PCL:0010001'  # Cluster
        d['Metadata'] = json.dumps(o)
        d['Synonyms'] = '|'.join([
            o[prop] for prop in synonym_properties
            if prop in o.keys() and o[prop]
        ])
        d['Property Assertions'] = '|'.join(
            sorted([
                'PCL:' + get_individual_id(e[1]) for e in dend['edges']
                if e[0] == o['cell_set_accession']
            ]))
        meta_properties = [
            'cell_set_preferred_alias', 'original_label', 'cell_set_label',
            'cell_set_aligned_alias', 'cell_set_additional_aliases',
            'cell_set_alias_assignee', 'cell_set_alias_citation'
        ]
        for prop in meta_properties:
            if prop in o.keys():
                d[prop] = '|'.join([
                    prop_val.strip() for prop_val in str(o[prop]).split("|")
                    if prop_val
                ])
            else:
                d[prop] = ''
        d['Cluster_ID'] = o['cell_set_accession']
        if o['cell_set_accession'] in set().union(
                *subtrees) and o['cell_set_preferred_alias']:
            d['Exemplar_of'] = PCL_BASE + get_class_id(o['cell_set_accession'])

        if "cell_type_card" in o:
            d['Rank'] = '|'.join([
                cell_type.strip().replace("No", "None")
                for cell_type in str(o["cell_type_card"]).split(",")
            ])

        if o['cell_set_accession'] in allen_descriptions:
            allen_data = allen_descriptions[o['cell_set_accession']]
            d['Comment'] = allen_data["summary"][0]
            if allen_data["aliases"][0]:
                d['Aliases'] = '|'.join([
                    alias.strip()
                    for alias in str(allen_data["aliases"][0]).split("|")
                ])

        # There should only be one!
        dl.append(d)
    robot_template = pd.DataFrame.from_records(dl)
    robot_template.to_csv(output_filepath, sep="\t", index=False)