def generate_cross_species_template(taxonomy_file_path, output_filepath):
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)
        cross_species_template = []

        headers, cs_by_preferred_alias = read_csv_to_dict(
            CROSS_SPECIES_PATH,
            id_column_name="cell_set_preferred_alias",
            id_to_lower=True)
        headers, cs_by_aligned_alias = read_csv_to_dict(
            CROSS_SPECIES_PATH,
            id_column_name="cell_set_aligned_alias",
            id_to_lower=True)

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                cross_species_classes = set()
                if o["cell_set_aligned_alias"] and str(
                        o["cell_set_aligned_alias"]).lower(
                        ) in cs_by_aligned_alias:
                    cross_species_classes.add(PCL_BASE + get_class_id(
                        cs_by_aligned_alias[str(o["cell_set_aligned_alias"]).
                                            lower()]["cell_set_accession"]))

                if "cell_set_additional_aliases" in o and o[
                        "cell_set_additional_aliases"]:
                    additional_aliases = str(
                        o["cell_set_additional_aliases"]).lower().split(
                            EXPRESSION_SEPARATOR)
                    for additional_alias in additional_aliases:
                        if additional_alias in cs_by_preferred_alias:
                            cross_species_classes.add(PCL_BASE + get_class_id(
                                cs_by_preferred_alias[additional_alias]
                                ["cell_set_accession"]))

                if len(cross_species_classes):
                    d = dict()
                    d['defined_class'] = PCL_BASE + get_class_id(
                        o['cell_set_accession'])
                    d['cross_species_classes'] = EXPRESSION_SEPARATOR.join(
                        cross_species_classes)

                    cross_species_template.append(d)

        class_robot_template = pd.DataFrame.from_records(
            cross_species_template)
        class_robot_template.to_csv(output_filepath, sep="\t", index=False)
def nomenclature_2_nodes_n_edges(taxonomy_file_path):
    out = dict()
    out['nodes'] = []
    out['edges'] = set()

    child_cell_sets = list()
    headers, nomenclature_records = read_csv_to_dict(
        taxonomy_file_path, id_column_name='cell_set_accession')
    for node_cell_set_accession in nomenclature_records:
        record_dict = nomenclature_records[node_cell_set_accession]
        node = {prop: record_dict[prop] for prop in headers}
        out['nodes'].append(node)

        children_str = record_dict['child_cell_set_accessions']
        if children_str:
            children = set(children_str.strip().split('|'))
        else:
            children = {node_cell_set_accession}
        node = {
            "node_cell_set_accession": node_cell_set_accession,
            "children": children
        }
        child_cell_sets.append(node)

    sorted_child_cell_sets = sorted(child_cell_sets,
                                    key=lambda x: len(x["children"]))
    for child_cell_sets in sorted_child_cell_sets:
        parent_node = find_next_inclusive_node(sorted_child_cell_sets,
                                               child_cell_sets)
        if parent_node:
            out['edges'].add((child_cell_sets["node_cell_set_accession"],
                              parent_node["node_cell_set_accession"]))

    fix_multi_inheritance_relations(out, sorted_child_cell_sets)
    return out
Пример #3
0
 def check(self):
     files = [
         f for f in os.listdir(MARKERS_FOLDER)
         if isfile(join(MARKERS_FOLDER, f))
     ]
     for marker_file in files:
         if not is_whitelist_file(marker_file):
             dendrogram_path = join(DENDROGRAMS_FOLDER,
                                    get_taxonomy_file_name(marker_file))
             print("DEND:" + dendrogram_path)
             if os.path.exists(dendrogram_path):
                 headers, marker_records = read_csv_to_dict(join(
                     MARKERS_FOLDER, marker_file),
                                                            delimiter="\t")
                 if str(dendrogram_path).endswith(".json"):
                     dend = dend_json_2_nodes_n_edges(dendrogram_path)
                 else:
                     dend = nomenclature_2_nodes_n_edges(dendrogram_path)
                 dend_dict = index_dendrogram(dend)
                 self.check_cluster_name(dend_dict, marker_records,
                                         marker_file)
             else:
                 message = "Could not find taxonomy file '{}' for marker '{}'." \
                     .format(dendrogram_path, marker_file)
                 self.reports.append(message)
def add_cluster_name_to_marker(marker_path):
    path_parts = marker_path.split(os.path.sep)
    taxonomy_id = path_parts[len(path_parts) - 1].split("_")[0].replace("CS", "CCN")

    marker_data = read_csv_to_dict(marker_path, id_column_name="Taxonomy_node_ID", delimiter="\t")[1]
    nomenclature = read_csv_to_dict(NOMENCLATURE.format(taxonomy_id), id_column_name="cell_set_accession")[1]

    normalized_markers = []
    for accession_id in marker_data:
        normalized_data = {"Taxonomy_node_ID": accession_id,
                           "clusterName": nomenclature[accession_id]["original_label"],
                           "Markers": marker_data[accession_id]["Markers"]}

        normalized_markers.append(normalized_data)

    class_robot_template = pd.DataFrame.from_records(normalized_markers)
    class_robot_template.to_csv(OUTPUT_MARKER.format(taxonomy_id.replace("CCN", "").replace("CS", "")), sep="\t", index=False)
def index_base_files(base_files):
    index = list()
    for base_file in base_files:
        headers, records = read_csv_to_dict(base_file,
                                            delimiter="\t",
                                            id_column_name="aligned_alias",
                                            id_to_lower=True)
        index.append(records)

    return index
def fix_gene_database_species(gene_db_path):
    headers, genes_by_id = read_csv_to_dict(gene_db_path, id_column=0, delimiter="\t")
    species_abbv = get_species_for_gene_db(gene_db_path)

    with open(gene_db_path.replace(".tsv", "_2.tsv"), mode='w') as out:
        writer = csv.writer(out, delimiter="\t", quotechar='"')
        writer.writerow(["ID", "TYPE", "NAME"])
        writer.writerow(["ID", "SC %", "A rdfs:label"])

        print(headers)
        for gene in genes_by_id:
            writer.writerow([genes_by_id[gene]["ID"], genes_by_id[gene]["TYPE"],
                             genes_by_id[gene]["NAME"] + " (" + species_abbv + ")"])
def fix_gene_database(gene_db_path, gene_prefix):
    headers, genes_by_name = read_csv_to_dict(gene_db_path, id_column=1, delimiter="\t")
    species_abbv = get_species_for_gene_db(gene_db_path)

    with open(gene_db_path.replace(".tsv", "_2.tsv"), mode='w') as out:
        writer = csv.writer(out, delimiter="\t", quotechar='"')
        writer.writerow(["ID", "TYPE", "NAME"])
        writer.writerow(["ID", "SC %", "A rdfs:label"])

        print(headers)
        for gene in genes_by_name:
            writer.writerow([gene_prefix + gene.replace("\"", ""), "SO:0000704",
                             genes_by_name[gene]["gene_name"] + " (" + species_abbv + ")"])
Пример #8
0
 def check(self):
     files = [
         f for f in os.listdir(MARKERS_FOLDER)
         if isfile(join(MARKERS_FOLDER, f))
     ]
     for marker_file in files:
         if not is_whitelist_file(marker_file):
             dendrogram_path = join(DENDROGRAMS_FOLDER,
                                    get_taxonomy_file_name(marker_file))
             if os.path.exists(dendrogram_path):
                 headers, marker_records = read_csv_to_dict(join(
                     MARKERS_FOLDER, marker_file),
                                                            delimiter="\t")
                 self.check_marker_names(marker_records, marker_file)
Пример #9
0
 def check(self):
     files = [
         f for f in os.listdir(MARKERS_FOLDER)
         if isfile(join(MARKERS_FOLDER, f))
     ]
     for file in files:
         if not is_whitelist_file(file):
             header_row, records = read_csv_to_dict(join(
                 MARKERS_FOLDER, file),
                                                    delimiter="\t")
             if header_row != self.expected_headers:
                 message = "Invalid column names: {} in file {}. Expected columns are: {}" \
                     .format(header_row, file, self.expected_headers)
                 self.reports.append(message)
def generate_datasets_template(dataset_metadata_path, output_filepath):
    path_parts = dataset_metadata_path.split(os.path.sep)
    taxonomy_id = path_parts[len(path_parts) - 1].split("_")[0]

    headers, dataset_metadata = read_csv_to_dict(dataset_metadata_path,
                                                 generated_ids=True)

    robot_template_seed = {
        'ID': 'ID',
        'TYPE': 'TYPE',
        'Entity Type': 'TI %',
        'Label': 'LABEL',
        'Taxonomy': 'AI schema:includedInDataCatalog',
        'Cell Count': "AT 'cell_count'^^xsd:integer",
        'Nuclei Count': "AT 'nuclei_count'^^xsd:integer",
        'Description': "A rdfs:comment",
        'Download Link': "A schema:archivedAt",
        'Explore Link': "A schema:discussionUrl"
    }
    dl = [robot_template_seed]

    dataset_index = 0
    for dataset in dataset_metadata:
        d = dict()
        d['ID'] = 'PCL:' + get_dataset_id(taxonomy_id, dataset_index)
        d['TYPE'] = 'owl:NamedIndividual'
        d['Entity Type'] = 'schema:Dataset'  # Taxonomy
        d['Label'] = dataset_metadata[dataset]['Dataset']
        d['Taxonomy'] = 'PCL:' + get_taxonomy_id(taxonomy_id)
        cells_nuclei = dataset_metadata[dataset]['cells/nuclei']
        if 'nuclei' in cells_nuclei:
            d['Nuclei Count'] = int(''.join(c for c in cells_nuclei
                                            if c.isdigit()))
        elif 'cells' in cells_nuclei:
            d['Cell Count'] = int(''.join(c for c in cells_nuclei
                                          if c.isdigit()))
        d['Description'] = dataset_metadata[dataset]['text']
        d['Download Link'] = dataset_metadata[dataset]['download_link']
        d['Explore Link'] = dataset_metadata[dataset]['explore_link']

        dataset_index += 1
        dl.append(d)
    robot_template = pd.DataFrame.from_records(dl)
    robot_template.to_csv(output_filepath, sep="\t", index=False)
def generate_taxonomies_template(taxonomy_metadata_path, output_filepath):
    taxon_configs = read_taxonomy_details_yaml()
    headers, taxonomies_metadata = read_csv_to_dict(taxonomy_metadata_path)

    robot_template_seed = {
        'ID': 'ID',
        'TYPE': 'TYPE',
        'Entity Type': 'TI %',
        'Label': 'LABEL',
        'Number of Cell Types': "A 'cell_types_count'",
        'Number of Cell Subclasses': "A 'cell_subclasses_count'",
        'Number of Cell Classes': "A 'cell_classes_count'",
        'Anatomic Region': "A 'has_brain_region'",
        'Species Label': "A skos:prefLabel",
        'Age': "A 'has_age'",
        'Sex': "A 'has_sex'",
        'Primary Citation': "A oboInOwl:hasDbXref"
    }
    dl = [robot_template_seed]

    for taxon_config in taxon_configs:
        d = dict()
        d['ID'] = 'PCL:' + get_taxonomy_id(taxon_config["Taxonomy_id"])
        d['TYPE'] = 'owl:NamedIndividual'
        d['Entity Type'] = 'PCL:0010002'  # Taxonomy
        d['Label'] = taxon_config["Taxonomy_id"]
        d['Anatomic Region'] = taxon_config['Brain_region'][0]
        d['Primary Citation'] = taxon_config['PMID'][0]
        if taxon_config["Taxonomy_id"] in taxonomies_metadata:
            taxonomy_metadata = taxonomies_metadata[
                taxon_config["Taxonomy_id"]]
            d['Number of Cell Types'] = taxonomy_metadata["Cell Types"]
            d['Number of Cell Subclasses'] = taxonomy_metadata[
                "Cell Subclasses"]
            d['Number of Cell Classes'] = taxonomy_metadata["Cell Classes"]
            d['Species Label'] = taxonomy_metadata["Species"]
            d['Age'] = taxonomy_metadata["Age"]
            d['Sex'] = taxonomy_metadata["Sex"]

        dl.append(d)
    robot_template = pd.DataFrame.from_records(dl)
    robot_template.to_csv(output_filepath, sep="\t", index=False)
Пример #12
0
 def check(self):
     files = [
         f for f in os.listdir(MARKERS_FOLDER)
         if isfile(join(MARKERS_FOLDER, f))
     ]
     for marker_file in files:
         if not is_whitelist_file(marker_file):
             dendrogram_path = join(DENDROGRAMS_FOLDER,
                                    get_taxonomy_file_name(marker_file))
             if os.path.exists(dendrogram_path):
                 headers, marker_records = read_csv_to_dict(join(
                     MARKERS_FOLDER, marker_file),
                                                            delimiter="\t")
                 if str(dendrogram_path).endswith(".json"):
                     dend = dend_json_2_nodes_n_edges(dendrogram_path)
                 else:
                     dend = nomenclature_2_nodes_n_edges(dendrogram_path)
                 dend_dict = index_dendrogram(dend)
                 self.check_all_node_ids_valid(dend_dict, marker_records,
                                               marker_file)
                 self.check_all_node_ids_unique(marker_file)
def normalize_raw_markers(raw_marker):
    """
    Raw marker files has different structure than the expected. Needs these modifications:
        - Extract Taxonomy_node_ID: clusterName matches cell_set_aligned_alias of the dendrogram.
        - Resolve markers: convert marker names to ensemble IDs from local DBs
    Args:
        raw_marker:
    """
    taxonomy_config = get_taxonomy_config(raw_marker)
    taxonomy_id = taxonomy_config["Taxonomy_id"]

    print("Taxonomy ID: " + taxonomy_id)
    if taxonomy_id == "CS1908210":
        print("Read dendrogram: " + taxonomy_id)
        dend = dend_json_2_nodes_n_edges(DENDROGRAM.format(taxonomy_id))
        nomenclature_indexes = [
                                index_dendrogram(dend, id_field_name="cell_set_preferred_alias", id_to_lower=True),
                                # index_dendrogram(dend, id_field_name="cell_set_aligned_alias", id_to_lower=True),
                                index_dendrogram(dend, id_field_name="cell_set_accession", id_to_lower=True),
                                index_dendrogram(dend, id_field_name="original_label", id_to_lower=True),
                                index_dendrogram(dend, id_field_name="cell_set_additional_aliases", id_to_lower=True)
                                ]
    else:
        print("Read nomenclature table: " + taxonomy_id)
        nomenclature_indexes = [read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="cell_set_preferred_alias", id_to_lower=True)[1],
                                read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="cell_set_aligned_alias", id_to_lower=True)[1],
                                read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="cell_set_accession", id_to_lower=True)[1],
                                read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="original_label", id_to_lower=True)[1],
                                read_csv_to_dict(NOMENCLATURE.format(taxonomy_id),
                                                 id_column_name="cell_set_additional_aliases", id_to_lower=True)[1],
                                ]

    gene_db_path = GENE_DB_PATH.format(str(taxonomy_config["Reference_gene_list"][0]).strip().lower())
    headers, genes_by_name = read_csv_to_dict(gene_db_path, id_column=2, delimiter="\t", id_to_lower=True)
    species_abbv = get_species_for_gene_db(gene_db_path).lower()

    unmatched_markers = set()
    normalized_markers = []

    if raw_marker.endswith(".csv"):
        headers, raw_marker_data = read_csv_to_dict(raw_marker, id_column_name="clusterName")
    else:
        headers, raw_marker_data = read_csv_to_dict(raw_marker, id_column_name="clusterName", delimiter="\t")

    for cluster_name in raw_marker_data:
        normalized_data = {}
        row = raw_marker_data[cluster_name]
        cluster_name_variants = [cluster_name.lower(), cluster_name.lower().replace("-", "/"),
                                 cluster_name.replace("Micro", "Microglia").lower()]

        nomenclature_node = search_terms_in_index(cluster_name_variants, nomenclature_indexes)
        if nomenclature_node:
            node_id = nomenclature_node["cell_set_accession"]
            marker_names = get_marker_names(row)
            marker_ids = []
            for name in marker_names:
                if name:
                    if species_abbv + " " + name.lower() in genes_by_name:
                        marker_ids.append(str(genes_by_name[species_abbv + " " + name.lower()]["ID"]))
                    elif species_abbv + " " + name.lower().replace("_", "-") in genes_by_name:
                        marker_ids.append(str(genes_by_name[species_abbv + " " + name.lower().replace("_", "-")]["ID"]))
                    else:
                        unmatched_markers.add(name)

            normalized_data["Taxonomy_node_ID"] = node_id
            normalized_data["clusterName"] = nomenclature_node["cell_set_preferred_alias"]
            normalized_data["Markers"] = "|".join(marker_ids)

            normalized_markers.append(normalized_data)
        else:
            log.error("Node with cluster name '{}' couldn't be found in the nomenclature.".format(cluster_name))
            # raise Exception("Node with cluster name {} couldn't be found in the nomenclature.".format(cluster_name))

    class_robot_template = pd.DataFrame.from_records(normalized_markers)
    class_robot_template.to_csv(OUTPUT_MARKER.format(taxonomy_id.replace("CCN", "").replace("CS", "")), sep="\t", index=False)
    log.error("Following markers could not be found in the db ({}): {}".format(len(unmatched_markers),
                                                                               str(unmatched_markers)))