def __init__(self):
     """Create new Zenodo Graph Repository object."""
     super().__init__()
     self._data = {
         **compress_json.local_load("zenodo.json"),
         **self.load_wikidata_metatada()
     }
예제 #2
0
def get_available_graphs_from_repository(repository: str) -> List[str]:
    """Return list of available graphs from the given repositories.

    Parameters
    ----------------------
    repository: str,
        The name of the repository to retrieve the graph from.

    Raises
    ----------------------
    ValueError,
        If the given repository is not available.
    """
    repositories = get_available_repositories()
    if not set_validator(repositories)(repository):
        raise ValueError((
            "The provided repository `{}` is not within the set "
            "of supported repositories, {}.\n"
            "Did you mean `{}`?"
        ).format(
            repository,
            ", ".join(repositories),
            closest(repository, repositories)
        ))

    return compress_json.local_load("{repository}.json.gz".format(
        repository=repository
    )).keys()
 def __init__(self):
     """Create new String Graph Repository object."""
     super().__init__()
     # We load the data that cannot be automatically scraped
     self._data = compress_json.local_load("monarch_initiative.json")
     # The arguments keys used to load this graph
     general_kwargs = {
         "sources_column": "subject",
         "destinations_column": "object",
         "edge_list_edge_types_column": "predicate",
         "nodes_column": "id",
         "node_list_node_types_column": "category",
         "node_types_separator": "|",
         "name": "Monarch"
     }
     # We extend the data through scraping the Google Bucket
     base_url = "https://storage.googleapis.com/monarch-ingest/"
     xml = pd.read_xml(base_url).fillna("NaN")
     xml = xml[xml.Key.str.endswith("/monarch-kg.tar.gz")]
     for path in xml.Key:
         version = path.split("/")[0]
         self._data["Monarch"][version] = {
             "urls": [base_url + path],
             "arguments": {
                 "edge_path": "monarch-kg/monarch-kg_edges.tsv",
                 "node_path": "monarch-kg/monarch-kg_nodes.tsv",
                 **general_kwargs
             }
         }
예제 #4
0
def train(epigenomes, labels, models, kwargs, region, cell_line):
    epigenomes = epigenomes[region].values
    labels = labels[region]

    splits = 10
    holdouts = StratifiedShuffleSplit(n_splits=splits,
                                      test_size=0.2,
                                      random_state=42)

    if os.path.exists(cell_line + "/results_" + region + ".json"):
        results = compress_json.local_load(cell_line + "/results_" + region +
                                           ".json")
    else:
        results = []

    for i, (train, test) in tqdm(enumerate(holdouts.split(epigenomes, labels)),
                                 total=splits,
                                 desc="Computing holdouts",
                                 dynamic_ncols=True):
        for model, params in tqdm(zip(models, kwargs),
                                  total=len(models),
                                  desc="Training models",
                                  leave=False,
                                  dynamic_ncols=True):
            model_name = (model.__class__.__name__
                          if model.__class__.__name__ != "Sequential" else
                          model.name)
            if precomputed(results, model_name, i):
                continue
            model.fit(epigenomes[train], labels[train], **params)
            results.append({
                "model":
                model_name,
                "run_type":
                "train",
                "holdout":
                i,
                **report(labels[train], model.predict(epigenomes[train]))
            })
            results.append({
                "model":
                model_name,
                "run_type":
                "test",
                "holdout":
                i,
                **report(labels[test], model.predict(epigenomes[test]))
            })
            compress_json.local_dump(
                results, cell_line + "/results_" + region + ".json")

    df = pd.DataFrame(results)
    df = df.drop(columns=["holdout"])
    return df
예제 #5
0
def last_db_time_get():
    latest_file = max(glob.glob(
        '{}\\jsons\\DB_*.json.gz'.format(db_folder_path)),
                      key=os.path.getctime)
    #выгружаем ее в переменную
    db_file = compress_json.local_load(latest_file)
    #если оказалось, что база выгружена текстом, а не диктом
    if not isinstance(db_file, dict):
        #конвертируем в дикт
        db_file = json.loads(db_file)
    #конвертируем ее время создания
    db_time = db_file["generation_timestamp"]
    db_conv_time = datetime.datetime.strptime(db_time, "%Y-%m-%dT%H:%M:%S.%fZ")
    return db_conv_time
    def load_graph_data(self, graph_name: str) -> Dict:
        """Return the data stored for the provided graph.

        Parameters
        -----------------------
        graph_name: str,
            Name of graph to retrieve data for.

        Returns
        -----------------------
        The stored data for this graph.
        """
        return compress_json.local_load(
            self.get_graph_data_path(graph_name)
        )
예제 #7
0
def is_normalized_metric(metric: str) -> bool:
    """Return boolean representing if given metric is known to be between 0 and 1.

    Parameters
    ----------
    metric:str
        The metric to check for

    Returns
    -------
    Boolean representing if given metric is known to be between 0 and 1.
    """
    sanitized_metric = sanitize_ml_labels(metric)
    return any(
        candidate in sanitized_metric
        for candidate in compress_json.local_load("normalized_metrics.json"))
예제 #8
0
def fantom_available_cell_lines(
    root: str = "fantom",
) -> pd.DataFrame:
    """Return supported cell lines available within FANTOM dataset.

    Parameters
    ---------------------------------------
    root: str = "fantom",
        Where to store / load from the downloaded data.

    Returns
    ---------------------------------------
    Return dataframe with the supported cell lines mapped to FANTOM name.
    """
    info = compress_json.local_load("fantom.json")
    path = f"{root}/cell_lines.tsv"
    download(info["cell_lines"], path, cache=True)
    df = pd.read_csv(
        path,
        sep="\t",
        header=None
    )
    cell_lines_names = df[0].str.split("cell line:", expand=True)
    cell_lines_names[1][
        cell_lines_names[0].str.startswith("H1") &
        cell_lines_names[0].str.contains("day00")
    ] = "H1"
    cell_lines_names[1][
        cell_lines_names[0].str.startswith("H9") &
        cell_lines_names[0].str.contains("H9ES")
    ] = "H9"
    nan_mask = pd.notnull(cell_lines_names[1])
    cell_lines_names = cell_lines_names[nan_mask]
    infected_mask = ~cell_lines_names[1].str.contains("infection")
    cell_lines_names = cell_lines_names[infected_mask]
    cell_lines_names[1] = cell_lines_names[1].str.split("/").str[0]
    cell_lines_names[1] = cell_lines_names[1].str.split(",").str[0]
    cell_lines_codes = pd.concat(
        objs=[
            cell_lines_names[1].apply(lambda x: x.split("ENCODE")[
                                      0].strip()).str.upper().str.replace("-", ""),
            df[nan_mask][infected_mask][1],
        ],
        axis=1
    )
    cell_lines_codes.columns = ["cell_line", "code"]
    return cell_lines_codes.reset_index(drop=True).groupby("cell_line").first().reset_index()
예제 #9
0
def test_compress_json():
    D = random_string_dict(10, 10)
    key = sha256(D)
    extensions = compress_json.compress_json._DEFAULT_EXTENSION_MAP.keys()
    for ext in extensions:
        path = f"random_dirs/test.json.{ext}"
        compress_json.dump(D, path)
        assert key == sha256(compress_json.load(path))

    shutil.rmtree("random_dirs")

    for ext in extensions:
        path = f"random_dirs/test.json.{ext}"
        compress_json.local_dump(D, path)
        assert key == sha256(compress_json.local_load(path))

    shutil.rmtree("tests/random_dirs")
예제 #10
0
def get_vend_data(db_file, vend_array_to_fill, buy_array_to_fill):
    """
    Take db -FILE- and fill 2 arrays with its data
    Parameters
    ----------
    db_file : filepath
        db file itself.
    vend_array_to_fill : list
        array that will hold value for "V" shops.
    buy_array_to_fill : list
        array that will hold value for "B" shops.

    Returns
    -------
    None.

    """
    #открываем файл с сжатой базой

    _shop = compress_json.local_load(db_file)
    if not isinstance(_shop, dict):
        #конвертируем в дикт
        _shop = json.loads(_shop)
    _shop = _shop['shops']
    #with open(db_file, encoding="utf8") as vend_data_file:
    #делаем из него массив только с шопами
    #    _shop = np.array(json.load(vend_data_file)['shops'])
    for _i in _shop:
        #закидываем всю эту ебалу друг за другом
        _t = {
            "owner":
            _i["owner"],
            "location":
            _i["location"]["map"],
            "creation_date":
            datetime.datetime.strptime(_i["creation_date"],
                                       "%Y-%m-%dT%H:%M:%SZ"),
            "items":
            _i["items"]
        }
        if _i['type'] == 'V':
            vend_array_to_fill.append(_t)
        else:
            buy_array_to_fill.append(_t)
예제 #11
0
def get_available_versions_from_graph_and_repository(name: str, repository: str) -> List[str]:
    """Return list of available graphs from the given repositories.

    Parameters
    ----------------------
    name: str,
        The name of the graph to retrieve.
    repository: str,
        The name of the repository to retrieve the graph from.

    Raises
    ----------------------
    ValueError,
        If the given repository is not available.
    """
    return list(compress_json.local_load(
        "{}.json.gz".format(repository),
        use_cache=True
    )[name].keys())
예제 #12
0
    def get_chembl_assays(self, start=0, end=100000, step=10000):
        """Get ChEMBL assays by querying the ChEMBL Assay Resource.

        Args:
            start: query start
            end: query end
            step: page size

        Returns:
            A list of ChEMBL assay records
        """
        url = 'https://www.ebi.ac.uk/chembl/elk/es/chembl_27_assay/_search'
        query_data = compress_json.local_load('chembl_assay_query.json')
        query_end = self.estimate_records(url, query_data, start, end)
        output = open(os.path.join(self.input_base_dir, 'chembl_assay_records.json'), 'w')
        assays = []
        for i in range(start, query_end, step):
            assays.extend(self.get_records(url, query_data, i, min(i+step, query_end)))
        json.dump(assays, output)
        return assays
예제 #13
0
def apply_replace_defaults(labels: List[str],
                           custom_defaults: Dict[str, List[str]]) -> List[str]:
    """Return list of labels with replaced defaults."""
    defaults = {
        **{
            key: ["(?<![a-z]){}(?![a-z])".format(val) for val in values]
            for key, values in compress_json.local_load("labels.json").items()
        },
        **custom_defaults
    }
    new_labels = []
    for label in labels:
        replace_candidates = []
        for default, targets in defaults.items():
            for target in targets:
                regex = re.compile(target, re.IGNORECASE)
                matches = regex.findall(label)
                if bool(matches):
                    for match in matches:
                        replace_candidates.append((match, default))

        # The following is required to avoid replacing substrings.

        replace_candidates = sorted(replace_candidates,
                                    key=lambda x: len(x[0]),
                                    reverse=False)

        replace_candidates = [(j, val)
                              for i, (j, val) in enumerate(replace_candidates)
                              if all(j not in k.lower()
                                     for _, k in replace_candidates[i + 1:])]

        replace_candidates = sorted(replace_candidates,
                                    key=lambda x: len(x[0]),
                                    reverse=True)

        for target, default in replace_candidates:
            label = label.replace(target, default)
        new_labels.append(label)
    return new_labels
    def __init__(self,
                 name: str,
                 directed: bool = False,
                 verbose: int = 2,
                 cache_path: str = "graphs"):
        """Create new automatically retrieved graph.

        Parameters
        -------------------
        name: str,
            The name of the graph to be retrieved and loaded.
        directed: bool = False,
            Wether to load the graph as directed or undirected.
            By default false.
        verbose: int = 2,
            Wether to show loading bars.
        cache_path: str = "graphs",
            Where to store the downloaded graphs.

        Raises
        -------------------
        ValueError,
            If the given graph name is not available.
        """
        graphs = compress_json.local_load("graphs.json")
        if name not in graphs:
            raise ValueError(
                ("Requested graph `{}` is not currently available.\n"
                 "Open an issue on the EnsmallenGraph repository to ask "
                 "for this graph to be added.").format(name))
        self._graph = graphs[name]
        self._directed = directed
        self._name = name
        self._verbose = verbose
        self._cache_path = os.path.join(cache_path, name)
        self._downloader = BaseDownloader(auto_extract=True,
                                          target_directory=self._cache_path,
                                          verbose=self._verbose)
예제 #15
0
def roadmap_available_cell_lines(root: str) -> pd.DataFrame:
    """Return Roadmap supported available cell lines.

    Parameters
    ---------------------------------------
    root: str,
        Where to store / load from the downloaded data.

    Returns
    ---------------------------------------
    Return dataframe with the cell lines supported available in Roadmap dataset.
    """
    info = compress_json.local_load("roadmap.json")
    filename = f"{root}/cell_lines.tsv"
    download(info["cell_lines"], filename, cache=True)
    cell_lines_codes = pd.read_csv(filename, sep="\t")
    cell_lines_codes = cell_lines_codes[
        (cell_lines_codes.TYPE != "ESCDerived")
        & cell_lines_codes.GROUP.isin(["ENCODE2012", "ESC", "IMR90"])]
    cell_lines_codes["cell_line"] = cell_lines_codes.MNEMONIC.str.split(
        ".").str[1].str.replace("-", "")
    cell_lines_codes["code"] = cell_lines_codes.EID
    return cell_lines_codes[["cell_line", "code"]].reset_index(drop=True)
예제 #16
0
    def run(self, data_file: Optional[str] = None) -> None:
        """Method is called and performs needed transformations to process
        protein-protein interactions from the STRING DB data.

        Args:
            data_file: data file to parse

        Returns:
            None.

        """
        if not data_file:
            data_file = os.path.join(self.input_base_dir,
                                     "9606.protein.links.full.v11.0.txt.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        protein_node_type = "biolink:Protein"
        edge_label = "biolink:interacts_with"
        self.node_header = compress_json.local_load("node_header.json")
        edge_core_header = compress_json.local_load("edge_core_header.json")
        edge_additional_headers = compress_json.local_load(
            "edge_additional_headers.json")

        self.edge_header = edge_core_header + edge_additional_headers
        relation = 'RO:0002434'
        seen_proteins: Set = set()
        seen_genes: Set = set()

        # Required to align the node edge header of the gene
        # with the default header
        self.extra_header = [""] * (len(edge_additional_headers) + 1)

        # make string ENSP to Uniprot id mapping dict
        string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
            os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(data_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())
            for line in interactions:
                items_dict = parse_stringdb_interactions(line, header_items)
                proteins = []
                for protein_name in ('protein1', 'protein2'):
                    nat_string_id = get_item_by_priority(
                        items_dict, [protein_name])
                    protein = '.'.join(nat_string_id.split('.')[1:])
                    proteins.append(protein)

                    if protein in self.protein_gene_map:
                        gene = self.protein_gene_map[protein]
                        if gene not in seen_genes:
                            seen_genes.add(gene)
                            ensemble_gene = f"ENSEMBL:{gene}"
                            gene_informations = self.gene_info_map[
                                self.ensembl2ncbi_map[gene]]
                            write_node_edge_item(
                                fh=node,
                                header=self.node_header,
                                data=[
                                    ensemble_gene, gene_informations['symbol'],
                                    'biolink:Gene',
                                    gene_informations['description'],
                                    f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
                                    self.source_name
                                ])
                            write_node_edge_item(
                                fh=edge,
                                header=self.edge_header,
                                data=[
                                    ensemble_gene, "biolink:has_gene_product",
                                    f"ENSEMBL:{protein}", "RO:0002205", "NCBI",
                                    ""
                                ] + self.extra_header)

                    # write node data
                    if protein not in seen_proteins:
                        seen_proteins.add(protein)

                        # if we have an equivalent Uniprot ID for this Ensembl protein
                        # ID make an xref edge, and a node for the Uniprot ID
                        uniprot_curie = ''
                        if protein in string_to_uniprot_id_map:
                            uniprot_curie = \
                                f"UniProtKB:{string_to_uniprot_id_map[protein]}"
                            uniprot_curie = collapse_uniprot_curie(
                                uniprot_curie)

                        write_node_edge_item(
                            fh=node,
                            header=self.node_header,
                            data=[
                                f"ENSEMBL:{protein}",
                                "",
                                protein_node_type,
                                "",
                                uniprot_curie,  # xref
                                self.source_name
                            ])

                # write edge data
                write_node_edge_item(
                    fh=edge,
                    header=self.edge_header,
                    data=[
                        f"ENSEMBL:{proteins[0]}", edge_label,
                        f"ENSEMBL:{proteins[1]}", relation, "STRING",
                        "biolink:Association", items_dict['combined_score']
                    ] + [
                        items_dict.get(header, "")
                        for header in edge_additional_headers
                    ])
 def __init__(self):
     """Create new FreeBase Graph Repository object."""
     super().__init__()
     self._data = compress_json.local_load("freebase.json")
예제 #18
0
def train_sequence(epigenomes, labels, genome, cell_line, region, models):

    bed = epigenomes[region].reset_index()[epigenomes[region].index.names]

    splits = 2
    holdouts = StratifiedShuffleSplit(n_splits=splits,
                                      test_size=0.2,
                                      random_state=42)

    if os.path.exists(cell_line + "/sequence_" + region + ".json"):
        results = compress_json.local_load(cell_line + "/sequence_" + region +
                                           ".json")
    else:
        results = []

    for i, (train_index,
            test_index) in tqdm(enumerate(holdouts.split(bed, labels[region])),
                                total=splits,
                                desc="Computing holdouts",
                                dynamic_ncols=True):
        train, test = get_holdout(train_index, test_index, bed, labels[region],
                                  genome)
        for model in tqdm(models,
                          total=len(models),
                          desc="Training models",
                          leave=False,
                          dynamic_ncols=True):
            if precomputed(results, model.name, i):
                continue
            history = model.fit(train,
                                steps_per_epoch=train.steps_per_epoch,
                                validation_data=test,
                                validation_steps=test.steps_per_epoch,
                                epochs=100,
                                shuffle=True,
                                verbose=False,
                                callbacks=[
                                    EarlyStopping(monitor="val_loss",
                                                  mode="min",
                                                  patience=50),
                                ]).history
            scores = pd.DataFrame(history).iloc[-1].to_dict()
            results.append({
                "model": model.name,
                "run_type": "train",
                "holdout": i,
                **{
                    key: value
                    for key, value in scores.items() if not key.startswith("val_")
                }
            })
            results.append({
                "model": model.name,
                "run_type": "test",
                "holdout": i,
                **{
                    key[4:]: value
                    for key, value in scores.items() if key.startswith("val_")
                }
            })
            compress_json.local_dump(
                results, cell_line + "/sequence_" + region + ".json")

    df = pd.DataFrame(results)
    df = df.drop(columns=["holdout"])
    return df
예제 #19
0
def get_df(cell_line, data_type, region_type):
    results = compress_json.local_load(cell_line + "_" + region_type + "_" +
                                       data_type + ".json")
    df = pd.DataFrame(results).drop(columns="holdout")
    return df[(df.run_type == "test")]
예제 #20
0
def train_model_epi(models, epigenomes, nlabels, region_type, cell_line):
    # Reprod
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(42)

    y = nlabels[region_type].values.ravel()
    X = epigenomes[region_type]
    print("Num feature: " + str(X.shape[1]))
    splits = 51
    holdouts = StratifiedShuffleSplit(n_splits=splits,
                                      test_size=0.2,
                                      random_state=42)
    class_w = class_weight.compute_class_weight('balanced', np.unique(y), y)
    class_w = dict(enumerate(class_w))
    print("Class weights: " + str(class_w))

    if os.path.exists(cell_line + "_" + region_type + "_epigenomic.json"):
        results = compress_json.local_load(cell_line + "_" + region_type +
                                           "_epigenomic.json")
    else:
        results = []

    for i, (train, test) in tqdm(enumerate(holdouts.split(X, y)),
                                 total=splits,
                                 desc="Computing holdouts",
                                 dynamic_ncols=True):
        for model in tqdm(models,
                          total=len(models),
                          desc="Training models",
                          leave=False,
                          dynamic_ncols=True):
            model_name = (model.__class__.__name__
                          if model.__class__.__name__ != "Sequential" else
                          model.name)
            if precomputed(results, model_name, i):
                continue

            model.fit(X[train],
                      y[train],
                      epochs=1000,
                      shuffle=True,
                      verbose=False,
                      validation_split=0.1,
                      batch_size=1024,
                      class_weight=class_w,
                      callbacks=[
                          EarlyStopping(monitor="val_loss",
                                        mode="min",
                                        patience=50,
                                        restore_best_weights=True),
                      ])
            results.append({
                "model": model_name,
                "run_type": "train",
                "holdout": i,
                **report(y[train], model.predict(X[train]))
            })
            results.append({
                "model": model_name,
                "run_type": "test",
                "holdout": i,
                **report(y[test], model.predict(X[test]))
            })
            compress_json.local_dump(
                results, cell_line + "_" + region_type + "_epigenomic.json")
            df = pd.DataFrame(results)
            df = df.drop(columns=["holdout"])

    return df
예제 #21
0
    def __init__(
        self,
        name: str,
        version: str,
        repository: str,
        directed: bool = False,
        preprocess: Union[bool, str] = "auto",
        load_nodes: bool = True,
        load_node_types: bool = True,
        load_edge_weights: bool = True,
        auto_enable_tradeoffs: bool = True,
        sort_tmp_dir: Optional[str] = None,
        verbose: int = 2,
        cache: bool = True,
        cache_path: Optional[str] = None,
        cache_sys_var: str = "GRAPH_CACHE_DIR",
        graph_kwargs: Dict = None,
        hash_seed: str = None,
        callbacks: List[Callable] = (),
        callbacks_arguments: List[Dict] = (),
    ):
        """Create new automatically retrieved graph.

        Parameters
        -------------------
        name: str
            The name of the graph to be retrieved and loaded.
        version: str
            The version of the graph to be retrieved.
        repository: str
            Name of the repository to load data from.
        directed: bool = False
            Whether to load the graph as directed or undirected.
            By default false.
        preprocess: Union[bool, str] = "auto"
            Whether to preprocess the node list and edge list
            to be loaded optimally in both time and memory.
            Will automatically preprocess in Linux and macOS
            and avoid doing this on Windows.
        load_nodes: bool = True
            Whether to load the nodes vocabulary or treat the nodes
            simply as a numeric range.
            This feature is only available when the preprocessing is enabled.
        load_node_types: bool = True
            Whether to load the node types if available or skip them entirely.
            This feature is only available when the preprocessing is enabled.
        load_edge_weights: bool = True
            Whether to load the edge weights if available or skip them entirely.
            This feature is only available when the preprocessing is enabled.
        auto_enable_tradeoffs: bool = True
            Whether to enable the Ensmallen time-memory tradeoffs in small graphs
            automatically. By default True, that is, if a graph has less than
            50 million edges. In such use cases the memory expenditure is minimal.
        sort_tmp_dir: Optional[str] = None
            Which folder to use to store the temporary files needed to sort in 
            parallel the edge list when building the optimal preprocessed file.
            This defaults to the same folder of the edge list when no value is 
            provided.
        verbose: int = 2
            Whether to show loading bars.
        cache: bool = True
            Whether to use cache, i.e. download files only once
            and preprocess them only once.
        cache_path: Optional[str] = None
            Where to store the downloaded graphs.
            If no path is provided, first we check the system variable
            provided below is set, otherwise we use the directory `graphs`.
        cache_sys_var: str = "GRAPH_CACHE_DIR"
            The system variable with the default graph cache directory.
        graph_kwargs: Dict = None
            Eventual additional kwargs for loading the graph.
        hash_seed: str = None
            Seed to use for the hash.
        callbacks: List[Callable] = ()
            Eventual callbacks to call after download files.
        callbacks_arguments: List[Dict] = ()
            Eventual arguments for callbacks.

        Raises
        -------------------
        ValueError,
            If the given graph name is not available.
        ValueError,
            If the preprocess flag is provided but the system
            is Windows, which does not provide the sort command.
        """
        try:
            validate_graph_version(name, repository, version)

            all_versions = compress_json.local_load(
                "{}.json.gz".format(repository)
            )[name]

            self._graph = all_versions[version]
        except KeyError:
            raise ValueError(
                (
                    "Requested graph `{}` is not currently available.\n"
                    "Open an issue on the Graph repository to ask "
                    "for this graph to be added."
                ).format(name)
            )

        if preprocess == "auto":
            preprocess = is_macos() or is_linux()

        if preprocess and is_windows():
            raise ValueError(
                "Currently preprocessing to optimal edge list is not supported "
                "on Windows because the sorting step is based upon the `sort` "
                "command, which is only available to our knowledge on Linux and "
                "macOS systems."
            )

        # If the cache path was not provided
        # we either check the system variable
        # and if it is not set we use `graphs`
        if cache_path is None:
            cache_path = os.getenv(cache_sys_var, "graphs")

        cache_path = os.path.join(cache_path, repository)

        self._directed = directed
        self._preprocess = preprocess
        self._load_nodes = load_nodes
        self._load_node_types = load_node_types
        self._load_edge_weights = load_edge_weights
        self._name = name
        self._repository = repository
        self._version = version
        self._auto_enable_tradeoffs = auto_enable_tradeoffs
        self._sort_tmp_dir = sort_tmp_dir
        self._cache = cache
        self._verbose = verbose
        self._callbacks = callbacks
        if graph_kwargs is None:
            graph_kwargs = {}
        self._graph_kwargs = graph_kwargs
        self._callbacks_arguments = callbacks_arguments
        self._instance_hash = sha256({
            "hash_seed": hash_seed,
            **self._graph,
            **self._graph_kwargs,
        })
        self._cache_path = os.path.join(
            cache_path,
            name,
            version
        )
        self._downloader = BaseDownloader(
            auto_extract=True,
            cache=cache,
            target_directory=self._cache_path,
            verbose=self._verbose,
            process_number=1
        )
예제 #22
0
def train(sess, env, actor, critic, actor_noise, buffer_size, min_batch, ep):

    if "--save" in sys.argv:
        saver = tf.compat.v1.train.Saver()

    if "--load" in sys.argv:
        print("loading weights")
        loader = tf.compat.v1.train.Saver()
        arg_index = sys.argv.index("--load")
        save_name = sys.argv[arg_index + 1]
        loader.restore(sess, "savedir/" + save_name + "/save")
        print("weights loaded")
    else:
        sess.run(tf.compat.v1.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(buffer_size, 0)

    if "--loadBuff" in sys.argv:
        arg_index = sys.argv.index("--loadBuff")
        buffPath = sys.argv[arg_index + 1]
        print("loading buffer")
        tempBuff = compress_json.local_load("preTrain/" + buffPath +
                                            ".json.gz")
        nb = buffer_size / len(tempBuff["action"])
        for i in range(int(nb)):
            for s, a, r, d, s1 in zip(tempBuff["state"], tempBuff["action"],
                                      tempBuff["reward"], tempBuff["done"],
                                      tempBuff["next_state"]):
                replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                                  np.reshape(a, (actor.a_dim, )), r, d,
                                  np.reshape(s1, (actor.s_dim, )))

        print("buffer loaded")

    max_episodes = ep
    max_steps = 200
    score_list = []
    tcostlist = []

    tic = time.time()

    for i in range(max_episodes):

        state = env.reset()
        state = np.concatenate([state["observation"], state["desired_goal"]])
        score = 0
        cost = 0
        costs = []
        actor_noise.reset()

        if (i % 10 == 0):
            #print("serious:")
            explo = 0
        else:
            explo = 1

        for j in range(max_steps):

            if '--visu' in sys.argv:
                env.render()

            action = np.clip(
                actor.predict(np.reshape(state, (1, actor.s_dim))) +
                actor_noise() * explo, -1, 1)

            #print(action)
            next_state, reward, done, info = env.step(action.reshape(4, ))
            next_state = np.concatenate(
                [next_state["observation"], next_state["desired_goal"]])
            replay_buffer.add(np.reshape(state, (actor.s_dim, )),
                              np.reshape(action, (actor.a_dim, )), reward,
                              done, np.reshape(next_state, (actor.s_dim, )))

            # updating the network in batch
            if replay_buffer.size() < min_batch:
                continue

            states, actions, rewards, dones, next_states = replay_buffer.sample_batch(
                min_batch)
            target_q = critic.predict_target(next_states,
                                             actor.predict_target(next_states))

            y = []
            for k in range(min_batch):
                y.append(rewards[k] + critic.gamma * target_q[k] *
                         (1 - dones[k]))

            # Update the critic given the targets
            predicted_q_value, _ = critic.train(states, actions,
                                                np.reshape(y, (min_batch, 1)))
            cost = y - predicted_q_value
            costs.append(cost)

            # Update the actor policy using the sampled gradient
            a_outs = actor.predict(states)
            grads = critic.action_gradients(states, a_outs)
            actor.train(states, grads[0])

            # Update target networks
            actor.update_target_network()
            critic.update_target_network()

            state = next_state
            score += reward

            tac = time.time()
            print("\033[0;1;4;97m", end='')
            print("Episode:", end="")
            print("\033[0;97m", end='')
            print(" {}    ".format(i), end='')
            print("\033[3;4;91m", end='')
            print("temps total : {} secondes\r".format(int(tac - tic)), end='')

            if done:
                break

        tcost = np.mean(costs)
        tcostlist.append(tcost)

        score_list.append(score)

        if i % 10 == 0:
            print("\033[0;1;4;97m", end='')
            print("Episode:", end="")
            print("\033[0;97m", end='')
            print(" {}                                             ".format(i))
            print("total reward: {:.5}  avg reward (last 10): {:.5}".format(
                score, np.mean(score_list[max(0, i - 10):(i + 1)])))
            print("cost: {:.5}  avg cost (last 10): {:.5}".format(
                tcost, np.mean(tcostlist[max(0, i - 10):(i + 1)])))
            if "--save" in sys.argv:
                arg_index = sys.argv.index("--save")
                save_name = sys.argv[arg_index + 1]
                saver.save(sess, "savedir/" + save_name + "/save")

    print("\033[3;4;91m", end='')
    print("temps total : {} secondes".format(int(tac - tic)))

    return score_list
예제 #23
0
def roadmap(cell_lines: Union[List[str], str],
            window_size: int,
            genome: str = "hg19",
            root: str = "roadmap",
            states: int = 18,
            enhancers_labels: List[str] = ("7_Enh", "9_EnhA1", "10_EnhA2"),
            promoters_labels: List[str] = ("1_TssA", ),
            nrows: int = None):
    """Runs the pipeline over the roadmap raw data.

    Parameters
    -----------------------------
    cell_lines: List[str],
        List of cell lines to be considered.
    window_size: int,
        Window size to use for the various regions.
    genome: str= "hg19",
        Considered genome version. Currently supported only "hg19".
    states: int = 18,
        Number of the states of the model to consider. Currently supported only "15" and "18".
    enhancers_labels: List[str] = ("7_Enh", "9_EnhA1", "10_EnhA2"),
        Labels to encode as active enhancers.
    promoters_labels: List[str] = ("1_TssA",),
        Labels to enode as active promoters
    nrows:int=None,
        the number of rows to read, usefull when testing pipelines for creating smaller datasets.

    Raises
    -------------------------------
    ValueError:
        If given cell lines list is empty.
    ValueError:
        If given cell lines are not strings.
    ValueError:
        If given window size is not an integer.
    ValueError:
        If given window size is not a strictly positive integer.
    ValueError:
        If given genome version is not a string.
    ValueError:
        If given nrows parameter is not None or a strictly positive integer.
    ValueError:
        If the model with *states* states is not currently supported with given genome *genome*.

    Returns
    -------------------------------
    Tuple containining dataframes informations for enhancers and promoters for chosen cell lines.
    """

    info = compress_json.local_load("roadmap.json")
    validate_common_parameters(cell_lines, [window_size], genome, info)
    cell_lines = normalize_cell_lines(cell_lines)
    if str(states) not in info[genome]["states_model"]:
        raise ValueError(
            "The model with {states} states is not currently supported with given genome {genome}."
            .format(states=states, genome=genome))

    cell_lines_names = filter_cell_lines(
        root,
        cell_lines,
    )

    url = info[genome]["states_model"][str(states)]
    enhancers_list, promoters_list = list(
        zip(*[(enhancers, promoters)
              for cell_line, code in tqdm(cell_lines_names.values,
                                          desc="Cell lines")
              for enhancers, promoters in (get_cell_line(
                  root, cell_line, states, genome, enhancers_labels,
                  promoters_labels, url.format(code=code), nrows), )
              if enhancers is not None and promoters is not None]))
    enhancers = pd.concat(enhancers_list, axis=1).fillna(0).astype(
        int)  # Encode inactive enhancers as zeros
    promoters = pd.concat(promoters_list, axis=1).fillna(0).astype(
        int)  # Encode inactive promoters as zeros

    # Adapt to given window size
    enhancers = enhancers.reset_index()
    promoters = promoters.reset_index()
    enhancers = center_window(enhancers, window_size)
    promoters = center_window(promoters, window_size)

    enhancers = normalize_bed_file(cell_lines, enhancers)
    promoters = normalize_bed_file(cell_lines, promoters)

    return enhancers, promoters
예제 #24
0
    def run(self, data_file: str = None) -> None:
        """Method is called and performs needed transformations to process
        protein-protein interactions from the STRING DB data.

        Args:
            data_file: data file to parse

        Returns:
            None.

        """
        if not data_file:
            data_file = os.path.join(self.input_base_dir,
                                     "9606.protein.links.full.v11.0.txt.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        protein_node_type = "biolink:Protein"
        edge_label = "biolink:interacts_with"
        self.node_header = compress_json.local_load("node_header.json")
        edge_core_header = compress_json.local_load("edge_core_header.json")
        edge_additional_headers = compress_json.local_load(
            "edge_additional_headers.json")

        self.edge_header = edge_core_header + edge_additional_headers
        relation = 'RO:0002434'
        seen_proteins: Set = set()
        seen_genes: Set = set()

        # Required to align the node edge header of the gene
        # with the default header
        extra_header = [""] * (len(edge_additional_headers) + 1)

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(data_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())
            for line in interactions:
                items_dict = parse_stringdb_interactions(line, header_items)
                proteins = []
                for protein_name in ('protein1', 'protein2'):
                    protein = get_item_by_priority(items_dict, [protein_name])
                    protein = '.'.join(protein.split('.')[1:])
                    proteins.append(protein)
                    if protein in self.protein_gene_map:
                        gene = self.protein_gene_map[protein]
                        if gene not in seen_genes:
                            seen_genes.add(gene)
                            ensemble_gene = f"ENSEMBL:{gene}"
                            gene_informations = self.gene_info_map[
                                self.ensembl2ncbi_map[gene]]
                            write_node_edge_item(
                                fh=node,
                                header=self.node_header,
                                data=[
                                    ensemble_gene, gene_informations['symbol'],
                                    'biolink:Gene',
                                    gene_informations['description'],
                                    f"NCBIGene:{self.ensembl2ncbi_map[gene]}"
                                ])
                            write_node_edge_item(
                                fh=edge,
                                header=self.edge_header,
                                data=[
                                    ensemble_gene,
                                    "biolink:has_gene_product",
                                    protein,
                                    "RO:0002205",
                                    "NCBI",
                                ] + extra_header)

                        # write node data
                        if protein not in seen_proteins:
                            seen_proteins.add(protein)
                            write_node_edge_item(fh=node,
                                                 header=self.node_header,
                                                 data=[
                                                     f"ENSEMBL:{protein}", "",
                                                     protein_node_type, "", ""
                                                 ])

                # write edge data
                write_node_edge_item(
                    fh=edge,
                    header=self.edge_header,
                    data=[
                        proteins[0], edge_label, proteins[1], relation,
                        "STRING", items_dict['combined_score']
                    ] + [
                        items_dict.get(header, "")
                        for header in edge_additional_headers
                    ])
예제 #25
0
def sanitize_ml_labels(
        labels: Union[List[str], str],
        upper_case_consonants_clusters: bool = True,
        replace_with_spaces: List[str] = ("-", "_", ":", "<", ">"),
        detect_and_remove_homogeneous_descriptors: bool = True,
        replace_defaults: bool = True,
        soft_capitalization: bool = True,
        custom_defaults: Dict[str, Union[List[str], str]] = None) -> List[str]:
    """Return sanitized labels in standard way.

    Parameters
    ----------
    labels: Union[List[str], str]
        Wither label or list of labels to sanitize.
    upper_case_consonants_clusters: bool = True
        Whetever to convert to upper case detected initials.
    replace_with_spaces: List[str] = ("-", "_", ":", "<", ">")
        Characters to be replaced with spaces.
    detect_and_remove_homogeneous_descriptors: bool = True
        Whetever to remove the known descriptors when all terms contain it.
    replace_defaults: bool = True
        Whetever to replace default terms.
    soft_capitalization: bool = True
        Whetever to apply soft capitalization,
        replacing capitalization only when no capitalization is already present.

    Returns
    -------
    Sanitized labels.
    """

    try:
        iter(labels)
        is_iterable = True
    except TypeError:
        is_iterable = False

    single_label = not is_iterable or isinstance(labels, str)
    if single_label:
        labels = [labels]

    labels = to_string(labels)

    if detect_and_remove_homogeneous_descriptors:
        generic_words_cooccurring_with_descriptors = compress_json.local_load(
            "generic_words_cooccurring_with_descriptors.json")
        for descriptor in compress_json.local_load("descriptors.json"):
            if have_descriptor(labels, descriptor,
                               generic_words_cooccurring_with_descriptors):
                labels = remove_descriptor(labels, descriptor)

    if soft_capitalization:
        labels = apply_soft_capitalization(labels)

    if replace_defaults:
        if custom_defaults is None:
            custom_defaults = dict()

        custom_defaults = dict([(key, value) if isinstance(value, list) else
                                (key, [value])
                                for key, value in custom_defaults.items()])
        labels = apply_replace_defaults(labels, custom_defaults)

    labels = [
        targets_to_spaces(label, replace_with_spaces) for label in labels
    ]

    labels = clear_spaces(labels)

    if soft_capitalization:
        labels = apply_soft_capitalization(labels)

    if upper_case_consonants_clusters:
        labels = [consonants_to_upper(label) for label in labels]

    if single_label:
        return labels[0]
    return labels
예제 #26
0
 def __init__(self):
     """Create new JAX Graph Repository object."""
     super().__init__()
     self._data = compress_json.local_load("jax.json")
 def __init__(self):
     """Create new String Graph Repository object."""
     super().__init__()
     self._data = compress_json.local_load("linqs.json")
 def _load_unsupported_graphs(self) -> Set[str]:
     """Return set of known unsupported graphs."""
     try:
         return compress_json.local_load(self.unsupported_graphs_path)
     except Exception:
         return list()
예제 #29
0
def main():
    with tf.compat.v1.Session() as sess:

        tic = time.time()

        env = customEnv()

        if "--mstep" in sys.argv:
            arg_index = sys.argv.index("--mstep")
            micro_stepping = int(sys.argv[arg_index + 1])
        else:
            micro_stepping = 1

        if "--ep" in sys.argv:
            arg_index = sys.argv.index("--ep")
            ep = int(sys.argv[arg_index + 1])
        else:
            ep = 10000

        tau = 0.001
        gamma = 0.99
        min_batch = 64
        actor_lr = 0.0001
        critic_lr = 0.001
        buffer_size = 1000000
        layers = [300]

        state_dim =  (env.observation_space["observation"].shape[0] + env.observation_space["desired_goal"].shape[0])*micro_stepping
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound, layers, actor_lr, tau, min_batch)
        critic = CriticNetwork(sess, state_dim, action_dim, layers, critic_lr, tau, gamma, actor.get_num_trainable_vars())

        action_wanted = tf.compat.v1.placeholder(tf.float32, (None, action_dim))
        reward_wanted = tf.compat.v1.placeholder(tf.float32, (None, 1))

        actor_target = tf.reduce_mean(tf.square(actor.out-action_wanted))
        critic_target = tf.reduce_mean(tf.square(critic.out-reward_wanted))

        actor_train = tf.compat.v1.train.AdamOptimizer(actor_lr).minimize(actor_target)
        critic_train = tf.compat.v1.train.AdamOptimizer(critic_lr).minimize(critic_target)


        update_target_network_actor = [actor.target_network_params[i].assign(actor.network_params[i]) for i in range(len(actor.target_network_params))]
        update_target_network_critic = [critic.target_network_params[i].assign(critic.network_params[i]) for i in range(len(critic.target_network_params))]

        print("\033[0;1;32m")
        print("===================")
        print("LE DEBUT")
        print("===================")

        print("loading buffer")
        arg_index = sys.argv.index("--loadBuff")
        buffPath = sys.argv[arg_index + 1]
        buffer = compress_json.local_load("preTrain/"+buffPath+".json.gz")
        print("buffer loaded")

        sess.run(tf.compat.v1.global_variables_initializer())

        saver = tf.compat.v1.train.Saver()

        i = 0
        while i < ep:
            i += 1
            states, actions, rewards = sample(buffer,min_batch)

            sess.run(actor_train,{actor.inputs: states, action_wanted: actions})
            sess.run(critic_train,{critic.inputs: states, critic.action: actions, reward_wanted: np.reshape(rewards,(min_batch,1))})

            print("\033[0;1;4;97m", end='')
            print("miniBatch {} / {}".format(i,ep), end='')
            print("\033[0;m     ", end='')
            tac = time.time()
            print("\033[3;91m", end='')
            print("{} secondes".format(int(tac - tic)), end='')
            print("\033[0;m                  \r", end='')


        sess.run(update_target_network_actor)
        sess.run(update_target_network_critic)

        arg_index = sys.argv.index("--save")
        save_name = sys.argv[arg_index + 1]
        saver.save(sess, "savedir/" + save_name+"/save")
        print("\033[0;1;32m")
        print("session saved at : " + save_name)


    return 0
예제 #30
0
def train_model_seq(models, epigenomes, nlabels, region_type, cell_line):
    # Reprod
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(42)

    splits = 11
    holdouts = StratifiedShuffleSplit(
        n_splits=splits, test_size=0.2, random_state=42)
    genome = Genome("hg19")
    bed = to_bed(epigenomes[region_type])
    labels = nlabels[region_type].values.ravel()
    if os.path.exists(cell_line + "_" + region_type + "_sequence.json"):
        results = compress_json.local_load(
            cell_line + "_" + region_type + "_sequence.json")
    else:
        results = []
    class_w = class_weight.compute_class_weight(
        'balanced', np.unique(labels), labels)
    class_w = dict(enumerate(class_w))
    print("Class weights: " + str(class_w))

    for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True):
        train, test = get_holdout(
            train_index, test_index, bed, labels, genome, 1024)
        print("="*80)
        for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True):
            if precomputed(results, model.name, i):
                continue
            history = model.fit(
                train,
                steps_per_epoch=train.steps_per_epoch,
                validation_data=test,
                validation_steps=test.steps_per_epoch,
                epochs=1000,
                shuffle=True,
                verbose=False,
                class_weight=class_w,
                callbacks=[
                    EarlyStopping(monitor="val_loss", mode="min",
                                  patience=50, restore_best_weights=True),
                ]
            ).history
            scores = pd.DataFrame(history).iloc[-1].to_dict()
            results.append({
                "model": model.name,
                "run_type": "train",
                "holdout": i,
                **{
                    key: value
                    for key, value in scores.items()
                    if not key.startswith("val_")
                }
            })
            results.append({
                "model": model.name,
                "run_type": "test",
                "holdout": i,
                **{
                    key[4:]: value
                    for key, value in scores.items()
                    if key.startswith("val_")
                }
            })
            compress_json.local_dump(
                results, cell_line + "_" + region_type + "_sequence.json")
            df = pd.DataFrame(results).drop(columns="holdout")
    return df