def __init__(self, ressource_name, version, dbfrom, db, ns_linking_id,
              ns_linked_id, ns_endpoint, primary_predicate,
              secondary_predicate, namespaces, timeout, ftp):
     self.dbfrom = dbfrom
     self.db = db
     self.file_index = 1
     self.namespaces = namespaces
     self.ressource_version = Database_ressource_version(
         ressource=ressource_name, version=version)
     self.ressource_version_endpoint = Database_ressource_version(
         ressource=ressource_name + "_endpoints", version=version)
     self.ns_linking_id = ns_linking_id
     self.ns_linked_id = ns_linked_id
     self.ns_endpoint = ns_endpoint
     self.primary_predicate = primary_predicate
     self.secondary_predicate = secondary_predicate
     self.g_linked_id = self.ressource_version.create_data_graph(
         namespace_list=[
             self.ns_linking_id[0], self.ns_linked_id[0],
             self.primary_predicate[0]
         ],
         namespace_dict=self.namespaces)
     self.g_linked_id_endpoint = self.ressource_version_endpoint.create_data_graph(
         namespace_list=[
             self.ns_linking_id[0], self.ns_linked_id[0],
             self.secondary_predicate[0], self.ns_endpoint[0], "obo",
             "dcterms"
         ],
         namespace_dict=self.namespaces)
     self.append_failure = list()
     self.request_failure = list()
     self.available_linked_ids = 0
     self.all_linked_ids = set()
     self.n_subjects_g_linked_id = 0
     self.n_triples_g_linked_id = 0
     self.n_subjects_g_linked_id_endpoint = 0
     self.n_triples_g_linked_id_endpoint = 0
     self.r_timeout = timeout
     self.ftp = ftp
namespace_s = namespaces[config['SUBJECTS'].get('namespace')]
namespace_o = namespaces[config['OBJECTS'].get('namespace')]
prefix_s = config['SUBJECTS'].get('prefix')
prefix_o = config['OBJECTS'].get('prefix')
predicate = namespaces[config['PREDICATES'].get('namespace')][config['PREDICATES'].get('name')]

# On récupère les colonnes des sujets et des objets
L = list([config['SUBJECTS'].get('name'), config['OBJECTS'].get('name')])

# Test if input file exists:
if not os.path.exists(input_table_path):
    print("The input table at " + input_table_path + " does not exists")
    sys.exit(3)

# Intialyze graph :
ressource = Database_ressource_version(ressource = ressource_name, version = version)
g = ressource.create_data_graph(namespaces.keys(), namespaces)
f_i = 1

# Intialyze counts:
n_subjects = 0
n_objects = 0

print("Starting read file by chunk of size " + str(chunk_size))
df_chunk = pd.read_csv(input_table_path, chunksize=chunk_size)
for chunk in df_chunk:
    print("Filtering chunk ... ", end = '')
    filtered_data = chunk[(chunk[column_parsed] <= threshold)]
    filtered_data = filtered_data[L]
    print("Ok\nConverting data to triples ... ", end = '')
    for i in range(len(filtered_data)):
Exemplo n.º 3
0
 def export_intra_eq(self, path_out, source):
     """
     This function is used to create a graph or URIs equivalences between the different URIs associated to a given ressource. E
     Between differents URIs of the same ressource (called intra-uris) a owl:sameAs relation is implemented
     - path_out: a path to out files
     - source : a string which defined the origin of the data stores in the IdMapping object, et may be SBML, MetaNetX, BiGG ...
     """
     ressource_version_intra = Database_ressource_version(
         ressource="Id_mapping/Intra/" + source, version=self.version)
     n_triples = 0
     subjects = set()
     path_out = path_out + source + "/" + ressource_version_intra.version + "/"
     if not os.path.exists(path_out):
         os.makedirs(path_out)
     for r_name in self.intra_ids_dict.keys():
         print("Treating " + r_name + ":")
         intra_ids = list(self.intra_ids_dict[r_name])
         if len(intra_ids) == 0:
             continue
         g_name = r_name + "_intra"
         current_graph = ressource_version_intra.create_data_graph([], None)
         current_graph.bind("owl", OWL)
         print("Create intra uris equivalences ... ", end='')
         for id in intra_ids:
             intra_uris = [
                 rdflib.URIRef(prefix + id)
                 for prefix in self.ressource_uris[r_name]
             ]
             for current_uri, next_uri in zip(intra_uris, intra_uris[1:]):
                 current_graph.add((current_uri, OWL['sameAs'], next_uri))
         print("Ok\nExport graph for resource " + r_name + " ... ", end='')
         current_graph.serialize(destination=path_out + g_name + ".ttl",
                                 format='turtle')
         print("Ok\nTry to compress file " + r_name + " ... ", end='')
         try:
             # Use of gzip -f to force overwritting if file already exist
             subprocess.run("gzip -f " + path_out + g_name + ".ttl",
                            shell=True,
                            check=True,
                            stderr=subprocess.PIPE)
             ressource_version_intra.add_DataDump(g_name + ".ttl.gz",
                                                  self.ftp)
         except subprocess.CalledProcessError as e:
             print("Error while trying to compress files")
             print(e)
             sys.exit(3)
         print("Ok\nIncrement metadata ... ", end='')
         subjects = subjects.union(
             set([s for s in current_graph.subjects()]))
         n_triples += len(current_graph)
         print("Ok")
     print("Write metadata graph ... ", end='')
     ressource_version_intra.version_graph.bind("void", VOID)
     ressource_version_intra.add_version_attribute(
         DCTERMS["description"],
         rdflib.Literal("URIs equivalence inside a ressource"))
     ressource_version_intra.add_version_attribute(
         DCTERMS["title"],
         rdflib.Literal("URIs equivalence inside a ressource"))
     for source_uris in self.sources:
         ressource_version_intra.add_version_attribute(
             DCTERMS["source"], rdflib.URIRef(source_uris))
     ressource_version_intra.add_version_attribute(
         VOID["triples"], rdflib.Literal(n_triples, datatype=XSD.long))
     ressource_version_intra.add_version_attribute(
         VOID["distinctSubjects"],
         rdflib.Literal(len(subjects), datatype=XSD.long))
     ressource_version_intra.version_graph.serialize(destination=path_out +
                                                     "void.ttl",
                                                     format='turtle')
     print("Ok")
     return ressource_version_intra.uri_version
Exemplo n.º 4
0
 def create_graph_from_pubchem_type(self, pubchem_graph, path_out,
                                    graph_uri):
     """
     This function is ised to create a mapping graph from information contains in type pubchem graphs which can contains links between PubChem CID and ChEBI
     - pubchem_graph: a rdflib object graph associated to the PubChem type RDF graph
     - path_out: path to the output directory
     """
     ressource_version_PubChem = Database_ressource_version(
         ressource="Id_mapping/Inter/PubChem", version=self.version)
     n_triples = 0
     subjects = set()
     path_out = path_out + ressource_version_PubChem.version + "/"
     if not os.path.exists(path_out):
         os.makedirs(path_out)
     selected_ressource = [
         r for r in self.uri_PubChem.keys()
         if len(self.uri_PubChem[r]) > 0 and r != "pubchem"
     ]
     uri_PubChem = self.uri_PubChem["pubchem"]
     for ressource in selected_ressource:
         request_uri = self.uri_PubChem[ressource]
         print("Treating ressource " + ressource + " :")
         g_name = ("PubChem_" + ressource)
         current_graph = ressource_version_PubChem.create_data_graph([],
                                                                     None)
         current_graph.bind("skos", SKOS)
         print("Get ids mapping ... ", end='')
         PubChem_ids, ressource_ids = self.get_pubchem_mapping(
             pubchem_graph, uri_PubChem, request_uri)
         if PubChem_ids is None or ressource_ids is None:
             print(
                 "Impossible to process information for identifiers equivalence between MetaNetX and "
                 + ressource + "\n")
             continue
         n_ids = len(PubChem_ids)
         print("Ok\nCompute equivalences ... ", end='')
         for id_index in range(n_ids):
             #  On écrit les équivalence inter-ressource seulement pour une URI de chaque ressource, le liens avec les autres se fera par le biais des équivalence intra-ressource
             uri_1, uri_2 = rdflib.URIRef(
                 self.ressource_uris["pubchem"][0] +
                 PubChem_ids[id_index]), rdflib.URIRef(
                     self.ressource_uris[ressource][0] +
                     ressource_ids[id_index])
             current_graph.add((uri_1, SKOS['closeMatch'], uri_2))
         print("Ok\nExport graph for resource " + ressource + " ... ",
               end='')
         current_graph.serialize(destination=path_out + g_name + ".ttl",
                                 format='turtle')
         try:
             # Use of gzip -f to force overwritting if file already exist
             subprocess.run("gzip -f " + path_out + g_name + ".ttl",
                            shell=True,
                            check=True,
                            stderr=subprocess.PIPE)
             ressource_version_PubChem.add_DataDump(g_name + ".ttl.gz",
                                                    self.ftp)
         except subprocess.CalledProcessError as e:
             print("Error while trying to compress files")
             print(e)
             sys.exit(3)
         print("Ok\nIncrement metadata ... ", end='')
         n_triples += len(current_graph)
         subjects = subjects.union(
             set([s for s in current_graph.subjects()]))
         # Si il y a plusieurs URI pour la ressource, il faut préparer les identifiants pour les correspondances intra-ressource
         print("Ok\nAdd ids to intra equivalences table ... ", end='')
         if len(self.ressource_uris["pubchem"]) > 1:
             self.intra_ids_dict["pubchem"] = self.intra_ids_dict[
                 "pubchem"].union(PubChem_ids)
         if len(self.ressource_uris[ressource]) > 1:
             self.intra_ids_dict[ressource] = self.intra_ids_dict[
                 ressource].union(ressource_ids)
         print("Ok")
     self.sources.append(graph_uri)
     print("Write metadata graph ... ", end='')
     ressource_version_PubChem.version_graph.bind("void", VOID)
     ressource_version_PubChem.add_version_attribute(
         DCTERMS["description"],
         rdflib.Literal(
             "Ids correspondances between differents ressources from PubChem"
         ))
     ressource_version_PubChem.add_version_attribute(
         DCTERMS["title"],
         rdflib.Literal("Ids correspondances from PubChem"))
     ressource_version_PubChem.add_version_attribute(
         DCTERMS["source"], rdflib.URIRef(graph_uri))
     ressource_version_PubChem.add_version_attribute(
         VOID["triples"], rdflib.Literal(n_triples, datatype=XSD.long))
     ressource_version_PubChem.add_version_attribute(
         VOID["distinctSubjects"],
         rdflib.Literal(len(subjects), datatype=XSD.long))
     ressource_version_PubChem.version_graph.serialize(
         destination=path_out + "void.ttl", format='turtle')
     print("Ok")
     return ressource_version_PubChem.uri_version
Exemplo n.º 5
0
 def create_graph_from_MetaNetX(self, graph_metaNetX, path_out, graph_uri):
     """
     This function is used to create a graph or uri equivalences between MetaNetX identifiers and other ressources. Equivalence information are fetch from the MetaNetX RDF graph. 
     Between ressource a skos:closeMatch relation is implemented (to avoid propaging false information)
     - graph_metaNetX: a rdflib object graph associated to the MetaNetX RDF graph
     - path_out: a path to out files
     """
     ressource_version_MetaNetX = Database_ressource_version(
         ressource="Id_mapping/Inter/MetaNetX", version=self.version)
     n_triples = 0
     subjects = set()
     path_out = path_out + ressource_version_MetaNetX.version + "/"
     if not os.path.exists(path_out):
         os.makedirs(path_out)
     selected_ressource = [
         r for r in self.uri_MetaNetX.keys()
         if len(self.uri_MetaNetX[r]) > 0 and r != "metanetx"
     ]
     for ressource in selected_ressource:
         # On crée le graph MetaNetX .vs. ressource
         print("Treating resource: " + ressource + " with MetaNetX :")
         g_name = ("MetaNetX_" + ressource)
         print("Get ids mapping ... ", end='')
         current_graph = ressource_version_MetaNetX.create_data_graph([],
                                                                      None)
         current_graph.bind("skos", SKOS)
         metaNetX_ids, ressource_ids = self.get_mapping_from_MetanetX(
             graph_metaNetX, ressource)
         if metaNetX_ids is None or ressource_ids is None:
             print(
                 "Impossible to process information for identifiers equivalence between MetaNetX and "
                 + ressource + "\n")
             continue
         n_ids = len(metaNetX_ids)
         for id_index in range(n_ids):
             #  On écrit les équivalence inter-ressource seulement pour une URI de chaque ressource, le liens avec les autres se fera par le biais des équivalence intra-ressource
             uri_1, uri_2 = rdflib.URIRef(
                 self.ressource_uris["metanetx"][0] +
                 metaNetX_ids[id_index]), rdflib.URIRef(
                     self.ressource_uris[ressource][0] +
                     ressource_ids[id_index])
             current_graph.add((uri_1, SKOS['closeMatch'], uri_2))
         # On écrit le graph :
         print("Ok\nExport graph for resource " + ressource + " ... ",
               end='')
         current_graph.serialize(destination=path_out + g_name + ".ttl",
                                 format='turtle')
         try:
             # Use of gzip -f to force overwritting if file already exist
             subprocess.run("gzip -f " + path_out + g_name + ".ttl",
                            shell=True,
                            check=True,
                            stderr=subprocess.PIPE)
             ressource_version_MetaNetX.add_DataDump(
                 g_name + ".ttl.gz", self.ftp)
         except subprocess.CalledProcessError as e:
             print("Error while trying to compress files")
             print(e)
             sys.exit(3)
         n_triples += len(current_graph)
         subjects = subjects.union(
             set([s for s in current_graph.subjects()]))
         print("Ok\nAdd ids to intra equivalences table ... ", end='')
         # Si il y a plusieurs URI pour la ressource, il faut préparer les identifiants pour les correspondances intra-ressource
         if len(self.ressource_uris["metanetx"]) > 1:
             self.intra_ids_dict["metanetx"] = self.intra_ids_dict[
                 "metanetx"].union(metaNetX_ids)
         if len(self.ressource_uris[ressource]) > 1:
             self.intra_ids_dict[ressource] = self.intra_ids_dict[
                 ressource].union(ressource_ids)
         print("Ok")
     self.sources.append(graph_uri)
     # On crée les graph inter ressource à partir des infos de MetaNetX :
     print("Creating inter-ressource equivalences from MetaNetX: ")
     cbn_resource = itertools.combinations(selected_ressource, 2)
     for ressource_pair in cbn_resource:
         r1 = ressource_pair[0]
         r2 = ressource_pair[1]
         g_name = ("metanetx_" + r1 + "_" + r2)
         print("Treating : " + r1 + " - " + r2 + " with MetaNetX :")
         print("Get ids mapping ...", end='')
         current_graph = ressource_version_MetaNetX.create_data_graph([],
                                                                      None)
         current_graph.bind("skos", SKOS)
         # Le WevService semble mal fonctionner ... donc je suis passer par une nouvelle méthode où de download depuis le ftp :
         ids_r1, ids_r2 = self.get_mapping_from_MetanetX_inter_ressource(
             graph_metaNetX, self.uri_MetaNetX[r1], self.uri_MetaNetX[r2])
         if ids_r1 is None or ids_r2 is None:
             print(
                 "Impossible to process information for identifiers equivalence between ressource "
                 + r1 + " and " + r2 + " with MetaNetX\n")
             continue
         n_ids = len(ids_r1)
         for id_index in range(n_ids):
             #  On écrit les équivalence inter-ressource seulement pour une URI de chaque ressource, le liens avec les autres se fera par le biais des équivalence intra-ressource
             uri_1, uri_2 = rdflib.URIRef(self.ressource_uris[r1][0] +
                                          ids_r1[id_index]), rdflib.URIRef(
                                              self.ressource_uris[r2][0] +
                                              ids_r2[id_index])
             current_graph.add((uri_1, SKOS['closeMatch'], uri_2))
         # On écrit le graph :
         print("Ok\nExport graph for resource " + ressource + " ...",
               end='')
         current_graph.serialize(destination=path_out + g_name + ".ttl",
                                 format='turtle')
         try:
             # Use of gzip -f to force overwritting if file already exist
             subprocess.run("gzip -f " + path_out + g_name + ".ttl",
                            shell=True,
                            check=True,
                            stderr=subprocess.PIPE)
             ressource_version_MetaNetX.add_DataDump(
                 g_name + ".ttl.gz", self.ftp)
         except subprocess.CalledProcessError as e:
             print("Error while trying to compress files")
             print(e)
             sys.exit(3)
         n_triples += len(current_graph)
         subjects = subjects.union(
             set([s for s in current_graph.subjects()]))
         print("Ok")
         # Pas besoin de savoir s'il faut les ajouter dans l'intra-dict, car ils y ont nécéssairement été ajouté par le run MetaNetX .vs. ressource
     print("Write metadata graph ...", end='')
     ressource_version_MetaNetX.version_graph.bind("void", VOID)
     ressource_version_MetaNetX.add_version_attribute(
         DCTERMS["description"],
         rdflib.Literal(
             "Ids correspondances between differents ressources from MetaNetX"
         ))
     ressource_version_MetaNetX.add_version_attribute(
         DCTERMS["title"],
         rdflib.Literal("Ids correspondances from MetaNetX"))
     ressource_version_MetaNetX.add_version_attribute(
         DCTERMS["source"], rdflib.URIRef(graph_uri))
     ressource_version_MetaNetX.add_version_attribute(
         VOID["triples"], rdflib.Literal(n_triples, datatype=XSD.long))
     ressource_version_MetaNetX.add_version_attribute(
         VOID["distinctSubjects"],
         rdflib.Literal(len(subjects), datatype=XSD.long))
     ressource_version_MetaNetX.version_graph.serialize(
         destination=path_out + "void.ttl", format='turtle')
     print("Ok")
     return ressource_version_MetaNetX.uri_version
Exemplo n.º 6
0
def download_MeSH(out_dir, out_log):
    """
    This function is used to download the last version of the MeSH RDF from NIH ftp server, the void.ttl file is also use to bring metadata information about the dowloaded version.
    But contrary to PubChem the modification date is not include in the void.ttl file. So, version is determine from the modification date of the file.
    Ressource is named 'MeSHRDF' as indicate in the void.ttl
    - out_dir: a path to an directory to write output files
    - namespace_list: a list of the namespaces that should be associated to the graph
    The function return the version and the uri of this new version.
    """
    # Intialyze .log files
    with open(out_log + "dl_mesh.log", "wb") as f_log:
        pass
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    print("Trying to check last available version of MeSH RDF on ftp ...", end = '')
    # Connect ftp server to check mesh.nt last modification date
    try:
        ftp = ftplib.FTP("ftp.nlm.nih.gov")
        ftp.login()
        mesh_mdtm = ftp.voidcmd("MDTM /online/mesh/rdf/mesh.nt")[4:]
        ftp.quit()
    except ftplib.all_errors as ftplib_e:
        print("Errors while trying to connect to NCBI mesh FTP server at ftp.nlm.nih.gov, check dl_mesh.log")
        with open(out_log + "dl_mesh.log", "a") as f_log:
            f_log.write("\n" + str(ftplib_e) + "\n")
        sys.exit(3)
    # parse date to get last version
    mesh_last_v = parser.parse(mesh_mdtm)
    mesh_last_v = mesh_last_v.strftime('%Y-%m-%d')
    print(" Ok\nLast MeSH RDF version found on ftp server is : " + mesh_last_v)
    print("Check if MeSH RDF version " + mesh_last_v + " was already download: ", end = '')
    test_r_info = glob.glob(out_dir + mesh_last_v + "/" + "void.ttl")
    # From last version date, if associated void.ttl file already exists, exit and return mesh last version and associated uri
    if len(test_r_info) == 1:
        print("Yes\nMeSH RDF version " + mesh_last_v + " is already downloaded, end.\n\n")
        ressource_version = Database_ressource_version(ressource = "MeSHRDF", version = mesh_last_v)
        print("=================================================================================\n")
        return mesh_last_v, str(ressource_version.uri_version)
    else:
        print("No\nTrying to dowload MeSH RDF version " + mesh_last_v + "\n\n")
    # Create version output directory
    out_path = out_dir + mesh_last_v + "/"
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    # Download MeSh data
    try:
        subprocess.run("wget -P " + out_path + " ftp://ftp.nlm.nih.gov/online/mesh/rdf/void_1.0.0.ttl", shell = True, check=True, stderr = subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        print("Error during trying to download MeSH void.ttl file, check dl_mesh.log")
        print(e)
        with open(out_log + "dl_mesh.log", "ab") as f_log:
            f_log.write(e.stderr)
        sys.exit(3)
    print("Trying to read MeSH void.ttl file ...", end = '')
    g_metadata = rdflib.Graph()
    g_metadata.parse(out_path + "void_1.0.0.ttl", format = 'turtle')
    print(" Ok\nTrying to dowload MeSH RDF file ...", end = '')
    # Download MeSH RDF
    try:
        subprocess.run("wget -P " + out_path + " ftp://ftp.nlm.nih.gov/online/mesh/rdf/mesh.nt", shell = True, check=True, stderr = subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        print("Error during trying to download MeSH mesh.nt file, check dl_mesh.log")
        print(e)
        with open(out_log + "dl_mesh.log", "ab") as f_log:
            f_log.write(e.stderr)
        sys.exit(3)
    print(" Ok\nTrying to parse MeSH original metadata ...", end = '')
    # On crée la nouvelle ressource MeSH
    ressource_version = Database_ressource_version(ressource = "MeSHRDF", version = mesh_last_v)
    ressource_version.version_graph.namespace_manager = g_metadata.namespace_manager
    for s,p,o in g_metadata.triples((rdflib.URIRef("http://id.nlm.nih.gov/mesh/void#MeSHRDF"), None, None)):
        # L'attribut creation dans le void correspond à la date de création originale du fichier soir courant 2014, noous souhaitant que la date de création de notre ressource correspondent à la date de modification du fichier
        if (p == VOID['dataDump']):
            if str(o) == "ftp://ftp.nlm.nih.gov/online/mesh/rdf/mesh.nt":
                ressource_version.add_version_attribute(predicate = p, object = o)
            else:
                continue
        elif (p != DCTERMS["created"]):
            ressource_version.add_version_attribute(predicate = p, object = o)
        else:
            continue
    g_metadata = None
    # On crée le graph de données : 
    print(" Ok\nTrying to create MeSH new ressource version ...", end = '')
    mesh_graph = ressource_version.create_data_graph([], None)
    mesh_graph.parse(out_path + "mesh.nt", format = "nt")
    ressource_version.add_version_attribute(VOID["triples"], rdflib.Literal( len(mesh_graph), datatype=XSD.long ))
    ressource_version.add_version_attribute(VOID["distinctSubjects"], rdflib.Literal( len(set([str(s) for s in mesh_graph.subjects()])), datatype=XSD.long ))
    # Clear graph
    mesh_graph = None
    # On écrit le graph de la ressource
    ressource_version.version_graph.serialize(out_path + "void.ttl", format = 'turtle')
    # On supprime le fichier void initial
    try:
        subprocess.run("rm " + out_path + "void_1.0.0.ttl ", shell = True, check=True, stderr = subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        print("Error during trying to remove file, check dl_mesh.log")
        print(e)
        with open(out_log + "dl_mesh.log", "ab") as f_log:
            f_log.write(e.stderr)
        sys.exit(3)
    print(" Ok\nEnd")
    print("=================================================================================\n")
    return ressource_version.version, str(ressource_version.uri_version)
Exemplo n.º 7
0
def download_pubChem(dir, request_ressource, out_path, out_log):
    """
    This function is used to download PubChem rdf files from the ftp server and create a new version of the associated ressource.
    - dir: the path to the directory/file to fetch in the ftp server from ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/
    - request_ressource: the name of the ressource as indicated in the void.ttl file.
    - out_path: a path to a directory to write output files.
    The function return the version created and the uri of this new version. in case of errors during wget downloading, errors are printed in dl_pubchem_*.log
    """
    # Intialyze .log files
    with open(out_log + "dl_pubchem_" + request_ressource + ".log", "wb") as f_log:
        pass
    # On télécharge le fichier void et les données
    print("Trying to check last available version of PubChem RDF on ftp ...", end = '')
    # Connect ftp server to check void.ttl last modification date
    try:
        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
        ftp.login()
        pubchem_mdtm = ftp.voidcmd("MDTM /pubchem/RDF/void.ttl")[4:]
        ftp.quit()
    except ftplib.all_errors as ftplib_e:
        print("Errors while trying to connect to NCBI PubChem FTP server at ftp.ncbi.nlm.nih.gov, check dl_pubchem_" + request_ressource + ".log")
        with open(out_log + "dl_pubchem_" + request_ressource + ".log", "ab") as f_log:
            f_log.write("\n" + str(ftplib_e) + "\n")
        sys.exit(3)
    # Parse data to create pubchem version
    pubchem_last_v = parser.parse(pubchem_mdtm)
    pubchem_last_v = pubchem_last_v.strftime('%Y-%m-%d')
    print(" Ok\nLast PubChem " + request_ressource + "RDF version found on ftp server is : " + pubchem_last_v)
    print("Check if PubChem " + request_ressource + " RDF version " + pubchem_last_v + " was already download: ", end = '')
    # From last version date, if associated void.ttl file already exists, exit and return pubchem last version and associated uri
    test_r_info = glob.glob(out_path + request_ressource + "/" + pubchem_last_v + "/" + "void.ttl")
    if len(test_r_info) == 1:
        print("Yes\nPubChem " + request_ressource + " RDF version " + pubchem_last_v + " is already downloaded, end.\n\n")
        ressource_version = Database_ressource_version(ressource = "PubChem/" + request_ressource, version = pubchem_last_v)
        print("=================================================================================\n")
        return pubchem_last_v, str(ressource_version.uri_version)
    else:
        print("No\nTrying to dowload PubChem " + request_ressource + " RDF version " + pubchem_last_v + "\n\n")
    # Download PubChem data
    print("Trying to dowload PubChem void.ttl file ...", end = '')
    # Create output directory for requested ressource and last available version
    version_path = out_path + request_ressource + "/" + pubchem_last_v + "/"
    if not os.path.exists(version_path):
        os.makedirs(version_path)
    try:
        subprocess.run("wget -P " + version_path + " ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/void.ttl", shell = True, check=True, stderr = subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        print("Error during trying to download PubChem void.ttl file, check dl_pubchem_" + request_ressource + ".log")
        print(e)
        with open(out_log + "dl_pubchem_" + request_ressource + ".log", "ab") as f_log:
            f_log.write(e.stderr)
        sys.exit(3)
    print(" Ok\nTrying to read Pubchem void.ttl file ...", end = '')
    # On parse le fichier des metadatas
    g_metadata = rdflib.Graph()
    g_metadata.parse(version_path + "void.ttl", format='turtle')
    try:
        subprocess.run("rm " + version_path + "void.ttl", shell = True, check=True, stderr = subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        print("Error during trying to remove PubChem void.ttl file, check dl_pubchem_" + request_ressource + ".log")
        print(e)
        with open(out_log + "dl_pubchem_" + request_ressource + ".log", "ab") as f_log:
            f_log.write(e.stderr)
        sys.exit(3)
    print(" Ok\nTrying to dowload Pubchem " + dir + " directory ...", end = '')
    # On récupère les données que l'on enregistre dans le directory créée
    try:
        subprocess.run("wget -r -A ttl.gz -nH" + " -P " + version_path + " --cut-dirs=5 " + "ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/" + dir, shell = True, check=True, stderr = subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        print("Error during trying to dowload PubChem " + dir + " directory, check dl_pubchem_" + request_ressource + ".log")
        print(e)
        with open(out_log + "dl_pubchem_" + request_ressource + ".log", "ab") as f_log:
            f_log.write(e.stderr)
        sys.exit(3)
    print(" Ok\nTrying to build Pubchem " + request_ressource + " new ressource version ...", end = '')
    # On récupère la description en metadata du répertoire téléchargé  pour créer le graph qui sera associé à la ressource
    ressource_version = Database_ressource_version(ressource = "PubChem/" + request_ressource, version = pubchem_last_v)
    ressource_version.version_graph.namespace_manager = g_metadata.namespace_manager
    # On annote la nouvelle version avec les informations du fichier void
    for s,p,o in g_metadata.triples((rdflib.URIRef("http://rdf.ncbi.nlm.nih.gov/pubchem/void.ttl#" + request_ressource), None, None)):
        if p == VOID['dataDump'] and not str(o).startswith("ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/" + dir):
            continue
        ressource_version.add_version_attribute(predicate = p, object = o)
    # On écrit le graph le fichier
    ressource_version.version_graph.serialize(version_path + "void.ttl", format = 'turtle')
    g_metadata = None
    print(" Ok\nEnd !")
    print("=================================================================================\n")
    return ressource_version.version, str(ressource_version.uri_version)
Exemplo n.º 8
0
def download_MetaNetX(out_dir, out_log, version):
    # Intialyze logs
    with open(out_log + "dl_metanetx.log", "wb") as f_log:
        pass
    version_path = out_dir + version + "/"
    print("Check if MetaNetX version " + version + " was already download: ", end = '')
    test_r_info = glob.glob(version_path + "void.ttl")
    if len(test_r_info) == 1:
        print("Yes\nMetaNetX RDF version " + version + " is already downloaded, end.\n\n")
        ressource_version = Database_ressource_version(ressource = "MetaNetX", version = version)
        print("=================================================================================\n")
        return str(ressource_version.uri_version)
    # Else, download:
    print("No\nTrying to dowload MetaNetX RDF file ... ", end = '')
    if not os.path.exists(version_path):
        os.makedirs(version_path)
    # Download MeSH RDF
    try:
        subprocess.run("wget -P " + version_path + " ftp://ftp.vital-it.ch/databases/metanetx/MNXref/" + version + "/metanetx.ttl.gz", shell = True, check=True, stderr = subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        print("Error during trying to download MetaNetX metanetx.ttl.gz file version " + version + ", check dl_metanetx.log")
        print(e)
        with open(out_log + "dl_metanetx.log", "ab") as f_log:
            f_log.write(e.stderr)
        sys.exit(3)
    print("Ok\nCreate new MetaNetX resource: ")
    ressource_version = Database_ressource_version(ressource = "MetaNetX", version = version)
    print("Try to parse MetaNetX graph to extract metadata ... ", end = '')
    g_MetaNetX = rdflib.Graph()
    with gzip.open(version_path + "metanetx.ttl.gz", "rb") as f_MetaNetX:
        g_MetaNetX.parse(f_MetaNetX, format="turtle")
    print("Ok\nExtract metadata ... ", end = '')
    ressource_version.add_version_attribute(predicate = RDF["type"], object = VOID["Dataset"])
    ressource_version.add_version_attribute(predicate = DCTERMS["description"], object = rdflib.Literal("MetaNetX is a repository of genome-scale metabolic networks (GSMNs) and biochemical pathways from a number of major resources imported into a common namespace of chemical compounds, reactions, cellular compartments--namely MNXref--and proteins."))
    ressource_version.add_version_attribute(predicate = DCTERMS["title"], object = rdflib.Literal("MetaNetX v." + version))
    ressource_version.add_version_attribute(predicate = VOID["dataDump"], object = rdflib.URIRef("ftp://ftp.vital-it.ch/databases/metanetx/MNXref/" + version + "/metanetx.ttl.gz"))
    ressource_version.add_version_attribute(predicate = VOID["triples"], object = rdflib.Literal(len(g_MetaNetX), datatype=XSD.long ))
    ressource_version.add_version_attribute(predicate = VOID["distinctSubjects"], object = rdflib.Literal(len(set([str(s) for s in g_MetaNetX.subjects()]))))
    ressource_version.version_graph.serialize(version_path + "void.ttl", format = 'turtle')
    # Clear memory
    g_MetaNetX = None
    print("Ok\nEnd")
    print("=================================================================================\n")
    return str(ressource_version.uri_version)
class Elink_ressource_creator:
    """This class represent an ensembl of Pccompound objects:
    - dbfrom: The NCBI Entrez database for linking ids
    - db: The NCBI Entrez database for linked ids
    - namespaces: a dict containing namespace names as keys and associated rdflib.Namespace() objects as values
    - ressource_version: Database_ressource_version objects representing a new version of the association between linking_ids and linked ids association
    - ressource_version_endpoint: Database_ressource_version objects representing a new version of the additionnal information about associations between linking_ids and linked ids association
    - ns_linking_id: a tuple representing namespace name and associated prefix (if one should be added next before the id) for linking ids
    - ns_linked_id: a tuple representing namespace name and associated prefix (if one should be added next before the id) for linked ids
    - ns_endpoint: a tuple representing namespace name and associated prefix (if one should be added next before the id) for endpoints ids
    - primary_predicate: a tuple representing the primary predicate with namespace name and predicate name, that will be used to illutrate the relation between linking ids and linked ids
    - secondary_predicate: a tuple representing the secondary predicate with namespace name and predicate name, that will be used to illutrate the additionnal relation between linking ids and linked ids in the endpoint graph
    - g_linked_id: a rdflib graph storing association between linking ids and linked ids using the primary_predicate
    - g_linked_id_endpoint: a rdflib graph storing describing associations between linking ids and linked ids using the secondary_predicate
    - append_failure: a list of the linking ids for which the NCBI eutils request succeeded but for which there was no associated linked ids
    - request_failure: a list of linkings ids for which there was a eutils.EutilsError or a RequestException. For all ids contained in the request failure, a new attempt will be processed until th request succeded. 
    - available_linked_ids: a variable that store the current number of linking ids added to graphs
    - all_linked_ids: a set of all the linked ids which were added to graphs 
    - n_subjects_g_linked_id: the number of subjects in the g_linked_id graph
    - n_triples_g_linked_id: the total number of triples in the g_linked_id graph
    - n_subjects_g_linked_id_endpoint: the number of subjects in the g_linked_id_endpoint graph
    - n_triples_g_linked_id_endpoint: the total number of triples in the g_linked_id_endpoint graph
    - ftp: ftp server adress on which data will be uploaded. A valid adress is not mandatory as data will not be automatically upload to the ftp server, an empty string can thus be used.  
    """
    def __init__(self, ressource_name, version, dbfrom, db, ns_linking_id,
                 ns_linked_id, ns_endpoint, primary_predicate,
                 secondary_predicate, namespaces, timeout, ftp):
        self.dbfrom = dbfrom
        self.db = db
        self.file_index = 1
        self.namespaces = namespaces
        self.ressource_version = Database_ressource_version(
            ressource=ressource_name, version=version)
        self.ressource_version_endpoint = Database_ressource_version(
            ressource=ressource_name + "_endpoints", version=version)
        self.ns_linking_id = ns_linking_id
        self.ns_linked_id = ns_linked_id
        self.ns_endpoint = ns_endpoint
        self.primary_predicate = primary_predicate
        self.secondary_predicate = secondary_predicate
        self.g_linked_id = self.ressource_version.create_data_graph(
            namespace_list=[
                self.ns_linking_id[0], self.ns_linked_id[0],
                self.primary_predicate[0]
            ],
            namespace_dict=self.namespaces)
        self.g_linked_id_endpoint = self.ressource_version_endpoint.create_data_graph(
            namespace_list=[
                self.ns_linking_id[0], self.ns_linked_id[0],
                self.secondary_predicate[0], self.ns_endpoint[0], "obo",
                "dcterms"
            ],
            namespace_dict=self.namespaces)
        self.append_failure = list()
        self.request_failure = list()
        self.available_linked_ids = 0
        self.all_linked_ids = set()
        self.n_subjects_g_linked_id = 0
        self.n_triples_g_linked_id = 0
        self.n_subjects_g_linked_id_endpoint = 0
        self.n_triples_g_linked_id_endpoint = 0
        self.r_timeout = timeout
        self.ftp = ftp

    def append_linked_ids(self, id_packed_list, index_list, query_builder,
                          pack_size, add_f_out_path):
        """This function append a new Pccompound to the pccompound_list attribute. Using the cid, this function send a request to NCBI server via Eutils to get PMID association
        - id_packed_list: a list of pack of ids
        - index_list: the index of the current pack
        - query_builder: a eutils.QueryService object parameterized with cache, retmax, retmode, usehistory and especially the api_key
        - pack_size: the pack ids size
        """
        id_pack = id_packed_list[index_list]
        # Get linking_id associated linked_id. using try we test if request fail or not. If request fail, it's added to append_failure list
        print("Send request ...")
        # Intialyze signal timeout :
        signal.signal(signal.SIGALRM, alarm_handler)
        signal.alarm(self.r_timeout)
        try:
            response = query_builder.elink({
                "dbfrom": self.dbfrom,
                "db": self.db,
                "id": id_pack
            })
        except TimeOutException:
            print("\nRequest timeout was reached !")
            with open(add_f_out_path + "elink.log", "a") as f_log:
                f_log.write("from id " + str(index_list * pack_size + 1) +
                            " to id " + str((index_list + 1) * pack_size) +
                            " :\n")
                f_log.write("Request Timeout\n")
                signal.alarm(0)
                return False
        except eutils.EutilsError as fail_request:
            print(
                "\nRequest on Eutils for current compound pack has failed during process, with error name: %s \n"
                % (fail_request))
            with open(add_f_out_path + "elink.log", "a") as f_log:
                f_log.write("from id " + str(index_list * pack_size + 1) +
                            " to id " + str((index_list + 1) * pack_size) +
                            " :\n")
                f_log.write(str(fail_request) + "\n")
                signal.alarm(0)
            return False
        except (ValueError, requests.exceptions.RequestException) as e:
            print(
                "\nThere was an request error: %s \n-- Compound cids is added to request_failure list"
                % (e))
            with open(add_f_out_path + "elink.log", "a") as f_log:
                f_log.write("from id " + str(index_list * pack_size + 1) +
                            " to id " + str((index_list + 1) * pack_size) +
                            " :\n")
                f_log.write(str(e) + "\n")
                signal.alarm(0)
            return False
        signal.alarm(0)
        print("Try to parse request results ...", end='')
        root = ET.fromstring(response)
        # Exploring sets
        for id_Element in root.findall("./LinkSet"):
            # For each LinkSet, get the associated linking_id :
            linking_id = id_Element.find("./IdList/Id").text
            linked_id_by_link_name = {}
            for linked_id_set in id_Element.findall("./LinkSetDb"):
                # Each source is assigned as a Key value and linked_id list as values
                linked_id_by_link_name[(
                    linked_id_set.find("./LinkName").text)] = [
                        set.text for set in linked_id_set.findall("./Link/Id")
                    ]
            # If no refenreces can be found for the linking_id, exit function and add it to append_failure list
            if len(linked_id_by_link_name) == 0:
                self.append_failure.append(linking_id)
                continue
            # Create Union and prepare associated link_name
            linked_id_list = list(
                set().union(*(linked_id_by_link_name.values())))
            link_name_list = [list() for i in range(len(linked_id_list))]
            # For each linked_id link_name in the union set, determine which are the orginals link_name of the association.
            for link_name in linked_id_by_link_name.keys():
                a = numpy.array(
                    numpy.isin(linked_id_list,
                               linked_id_by_link_name[link_name])).nonzero()
                [
                    link_name_list[index].append((link_name))
                    for index in a[0].tolist()
                ]
            # Add in graph :
            self.fill_ids_link_graph(linking_id, linked_id_list)
            self.fill_ids_link_endpoint_graph(linking_id, linked_id_list,
                                              link_name_list)
            # On incrémente le nombre de pmids ajoutés :
            self.available_linked_ids += len(linked_id_list)
        return True

    def fill_ids_link_graph(self, linking_id, linked_id_list):
        """This function fill the g_linked_id graph with linking ids and linked ids associations.
        - linking_id: The linking identifier
        - linked_id_list: the linked id list from the request result
        """
        # Add all triples to graph
        for linked_id in linked_id_list:
            self.g_linked_id.add(
                (self.namespaces[self.ns_linking_id[0]][self.ns_linking_id[1] +
                                                        linking_id],
                 self.namespaces[self.primary_predicate[0]][
                     self.primary_predicate[1]],
                 self.namespaces[self.ns_linked_id[0]][self.ns_linked_id[1] +
                                                       linked_id]))

    def fill_ids_link_endpoint_graph(self, linking_id, linked_id_list,
                                     link_name_list):
        """This function create a rdflib graph containing all the cid - pmid endpoints associations contains in the Ensemble_pccompound object.
        - linking_id: The linking identifier
        - linked_id_list: the linked id list from the request result
        - link_name_list: the link_name list from the request result
        """
        for linked_id_index in range(0, len(linked_id_list)):
            linked_id = linked_id_list[linked_id_index]
            subject = self.ns_linking_id[
                1] + linking_id + "_" + self.ns_linked_id[1] + linked_id
            # Add to graph
            self.g_linked_id_endpoint.add(
                (self.namespaces[self.ns_endpoint[0]][self.ns_endpoint[1] +
                                                      subject],
                 self.namespaces["obo"]['IAO_0000136'],
                 self.namespaces[self.ns_linking_id[0]][self.ns_linking_id[1] +
                                                        linking_id]))
            self.g_linked_id_endpoint.add(
                (self.namespaces[self.ns_endpoint[0]][self.ns_endpoint[1] +
                                                      subject],
                 self.namespaces[self.secondary_predicate[0]][
                     self.secondary_predicate[1]],
                 self.namespaces[self.ns_linked_id[0]][self.ns_linked_id[1] +
                                                       linked_id]))
            for link_name in link_name_list[linked_id_index]:
                self.g_linked_id_endpoint.add(
                    (self.namespaces[self.ns_endpoint[0]][self.ns_endpoint[1] +
                                                          subject],
                     self.namespaces["dcterms"]['contributor'],
                     rdflib.Literal(link_name)))

    def get_all_linked_ids(self):
        """this function allows to extract the union of all linked_ids, the objects of the g_linked_id graph"""
        all_linked_ids = set([
            str(s).split(self.namespaces[self.ns_linked_id[0]] +
                         self.ns_linked_id[1])[1]
            for s in self.g_linked_id.objects()
        ]).union()
        return all_linked_ids

    def get_all_linking_ids(self):
        """this function allows to extract of all linking ids, the subjects of the g_linked_id graph"""
        all_linking_ids = set([
            str(s).split(self.namespaces[self.ns_linking_id[0]] +
                         self.ns_linking_id[1])[1]
            for s in self.g_linked_id.subjects()
        ])
        return all_linking_ids

    def get_all_linked_id_endpoints(self):
        """this function allows to extract the union of all linked_enpoint ids, the subjects of the g_linked_id_endpoint"""
        all_linked_ids_endpoints = set([
            str(s).split(self.namespaces[self.ns_endpoint[0]] +
                         self.ns_endpoint[1])[1]
            for s in self.g_linked_id_endpoint.subjects()
        ])
        return all_linked_ids_endpoints

    def clean(self):
        """This function allow to clean and empty memory for bulky attributes"""
        self.g_linked_id = None
        self.g_linked_id = self.ressource_version.create_data_graph(
            namespace_list=[
                self.ns_linking_id[0], self.ns_linked_id[0],
                self.primary_predicate[0]
            ],
            namespace_dict=self.namespaces)
        self.g_linked_id_endpoint = None
        self.g_linked_id_endpoint = self.ressource_version_endpoint.create_data_graph(
            namespace_list=[
                self.ns_linking_id[0], self.ns_linked_id[0],
                self.secondary_predicate[0], self.ns_endpoint[0], "obo",
                "dcterms"
            ],
            namespace_dict=self.namespaces)
        self.append_failure = None
        self.append_failure = list()
        self.available_linked_ids = 0

    def export_ressource_metatdata(self, out_dir, uri_targeted_ressources):
        """
        This function is used to export the metadata graph.
        - out_dir: a path to the out directory
        - uri_targeted_ressources: A list of uri targeted ressource. As the association graph provides links between two ressources, it can be defined as a LinkSet. The targeted ressources for which the graph is providing associations are displayed is the metadata graph.
        """
        path_out_1 = out_dir + self.ressource_version.ressource + "/" + self.ressource_version.version + "/"
        path_out_2 = out_dir + self.ressource_version_endpoint.ressource + "/" + self.ressource_version_endpoint.version + "/"
        # On ajoute les infos pour la première ressource:
        self.ressource_version.add_version_attribute(RDF["type"],
                                                     VOID["Linkset"])
        for uri_targeted_ressource in uri_targeted_ressources:
            self.ressource_version.add_version_attribute(
                VOID["target"], uri_targeted_ressource)
        self.ressource_version.add_version_attribute(
            DCTERMS["description"],
            rdflib.Literal(
                "This subset contains RDF triples providind link between Entrez Ids from the NCBI database "
                + self.dbfrom + " to the " + self.db + " database"))
        self.ressource_version.add_version_attribute(
            DCTERMS["title"],
            rdflib.Literal(self.dbfrom + " to " + self.db + " RDF triples"))
        # On ajoute les infos pour la seconde ressource, les endpoint:
        self.ressource_version_endpoint.add_version_attribute(
            DCTERMS["description"],
            rdflib.Literal(
                "This subset contains additionnal informations describing relations between Entrez Ids from the NCBI database "
                + self.dbfrom + " to the " + self.db + " database"))
        self.ressource_version_endpoint.add_version_attribute(
            DCTERMS["title"],
            rdflib.Literal(self.dbfrom + " to " + self.db +
                           " endpoint RDF triples"))
        # On exporte le graph des metadata :
        print("Export version graph with metadata ...", end='')
        self.ressource_version.add_version_attribute(
            VOID["triples"],
            rdflib.Literal(self.n_triples_g_linked_id, datatype=XSD.long))
        self.ressource_version.add_version_attribute(
            VOID["distinctSubjects"],
            rdflib.Literal(self.n_subjects_g_linked_id, datatype=XSD.long))
        self.ressource_version_endpoint.add_version_attribute(
            VOID["triples"],
            rdflib.Literal(self.n_triples_g_linked_id_endpoint,
                           datatype=XSD.long))
        self.ressource_version_endpoint.add_version_attribute(
            VOID["distinctSubjects"],
            rdflib.Literal(self.n_subjects_g_linked_id_endpoint,
                           datatype=XSD.long))
        self.ressource_version.version_graph.serialize(destination=path_out_1 +
                                                       "void.ttl",
                                                       format='turtle')
        self.ressource_version_endpoint.version_graph.serialize(
            destination=path_out_2 + "void.ttl", format='turtle')
        print(" Ok")

    def create_ressource(self, out_dir, id_list, pack_size, query_builder,
                         max_size, add_f_out_path):
        """
        This function is used to create a new version of the CID_PMID and CID_PMID_enpoint ressources, by creating all the ressource and data graph associated to from information contained in the object.
        - out_dir: a path to an directory to write output files.
        - id_list: a list of input Entrez identifiers that will be used as linking ids
        - pack_size: the size of the cids pack that have to be send as request, refer to https://eutils.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html
        - query_builder: a eutils.QueryService object parameterized with cache, retmax, retmode, usehistory and especially the api_key
        - max_size : the maximal number of pmids by files
        """
        # Intialyze .log file :
        with open(add_f_out_path + "elink.log", "w") as f_log:
            pass
        # Création des fichiers de sorties :
        add_files_path = add_f_out_path + "additional_files/" + self.ressource_version.version + "/"
        if not os.path.exists(add_files_path):
            os.makedirs(add_files_path)
        # On prépare les répertoire :
        path_out_1 = out_dir + self.ressource_version.ressource + "/" + self.ressource_version.version + "/"
        path_out_2 = out_dir + self.ressource_version_endpoint.ressource + "/" + self.ressource_version_endpoint.version + "/"
        if not os.path.exists(path_out_1):
            os.makedirs(path_out_1)
        if not os.path.exists(path_out_2):
            os.makedirs(path_out_2)
        id_packed_list = [
            id_list[i * pack_size:(i + 1) * pack_size]
            for i in range((len(id_list) + pack_size - 1) // pack_size)
        ]
        print("There are %d packed lists" % (len(id_packed_list)))
        # On réinitialise le fichier request failure et l'attribut (après avoir init packed_list au cas où on est sur un run à partir de self.request_failure):
        with open(add_files_path + "linking_ids_request_failed.txt", 'w') as f:
            pass
        self.request_failure = list()
        # On initialize les deux premières instances des graphs g_linked_id & g_linked_id_endpoint :
        g_linked_id_name, g_linked_id_endpoint_name = self.ressource_version.ressource + "_" + str(
            self.file_index
        ), self.ressource_version_endpoint.ressource + "_" + str(
            self.file_index)
        for index_list in range(0, len(id_packed_list)):
            print(
                "-- Start getting pmids of list %d !\nTry to append compounds ..."
                % (index_list + 1))
            # On append les linked_ids: Si false est return c'est qu'il y a eu une erreur dans la requête, sinon on continue
            test_append = self.append_linked_ids(id_packed_list, index_list,
                                                 query_builder, pack_size,
                                                 add_f_out_path)
            if not test_append:
                print(
                    " <!!!> Fail <!!!> \n There was an issue while querying NCBI server, check parameters. Try to continue to the next packed list. All ids are exported to request failure file."
                )
                self.request_failure.extend(id_packed_list[index_list])
                with open(add_files_path + "linking_ids_request_failed.txt",
                          'a') as f_request_failure:
                    for id_fail in id_packed_list[index_list]:
                        f_request_failure.write("%s\n" % (id_fail))
                continue
            print(" Ok")
            if self.available_linked_ids > max_size or (
                    index_list == len(id_packed_list) - 1):
                if index_list == len(id_packed_list) - 1:
                    print(
                        "\t-> End was reached with %d new linking_id - linked_id association, start to export graph:"
                        % (self.available_linked_ids))
                else:
                    print(
                        "\t-> Maximal size (%d) was reached with %d new linking_id - linked_id association, start to export graph:"
                        % (max_size, self.available_linked_ids))
                print(
                    "\tTry to write and compress graph as .ttl in %s and %s ..."
                    % (path_out_1, path_out_2),
                    end='')
                # On export les graphs :
                try:
                    self.g_linked_id.serialize(destination=path_out_1 +
                                               g_linked_id_name + ".ttl",
                                               format='turtle')
                except Exception as e:
                    print(
                        "Error while trying to serialize linked id graph at " +
                        path_out_1 + g_linked_id_name + " : " + str(e))
                    sys.exit(3)
                try:
                    self.g_linked_id_endpoint.serialize(
                        destination=path_out_2 + g_linked_id_endpoint_name +
                        ".ttl",
                        format='turtle')
                except Exception as e:
                    print(
                        "Error while trying to serialize linked id graph endpoint at "
                        + path_out_2 + g_linked_id_endpoint_name + " : " +
                        str(e))
                    sys.exit(3)
                # On zip et on ajoute le dataDump:
                try:
                    subprocess.run("gzip -f " + path_out_1 + g_linked_id_name +
                                   ".ttl" + " " + path_out_2 +
                                   g_linked_id_endpoint_name + ".ttl",
                                   shell=True,
                                   check=True,
                                   stderr=subprocess.PIPE)
                    self.ressource_version.add_DataDump(
                        g_linked_id_name + ".ttl.gz", self.ftp)
                    self.ressource_version_endpoint.add_DataDump(
                        g_linked_id_endpoint_name + ".ttl.gz", self.ftp)
                except subprocess.CalledProcessError as e:
                    print(
                        "Error while trying to compress files and add dataDump at "
                        + path_out_1 + g_linked_id_name + " and " +
                        path_out_2 + g_linked_id_endpoint_name + " : " +
                        str(e))
                    sys.exit(3)
                # On incrémente les nombres de sujets et de triples :
                print(
                    " Ok\n\tIncrement numbers of triples and subjects from added triples ...",
                    end='')
                self.n_triples_g_linked_id += len(self.g_linked_id)
                self.n_triples_g_linked_id_endpoint += len(
                    self.g_linked_id_endpoint)
                self.n_subjects_g_linked_id += len(self.get_all_linking_ids())
                self.n_subjects_g_linked_id_endpoint += len(
                    self.get_all_linked_id_endpoints())
                with open(add_files_path + "s_metdata.txt",
                          "w") as s_metadata_f:
                    s_metadata_f.write("%d\n" % (self.n_triples_g_linked_id))
                    s_metadata_f.write("%d\n" %
                                       (self.n_triples_g_linked_id_endpoint))
                    s_metadata_f.write("%d\n" % (self.n_subjects_g_linked_id))
                    s_metadata_f.write("%d\n" %
                                       (self.n_subjects_g_linked_id_endpoint))
                # On export les cid successful :
                print(" Ok\n\tTry tp export successful linking ids in " +
                      add_files_path + "successful_linking_ids.txt ...",
                      end='')
                with open(add_files_path + "successful_linking_ids.txt",
                          'a') as f_success:
                    for success_id in self.get_all_linking_ids():
                        f_success.write("%s\n" % (success_id))
                print(
                    " Ok\n\tTry tp export linking ids without linked_ids in " +
                    add_files_path + "/linking_ids_without_linked_ids.txt ...",
                    end='')
                # On export les append failures :
                with open(
                        add_files_path + "linking_ids_without_linked_ids.txt",
                        'a') as f_append_failure:
                    for append_failure_id in self.append_failure:
                        f_append_failure.write("%s\n" % (append_failure_id))
                print(
                    " Ok\n\tTry to append new linked ids to the global set ...",
                    end='')
                self.all_linked_ids = self.all_linked_ids.union(
                    self.get_all_linked_ids())
                print(" Ok\n\tExport all linked ids ...", end='')
                # Use write instead of append ('a') to overwrite the file at each new call of the function because only union of linked_ids shouls be mapped, and with append it may have duplicates if there are supplementary trys for request failures
                with open(add_files_path + "all_linked_ids.txt",
                          'w') as f_all_linked_ids:
                    for linked_id in self.all_linked_ids:
                        f_all_linked_ids.write("%s\n" % (linked_id))
                print(" Ok\n\tTry to clear objects for next iteration ...",
                      end='')
                # On vide les graphs et les objects :
                self.clean()
                # On incrémente le fichier :
                self.file_index += 1
                if index_list != len(id_packed_list) - 1:
                    print(" Ok\n\tCreate new graphs")
                    # On créée deux nouveaux graphs :
                    g_linked_id_name, g_linked_id_endpoint_name = self.ressource_version.ressource + "_" + str(
                        self.file_index
                    ), self.ressource_version_endpoint.ressource + "_" + str(
                        self.file_index)
        print(" Ok\nEnd !")
Exemplo n.º 10
0
def create_annotation_graph_ressource_version(path_to_annot_graphs_dir, version, ressource_name, desc, title, sources):
    """
    This function is used to create the ressource_info file associated to the version of the created annotation_graph.
    - path_to_annot_graphs_dir: a path to annotation graphs directory
    - version: the version of the annotation graph
    - ressource_name: the name of the associated ressource for the annotation graph
    - desc: a description of the graph
    - title: a title for the graph
    - sources: a list of uris which vere used to build the annotation graph
    """
    ressource_version = Database_ressource_version(ressource = ressource_name, version = version)
    n_triples = 0
    subjects = set()
    for annot_graph in os.listdir(path_to_annot_graphs_dir):
        if not annot_graph.endswith(".ttl"):
            continue
        if annot_graph == "void.ttl":
            continue
        sub_g = rdflib.ConjunctiveGraph()
        sub_g.parse(path_to_annot_graphs_dir + annot_graph, format = 'turtle')
        n_triples += len(sub_g)
        subjects = subjects.union(set([s for s in sub_g.subjects()]))
        ressource_version.add_DataDump(annot_graph)
    for source in sources:
        ressource_version.add_version_attribute(DCTERMS["source"], rdflib.URIRef(source))
    ressource_version.add_version_attribute(DCTERMS["description"], rdflib.Literal(desc))
    ressource_version.add_version_attribute(DCTERMS["title"], rdflib.Literal(title))
    ressource_version.add_version_attribute(VOID["triples"], rdflib.Literal(n_triples, datatype=XSD.long ))
    ressource_version.add_version_attribute(VOID["distinctSubjects"], rdflib.Literal(len(subjects), datatype=XSD.long ))
    ressource_version.version_graph.serialize(destination=path_to_annot_graphs_dir + "void.ttl", format = 'turtle')
Exemplo n.º 11
0
ftp = config['FTP'].get('ftp')
path_to_share = args.out + "/"
path_out = args.log + "/"

# Init output and log files:
with open(path_out + "classyFire.log", "w") as f_log:
    pass

with open(path_out + "classyFire_error_ids.log", "w") as f_log:
    pass

with open(path_out + "ids_no_classify.log", "w") as f_log:
    pass

# On initialise les ressources :
ClassyFire_direct_p = Database_ressource_version(
    ressource="ClassyFire/direct-parent", version=version)
ClassyFire_alternative_p = Database_ressource_version(
    ressource="ClassyFire/alternative-parents", version=version)

# On initialise les path où exporter les graphs :
path_direct_p = path_to_share + ClassyFire_direct_p.ressource + "/" + ClassyFire_direct_p.version + "/"
path_alternative_p = path_to_share + ClassyFire_alternative_p.ressource + "/" + ClassyFire_alternative_p.version + "/"

# Check if a previous version already exists :
if (len(glob.glob(path_direct_p + "void.ttl")) == 1) and (len(
        glob.glob(path_alternative_p + "void.ttl")) == 1):
    print("This version already exist, skip computation.")
else:
    pmids_cids_graph_list = get_graph_list(path_to_share, "PMID_CID/",
                                           "*.ttl.gz")
    inchikeys_graph_list = get_graph_list(path_to_share,