def countExt(): count = 0 for end in mongo.getAllEndopoinLodex(): p = mongo.getExtById(end['_id']) if len(p) != 0: count = count + 1 return count
def countExt_month_year(month, year): count = 0 for end in mongo.getAllEndopoinLodex(): p = mongo.getExtById(end['_id']) if len(p) != 0: e = mongo.getLastRunById(end['_id']) if (e['date'].month == month) and (e['date'].year == year): count = count + 1 return count
def automaticExtraction(argv): if (argv[0] == 'all'): for end in mongo.getAllEndopoinLodex(): endpointExtraction(end['_id']) print("Generating schema summary") generateSS(end['_id']) generateCS(end['_id']) elif isinstance(argv[0], str): url = argv[0] end = mongo.getEndopointByUrl(url) p = mongo.getExtById(end['_id']) endpointExtraction(end['_id']) print("Generating schema summary ") generateSS([str(end['_id'])]) generateCS([str(end['_id'])]) else: print("Something awful happened")
from extractor import SchemaExtractorTestV3 as se import threading import time from extractor.util import mongo import pymongo as pm threads = [] # TODO choose number of thread for a in mongo.getAllEndopoinLodex(): # time.sleep(1) e= mongo.getLastRunById(a['_id']) logs=set() if e is not None: logs = set([l['phase'] for l in e['log']]) if 'finish' not in logs: print '------------------' print len(threads) thread = threading.Thread(target=se.ExtractSchema, args=(a,False)) thread.start() threads.append(thread) while len(threads) > 10: time.sleep(1)
def downloadDataset(argv): if argv[0] in [ "https://www.europeandataportal.eu/sparql", "https://io.datascience-paris-saclay.fr/sparql", "http://data.europa.eu/euodp/sparqlep" ]: sparql = SPARQLWrapper(argv[0]) q = util.queryGenerator.QueryGenerator() if argv[0] == "https://www.europeandataportal.eu/sparql": sparql.setQuery(q.EuDownload().query) elif argv[0] == "https://io.datascience-paris-saclay.fr/sparql": sparql.setQuery(q.dataScienceParisDownload().query) elif argv[0] == "http://data.europa.eu/euodp/sparqlep": sparql.setQuery(q.dataEuDownload().query) sparql.setReturnFormat(XML) """print(sparql)""" #print("Extraction endpoints") results = sparql.queryAndConvert() #print("Parsing results\n") pprint(results) pprint( se.parseResponseForDatasetExtr(None, results, "test_connection", False)) print("-----") return if se.parseResponseForDatasetExtr(None, results, "test_connection", False): endArr = [] endDIct = {} for end in se.parseResponseForDatasetExtr( None, results, "test_connection", False ): # end è un oggetto con 'dataset', 'title' e 'url' dell'endpoint if 'title' in end: if end['url'] in endDIct: tmp = endDIct[end['url']] tmp['name'].append(end['title']) endDIct[end['url']] = tmp else: endDIct[end['url']] = {'name': [end['title']]} datasets = [] urls = [] count = mongo.getLastIdEndpointsLodex() copy = False for key in endDIct: endpoint = mongo.getAllEndopoinLodex() for e in endpoint: if e["url"] == key: copy = True if copy == False: ds = {} ds = { 'url': key, '_id': count, 'name': endDIct[key]['name'][0] } urls.append(key) count = count + 1 ds['datasets'] = [{'name': endDIct[key]['name'][0]}] datasets.append(ds) copy = False # Stringa per il parsing print("Ricerca di nuovi dataset sul portale " + argv[0]) print("Trovati " + str(len(datasets)) + " nuovi datasets") print(datasets) if len(datasets) > 0: mongo.inserLodexDatasets(datasets) for i in range(0, len(datasets)): url = urls[i] automaticExtraction([url]) else: url = argv[0] sparql = SPARQLWrapper(url) q = util.queryGenerator.QueryGenerator() id = mongo.startTestNew(url) print(id) """in runInfo, id è il numero dentro a ObjectId""" copy = False count = mongo.getLastIdEndpointsLodex() datasets = [] if se.testConnection(url, q, sparql, id): endpoint = mongo.getAllEndopoinLodex() for e in endpoint: if e["url"] == url: copy = True if copy == False: ds = {} trash, name = itemgetter(0, 1)(url.split('//', 1)) ds = {'url': url, '_id': count, 'name': name} datasets.append(ds) else: print("-----") print(url + " e' un endpoint valido, ") print("ma e' gia' presente in MongoDB; non lo aggiungo.") print("L'estrazione viene evitata in quanto sarebbe inutile.") else: print("-----") print( url + " non e' un endpoint valido o non e' al momento raggiungibile." ) print("Estrazione fallita.") if len(datasets) > 0: mongo.inserLodexDatasets(datasets) mongo.deleteExtById(count) print(datasets) automaticExtraction([argv[0]]) print("-----") print(url + " e' un endpoint valido, ") print("non presente su MongoDB; lo aggiungo.") print("Estrazione andata a buon fine.")
def countDataset(): count = 0 for end in mongo.getAllEndopoinLodex(): count = count + 1 return count
from extractor import SchemaExtractorTestV3 as se import threading import time from extractor.util import mongo import pymongo as pm threads = [] # TODO choose number of thread for a in mongo.getAllEndopoinLodex(): # time.sleep(1) e = mongo.getLastRunById(a['_id']) logs = set() if e is not None: logs = set([l['phase'] for l in e['log']]) if 'finish' not in logs: print '------------------' print len(threads) thread = threading.Thread(target=se.ExtractSchema, args=(a, False)) thread.start() threads.append(thread) while len(threads) > 10: time.sleep(1) for t in threads: