Пример #1
0
    def executeQuery(self, query, queue=Queue(), limit=-1, offset=0):

        if len(self.mappings) == 0:
            print("Empty Mapping")
            queue.put('EOF')
            return []

        self.query = qp.parse(query)
        self.prefixes = getPrefs(self.query.prefs)

        if limit > -1 or offset > -1:
            self.query.limit = limit
            self.query.offset = offset

        pipeline, db, col, coltotemplates, projvartocols = self.translate()
        if self.query.limit > 0:
            if self.query.offset > 0:
                pipeline.append({"$limit": int(self.query.limit) + int(self.query.offset)})
                pipeline.append({"$skip": int(self.query.offset)})
            else:
                pipeline.append({"$limit": int(self.query.limit)})
        # print("MongoDB query pipeline:")
        # pprint(pipeline)
        db, collection = self.mongo_client.get_db_coll_obj(db, col)
        result = collection.aggregate(pipeline, useCursor=True, batchSize=1000, allowDiskUse=True)
        for doc in result:
            for r in doc:
                if isinstance(doc[r], int):
                    doc[r] = str(doc[r])
                if r in projvartocols and r in coltotemplates:
                    doc[r] = coltotemplates[r].replace('{' + projvartocols[r] + '}', doc[r].replace(" ", '_'))
            queue.put(doc)
        queue.put("EOF")

        return result
Пример #2
0
 def __init__(self, sparql, mapping, datasource, rdfmts, star):
     self.mapping = mapping
     self.sparql = qp.parse(sparql)
     self.datasource = datasource
     self.rdfmts = rdfmts
     self.star = star
     self.prefixes = getPrefs(self.sparql.prefs)
Пример #3
0
 def __init__(self, query, config, pushdownssqjoins=True):
     if isinstance(query, str):
         self.query = queryParser.parse(query)
     else:
         self.query = query
     self.prefixes = utils.getPrefs(self.query.prefs)
     self.config = config
     self.relevant_mts = {}
     self.decomposition = {}
     self.pushdownssqjoins = pushdownssqjoins
Пример #4
0
    def executeQuery(self, query, queue=Queue(), limit=-1, offset=0):

        self.query = qp.parse(query)
        if limit > -1 or offset > -1:
            self.query.limit = limit
            self.query.offset = offset

        sparql = self.query

        pipeline, db, col, coltotemplates, projvartocols = self.translate()
        '''mquery, mproj, cmpquery = self.rewrite(sparql)
        mproj["_id"] = 0

        pipeline = []
        if len(mquery) > 0:
            pipeline.append({"$match": mquery})

        pipeline.append({"$project": mproj})
        '''
        if sparql.limit > 0:
            if sparql.offset > 0:
                pipeline.append({"$limit": int(sparql.limit) + int(sparql.offset)})
                pipeline.append({"$skip": int(sparql.offset)})
            else:
                pipeline.append({"$limit": int(sparql.limit)})
        print(pipeline)
        db, collection = self.mongo_client.get_db_coll_obj(db, col)
        result = collection.aggregate(pipeline, useCursor=True, batchSize=1000, allowDiskUse=True)
        for doc in result:
            for r in doc:
                if isinstance(doc[r], int):
                    doc[r] = str(doc[r])
                if projvartocols[r] in coltotemplates:
                    doc[r] = coltotemplates[projvartocols[r]].replace('{' + projvartocols[r] + '}', doc[r].replace(" ", '_'))
            queue.put(doc)
        queue.put("EOF")

        return result
Пример #5
0
    def executeQuery(self, query, queue=Queue(), limit=-1, offset=0):
        """
        Entry point for query execution on csv files
        :param querystr: string query
        :return:
        """
        if len(self.mappings) == 0:
            print("Empty Mapping")
            queue.put('EOF')
            return []
        # querytxt = query
        self.query = qp.parse(query)
        self.prefixes = getPrefs(self.query.prefs)

        if limit > -1 or offset > -1:
            self.query.limit = limit
            self.query.offset = offset
        ds = self.star['datasource']

        sqlquery, coltotemplates, projvartocols, filenametablename, filenameiteratormap = self.translate(
        )
        print(sqlquery)
        try:
            if self.username is None:
                self.mysql = connector.connect(user='******', host=self.url)
            else:
                self.mysql = connector.connect(user=self.username,
                                               password=self.password,
                                               host=self.host,
                                               port=self.port)
        except connector.Error as err:
            if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
                print("Something is wrong with your user name or password")
            elif err.errno == errorcode.ER_BAD_DB_ERROR:
                print("Database does not exist")
            else:
                print(err)
        except Exception as ex:
            print("Exception while connecting to Mysql", ex)

        try:
            cursor = self.mysql.cursor()
            db = ""
            for fn in filenameiteratormap:
                db = filenameiteratormap[fn]['iterator']
            cursor.execute("use " + db)
            cursor.execute(sqlquery)
            header = [h[0] for h in cursor._description]

            for line in cursor:
                row = {}
                res = {}
                skip = False
                for i in range(len(line)):
                    row[header[i]] = str(line[i])

                    for r in row:
                        if row[r] == 'null':
                            skip = True
                            break
                        if '_' in r and r[:r.find("_")] in projvartocols:
                            s = r[:r.find("_")]
                            if s in res:
                                val = res[s]
                                res[s] = val.replace(
                                    '{' + r[r.find("_") + 1:] + '}',
                                    row[r].replace(" ", '_'))
                            else:
                                res[s] = coltotemplates[s].replace(
                                    '{' + r[r.find("_") + 1:] + '}',
                                    row[r].replace(" ", '_'))
                        elif r in projvartocols and r in coltotemplates:
                            res[r] = coltotemplates[r].replace(
                                '{' + projvartocols[r] + '}',
                                row[r].replace(" ", '_'))
                        else:
                            res[r] = row[r]
                        # if '_' in r and r[:r.find("_")] in projvartocols:
                        #     s = r[:r.find("_")]
                        #     p = r[r.find("_") + 1:]
                        #     if s in res:
                        #         val = res[s]
                        #         res[s] = val.replace('{' + p + '}', row[r].replace(" ", '_'))
                        #     else:
                        #         if '[' in coltotemplates[s]:
                        #             coltotemplates[s] = coltotemplates[s].replace(
                        #                 coltotemplates[s][coltotemplates[s].rfind('['): coltotemplates[s].rfind(']') + 1],
                        #                 '')
                        #         if '[' in coltotemplates[s]:
                        #             coltotemplates[s] = coltotemplates[s].replace(
                        #                 coltotemplates[s][coltotemplates[s].rfind('['): coltotemplates[s].rfind(']') + 1],
                        #                 '')
                        #         res[s] = coltotemplates[s].replace('{' + p + '}', row[r].replace(" ", '_'))
                        # elif r in projvartocols and r in coltotemplates:
                        #     if '[' in coltotemplates[r]:
                        #         coltotemplates[r] = coltotemplates[r].replace(
                        #             coltotemplates[r][coltotemplates[r].find('['): coltotemplates[r].find(']') + 1], '')
                        #     if '[' in coltotemplates[r]:
                        #         coltotemplates[r] = coltotemplates[r].replace(
                        #             coltotemplates[r][coltotemplates[r].find('['): coltotemplates[r].find(']') + 1], '')
                        #
                        #     p = coltotemplates[r][coltotemplates[r].find('{') + 1: coltotemplates[r].find('}')]
                        #
                        #     res[r] = coltotemplates[r].replace('{' + p + '}', row[r].replace(" ", '_'))
                        # else:
                        #     res[r] = row[r]
                if not skip:
                    queue.put(res)
        except Exception as e:
            print("Exception ", e)
            pass

        queue.put("EOF")
Пример #6
0
    def executeQuery(self, query, queue=Queue(), limit=-1, offset=0):
        """
        Entry point for query execution on csv files
        :param querystr: string query
        :return:
        """
        from time import time
        # start = time()
        # print("Start:", start)
        if len(self.mappings) == 0:
            print("Empty Mapping")
            queue.put('EOF')
            return []
        # querytxt = query
        self.query = qp.parse(query)
        self.prefixes = getPrefs(self.query.prefs)

        query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)]

        if limit > -1 or offset > -1:
            self.query.limit = limit
            self.query.offset = offset

        sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters)
        # print(sqlquery)
        # totalres = 0
        if sqlquery is None or len(sqlquery) == 0:
            queue.put("EOF")
            return []
        try:
            start = time()
            try:
                self.drill = PyDrill(host=self.host, port=self.port)
            except Exception as ex:
                print("Exception while connecting to Drill", ex)
                queue.put("EOF")
                return
            if not self.drill.is_active():
                print('Exception: Please run Drill first')
                queue.put("EOF")
                return
            # print("Drill Initialization cost:", time() - start)
            logger.info("Drill Initialization cost:" + str(time() - start))
            start = time()
            if isinstance(sqlquery, list):
                sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0]
                if len(sqlquery) > 3:
                    sqlquery = " UNION ".join(sqlquery)
            if isinstance(sqlquery, list):
                sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0]
                # logger.info(" UNION ".join(sqlquery))
                processqueues = []
                processes = []
                res_dict = []
                for sql in sqlquery:
                    # processquery = Queue()
                    # self.run_union(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict)
                    # print(sql)
                    processquery = Queue()
                    processqueues.append(processquery)
                    p = Process(target=self.run_union, args=(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict,))
                    p.start()
                    processes.append(p)

                while len(processqueues) > 0:
                    toremove = []
                    try:
                        for q in processqueues:
                            if q.get(False) == 'EOF':
                                toremove.append(q)
                        for p in processes:
                            if p.is_alive():
                                p.terminate()
                    except:
                        pass
                    for q in toremove:
                        processqueues.remove(q)
                logger.info("Done running:")
                sw = " UNION ".join(sqlquery)
                logger.info(sw)
            else:
                card = 0
                # if limit == -1:
                limit = 1000
                if offset == -1:
                    offset = 0
                logger.info(sqlquery)
                # print(sqlquery)
                while True:
                    query_copy = sqlquery + " LIMIT " + str(limit) + " OFFSET " + str(offset)
                    cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates)
                    card += cardinality
                    if cardinality < limit:
                        break

                    offset = offset + limit
            # print("Exec in Drill took:", time() - start)
            logger.info("Exec in Drill took:" + str(time() - start))
        except Exception as e:
            print("Exception ", e)
            pass
        # print('End:', time(), "Total results:", totalres)
        # print("Drill finished after: ", (time()-start))
        queue.put("EOF")
Пример #7
0
    def executeQuery(self, query, queue, limit=-1, offset=0):
        """
        Entry point for query execution on csv files
        :param querystr: string query
        :return:
        """
        from time import time
        if len(self.mappings) == 0:
            print("Empty Mapping")
            queue.put('EOF')
            return []
        # querytxt = query
        # print(query)
        self.query = qp.parse(query)
        self.prefixes = getPrefs(self.query.prefs)

        # print("Connection time: ", time()-start)
        query_filters = [
            f for f in self.query.body.triples[0].triples
            if isinstance(f, Filter)
        ]

        # if limit > -1 or offset > -1:
        #     self.query.limit = limit
        #     self.query.offset = offset
        start = time()
        sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(
            query_filters)
        # print(sqlquery)
        if sqlquery is None or len(sqlquery) == 0:
            queue.put("EOF")
            return []

        # print(sqlquery, '\tTranslate took: ', time() - start)
        try:
            if isinstance(sqlquery, list):
                sqlquery = [
                    sql for sql in sqlquery if sql is not None and len(sql) > 0
                ]
                if len(sqlquery) > 3:
                    sqlquery = " UNION ".join(sqlquery)
            if isinstance(sqlquery, list):
                try:
                    sqlquery = [
                        sql for sql in sqlquery
                        if sql is not None and len(sql) > 0
                    ]
                    # logger.info(" UNION ".join(sqlquery))
                    processqueues = []
                    processes = []
                    res_dict = []
                    # logger.info("UNION started" + str(len(sqlquery)))
                    for sql in sqlquery:
                        processquery = Queue()
                        # self.run_union(sql, filenametablename, queue, projvartocols, coltotemplates, limit, processquery,res_dict)
                        processqueues.append(processquery)
                        p = Process(target=self.run_union,
                                    args=(
                                        sql,
                                        filenametablename,
                                        queue,
                                        projvartocols,
                                        coltotemplates,
                                        limit,
                                        processquery,
                                        res_dict,
                                    ))
                        p.daemon = True
                        p.start()
                        processes.append(p)

                    while len(processqueues) > 0:
                        toremove = []
                        try:
                            for q in processqueues:
                                v = q.get(False)
                                if v == 'EOF':
                                    toremove.append(q)
                            for p in processes:
                                if p.is_alive():
                                    p.terminate()
                        except Empty:
                            pass
                        except Exception as e:
                            print("Exception: ", e, len(processqueues))
                            pass
                        for q in toremove:
                            processqueues.remove(q)
                    # logger.info("UNION Finished")
                    logger.info("MySQL Done running UNION:")
                    sw = " UNION ".join(sqlquery)
                    logger.info(sw)
                except Exception as e:
                    logger.error(" UNION ".join(sqlquery))
                    logger.error("Exception while running query" + str(e))
                    pass
                except IOError:
                    logger.error("IOError while running query")
                    pass
            else:
                try:
                    if self.username is None:
                        self.mysql = connector.connect(user='******',
                                                       host=self.url)
                    else:
                        self.mysql = connector.connect(user=self.username,
                                                       password=self.password,
                                                       host=self.host,
                                                       port=self.port)
                except connector.Error as err:
                    if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
                        print(
                            "Something is wrong with your user name or password"
                        )
                        logger.error(
                            "Something is wrong with your user name or password"
                        )
                    elif err.errno == errorcode.ER_BAD_DB_ERROR:
                        print("Database does not exist")
                        logger.error("Database does not exist")
                    else:
                        print(err)
                        logger.error("Error:" + str(err))
                    queue.put('EOF')
                    return
                except Exception as ex:
                    print("Exception while connecting to Mysql", ex)
                    logger.error("Exception while connecting to Mysql" +
                                 str(ex))
                    queue.put('EOF')
                    return

                cursor = self.mysql.cursor()
                db = filenametablename
                cursor.execute("use " + db + ';')
                card = 0
                # if limit == -1:
                limit = 100
                if offset == -1:
                    offset = 0
                logger.info(sqlquery)
                # print(sqlquery)
                try:
                    # rs = time()
                    while True:
                        query_copy = sqlquery + " LIMIT " + str(
                            limit) + " OFFSET " + str(offset)
                        cursor.execute(query_copy)
                        cardinality = self.process_result(
                            cursor, queue, projvartocols, coltotemplates)
                        card += cardinality
                        # if (time()-rs) > 20:
                        #     print(card, 'results found -..')
                        if cardinality < limit:
                            break

                        offset = offset + limit
                    logger.info("Running query: " + str(sqlquery) +
                                " Non UNION DONE" + str(card))
                except Exception as e:
                    print("EXception: ", e)
                    logger.error("Exception while running query" + str(e))
                    pass
                except IOError as ie:
                    print("IO ERROR:", ie)
                    logger.error("IOError while running query" + str(ie))
                    pass
        except Exception as e:
            print("Exception ", e)
            pass
        logger.info("Running query: " + str(query) + " DONE")
        logger.info("MySQL finished after: " + str(time() - start))
        queue.put("EOF")
Пример #8
0
    def executeQuery(self, query, queue=Queue(), limit=-1, offset=0):

        if len(self.mappings) == 0:
            print("Empty Mapping")
            queue.put('EOF')
            return []

        self.query = qp.parse(query)
        self.prefixes = getPrefs(self.query.prefs)

        if limit > -1 or offset > -1:
            self.query.limit = limit
            self.query.offset = offset
        cypher, coltotemplates, projvartocols = self.translate()
        print("Cypher: ", cypher)
        print(coltotemplates, '||', projvartocols)
        if cypher is not None and len(cypher) > 5:
            session = self.neo_client.driver.session()
            results = session.run(cypher)
            for r in results:
                rr = dict(r)
                # check if collect is used in one of the variables
                lstps = []
                for p in rr:
                    if isinstance(rr[p], list):
                        lstps.append(p)
                resulsts = []
                if len(lstps) > 0:
                    reslist = [rr.copy()]
                    for p1 in lstps:
                        rrp1 = rr[p1]
                        for v in rrp1:
                            for res in reslist:
                                res[p1] = v
                                resulsts.append(res.copy())
                        reslist = resulsts
                        resulsts = []
                    resulsts = reslist
                    for row in resulsts:
                        res = {}
                        for r in row:
                            if '_' in r and r[:r.find("_")] in projvartocols:
                                s = r[:r.find("_")]
                                if s in res:
                                    val = res[s]
                                    res[s] = val.replace(
                                        '{' + r[r.find("_") + 1:] + '}',
                                        row[r].replace(" ", '_'))
                                else:
                                    res[s] = coltotemplates[s].replace(
                                        '{' + r[r.find("_") + 1:] + '}',
                                        row[r].replace(" ", '_'))

                            elif r in projvartocols and r in coltotemplates:
                                res[r] = coltotemplates[r].replace(
                                    '{' + projvartocols[r] + '}',
                                    row[r].replace(" ", '_'))
                            else:
                                res[r] = row[r]
                        queue.put(res)
                else:
                    res = {}
                    row = rr
                    for r in row:
                        if '_' in r and r[:r.find("_")] in projvartocols:
                            s = r[:r.find("_")]
                            if s in res:
                                val = res[s]
                                res[s] = val.replace(
                                    '{' + r[r.find("_") + 1:] + '}',
                                    row[r].replace(" ", '_'))
                            else:
                                res[s] = coltotemplates[s].replace(
                                    '{' + r[r.find("_") + 1:] + '}',
                                    row[r].replace(" ", '_'))

                        elif r in projvartocols and r in coltotemplates:
                            res[r] = coltotemplates[r].replace(
                                '{' + projvartocols[r] + '}',
                                row[r].replace(" ", '_'))
                        else:
                            res[r] = row[r]
                    queue.put(res)

        queue.put("EOF")
        return []
Пример #9
0
    def executeQuery(self, query, queue=Queue(), limit=-1, offset=0):
        """
        Entry point for query execution on csv files
        :param querystr: string query
        :return:
        """
        from time import time
        if len(self.mappings) == 0:
            print("Empty Mapping")
            queue.put('EOF')
            return []
        # querytxt = query
        self.query = qp.parse(query)
        self.prefixes = getPrefs(self.query.prefs)

        # query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)]

        if limit > -1 or offset > -1:
            self.query.limit = limit
            self.query.offset = offset

        sparql2sql = SPARQL2SQL(query, self.mappings, self.datasource,
                                self.rdfmts, self.star)

        sqlquery, projvartocols, coltotemplates, filenametablename = sparql2sql.translate(
        )
        # sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters)
        # print(sqlquery)
        if self.spark is None:
            # url = 'spark://node3.research.tib.eu:7077' # self.mapping['url']
            # params = {
            #     "spark.driver.cores": "4",
            #     "spark.executor.cores": "4",
            #     "spark.cores.max": "6",
            #     "spark.default.parallelism": "4",
            #     "spark.executor.memory": "6g",
            #     "spark.driver.memory": "6g",
            #     "spark.driver.maxResultSize": "6g",
            #     "spark.python.worker.memory": "4g",
            #     "spark.local.dir": "/tmp",
            #     "spark.debug.maxToStringFields": "500"
            # }
            params = self.params
            start = time()
            # self.config['params']
            self.spark = SparkSession.builder.master('local[*]') \
                .appName("OntarioSparkWrapper" + str(self.datasource.url) + query)
            for p in params:
                self.spark = self.spark.config(p, params[p])

            self.spark = self.spark.getOrCreate()
            logger.info("SPARK Initialization cost:" + str(time() - start))
        start = time()

        for filename, tablename in filenametablename.items():
            # schema = self.make_schema(schemadict[filename])
            # filename = "hdfs://node3.research.tib.eu:9000" + filename
            # filename = self.datasource.url + "/" + filename
            # filename = "/media/kemele/DataHD/LSLOD-flatfiles/" + filename
            # print(filename, tablename)
            if self.datasource.dstype == DataSourceType.LOCAL_JSON or \
                    self.datasource.dstype == DataSourceType.SPARK_JSON:
                df = self.spark.read.json(filename)
            elif self.datasource.dstype == DataSourceType.HADOOP_JSON:
                filename = "hdfs://node3.research.tib.eu:9000" + filename
                df = self.spark.read.json(filename)
            elif self.datasource.dstype == DataSourceType.HADOOP_TSV or \
                    self.datasource.dstype == DataSourceType.HADOOP_CSV:
                filename = "hdfs://node3.research.tib.eu:9000" + filename
                df = self.spark.read.csv(filename,
                                         inferSchema=True,
                                         sep='\t' if self.datasource.dstype
                                         == DataSourceType.HADOOP_TSV else ',',
                                         header=True)
            else:
                df = self.spark.read.csv(
                    filename,
                    inferSchema=True,
                    sep='\t'
                    if self.datasource.dstype == DataSourceType.LOCAL_TSV
                    or self.datasource.dstype == DataSourceType.SPARK_TSV else
                    ',',
                    header=True)

            df.createOrReplaceTempView(tablename)

        logger.info("time for reading file" + str(filenametablename) +
                    str(time() - start))

        totalres = 0
        try:
            runstart = time()
            if isinstance(sqlquery, list):  # and len(sqlquery) > 3:
                sqlquery = " UNION ".join(sqlquery)
            # print(sqlquery)
            if isinstance(sqlquery, list):
                logger.info(" UNION ".join(sqlquery))
                res_dict = []
                for sql in sqlquery:
                    cardinality = self.process_result(sql, queue,
                                                      projvartocols,
                                                      coltotemplates, res_dict)
                    totalres += cardinality
            else:
                logger.info(sqlquery)
                cardinality = self.process_result(sqlquery, queue,
                                                  projvartocols,
                                                  coltotemplates)
                totalres += cardinality
            logger.info("Exec in SPARK took:" + str(time() - runstart))
        except Exception as e:
            print("Exception ", e)
            pass
        # print('SPARK End:', time(), "Total results:", totalres)
        # print("SPARK finished after: ", (time()-start))
        queue.put("EOF")
        self.spark.stop()