Exemplo n.º 1
0
def QueryExe(hql, name, dates):
    lock_file = join(lpath, name + '_' + dates + '.lock')
    try:
        transport = TSocket.TSocket(ips, 10001)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = ThriftHive.Client(protocol)
        transport.open()
        logger.info('Query sql is:\n%s', hql)
        client.execute(hql)
        query = client.fetchAll()
        logger.info('Query sql result is:\n%s', query)
        transport.close()
        return (query)
    except Thrift.TException, tx:
        logger.error(u'程序执行过程中发生异常, 错误信息如下\n%s', tx.message)
        os.remove(lock_file)
        logger.error(u'程序正在退出. 删除锁文件  %s', lock_file)
        sys.exit(1)
Exemplo n.º 2
0
def hiveExe(sql):
    try:
        transport = TSocket.TSocket(hive_server_ip, hive_server_port)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = ThriftHive.Client(protocol)

        transport.open()

        client.execute(sql)

        # print "The return value is : "
        result = client.fetchAll()
        #         print result
        #         print "............",len(result)
        transport.close()
        return result
    except Thrift.TException, tx:
        print '%s' % (tx.message)
Exemplo n.º 3
0
def HiveExe(hql, name, dates):
    lock_file = join(lpath, name + '_' + dates + '.lock')
    try:
        transport = TSocket.TSocket(ips, 10001)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = ThriftHive.Client(protocol)
        transport.open()
        for sql in hql:
            logger.info('Executive sql is:\n%s', sql)
            client.execute(sql)
            # client.fetchAll()
            logger.info('Successful implementation of this Sql')
        transport.close()
    except Thrift.TException, tx:
        logger.error(u'程序执行过程中发生异常, 错误信息如下\n%s', tx.message)
        os.remove(lock_file)
        logger.error(u'程序正在退出. 删除锁文件  %s', lock_file)
        sys.exit(1)
Exemplo n.º 4
0
def execsql(sql):
    try:
        transport = TSocket.TSocket(conf['hive']['host'], conf['hive']['port'])
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)

        client = ThriftHive.Client(protocol)
        transport.open()
        print "hive connect"

        client.execute(sql)
        print client.fetchAll()

        transport.close()
        print "close hive connect"
        return True

    except Thrift.TException, tx:
        print '%s' % (tx.message)
        return False
Exemplo n.º 5
0
    def query(self, hsql, callback):
        try:
            transport = TSocket.TSocket(self.host, self.port)
            transport = TTransport.TBufferedTransport(transport)
            protocol = TBinaryProtocol.TBinaryProtocol(transport)
            client = ThriftHive.Client(protocol)
            transport.open()
            #获取表中的数据记录
            client.execute(hsql)
            callback(client.fetchAll())
            transport.close()

        except Thrift.TException, tx:
            callback(None)
            print '%s' % (tx.message)


#
# app=hiveDB('182.92.183.76',9084)
# app.query()
Exemplo n.º 6
0
def executeSql(host, command):
    try:
        transport = TSocket.TSocket(host, 10000)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = ThriftHive.Client(protocol)
        transport.open()
        sqls = command.replace("\r\n", "").split(";")
        result = []
        for sql in sqls:
            sql = sql.strip()
            if len(sql) > 0:
                start = time.time()
                client.execute(sql)
                lines = client.fetchAll()
                end = time.time()
                result = result + lines + ["----------Time: %.3fs----------" % (end-start)]
        transport.close()
        return result
    except Exception as e:
        return [str(e)]
Exemplo n.º 7
0
    def __init__(self, server='localhost', port=10001, db='default'):
        """Initialize the Hive Client.

        :parameter server(string): server to connect to. Default- localhost
        :parameter port(int): port to connect to. Default- 10000
        :parameter db(string): databased name. Default- default

        :return: None

        """
        transport = TSocket.TSocket(server, port)
        self.__transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(self.__transport)

        self.__client = ThriftHive.Client(protocol)

        self.__db = db

        # make sure this DB exists!
        with openclose(self.__transport):
            assert self.__client.get_database(db)
Exemplo n.º 8
0
def fetch_db_info_from_hive(hive_server_addr, port=10000):
    try:
        transport = TSocket.TSocket(hive_server_addr, port)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)

        client = ThriftHive.Client(protocol)
        transport.open()
        
        # Fetch databases
        client.execute("show databases")
        dbs = client.fetchAll()

        # Fetch tables
        db_tbl_map = {}
        for db in dbs:
            client.execute("use " + db)
            client.execute("show tables")
            tbls = client.fetchAll()
            
            tbl_col_map = {}
            for tbl in tbls:
                col_map = {}

                # Fetch table column name and type
                client.execute("describe " + tbl)
                cols = client.fetchAll()
                
                for col in cols:
                    words = col.split()
                    col_map[words[0]] = words[1]

                tbl_col_map[tbl] = col_map;
            db_tbl_map[db] = tbl_col_map;
        
        transport.close()
        return db_tbl_map

    except Thrift.TException, tx:
        print '%s' % (tx.message)
Exemplo n.º 9
0
    def get_metastore_client(self):
        """
        Returns a Hive thrift client.
        """
        from thrift.transport import TSocket, TTransport
        from thrift.protocol import TBinaryProtocol
        from hive_service import ThriftHive
        ms = self.metastore_conn
        auth_mechanism = ms.extra_dejson.get('authMechanism', 'NOSASL')
        if configuration.get('core', 'security') == 'kerberos':
            auth_mechanism = ms.extra_dejson.get('authMechanism', 'GSSAPI')
            kerberos_service_name = ms.extra_dejson.get(
                'kerberos_service_name', 'hive')

        socket = TSocket.TSocket(ms.host, ms.port)
        if configuration.get(
                'core',
                'security') == 'kerberos' and auth_mechanism == 'GSSAPI':
            try:
                import saslwrapper as sasl
            except ImportError:
                import sasl

            def sasl_factory():
                sasl_client = sasl.Client()
                sasl_client.setAttr("host", ms.host)
                sasl_client.setAttr("service", kerberos_service_name)
                sasl_client.init()
                return sasl_client

            from thrift_sasl import TSaslClientTransport
            transport = TSaslClientTransport(sasl_factory, "GSSAPI", socket)
        else:
            transport = TTransport.TBufferedTransport(socket)

        protocol = TBinaryProtocol.TBinaryProtocol(transport)

        return ThriftHive.Client(protocol)
Exemplo n.º 10
0
    return (outputFilename, dt)
    


if __name__ == "__main__":
    ###
    ### Main function gets the current IP list and upload it to the Hive database
    ###
    
    tmpFilename = downloadIPList()
    (csvFilename, dt) = convert2csv(tmpFilename)

    # upload data to the Hive server

    try:
        transport = TSocket.TSocket('localhost', 10000)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
 
        client = ThriftHive.Client(protocol)
        transport.open()

        client.execute("create table if not exists isc_daily_sources (source_ip string, target_port int, protocol int,  reports bigint, targets bigint, first_seen string, last_seen string, hostname string) partitioned by(dt string) row format delimited fields terminated by '\t'");
        client.execute("load data local inpath '{csvFile}' overwrite into table isc_daily_sources partition (dt='{date}')".format(csvFile=csvFilename,date=dt))
        transport.close()

    except Thrift.TException, tx:
        sys.stderr.write('%s\n' % (tx.message))


Exemplo n.º 11
0
def findHeavyHitters(table, today=datetime.date.today(), verbose=False):
    """
  Find heavy hitters in the given traffic (table) and store the results in the 'suspiciousheavyhitters' Hive table.
  """

    histNbDay = 15
    date = "%d%02d%02d" % (today.year, today.month, today.day)
    dates = list(
        "%d%02d%02d" % (x.year, x.month, x.day)
        for x in pd.date_range(today - datetime.timedelta(histNbDay), today -
                               datetime.timedelta(1)))
    table = scrub(table)

    ## set some variables regarding the input data
    if table.startswith("netflow"):
        dataType = "netflow"
        endpointTypes = [("dstip", "da"), ("srcip", "sa")]
        req0 = "select {endpoint}, sum(ipkt) nbpkt, sum(ibyt) nbbyte from {table} where dt=%s group by {endpoint}"
        req1 = "select {genericLabel}, avg(nbpkt) as avgpkt, stddev_samp(nbpkt) as stdpkt, avg(nbbyt) as avgbyt, stddev_samp(nbbyt) as stdbyt from(select {endpointType} as {genericLabel}, dt, sum(ipkt) as nbpkt, sum(ibyt) as nbbyt from {table} where {endpointType} IN ({suspiciousIP}) and dt IN ({dates}) group by {endpointType}, dt order by {endpointType}, dt) group by {genericLabel}"
    elif table.startswith("sflow"):
        dataType = "sflow"
        endpointTypes = [("dstip", "dstip"), ("srcip", "srcip"),
                         ("dstip", "dstip6"), ("srcip", "srcip6")]
        req0 = "select {endpoint}, count(*) nbpkt, sum(ipsize) nbbyte from {table} where dt=%s and {endpoint}<>'' group by {endpoint}"
        req1 = "select {genericLabel}, avg(nbpkt) as avgpkt, stddev_samp(nbpkt) as stdpkt, avg(nbbyt) as avgbyt, stddev_samp(nbbyt) as stdbyt from(select {endpointType} as {genericLabel}, dt, count(*) as nbpkt, sum(ipsize) as nbbyt from {table} where {endpointType} IN ({suspiciousIP}) and dt IN ({dates}) group by {endpointType}, dt order by {endpointType}, dt) group by {genericLabel}"
    else:
        sys.stderr.write("Data type unknown!")
        sys.exit(-1)

    outputFile = open(
        "%s/suspiciousheavyhitters_%s_%s.txt" % (outputDirectory, table, date),
        "w")
    cursor = presto.connect('localhost').cursor()
    for genericLabel, endpointType in endpointTypes:
        if verbose:
            sys.stdout.write("Looking for %s heavy hitters... (%s,%s)\n" %
                             (date, table, genericLabel))
        suspiciousIP = set()
        # get today's data
        formatedReq = req0.format(endpoint=endpointType, table=table)
        cursor.execute(formatedReq, [date])
        res = cursor.fetchall()

        if len(res) == 0:
            continue

        data = pd.DataFrame(res, columns=[genericLabel, "nbpkt", "nbbyt"])
        data.index = data.pop(genericLabel)

        # find today's heavy hitter
        for aggType in ["nbpkt", "nbbyt"]:
            suspiciousIP.update(
                data.ix[data[aggType] > data[aggType].mean() +
                        3 * data[aggType].std()].index.tolist())

        # check in past data if they had similar behavior
        if verbose: sys.stdout.write("Retrieve past data...\n")
        suspiciousIP = list(suspiciousIP)
        for i in range(len(suspiciousIP))[::100]:
            susIP = suspiciousIP[i:i + 100]
            formatedReq1 = req1.format(
                genericLabel=genericLabel,
                endpointType=endpointType,
                table=table,
                suspiciousIP=str.translate(str(list(susIP)), None, "u[]"),
                dates=str.translate(str(dates), None, "u[]"))
            cursor.execute(formatedReq1)
            res = cursor.fetchall()

            if verbose: sys.stdout.write("Register suspicious IPs...\n")
            for ip, avgpkt, stdpkt, avgbyt, stdbyt in res:
                currData = data.ix[ip]
                if genericLabel == "dstip":
                    dstip = ip
                    srcip = ""
                else:
                    dstip = ""
                    srcip = ip
                try:
                    if currData["nbpkt"] > avgpkt + 3 * stdpkt or currData[
                            "nbbyt"] > avgbyt + 3 * stdbyt:
                        outputFile.write(
                            "%s\t%s\t%s\t%s\t%s\t\n" %
                            (srcip, dstip, currData["nbpkt"],
                             currData["nbbyt"],
                             confidence(currData["nbpkt"], avgpkt, stdpkt,
                                        currData["nbbyt"], avgbyt, stdbyt)))
                except TypeError:
                    if verbose:
                        sys.stdout.write(
                            "!!Warning!! no past data for %s (avgpkt=%s, stdpkt=%s, avgbyt=%s, stdbyt=%s)\n"
                            % (ip, avgpkt, stdpkt, avgbyt, stdbyt))
                    outputFile.write("%s\t%s\t%s\t%s\t%s\t\n" %
                                     (srcip, dstip, currData["nbpkt"],
                                      currData["nbbyt"], "MED"))
                    continue

    outputFile.close()

    # Store results in Hive
    try:
        transport = TSocket.TSocket('localhost', 10000)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)

        client = ThriftHive.Client(protocol)
        transport.open()

        client.execute(
            "create table if not exists suspiciousheavyhitters (srcip string, dstip string, pkt bigint, byte bigint, confidence string) partitioned by(dt string, dataSrc string) row format delimited fields terminated by '\t'"
        )
        client.execute(
            "load data local inpath '{dir}/suspiciousheavyhitters_{table}_{date}.txt' overwrite into table suspiciousheavyhitters partition (dt='{date}', dataSrc='{table}')"
            .format(table=table, date=date, dir=outputDirectory))
        transport.close()

    except Thrift.TException, tx:
        sys.stderr.write('%s\n' % (tx.message))
Exemplo n.º 12
0
Arquivo: tdw.py Projeto: zsmj513/tdw
 self.ip = iplist[num]
 while indexOfRetrytime + 1 < ipcounter and historyip.count(
         self.ip) > 0:
     num = random.randint(0, ipcounter - 1)
     self.ip = iplist[num]
 print "%d time retry execute connect to hive ip:%s" % (
     indexOfRetrytime + 1, self.ip)
 self.WriteLog("%d time retry execute connect to hive ip:%s" %
               (indexOfRetrytime + 1, self.ip))
 historyip.append(self.ip)
 self.transport = TSocket.TSocket(self.ip, self.port)
 #add by cherry end
 #self.transport = TSocket.TSocket(self.server, self.port)
 self.transport = TTransport.TBufferedTransport(self.transport)
 self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
 self.cli = ThriftHive.Client(self.protocol)
 self.transport.open()
 self.cli.audit(self.usrname, self.passwd, self.dbname)
 sname = self.cli.createSession("")
 self.session = sname[0]
 #print "create: %s" %(self.session)
 self.authid = sname[1]
 res = self.cli.execute("set plcretry=%d" %
                        (indexOfRetrytime + 1))
 self.WriteLog("plcretry: %d" % (indexOfRetrytime + 1))
 self.WriteLog("new session: " + self.session)
 self.WriteLog("new session server: " + self.server)
 self.WriteLog("new session ip: " + self.ip)
 self.WriteLog(
     time.strftime('%Y-%m-%d %H:%M:%S',
                   time.localtime(time.time())))
Exemplo n.º 13
0
def findNtpAmplifiers(table, today=datetime.date.today(), verbose=False):
    """
  Find NTP amplifiers in the given traffic (table) and store the results in the 'ntpamplifiers' Hive table.
  """

    date = "%d%02d%02d" % (today.year, today.month, today.day)
    table = scrub(table)

    ## set some variables regarding the input data
    if table.startswith("netflow"):
        dataType = "netflow"
        req0 = "select sa, sum(ibyt), sum(ipkt) from %s where sp=123 and dt='%s' and pr='UDP' and ibyt/ipkt=468 group by sa" % (
            table, date)
    elif table.startswith("sflow"):
        dataType = "sflow"
        req0 = "select srcip, sum(ipsize), count(*) from %s where udpsrcport=123 and ipprotocol=17 and ipsize=468 and dt='%s' group by srcip" % (
            table, date)
    else:
        sys.stderr.write("Data type unknown!")
        sys.exit(-1)

    cursor = presto.connect('localhost').cursor()
    if verbose:
        sys.stdout.write("Looking for %s NTP amplifiers... (%s)\n" %
                         (date, table))

    # get today's data
    cursor.execute(req0)
    res = cursor.fetchall()

    if len(res) == 0:
        return

    data = pd.DataFrame(res, columns=["srcip", "nbbyt", "nbpkt"])

    # add the confidence score:
    data["confidence"] = "LOW"
    data.loc[data.nbpkt >= 100, "confidence"] = "MED"
    data.loc[data.nbpkt >= 1000, "confidence"] = "HIGH"

    outputFile = open(
        "%s/ntpamplifiers_%s_%s.txt" % (outputDirectory, table, date), "w")
    data.to_csv(outputFile,
                sep="\t",
                header=False,
                cols=["srcip", "nbbyt", "nbpkt", "confidence"],
                index=False)
    outputFile.close()

    # Store results in Hive
    try:
        transport = TSocket.TSocket('localhost', 10000)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)

        client = ThriftHive.Client(protocol)
        transport.open()

        client.execute(
            "create table if not exists ntpamplifiers (srcip string, byte bigint,  pkt bigint, confidence string) partitioned by(dt string, dataSrc string) row format delimited fields terminated by '\t'"
        )
        client.execute(
            "load data local inpath '{dir}/ntpamplifiers_{table}_{date}.txt' overwrite into table ntpamplifiers partition (dt='{date}', dataSrc='{table}')"
            .format(table=table, date=date, dir=outputDirectory))
        transport.close()

    except Thrift.TException, tx:
        sys.stderr.write('%s\n' % (tx.message))