示例#1
0
    def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None,
                 overwrite=False):
        # extract some information from the function
        if udf_name is None:
            udf_name = function.name
        symbol = function.llvm_func.name
        ir = function.llvm_module.to_bitcode()
        return_type = udf_to_impala_type[function.signature.return_type.name]
        arg_types = [udf_to_impala_type[arg.name]
                     for arg in function.signature.args[1:]]

        # ship the IR to the cluster
        hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port,
                                      user_name=ic._hdfs_user)
        if hdfs_path is None:
            hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll')
        if not hdfs_path.endswith('.ll'):
            raise ValueError("The HDFS file name must end with .ll")
        hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite)

        # register the function in Impala
        if database is None:
            database = ic._temp_db
        impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types))
        if overwrite:
            ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name)
        register_query = ("CREATE FUNCTION %s RETURNS %s "
                          "LOCATION '%s' SYMBOL='%s'") % (impala_name,
                                                          return_type,
                                                          hdfs_path, symbol)
        ic._cursor.execute(register_query)
示例#2
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = ['/usr/hdp/current/hbase-client/lib/hbase-client.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-common.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-server.jar',
                     '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-it.jar',
                     '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar',
                     '/usr/hdp/current/hbase-client/lib/zookeeper.jar',
                     '/usr/hdp/current/pig-client/piggybank.jar',
                     '/usr/hdp/current/spark-client/lib/spark-examples.jar']

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        print 'Copying source file: %s to HDFS path %s' % (path, platform_file)
        with open(path) as file_data:
            try:
                hdfs_client.create_file(platform_file, file_data, overwrite=True)
            except PyWebHdfsException:
                print 'retrying HDFS copy command for %s' % platform_file
                time.sleep(5)
                hdfs_client.create_file(platform_file, file_data, overwrite=True)
def solarLog_call(epoch_time):
    conn = http.client.HTTPConnection("")
    r = requests.get(" http://winsun.solarlog-web.ch/api?cid=" + pfadheimBaarCID + "&locale=de_ch&username=277555406&password=5a03cdf0a3ff42de09bc85361d8a2f0f&function=dashboard&format=jsonh&solarlog=9112&tiles=Yield|true,Grafic|true,Env|true,Weather|true&ctime=" + epoch_time)
    logging.info("Response: " + str(r.status_code) + " " + r.reason)

    data = r.json()  # This will return entire content.
    data['timestamp'] = epoch_time
    # Remove key's with complex JSON structure
    del data['cur_production_per_wrid']
    del data['invEnergyType']
    #del data['decimalseperator']
    logging.debug(data)

    #write data to .json
    with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.json', 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)

    #write the same data as .csv since it is more easy to handel with hdfs..
    with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.csv', 'w') as f:  # Just use 'w' mode in 3.x
        w = csv.DictWriter(f, data.keys(), dialect=csv.excel_tab)
        w.writeheader()
        w.writerow(data)

    # write the same data as .csv since it is more easy to handel with hdfs..
    hdfs = PyWebHdfsClient(host='193.246.208.147', port='50079', user_name='hdfs')
    #hdfs_path = 'user/hdfs/from_python'
    hdfs.create_file('user/hdfs/pfadibaar_solarlog.csv', '0100')
    #with open('pfadibaar_solarlog_' + epoch_time + '.csv') as file_data:
    #    hdfs.create_file(hdfs_path, data=file_data)

    conn.close()
示例#4
0
def upload_file():
    """
    Upload file
    ---
    tags:
        - Files
    consumes: "multipart/form-data"
    parameters:
        -   name: file
            in: formData
            required: true
            paramType: body
            dataType: file
            type: file
    responses:
        200:
            description: Return a successful message
        401:
            description: Unauthorized
        400:
            description: Bad Request
        500:
            description: Server Internal error
    """
    # hard-code config information. You should imporove it.
    hdfs = PyWebHdfsClient(host='webhdfs',port='50070', user_name='thanhson1085')
    if request.method == 'POST':
        file = request.files['file']
        if file and allowed_file(file.filename):
            filename = secure_filename(str(time.time()) + file.filename)
            my_file = 'tmp/thanhson1085/data/' + filename
            hdfs.create_file(my_file, file)
            return jsonify({'success':'true'})

    return jsonify({'success':'false'})
def update_raw_stage(output, delivery_tag):

    #context = zmq.Context()

    #confirm = context.socket(zmq.PUSH)
    #confirm.connect(confirm_host)

    hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user)
    impala_conn = connect(host=impala_host, port=int(impala_port))
    cur = impala_conn.cursor()

    start_time = time.time()

    for k, v in output.iteritems():

        if (time.time() - start_time)/60 > sink_minutes:
            sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60))
        try:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

        except hdfs_err.PyWebHdfsException:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.create_file(file_name, '')
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

    #confirm.send(delivery_tag)
    sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60))
    sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))
示例#6
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = ['/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar',
                     '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar',
                     '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar',
                     '/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples.jar']

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        logging.info('Copying source file: %s to HDFS path %s', path, platform_file)
        with open(path) as file_data:
            hdfs_client.create_file(platform_file, file_data, overwrite=True)
示例#7
0
    def submit(self, bund, files=[]):
        hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')

        for f in files:
            hdfs.create_file("{}/{}".format(bund.path, f.name), f.read())  

        doc, tag, text = Doc().tagtext()
        with tag("configuration"):
            with tag("property"):
                with tag("name"):
                    text("user.name")
                with tag("value"):
                    text("oozie")

            with tag("property"):
                with tag("name"):
                    text("oozie.bundle.application.path")
                with tag("value"):
                    text("/"+bund.path + "/" + bund.name)

        configuration = doc.getvalue()
        response = post("{0}/oozie/v1/jobs".format(self.url), data=configuration, headers={'Content-Type': 'application/xml'})

        if response.status_code > 399:
            print response.headers["oozie-error-message"]
        print response.status_code
        print response.content
示例#8
0
def saveToStore(path,meta):
    con=happybase.Connection(MasterHbase)
    con.open()
    metaTable= con.table('MetaTable')
    if meta['size'] < largeSize:
        # save to Hbase
        encTable = con.table('EncTable')
        with open(path,'rb') as f:
            encTable.put(meta['rowkey'],{'enc:data': f.read()})
        metaTable.put(str(meta['rowkey']),{
                'pp:name': str(meta['filename']),
                'pp:checksum': str(meta['checksum']),
                'pp:size': str(meta['size']),
                'pp:often': str(meta['often']),
                'pp:des': str(meta['description'])
                }
              )
        app.logger.debug('%s is saved to Hbase',meta['rowkey'])
    else:
        # save to HDFS
        hdfs = PyWebHdfsClient(host=Master,port='50070', timeout=None,user_name='hduser')
        with open(path, 'rb') as f:
            hdfs.create_file(HDFSMainPath+meta['rowkey'], f)
        metaTable.put(str(meta['rowkey']),{
                'pp:name': str(meta['filename']),
                'pp:checksum': str(meta['checksum']),
                'pp:size': str(meta['size']),
                'pp:HDFSpath': str(HDFSMainPath + meta['rowkey']),
                'pp:often': str(meta['often']),
                'pp:des': str(meta['description'])
                }
              )
        app.logger.debug('%s is saved to HDFS',meta['rowkey'])
    con.close()
示例#9
0
    def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None,
            overwrite=False):
        # extract some information from the function
        if udf_name is None:
            udf_name = function.name
        symbol = function.llvm_func.name
        ir = function.llvm_module.to_bitcode()
        return_type = udf_to_impala_type[function.signature.return_type.name]
        arg_types = [udf_to_impala_type[arg.name]
                        for arg in function.signature.args[1:]]

        # ship the IR to the cluster
        hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port,
                user_name=ic._hdfs_user)
        if hdfs_path is None:
            hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll')
        if not hdfs_path.endswith('.ll'):
            raise ValueError("The HDFS file name must end with .ll")
        hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite)

        # register the function in Impala
        if database is None:
            database = ic._temp_db
        impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types))
        if overwrite:
            ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name)
        register_query = "CREATE FUNCTION %s RETURNS %s LOCATION '%s' SYMBOL='%s'" % (impala_name,
                return_type, hdfs_path, symbol)
        ic._cursor.execute(register_query)
示例#10
0
def to_hdfs(file_path, hdfs_path):
    hdfs = PyWebHdfsClient(host='hdfs-v1',
                           port='50070',
                           user_name='hdfs',
                           timeout=100)
    with open(file_path, 'rb') as f:
        hdfs.create_file(hdfs_path, f, overwrite=True)
示例#11
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = [
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar',
        '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar'
    ]

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node,
                                  port=webhdfs_port,
                                  user_name=webhdfs_user,
                                  timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        logging.info('Copying source file: %s to HDFS path %s', path,
                     platform_file)
        with open(path) as file_data:
            hdfs_client.create_file(platform_file, file_data, overwrite=True)
示例#12
0
def save_extracted_subgraph(elements, args: application_args):
    pair, subgraph, _ = elements
    path = args.get_folder_results_path()
    hdfs = PyWebHdfsClient(host=args.hdfs_host, port=args.hdfs_port)
    file = os.path.join(path, f"graph_{str(pair[0])}_{str(pair[1])}")
    pickled = pkl.dumps(subgraph)
    hdfs.create_file(file, pickled, overwrite=True)
示例#13
0
 def test_create_throws_exception_for_not_created(self, mock_put):
     webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                               user_name=self.user_name)
     self.init_response.status_code = http_client.TEMPORARY_REDIRECT
     self.response.status_code = http_client.BAD_REQUEST
     mock_put.side_effect = [self.init_response, self.response]
     with self.assertRaises(errors.PyWebHdfsException):
         webhdfs.create_file(self.path, self.file_data)
示例#14
0
    def test_webhdfs_csv(self):
        from pywebhdfs.webhdfs import PyWebHdfsClient
        dfs = PyWebHdfsClient(host='localhost',port='9870', user_name='hadoop')
        dfs.make_dir("/temp")

        with open("tests/data/data.csv") as input_file:
            dfs.create_file("/temp/data.csv", file_data=input_file, overwrite=True)

        dfs.delete_file_dir("/temp", recursive=True)
示例#15
0
 def put_file_contents(self,
                       hdfs_path,
                       file_contents,
                       user_name='trifacta',
                       httpfs_port='14000'):
     hdfs = PyWebHdfsClient(host=urlparse(self.trifacta_base_url).netloc,
                            port=httpfs_port,
                            user_name=user_name)
     hdfs.create_file(hdfs_path, file_contents, overwrite=True)
     return True
示例#16
0
class WhenTestingCreateOperation(unittest.TestCase):
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host,
                                       port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.location = 'redirect_uri'
        self.path = 'user/hdfs'
        self.file_data = '010101'
        self.init_response = MagicMock()
        self.init_response.headers = {'location': self.location}
        self.response = MagicMock()
        self.expected_headers = {'content-type': 'application/octet-stream'}

    def test_create_throws_exception_for_no_redirect(self):

        self.init_response.status_code = http_client.BAD_REQUEST
        self.response.status_code = http_client.CREATED
        self.requests.put.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.create_file(self.path, self.file_data)

    def test_create_throws_exception_for_not_created(self):

        self.init_response.status_code = http_client.TEMPORARY_REDIRECT
        self.response.status_code = http_client.BAD_REQUEST
        self.requests.put.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.create_file(self.path, self.file_data)

    def test_create_returns_file_location(self):

        self.init_response.status_code = http_client.TEMPORARY_REDIRECT
        self.response.status_code = http_client.CREATED
        self.put_method = MagicMock(
            side_effect=[self.init_response, self.response])
        self.requests.put = self.put_method
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.create_file(self.path, self.file_data)
        self.assertTrue(result)
        self.put_method.assert_called_with(self.location,
                                           headers=self.expected_headers,
                                           data=self.file_data)
示例#17
0
class WhenTestingCreateOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.location = 'redirect_uri'
        self.path = 'user/hdfs'
        self.file_data = '010101'
        self.init_response = MagicMock()
        self.init_response.headers = {'location': self.location}
        self.response = MagicMock()
        self.expected_headers = {'content-type': 'application/octet-stream'}

    def test_create_throws_exception_for_no_redirect(self):

        self.init_response.status_code = httplib.BAD_REQUEST
        self.response.status_code = httplib.CREATED
        self.requests.put.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.create_file(self.path, self.file_data)

    def test_create_throws_exception_for_not_created(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.BAD_REQUEST
        self.requests.put.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.create_file(self.path, self.file_data)

    def test_create_returns_file_location(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.CREATED
        self.put_method = MagicMock(
            side_effect=[self.init_response, self.response])
        self.requests.put = self.put_method
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.create_file(self.path, self.file_data)
        self.assertTrue(result)
        self.put_method.assert_called_with(
            self.location, headers=self.expected_headers, data=self.file_data)
示例#18
0
def create_data_from_station_data(first, second):
    """this function creates the data analyzing the two stations in comparison"""
    global hdfs; #global hdfs object
    global hbase; #global hbase object
    
    if(hdfs is None): 
        from pywebhdfs.webhdfs import PyWebHdfsClient; 
        hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya'); 
   
    if(hbase is None):
        import happybase;
        hbase = happybase.ConnectionPool(size=1,host='cshadoop.boisestate.edu');
 
    date_for_comparision = first["Date"].strip();

   # creating directory for each date
    try:
        hdfs.get_file_dir_status('user/uacharya/simulation/'+date_for_comparision);
    except Exception:
        # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9    
        for index in range(1, 10):
            content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n';
            try:
                hdfs.create_file('user/uacharya/simulation/'+date_for_comparision+'/node'+str(index)+'/output.csv',content,replication=1);
            except Exception:
                continue;
   
    
    dataset = {'node_1':[],'node_2':[],'node_3':[],'node_4':[],'node_5':[],'node_6':[],'node_7':[],'node_8':[],'node_9':[]};
   
    for data in broadcast_variable.value:
        compare_data_between(date_for_comparision, first, data,dataset);

#    for key in dataset:
#        if(len(dataset[key])!=0):
#            content = "\n".join(dataset[key]);
#            content +="\n";
#            while(True):
#                try:
#                    hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096);
#                    break;
#                except Exception:
#                    time.sleep(0.2);
#                    continue;

    
    dataset.clear(); #clearing the dictionary
    # append over here after all the global variable has been made        
    return second;
示例#19
0
def sharedlib_install(name_node, webhdfs_port, authentic_user, platform_dir,
                      lib_path_list):
    # Setup a connection with hdfs using namenode.
    hdfs = PyWebHdfsClient(host=name_node,
                           port=webhdfs_port,
                           user_name=authentic_user,
                           timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        print >> sys.stdout, 'Copying source file: %s to HDFS path %s' %\
                             (path, platform_file)
        with open(path) as file_data:
            hdfs.create_file(platform_file, file_data, overwrite=True)
示例#20
0
class HdfsHandler:
    def __init__(self, hadoopHost, hadopPort='50070', user='******'):
        # self.hdfs = PyWebHdfsClient(host='52.14.121.163', port='50070', user_name='hadoop')
        self.hdfs = PyWebHdfsClient(host=hadoopHost,
                                    port=hadopPort,
                                    user_name=user)
        self.s3_client = boto3.client('s3')

    def copyToHDFS(self, src_path, hdfs_path):
        if hdfs_path.startswith("hdfs"):
            temp_path = hdfs_path.split("8020")
            self.new_hdfs_path = temp_path[1] + '/lib'
            print "New Path: %s" % self.new_hdfs_path
        # create a new client instance
        # print "New Path: %s" % self.new_hdfs_path[1]
        jar_name = os.path.basename(src_path)
        print src_path
        fileContent = open(src_path, 'rb').read()

        # copies file to local for testing purpose
        # with open("E:/temp/java-0.0.2.jar", "wb") as jarfile:
        #     jarfile.write(fileContent)

        # create a new file on hdfs
        print('making new file at: {0}\n'.format(jar_name))
        result = self.hdfs.create_file(self.new_hdfs_path + "/" + jar_name,
                                       fileContent,
                                       overwrite=True)
        print "HDFS Copy Result: %s" % result
        return result

    def list_hdfs_dir(self, hdfs_path):
        print self.hdfs.list_dir(hdfs_path)
def writeFileToHdfs(hostName, userName, writePath, dataframe, fileName):
    auth = HTTPKerberosAuth()
    hdfsClient = PyWebHdfsClient(host=hostName,
                                 port='50070',
                                 user_name=userName,
                                 request_extra_opts={'auth': auth})

    outputPath = writePath
    stringDF = dataframe.to_csv(columns=[
        "env_name", "db_name", "tab_name", "col_name", "data_type", "comment"
    ],
                                index=False)

    hdfsClient.create_file(path=outputPath + fileName + ".csv",
                           file_data=stringDF,
                           overwrite=True)
示例#22
0
文件: bdf.py 项目: fkaufer/impyla
def from_pandas(ic, df, table=None, path=None, method='in_query',
        file_format='TEXTFILE', field_terminator='\t', line_terminator='\n',
        escape_char='\\',
        hdfs_host=None, webhdfs_port=50070, hdfs_user=None, overwrite=False):
    """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala
    
    path is the dir, not the filename
    """
    # TODO: this is not atomic
    temp_table = _random_id('tmp_table_', 8)
    if table is None:
        table = "%s.%s" % (ic._temp_db, temp_table)
    if path is None:
        path = os.path.join(ic._temp_dir, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    columns = list(df.columns)
    types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes]
    schema = zip(columns, types)
    create_stmt = _create_table(table_name, schema, path=path,
            file_format=file_format, field_terminator=field_terminator,
            line_terminator=line_terminator, escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    if method == 'in_query':
        query = "INSERT INTO %s VALUES " % table_name.to_sql()
        query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values])
        ic._cursor.execute(query)
    elif method == 'webhdfs':
        if file_format != 'TEXTFILE':
            raise ValueError("only TEXTFILE format supported for webhdfs")
        if path is None:
            raise ValueError("must supply a path for EXTERNAL table for webhdfs")
        from pywebhdfs.webhdfs import PyWebHdfsClient
        hdfs_client = PyWebHdfsClient(host=hdfs_host, port=webhdfs_port,
                user_name=hdfs_user)
        raw_data = StringIO()
        df.to_csv(raw_data, sep=field_terminator,
                line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False)
        hdfs_client.create_file(os.path.join(path, 'data.txt').lstrip('/'), raw_data.getvalue(), overwrite=overwrite)
        raw_data.close()
    else:
        raise ValueError("method must be 'in_query' or 'webhdfs'; got %s" % method)
    return from_sql_table(ic, table_name.to_sql())
示例#23
0
 def test_create_returns_file_location(self, mock_put):
     webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                               user_name=self.user_name)
     self.init_response.status_code = http_client.TEMPORARY_REDIRECT
     self.response.status_code = http_client.CREATED
     mock_put.side_effect = [self.init_response, self.response]
     result = webhdfs.create_file(self.path, self.file_data)
     self.assertTrue(result)
     mock_put.assert_called_with(
         self.location, headers=self.expected_headers, data=self.file_data)
示例#24
0
def save_prediction_results(results, time, args: application_args):
    # get hdfs path
    path = args.get_folder_results_path()
    # save data on pod
    chained = list(itertools.chain.from_iterable(results))
    file = os.path.join(workdir,
                        'prediction_' + args.get_folder_results_name())
    np.savetxt(file, chained, fmt=['%d', '%d', '%1.2f'])
    # access it to read linewise
    predictions = ''
    with open(file, 'r') as f:
        for line in f:
            predictions += line.strip() + '\n'
    os.remove(file)
    # save results on hdfs
    hdfs = PyWebHdfsClient(host=args.hdfs_host, port=args.hdfs_port)
    file = os.path.join(path, "predictions")
    hdfs.create_file(file, predictions)
    file = os.path.join(path, "resulting_prediction_time")
    hdfs.create_file(file, str(time))
示例#25
0
def write_data_to_hdfs(username, records):
    global hdfs_namenodes
    to_return = {}
    file_path = "/jobs_to_do/" + username + ".txt"
    result_path = "/jobs_done/" + username
    logger.debug("Writing file " + file_path + " to HDFS")
    try:
        logger.debug("Trying to connect to " + hdfs_namenodes[0] + " namenode")
        hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[0],
                                      port='50070',
                                      user_name='xnet',
                                      timeout=100)
        logger.debug("Trying to erase " + file_path)
        logger.debug("Trying to erase " + result_path)
        hdfs_client.delete_file_dir(file_path, recursive=True)
        hdfs_client.delete_file_dir(result_path, recursive=True)
        hdfs_client.create_file(file_path, records.encode("utf-8"))
    except (ConnectionError, PyWebHdfsException) as ce:
        to_return["details_1"] = str(ce)
        logger.debug("Failed connecting to" + hdfs_namenodes[0] + " namenode")
        try:
            logger.debug("Trying to connect to " + hdfs_namenodes[1] +
                         " namenode")
            hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[1],
                                          port='50070',
                                          user_name='xnet',
                                          timeout=100)
            logger.debug("Trying to erase " + file_path)
            logger.debug("Trying to erase " + result_path)
            hdfs_client.delete_file_dir(file_path, recursive=True)
            hdfs_client.delete_file_dir(result_path, recursive=True)
            hdfs_client.create_file(file_path, records.encode("utf-8"))
        except (ConnectionError, PyWebHdfsException) as ce:
            to_return[
                "error"] = "There was a problem while trying to connect to HDFS namenode."
            to_return["details_2"] = str(ce)
            logger.debug(str(to_return))
            return False, to_return

    return True, None
示例#26
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = [
        '/usr/hdp/current/hbase-client/lib/hbase-client.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-common.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-server.jar',
        '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-it.jar',
        '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar',
        '/usr/hdp/current/hbase-client/lib/zookeeper.jar',
        '/usr/hdp/current/pig-client/piggybank.jar',
        '/usr/hdp/current/spark-client/lib/spark-examples.jar'
    ]

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node,
                                  port=webhdfs_port,
                                  user_name=webhdfs_user,
                                  timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        print 'Copying source file: %s to HDFS path %s' % (path, platform_file)
        with open(path) as file_data:
            try:
                hdfs_client.create_file(platform_file,
                                        file_data,
                                        overwrite=True)
            except PyWebHdfsException:
                print 'retrying HDFS copy command for %s' % platform_file
                time.sleep(5)
                hdfs_client.create_file(platform_file,
                                        file_data,
                                        overwrite=True)
示例#27
0
 def save(self, workflow_name="workflow.xml"):
     hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
     workflow_path = "{0}/{1}/workflow.xml".format(self.path, self.name)
     hdfs.make_dir(self.path)
     hdfs.create_file(workflow_path, self.as_xml())
示例#28
0
source = requests.get("https://resources.lendingclub.com/LoanStats3d.csv.zip",
                      verify=False)
stringio = StringIO.StringIO(source.content)
unzipped = zipfile.ZipFile(stringio)
import pandas as pd
from pywebhdfs.webhdfs import PyWebHdfsClient

subselection_csv = pd.read_csv(unzipped.open('LoanStats3d.csv'),
                               skiprows=1,
                               skipfooter=2,
                               engine='python')
stored_csv = subselection_csv.to_csv('./stored_csv.csv')
hdfs = PyWebHdfsClient(user_name="hdfs", port=50070, host="sandbox")
hdfs.make_dir('chapter5')
with open('./stored_csv.csv') as file_data:
    hdfs.create_file('chapter5/LoanStats3d.csv', file_data, overwrite=True)
print(hdfs.get_file_dir_status('chapter5/LoanStats3d.csv'))

from pyspark.sql import HiveContext

# sc = SparkContext()
sqlContext = HiveContext(sc)
data = sc.textFile("/chapter5/LoanStats3d.csv")
parts = data.map(lambda r: r.split(','))
firstline = parts.first()
datalines = parts.filter(lambda x: x != firstline)


def cleans(row):
    row[7] = str(float(row[7][:-1]) / 100)
    return [s.encode('utf8').replace(r"_", " ").lower() for s in row]
示例#29
0
    HTMLFILE=str(line[1])+'.htm'
    TEXTFILE=str(line[1])+'.txt'
    HADOOP_HTMLFILE='user/root/crawls/'+str(ANET)+'/'+str(BNET)+'/'+HTMLFILE
    HADOOP_TEXTFILE='user/root/texts/'+str(ANET)+'/'+str(BNET)+'/'+TEXTFILE
    print "-======= site: "+str(url)+" =======-"
    try:
      soup = BeautifulSoup(html)
    except:
      print " soup exception"
      continue
    HFP=open(HTMLFILE,'w')
    HFP.write(soup.encode('utf-8'))
    HFP.close()
    with open(HTMLFILE) as hfp:
      try:
        client.create_file(HADOOP_HTMLFILE,hfp)
      except:
        client.delete_file_dir(HADOOP_HTMLFILE)
        client.create_file(HADOOP_HTMLFILE,hfp)

    TFP=open(TEXTFILE,'w')
    WRITEOUT=unicode(soup.get_text())
    WORDLIST=re.sub(r'[^a-zA-Z0-9 ]',r' ',WRITEOUT)
    WORDLIST=WORDLIST.strip().split()
    TFP.write(WRITEOUT.encode('utf-8'))
    TFP.close()
    PAGETITLE=''
    try:
      PAGETITLE=soup.title.string
    except:
      pass
示例#30
0
 def save(self):
   hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
   coordinator_path = "{0}/{1}/coordinator.xml".format(self.path, self.name)
   hdfs.make_dir(self.path)
   hdfs.create_file(coordinator_path, self.as_xml())
class HDFS(object):
    def __init__(self, host, port, user):
        self._hdfs = PyWebHdfsClient(
            host=host, port=port, user_name=user, timeout=None)
        logging.debug('webhdfs = %s@%s:%s', user, host, port)

    def recursive_copy(self, local_path, remote_path, exclude=None):

        if exclude is None:
            exclude = []

        c_path = canonicalize(remote_path)
        logging.debug('making %s', c_path)
        self._hdfs.make_dir(c_path)

        fs_g = os.walk(local_path)
        for dpath, dnames, fnames in fs_g:
            _, relative_path = dpath.split(local_path)
            for dname in dnames:
                if dname not in exclude:
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, dname))
                    logging.debug('making %s', c_path)
                    self._hdfs.make_dir(c_path)

            for fname in fnames:
                if fname not in exclude:
                    data = file(
                        canonicalize(
                            '%s/%s/%s' %
                            (local_path, relative_path, fname)), 'rb')
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, fname))
                    logging.debug('creating %s', c_path)
                    self._hdfs.create_file(c_path, data, overwrite=True)
                    data.close()

    def make_dir(self, path):

        logging.debug('make_dir: %s', path)

        self._hdfs.make_dir(canonicalize(path))

    def create_file(self, data, remote_file_path):

        logging.debug('create_file: %s', remote_file_path)

        sio = StringIO.StringIO(data)

        self._hdfs.create_file(
            canonicalize(remote_file_path),
            sio,
            overwrite=True)

    def append_file(self, data, remote_file_path):

        logging.debug('append to: %s', remote_file_path)

        self._hdfs.append_file(canonicalize(remote_file_path), data)


    def stream_file_to_disk(self, remote_file_path, local_file_path):
        chunk_size = 10*1024*1024
        offset = 0
        with open(local_file_path, 'wb') as dest_file:
            data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)
            while True:
                dest_file.write(data)
                if len(data) < chunk_size:
                    break
                offset += chunk_size
                data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)

    def read_file(self, remote_file_path):

        data = self._hdfs.read_file(canonicalize(remote_file_path))

        return data

    def remove(self, path, recursive=False):

        logging.debug('remove: %s', path)

        self._hdfs.delete_file_dir(canonicalize(path), recursive)
示例#32
0
class HdfsApi:
    def __init__(self,
                 request_timeout=10,
                 logger=logging,
                 active_nn_host='localhost',
                 kerberos=False):
        self.timeout = request_timeout
        self.hdfs_schema = os.environ.get('HDFS_NAMENODE_SCHEMA', 'http')
        self.hdfs_host = active_nn_host
        self.hdfs_port = os.environ.get('HDFS_NAMENODE_PORT', 50070)
        if kerberos:
            extra_opts = {
                'auth':
                HTTPKerberosAuth(mutual_authentication=OPTIONAL,
                                 sanitize_mutual_error_response=False,
                                 force_preemptive=True)
            }
        else:
            extra_opts = {}
        self.webhdfs = PyWebHdfsClient(host=self.hdfs_host,
                                       port=self.hdfs_port,
                                       request_extra_opts=extra_opts)
        self.logger = logger

    def request_namenode(self, path, method='GET', headers=None, **kwargs):
        self.logger.info("Calling HDFS API ({0})".format(path))
        if headers is None:
            headers = dict()

        if path.startswith('http'):
            hdfs_url = path
        else:
            hdfs_url = '{0}://{1}:{2}/{3}'.format(self.hdfs_schema,
                                                  self.hdfs_host,
                                                  self.hdfs_port, path)
        self.logger.debug(hdfs_url)
        r = requests.request(method,
                             hdfs_url,
                             headers=headers,
                             timeout=self.timeout,
                             verify=False,
                             auth=HTTPKerberosAuth(),
                             **kwargs)
        return self._check_response_status(r)

    def request_webhdfs_status(self, path):
        return self.webhdfs.get_file_dir_status(path)

    def _check_response_status(self, response):
        self.logger.debug(response.text)
        if response.status_code >= 400:
            self.logger.error(
                "HdfsResponse returned with error status [{0}], response was: {1}"
                .format(response.status_code, response.text))
            raise HdfsRequestError(
                "HdfsResponse returned with error status [{0}]".format(
                    response.status_code))
        return response

    def get_block_info_for_file(self, file_path):
        path = "fsck"
        params = {'files': 0, 'racks': 1, 'blocks': 0, 'path': file_path}

        response = self.request_namenode(path, params=params)
        return response

    @staticmethod
    def get_first_block_info(filename, block_info):
        regex = r"^{0}.*\n(.*)\n".format(filename)
        info_of_first_block = re.findall(regex, block_info, re.MULTILINE)
        if len(info_of_first_block) < 1:
            raise HdfsRequestError(
                "No block information found for file {0} in {1}".format(
                    filename, block_info))
        return info_of_first_block[0]

    @staticmethod
    def get_location_of_first_block(block_info):
        ip_regex = r"(?<!\-)(\d{2,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?=:)"
        block_locations = re.findall(ip_regex, block_info)
        if len(block_locations) < 1:
            raise HdfsRequestError(
                "No block location information found in {0}".format(
                    block_info))
        return block_locations[0]

    @staticmethod
    def get_host_by_ip(ip):
        host_info = socket.gethostbyaddr(ip)
        if len(host_info) < 1:
            raise HdfsRequestError(
                "Unable to get hostname form ip {0}".format(ip))
        return host_info[0]

    @staticmethod
    def calculate_md5(file, block_size=65536):
        hash_builder = hashlib.md5()
        for block in iter(lambda: file.read(block_size), b""):
            hash_builder.update(block)
        md5 = hash_builder.hexdigest()
        file.seek(0)
        return md5

    @staticmethod
    def create_temp_file():
        return tempfile.NamedTemporaryFile(suffix='.temporary',
                                           prefix='hdfs-smoketest-api-')

    def create_temp_file_of_size(self, temp_file_size):
        tmp = self.create_temp_file()
        tmp.seek(temp_file_size * 1024 * 1024)
        tmp.write(b'1')
        tmp.seek(0)

        return tmp

    def copy_to_hdfs(self, remote_path, tmpfile):
        self.webhdfs.create_file(remote_path,
                                 file_data=tmpfile,
                                 overwrite=True)

    def create_hdfs_file_of_size_in_mb(self, path, size=300):
        with self.create_temp_file_of_size(size) as tmp_file:
            md5_of_tmp_file = self.calculate_md5(tmp_file)
            self.copy_to_hdfs(path, tmp_file)

        return md5_of_tmp_file

    def get_remote_file(self, path):
        return self.webhdfs.read_file(path)

    def write_remote_file_to_local_temp(self, remote_path):
        local = self.create_temp_file()
        file = self.get_remote_file(remote_path)
        local.write(file)
        local.seek(0)
        return local

    def get_hdfsfile_and_calc_md5(self, path):
        with self.write_remote_file_to_local_temp(path) as temp_file:
            return self.calculate_md5(temp_file)

    def cleanup_remote_file(self, path, recursive=False):
        self.webhdfs.delete_file_dir(path, recursive=recursive)

    def get_host_location_of_first_block(self, filename):
        file_block_info = self.get_block_info_for_file(filename)
        file_first_block_info = self.get_first_block_info(
            filename, file_block_info.text)
        file_block_ip = self.get_location_of_first_block(file_first_block_info)
        return self.get_host_by_ip(file_block_ip)
示例#33
0
for function in ll.functions:
    try:
        symbol = function.name
        log("Loading types for function %s" % symbol)
        # skip the first argument, which is FunctionContext*
        arg_types = tuple([llvm2impala[arg.pointee.name] for arg in function.type.pointee.args[1:]])
        functions.append((symbol, arg_types))
    except (AttributeError, KeyError):
        # this process could fail for non-UDF helper functions...just ignore them,
        # because we're not going to be registering them anyway
        log("Had trouble with function %s; moving on..." % symbol)
        pass

# transfer the LLVM module to HDFS
hdfs_client = PyWebHdfsClient(host=args.nn_host, port=args.webhdfs_port, user_name=args.user)
hdfs_client.create_file(args.hdfs_path.lstrip('/'), bc, overwrite=args.force)
log("Transferred LLVM IR to HDFS at %s" % args.hdfs_path)

# register the functions with impala
conn = impala.dbapi.connect(host=args.impala_host, port=args.impala_port)
cursor = conn.cursor(user=args.user)
log("Connected to impalad: %s" % args.impala_host)
if args.db:
    cursor.execute('USE %s' % args.db)
cursor.execute("SHOW FUNCTIONS")
registered_functions = cursor.fetchall()
for (udf_name, return_type) in zip(args.name, args.return_type):
    log("Registering function %s" % udf_name)
    # find matching LLVM symbols to the current UDF name
    matches = [pair for pair in functions if udf_name in pair[0]]
    if len(matches) == 0:
示例#34
0
while True:
    a_net = randint(1, 255)
    ROW = t.row(str(a_net))
    if len(ROW) > 0:
        for key, value in ROW.items():
            if value != str(-1):
                START = randint(1, 255)
                continue
    t.put(str(a_net), {'data:user': '******'})
    print 'scanning the major ' + str(a_net) + '.0.0.0/8 subnet'
    for bnet in range(0, 256):
        if a_net == 10:
            continue
        elif a_net == 192 and bnet == 168:
            continue
        elif a_net == 172 and bnet == 16:
            continue
        elif a_net == 127:
            continue
        IPADDR = str(a_net) + '.' + str(bnet) + '.0.0/16'
        OFILE = str(a_net) + '-' + str(bnet) + '-p80.log'
        A = subprocess.Popen(
            ['masscan', '-p80', '-oG', OFILE, IPADDR, '--rate=2000'])
        A.wait()
        time.sleep(2)
        HADOOP_FILE_NAME = 'user/root/scans/' + str(a_net) + '/' + OFILE
        with open(OFILE) as ofp:
            hdfs.create_file(HADOOP_FILE_NAME, ofp)
        subprocess.Popen(['rm', OFILE])
    t.put(str(a_net), {'data:user': '******'})
示例#35
0
def from_pandas(ic,
                df,
                table=None,
                path=None,
                method='in_query',
                file_format='TEXTFILE',
                field_terminator='\t',
                line_terminator='\n',
                escape_char='\\',
                hdfs_host=None,
                webhdfs_port=50070,
                hdfs_user=None,
                overwrite=False):
    """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala

    path is the dir, not the filename
    """
    # TODO: this is not atomic
    temp_table = _random_id('tmp_table_', 8)
    if table is None:
        table = "%s.%s" % (ic._temp_db, temp_table)
    if path is None:
        path = os.path.join(ic._temp_dir, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    columns = list(df.columns)
    types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes]
    schema = zip(columns, types)
    create_stmt = _create_table(table_name,
                                schema,
                                path=path,
                                file_format=file_format,
                                field_terminator=field_terminator,
                                line_terminator=line_terminator,
                                escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    if method == 'in_query':
        query = "INSERT INTO %s VALUES " % table_name.to_sql()
        query += ', '.join([
            '(%s)' % ', '.join(map(_py_to_sql_string, row))
            for row in df.values
        ])
        ic._cursor.execute(query)
    elif method == 'webhdfs':
        if file_format != 'TEXTFILE':
            raise ValueError("only TEXTFILE format supported for webhdfs")
        if path is None:
            raise ValueError(
                "must supply a path for EXTERNAL table for webhdfs")
        from pywebhdfs.webhdfs import PyWebHdfsClient
        hdfs_client = PyWebHdfsClient(host=hdfs_host,
                                      port=webhdfs_port,
                                      user_name=hdfs_user)
        raw_data = StringIO()
        df.to_csv(raw_data,
                  sep=field_terminator,
                  line_terminator=line_terminator,
                  quoting=csv.QUOTE_NONE,
                  escapechar=escape_char,
                  header=False,
                  index=False)
        hdfs_client.create_file(os.path.join(path, 'data.txt').lstrip('/'),
                                raw_data.getvalue(),
                                overwrite=overwrite)
        raw_data.close()
    else:
        raise ValueError("method must be 'in_query' or 'webhdfs'; got %s" %
                         method)
    return from_sql_table(ic, table_name.to_sql())
示例#36
0
        _FULLURL = url + link.get('href')
        if _FULLURL.endswith('.pdf') and _FULLURL.startswith(
                "https://www.basf.com/documents/"):
            urls.append(_FULLURL)
            names.append(link.get('href').rsplit('/', 1)[-1])

names_urls = zip(names, urls)

for name, url in names_urls:
    if not os.path.isfile("files/" + name):
        rq = urllib.Request(url)
        try:
            res = urllib.urlopen(rq)
            pdf = open("files/" + name, 'wb')
            pdf.write(res.read())
            pdf.close()
            print("Download: " + url)

        except:
            continue

    else:
        print(url)

    try:
        with open('files/' + name, "rb") as file_data:  # UTF-8 Latin-1
            hdfs.create_file("/user/data/" + name, file_data)
        print("Upload to HDFS " + name)
    except:
        print("Upload to HDFS Failed")
示例#37
0
 def create_or_overwrite_file(self, path, f, hdfs_user='******', request_extra_opts={}):
     hdfs = PyWebHdfsClient(host='10.1.94.54', port=14000, user_name=hdfs_user, request_extra_opts=request_extra_opts)
     return hdfs.create_file(path, f, overwrite=True)
示例#38
0
import ujson
from impala.dbapi import connect
import datetime
import re

hdfs_row = []
bad_str =  """?NaDDi?\\""" #

hdfs_row.append('blablabla')
hdfs_row.append(re.sub("""(\n|\t|\r)""", '?', bad_str))
hdfs_row.append('blablabla')

#try:
hdfs = PyWebHdfsClient(host='al1.zmeke.com', port=50070, user_name='k.kraynov')
#data = 'test,test,test'
hdfs.create_file('user/k.kraynov/test/test.txt', 'blabla')
#hdfs.delete_file_dir('user/k.kraynov/test.txt')
#hdfs.append_file('user/k.kraynov/test2.txt', data+'\n')
#hdfs.append_file('etl/500.txt', 'test')
#hdfs.make_dir('etl/stage/log_{0}')
#conn = connect(host='al1.zmeke.com', port=21050)
#cur = conn.cursor()
#cur.execute('show tables in stage;')
#cur.execute('REFRESH analytics.test')
#result = cur.fetchall()
#for i in result:
#print hdfs.list_dir('user/k.kraynov/')
#dir = hdfs.list_dir('etl/stage/log_102/2')
#for dir_file in dir['FileStatuses']['FileStatus']:
 #   print dir_file['pathSuffix']
#except
示例#39
0
文件: main.py 项目: aidmoit/collect
    opendata3mDataMetada = getUrlFromOpendata3M(inputCSV)
    jsonfile = open(pathToSaveDownloadedMeta, "w")
    jsonfile.write(json.dumps(opendata3mDataMetada))
    jsonfile.close()
    """Download File"""
    nboffiledl = downloadOpendata3MFiles(opendata3mDataMetada,
                                         pathToSaveDownloadedData)
    """Insert files inside HDFS and store file"""
    # connect to HDFS
    hdfs = PyWebHdfsClient(host=namenode,
                           port=namenodePort,
                           user_name=hdfsuser)
    for file in os.listdir(pathToSaveDownloadedData):
        if (str(file) != ".forgit"):
            try:
                pathInDL = "."
                file_data = str(file)
                hdfs.create_file(file_data, pathInDL)
            except Exception as e:
                print('Failed to upload in HDFS: ' + str(e))
    """Build and insert iso19139 xml to geonetwork"""
    try:
        subprocess.call("/usr/bin/Rscript  addServicesToGN.R")
    except:
        print("R error due to OSM ? Try re-launched")
        subprocess.call("R -f addServicesToGN.R", shell=True)

    print(
        str(nboffiledl) + " files downloaded in : " + pathToSaveDownloadedData)
    print("AIDMOIt ingestion module ends")
class HDFS(object):
    def __init__(self, host, port, user):
        self._hdfs = PyWebHdfsClient(host=host,
                                     port=port,
                                     user_name=user,
                                     timeout=None)
        logging.debug('webhdfs = %s@%s:%s', user, host, port)

    def recursive_copy(self,
                       local_path,
                       remote_path,
                       exclude=None,
                       permission=755):

        if exclude is None:
            exclude = []

        c_path = canonicalize(remote_path)
        logging.debug('making %s', c_path)
        self._hdfs.make_dir(c_path, permission=permission)

        fs_g = os.walk(local_path)
        for dpath, dnames, fnames in fs_g:
            _, relative_path = dpath.split(local_path)
            for dname in dnames:
                if dname not in exclude:
                    c_path = canonicalize('%s/%s/%s' %
                                          (remote_path, relative_path, dname))
                    logging.debug('making %s', c_path)
                    self._hdfs.make_dir(c_path, permission=permission)

            for fname in fnames:
                if fname not in exclude:
                    data = file(
                        canonicalize('%s/%s/%s' %
                                     (local_path, relative_path, fname)), 'rb')
                    c_path = canonicalize('%s/%s/%s' %
                                          (remote_path, relative_path, fname))
                    logging.debug('creating %s', c_path)
                    self._hdfs.create_file(c_path,
                                           data,
                                           overwrite=True,
                                           permission=permission)
                    data.close()

    def make_dir(self, path, permission=755):

        logging.debug('make_dir: %s', path)

        self._hdfs.make_dir(canonicalize(path), permission=permission)

    def create_file(self, data, remote_file_path, permission=755):

        logging.debug('create_file: %s', remote_file_path)

        sio = BytesIO(data)

        self._hdfs.create_file(canonicalize(remote_file_path),
                               sio,
                               overwrite=True,
                               permission=permission)

    def append_file(self, data, remote_file_path):

        logging.debug('append to: %s', remote_file_path)

        self._hdfs.append_file(canonicalize(remote_file_path), data)

    def stream_file_to_disk(self, remote_file_path, local_file_path):
        chunk_size = 10 * 1024 * 1024
        offset = 0
        with open(local_file_path, 'wb') as dest_file:
            data = self._hdfs.read_file(canonicalize(remote_file_path),
                                        offset=offset,
                                        length=chunk_size)
            while True:
                dest_file.write(data)
                if len(data) < chunk_size:
                    break
                offset += chunk_size
                data = self._hdfs.read_file(canonicalize(remote_file_path),
                                            offset=offset,
                                            length=chunk_size)

    def read_file(self, remote_file_path):

        data = self._hdfs.read_file(canonicalize(remote_file_path))

        return data

    def remove(self, path, recursive=False):

        logging.debug('remove: %s', path)

        self._hdfs.delete_file_dir(canonicalize(path), recursive)

    def file_exists(self, path):

        try:
            self._hdfs.get_file_dir_status(path)
            return True
        except:
            return False
示例#41
0
t=conn.table('anet')
while True:
  a_net=randint(1,255)
  ROW=t.row(str(a_net))
  if len(ROW) > 0:
    for key, value in ROW.items():
        if value != str(-1):
          START=randint(1,255)
	  continue
  t.put(str(a_net),{'data:user':'******'})
  print 'scanning the major '+str(a_net)+'.0.0.0/8 subnet'
  for bnet in range(0,256):
    if a_net==10:
       continue
    elif a_net==192 and bnet==168:
       continue
    elif a_net==172 and bnet==16:
       continue
    elif a_net==127:
       continue
    IPADDR=str(a_net)+'.'+str(bnet)+'.0.0/16'
    OFILE=str(a_net)+'-'+str(bnet)+'-p80.log'
    A=subprocess.Popen(['masscan','-p80','-oG',OFILE,IPADDR,'--rate=2000'])
    A.wait()
    time.sleep(2)
    HADOOP_FILE_NAME='user/root/scans/'+str(a_net)+'/'+OFILE
    with open(OFILE) as ofp:
      hdfs.create_file(HADOOP_FILE_NAME,ofp)
    subprocess.Popen(['rm',OFILE])
  t.put(str(a_net),{'data:user':'******'})
#1 imports

from pywebhdfs.webhdfs import PyWebHdfsClient

#2 make connection with hadoop file system

hdfs = PyWebHdfsClient(user_name="hdfs",port=50070,host="sandbox.hortonworks.com")


hdfs.delete_file_dir('chapter5/LoanStats3d.csv',recursive=True)

#4 recreate the chapters directory

hdfs.make_dir('chapter5')

#5 upload the csv file

with open('./data/stored_csv.csv') as file_data:
	hdfs.create_file('chapter5/LoanStats3d.csv',file_data, overwrite=True)

#6 print the status to see if this succeeded.
print hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')
示例#43
0
from pywebhdfs.webhdfs import PyWebHdfsClient
hdfs = PyWebHdfsClient(host='s12m.westeurope.cloudapp.azure.com', port='50070', user_name='data', timeout=10)  # s12m.westeurope.cloudapp.azure.com   hdfs = PyWebHdfsClient(host='', port='50070', user_name='data', timeout=10)  # s12m.westeurope.cloudapp.azure.com

for root, dirs, files in os.walk("./ExxonMobil"):
    for filename in files:
        try:
            txtname = filename.split('.')[0] + '.txt'
            text = convert_pdf_to_txt_1("ExxonMobil/"+filename)
            text_file = open("text/" + txtname, 'w', encoding="utf-8")
            text_file.write(text)
            text_file.close()
            print(filename)
            
            try:
                with open('text/' + txtname, "rb") as file_data:  # UTF-8 Latin-1
                    hdfs.create_file("/user/data/txt_file/" + txtname, file_data)
                print("Upload to HDFS " + filename)
            except:
                print("Upload to HDFS Failed")

        except:
            continue
        

'''
filename = "BASF_Creating-Chemistry_07.pdf"
text = convert_pdf_to_txt_1("files/"+filename)
print(text)
text_file = open("text/" + filename.split('.')[0] + '.txt', 'w', encoding="utf-8")
text_file.write(text)
text_file.close()
示例#44
0
# create a new client instance
hdfs = PyWebHdfsClient(host='localhost', port='50070',
                       user_name='hduser')

# create a new directory for the example
print('making new HDFS directory at: {0}\n'.format(example_dir))
hdfs.make_dir(example_dir)

# get a dictionary of the directory's status
dir_status = hdfs.get_file_dir_status(example_dir)
print(dir_status)

# create a new file on hdfs
print('making new file at: {0}\n'.format(example_file))
hdfs.create_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print(file_status)

# get the checksum for the file
file_checksum = hdfs.get_file_checksum(example_file)
print(file_checksum)

# append to the file created in previous step
print('appending to file at: {0}\n'.format(example_file))
hdfs.append_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print(file_status)
示例#45
0
def create_data_from_station_data(first, second):
    """this function creates the data analyzing the two stations in comparison"""
    global hdfs
    #global hdfs object
    global hbase
    #global hbase object

    if (hdfs is None):
        from pywebhdfs.webhdfs import PyWebHdfsClient
        hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',
                               port='50070',
                               user_name='uacharya')

    if (hbase is None):
        import happybase
        hbase = happybase.ConnectionPool(size=1,
                                         host='cshadoop.boisestate.edu')

    date_for_comparision = first["Date"].strip()

    # creating directory for each date
    try:
        hdfs.get_file_dir_status('user/uacharya/single_screen/' +
                                 date_for_comparision)
    except Exception:
        # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9
        content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n'
        try:
            hdfs.create_file('user/uacharya/single_screen/' +
                             date_for_comparision + '/data/output.csv',
                             content,
                             replication=1)
        except Exception:
            pass

    dataset = {
        'node_1': [],
        'node_2': [],
        'node_3': []
    }

    for data in broadcast_variable.value:
        compare_data_between(date_for_comparision, first, data, dataset)


#    for key in dataset:
#        if(len(dataset[key])!=0):
#            content = "\n".join(dataset[key]);
#            content +="\n";
#            while(True):
#                try:
#                    hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096);
#                    break;
#                except Exception:
#                    time.sleep(0.2);
#                    continue;

    dataset.clear()
    #clearing the dictionary
    # append over here after all the global variable has been made
    return second
示例#46
0
文件: DMS.py 项目: lukkiddd/DMSHadoop
class DMS:
    def __init__(self, debug=0):
        ''' This function use to init a class. To show an error messages debug
        should be 1.
        :param : debug - 1, show an error or success message. 0 otherwise
        :return: Nothing.
        '''
        self.debug = debug
        pass

    def hbase_connection(self, host, port, table='dms'):
        ''' This function use to establish a connection to hbase, for preparing to
        insert, remove, fetch data from hbase. We use starbase for connect to hbase
        via rest api.(See more: https://github.com/barseghyanartur/starbase)
        :param : host - hbase rest host
        :param : port - hbase rest running port
        :param : table - DMS table on hbase (default: 'dms')
        :return: Nothing.
        '''
        self.hbase = hbaseConnection(host=host, port=port)
        t = self.hbase.table(table)
        if (not t.exists()):
            t.create('meta_data','file')
        self.hbase_table = t

    def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'):
        ''' This function use to establish a connection to hdfs, for preparing to
        create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to
        do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/)
        :param : host - hdfs rest host
        :param : port - hdfs rest running port
        :param : user_name - hdfs username (for authentication)
        :param : hdfs_path - location to store files. (default: '/tmp/')
        :return: Nothing.
        '''
        self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
        self.hdfs_path = hdfs_path

    def solr_connection(self, host, port, collection):
        ''' This function use to establish a connection to solr, for query or
        search any text on a system.
        :param : host - solr's host
        :param : port - solr's running port
        :param : collection - solr's collection for searching
        '''
        self.solr = ''.join(['http://',host,':',port,'/solr/',collection])

    def extract(self, file):
        ''' This function use to extract meta data from a file. We use hachoir3 library
        to extract them. (See more: http://hachoir3.readthedocs.org)
        :param : file - file for extract
        :return: meta data as dict for success, 0 if fail.
        '''
    	try:
    		filename, realname = unicodeFilename(file), file
    		parser = createParser(filename, realname)
    		meta_data = extractMetadata(parser)
    		meta_data_text = meta_data.exportPlaintext()
    		meta_list = dict()
    		for i in range(1,len(meta_data_text)):
    			meta_split = meta_data_text[i].split(":")
    			column = meta_split[0].replace('- ','')
    			value = meta_split[1].lstrip()
    			meta_list.update({column:value})
    		return meta_list
    	except:
            if self.debug:
        		print "Something went wrong, meta data of",file,"could not extract."
            return None


    def upload(self, file):
        ''' This function use to uplaod a file to hdfs and store meta data on hbase
        Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data.
        This function will increase a file version if it is already store in hbase.
        :param : file - file's name
        :return: True if success otherwise False.
        '''
        version = 1
        key = ''.join(['v',str(version),'.',file])
        path = ''.join([self.hdfs_path,key])

        # Read a file
        try:
            f = open(file,'r')
            file_content = f.read()
            f.close()
        except:
            print "Cannot read file:",file

        # Check file's version
        while self.hbase_table.fetch(key) != None:
            version = int(self.get_lastest_version(file)) + 1
            key = ''.join(['v',str(version),'.',file])
            path = ''.join([self.hdfs_path,key])

        # Try to upload file.
        try:
            self.hdfs.create_file(path,file_content)
            hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus']
            file_meta = self.extract(file)
            t = self.hbase_table
            status = t.insert(
                key,
                {
                    'file': {'content': file_content}
                }
            )
            if status != 200:
                if self.debug:
                    print "Error inserting: file content"
            # save hbase meta data
            for i in range(0,len(file_meta.keys())):
                status = t.insert(
                    key,
                    {
                        'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]}
                    }
                )
                if status != 200:
                    if self.debug:
                        print "Error inserting:", file_meta.keys()[i]
            # save hdfs meta data
            for i in range(0,len(hdfs_meta.keys())):
                status = t.insert(
                    key,
                    {
                        'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]}
                    }
                )
                if status != 200:
                    if self.debug:
                        print "Error inserting:", hdfs_meta.keys()[i]
            # save version
            status = t.insert(
                key,
                {
                    'meta_data': {'version': version}
                }
            )
            if status != 200:
                if self.debug:
                    print "Error inserting: version"
        except:
            if self.debug:
                print "Upload failed."
            return False
        if self.debug:
            print "[Uploaded]", file, "version:", version
        return True

    def download(self, file, version=None, download_dir=''):
        ''' This function use to retrieve or download file from hdfs. Then save
        it as a new file named (v[version].[file] - For example, v1.mytext.txt).
        You can specify the directory of downloaded file. You can also specify
        file's version for downloading if not it will be version 1.
        :param : file - file's name
        :param : version - file's version (default: 1)
        :param : download_dir - download directory (default: '' or current directory
                 NOTE: it must end with '/' - For example, '../download/')
        :return: True if success otherwise false.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        path = ''.join([self.hdfs_path,key])
        downloaded_file = ''.join([download_dir,key])
        try:
            f = open(downloaded_file, 'w')
            f.write(self.hdfs.read_file(path))
            f.close()
        except:
            if self.debug:
                print "Cannot download a file:", file
            return False
        if self.debug:
            print "[Downloaded]",key
        return True

    def update(self, file, version=None):
        ''' This function use to update file to hdfs and data stored in hbase by
        overwrite that file on hdfs, and also insert new data to hbase too. You can
        specify a file's version in order to update it.
        :param : file - file's name
        :param : version - file's version
        :return: True if success otherwise False.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        path = ''.join([self.hdfs_path,key])

        # Read a file
        try:
            f = open(file,'r')
            file_content = f.read()
            f.close()
        except:
            print "Cannot read file:",file

        # Try to upload file.
        try:
            self.hdfs.create_file(path,file,overwrite=True)
            hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus']
            file_meta = self.extract(file)
            status = t.insert(
                key,
                {
                    'file': {'content': file_content,
                             'name': file}
                }
            )
            if status != 200:
                if self.debug:
                    print "Error inserting: file content"

            # save hbase meta data
            for i in range(0,len(file_meta.keys())):
                status = t.insert(
                    key,
                    {
                        'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]}
                    }
                )
                if status != 200:
                    if self.debug:
                        print "Error inserting:", file_meta.keys()[i]
            # save hdfs meta data
            for i in range(0,len(hdfs_meta.keys())):
                status = t.insert(
                    key,
                    {
                        'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]}
                    }
                )
                if status != 200:
                    if self.debug:
                        print "Error inserting:", hdfs_meta.keys()[i]
            # save version
            status = t.insert(
                key,
                {
                    'meta_data': {'version': version}
                }
            )
            if status != 200:
                if self.debug:
                    print "Error inserting: version"
        except:
            if self.debug:
                print "Update failed."
            return False
        if self.debug:
            print "[Updated]", file, "version:", version
        return True

    def delete(self, file, version=None):
        ''' This function use to delete file in hbase, and hdfs. You can specify
        file's version in order to delete it.
        :param : file - file's name
        :param : version - file's version
        :return: True if succes otherwise False.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        path = ''.join([self.hdfs_path,key])

        # Check if file exists
        if self.hbase_table.fetch(key) == None:
            if self.debug:
                print "Cannot delete.",key,"is not exists."
            return False

        # Remove row on hbase
        t = self.hbase_table
        if t.remove(key) != 200:
            if self.debug:
                print "[HBASE] cannot remove a row key:",key
            return False

        # Delete file on hdfs
        if not self.hdfs.delete_file_dir(path):
            if self.debug:
                print "[HDFS] Cannot remove a file path:",path
            return False
        if self.debug:
            print "[Deleted]", file, "version:", version
        return True

    def get_file_meta_data(self, file, version=None):
        ''' This function use to get all file's meta_data from hbase. You can
        specify a file's version.
        :param : file - file's name
        :param : version - file's version
        :return: meta data as dict for success, 0 if fail
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        if not self.hbase_table.fetch(key):
            if self.debug:
                print key,"is not exists"
            return False
        return self.hbase_table.fetch(key)['meta_data']

    def get_file_content(self, file, version=None):
        ''' This function use to get all file's content from hbase. You can
        specify a file's version.
        :param : file - file's name
        :param : version - file's version
        :return: meta data as dict for success, 0 if fail
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        if not self.hbase_table.fetch(key):
            if self.debug:
                print key,"is not exists"
            return False
        return self.hbase_table.fetch(key)['file']

    def search(self, text):
        ''' This function will search in xxxx via solr rest api.
        :param : text - text for searching
        :return: json response from solr, False for not found.
        '''
        query = urlopen(''.join([self.solr,'/select?q=',text,'&wt=json']))
        response = simplejson.load(query)
        if response['response']['numFound'] == 0:
            if self.debug:
                print text,"not found!"
            return False
        return response

    def get_all_file(self):
        ''' This function return all files that stored on Hbase in a list format.
        :param : Nothing.
        :return: fetch result as a list.
        '''
        rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}'
        t = self.hbase_table
        result = t.fetch_all_rows(with_row_id=True, filter_string=rf)
        return list(result)

    def get_file_version(self, file):
        ''' This function will fetch data from file name then return them.
        :param : file - file's name
        :return: file_list with version as a dict.
        '''
        rf = ''.join(['{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "',file,'"}}'])
        t = self.hbase_table
        result = t.fetch_all_rows(with_row_id=True, filter_string=rf)
        lsr = list(result)
        file_version = list()
        for i in range(0,len(lsr)):
            file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1])
        file_list = dict()
        file_list['name'] = file
        file_list['version'] = file_version
        return file_list

    def get_lastest_version(self, file):
        ''' This function will return a lastest version number as integer.
        :param : file - file's name
        :return: version number as an integer.
        '''
        file_version = self.get_file_version(file)
        file_version['version'].sort()
        return file_version['version'][len(file_version['version'])-1]

    def delete_all_version(self, file):
        ''' This function will delete all file's version in an hbase and HDFS
        :param : file - file's name
        :return: True if success otherwise False
        '''
        self.get_file_version(file)['version'].sort()
        for version in self.get_file_version(file)['version']:
            try:
                self.delete(file,version)
            except:
                return False
        return True

    def delete_all(self):
        ''' This function will delete all the files on an hbase and hdfs.
        :param : Nothing
        :return: True if success otherwise False
        '''
        for full_file in self.get_all_file():
            file = full_file.keys()[0].split('.')[1]
            version = full_file.keys()[0].split('.')[0].split('v')[1]
            try:
                self.delete(file,version)
            except:
                return False
        return True
示例#47
0
class DMS:
    def __init__(self, debug=0):
        ''' This function use to init a class. To show an error messages debug
        should be 1.
        :param : debug - 1, show an error or success message. 0 otherwise
        :return: Nothing.
        '''
        self.debug = debug
        pass

    def hbase_connection(self, host, port, table='dms'):
        ''' This function use to establish a connection to hbase, for preparing to
        insert, remove, fetch data from hbase. We use starbase for connect to hbase
        via rest api.(See more: https://github.com/barseghyanartur/starbase)
        :param : host - hbase rest host
        :param : port - hbase rest running port
        :param : table - DMS table on hbase (default: 'dms')
        :return: Nothing.
        '''
        self.hbase = hbaseConnection(host=host, port=port)
        t = self.hbase.table(table)
        if (not t.exists()):
            t.create('meta_data', 'file')
        self.hbase_table = t

    def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'):
        ''' This function use to establish a connection to hdfs, for preparing to
        create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to
        do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/)
        :param : host - hdfs rest host
        :param : port - hdfs rest running port
        :param : user_name - hdfs username (for authentication)
        :param : hdfs_path - location to store files. (default: '/tmp/')
        :return: Nothing.
        '''
        self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
        self.hdfs_path = hdfs_path

    def solr_connection(self, host, port, collection):
        ''' This function use to establish a connection to solr, for query or
        search any text on a system.
        :param : host - solr's host
        :param : port - solr's running port
        :param : collection - solr's collection for searching
        '''
        self.solr = ''.join(['http://', host, ':', port, '/solr/', collection])

    def extract(self, file):
        ''' This function use to extract meta data from a file. We use hachoir3 library
        to extract them. (See more: http://hachoir3.readthedocs.org)
        :param : file - file for extract
        :return: meta data as dict for success, 0 if fail.
        '''
        try:
            filename, realname = unicodeFilename(file), file
            parser = createParser(filename, realname)
            meta_data = extractMetadata(parser)
            meta_data_text = meta_data.exportPlaintext()
            meta_list = dict()
            for i in range(1, len(meta_data_text)):
                meta_split = meta_data_text[i].split(":")
                column = meta_split[0].replace('- ', '')
                value = meta_split[1].lstrip()
                meta_list.update({column: value})
            return meta_list
        except:
            if self.debug:
                print "Something went wrong, meta data of", file, "could not extract."
            return None

    def upload(self, file):
        ''' This function use to uplaod a file to hdfs and store meta data on hbase
        Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data.
        This function will increase a file version if it is already store in hbase.
        :param : file - file's name
        :return: True if success otherwise False.
        '''
        version = 1
        key = ''.join(['v', str(version), '.', file])
        path = ''.join([self.hdfs_path, key])

        # Read a file
        try:
            f = open(file, 'r')
            file_content = f.read()
            f.close()
        except:
            print "Cannot read file:", file

        # Check file's version
        while self.hbase_table.fetch(key) != None:
            version = int(self.get_lastest_version(file)) + 1
            key = ''.join(['v', str(version), '.', file])
            path = ''.join([self.hdfs_path, key])

        # Try to upload file.
        try:
            self.hdfs.create_file(path, file_content)
            hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus']
            file_meta = self.extract(file)
            t = self.hbase_table
            status = t.insert(key, {'file': {'content': file_content}})
            if status != 200:
                if self.debug:
                    print "Error inserting: file content"
            # save hbase meta data
            for i in range(0, len(file_meta.keys())):
                status = t.insert(
                    key, {
                        'meta_data': {
                            file_meta.keys()[i]: file_meta[file_meta.keys()[i]]
                        }
                    })
                if status != 200:
                    if self.debug:
                        print "Error inserting:", file_meta.keys()[i]
            # save hdfs meta data
            for i in range(0, len(hdfs_meta.keys())):
                status = t.insert(
                    key, {
                        'meta_data': {
                            hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]
                        }
                    })
                if status != 200:
                    if self.debug:
                        print "Error inserting:", hdfs_meta.keys()[i]
            # save version
            status = t.insert(key, {'meta_data': {'version': version}})
            if status != 200:
                if self.debug:
                    print "Error inserting: version"
        except:
            if self.debug:
                print "Upload failed."
            return False
        if self.debug:
            print "[Uploaded]", file, "version:", version
        return True

    def download(self, file, version=None, download_dir=''):
        ''' This function use to retrieve or download file from hdfs. Then save
        it as a new file named (v[version].[file] - For example, v1.mytext.txt).
        You can specify the directory of downloaded file. You can also specify
        file's version for downloading if not it will be version 1.
        :param : file - file's name
        :param : version - file's version (default: 1)
        :param : download_dir - download directory (default: '' or current directory
                 NOTE: it must end with '/' - For example, '../download/')
        :return: True if success otherwise false.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        path = ''.join([self.hdfs_path, key])
        downloaded_file = ''.join([download_dir, key])
        try:
            f = open(downloaded_file, 'w')
            f.write(self.hdfs.read_file(path))
            f.close()
        except:
            if self.debug:
                print "Cannot download a file:", file
            return False
        if self.debug:
            print "[Downloaded]", key
        return True

    def update(self, file, version=None):
        ''' This function use to update file to hdfs and data stored in hbase by
        overwrite that file on hdfs, and also insert new data to hbase too. You can
        specify a file's version in order to update it.
        :param : file - file's name
        :param : version - file's version
        :return: True if success otherwise False.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        path = ''.join([self.hdfs_path, key])

        # Read a file
        try:
            f = open(file, 'r')
            file_content = f.read()
            f.close()
        except:
            print "Cannot read file:", file

        # Try to upload file.
        try:
            self.hdfs.create_file(path, file, overwrite=True)
            hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus']
            file_meta = self.extract(file)
            status = t.insert(
                key, {'file': {
                    'content': file_content,
                    'name': file
                }})
            if status != 200:
                if self.debug:
                    print "Error inserting: file content"

            # save hbase meta data
            for i in range(0, len(file_meta.keys())):
                status = t.insert(
                    key, {
                        'meta_data': {
                            file_meta.keys()[i]: file_meta[file_meta.keys()[i]]
                        }
                    })
                if status != 200:
                    if self.debug:
                        print "Error inserting:", file_meta.keys()[i]
            # save hdfs meta data
            for i in range(0, len(hdfs_meta.keys())):
                status = t.insert(
                    key, {
                        'meta_data': {
                            hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]
                        }
                    })
                if status != 200:
                    if self.debug:
                        print "Error inserting:", hdfs_meta.keys()[i]
            # save version
            status = t.insert(key, {'meta_data': {'version': version}})
            if status != 200:
                if self.debug:
                    print "Error inserting: version"
        except:
            if self.debug:
                print "Update failed."
            return False
        if self.debug:
            print "[Updated]", file, "version:", version
        return True

    def delete(self, file, version=None):
        ''' This function use to delete file in hbase, and hdfs. You can specify
        file's version in order to delete it.
        :param : file - file's name
        :param : version - file's version
        :return: True if succes otherwise False.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        path = ''.join([self.hdfs_path, key])

        # Check if file exists
        if self.hbase_table.fetch(key) == None:
            if self.debug:
                print "Cannot delete.", key, "is not exists."
            return False

        # Remove row on hbase
        t = self.hbase_table
        if t.remove(key) != 200:
            if self.debug:
                print "[HBASE] cannot remove a row key:", key
            return False

        # Delete file on hdfs
        if not self.hdfs.delete_file_dir(path):
            if self.debug:
                print "[HDFS] Cannot remove a file path:", path
            return False
        if self.debug:
            print "[Deleted]", file, "version:", version
        return True

    def get_file_meta_data(self, file, version=None):
        ''' This function use to get all file's meta_data from hbase. You can
        specify a file's version.
        :param : file - file's name
        :param : version - file's version
        :return: meta data as dict for success, 0 if fail
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        if not self.hbase_table.fetch(key):
            if self.debug:
                print key, "is not exists"
            return False
        return self.hbase_table.fetch(key)['meta_data']

    def get_file_content(self, file, version=None):
        ''' This function use to get all file's content from hbase. You can
        specify a file's version.
        :param : file - file's name
        :param : version - file's version
        :return: meta data as dict for success, 0 if fail
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        if not self.hbase_table.fetch(key):
            if self.debug:
                print key, "is not exists"
            return False
        return self.hbase_table.fetch(key)['file']

    def search(self, text):
        ''' This function will search in xxxx via solr rest api.
        :param : text - text for searching
        :return: json response from solr, False for not found.
        '''
        query = urlopen(''.join([self.solr, '/select?q=', text, '&wt=json']))
        response = simplejson.load(query)
        if response['response']['numFound'] == 0:
            if self.debug:
                print text, "not found!"
            return False
        return response

    def get_all_file(self):
        ''' This function return all files that stored on Hbase in a list format.
        :param : Nothing.
        :return: fetch result as a list.
        '''
        rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}'
        t = self.hbase_table
        result = t.fetch_all_rows(with_row_id=True, filter_string=rf)
        return list(result)

    def get_file_version(self, file):
        ''' This function will fetch data from file name then return them.
        :param : file - file's name
        :return: file_list with version as a dict.
        '''
        rf = ''.join([
            '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "',
            file, '"}}'
        ])
        t = self.hbase_table
        result = t.fetch_all_rows(with_row_id=True, filter_string=rf)
        lsr = list(result)
        file_version = list()
        for i in range(0, len(lsr)):
            file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1])
        file_list = dict()
        file_list['name'] = file
        file_list['version'] = file_version
        return file_list

    def get_lastest_version(self, file):
        ''' This function will return a lastest version number as integer.
        :param : file - file's name
        :return: version number as an integer.
        '''
        file_version = self.get_file_version(file)
        file_version['version'].sort()
        return file_version['version'][len(file_version['version']) - 1]

    def delete_all_version(self, file):
        ''' This function will delete all file's version in an hbase and HDFS
        :param : file - file's name
        :return: True if success otherwise False
        '''
        self.get_file_version(file)['version'].sort()
        for version in self.get_file_version(file)['version']:
            try:
                self.delete(file, version)
            except:
                return False
        return True

    def delete_all(self):
        ''' This function will delete all the files on an hbase and hdfs.
        :param : Nothing
        :return: True if success otherwise False
        '''
        for full_file in self.get_all_file():
            file = full_file.keys()[0].split('.')[1]
            version = full_file.keys()[0].split('.')[0].split('v')[1]
            try:
                self.delete(file, version)
            except:
                return False
        return True
示例#48
0
rename_dir = 'user/hdfs/example_rename'

# create a new client instance
hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hduser')

# create a new directory for the example
print('making new HDFS directory at: {0}\n'.format(example_dir))
hdfs.make_dir(example_dir)

# get a dictionary of the directory's status
dir_status = hdfs.get_file_dir_status(example_dir)
print dir_status

# create a new file on hdfs
print('making new file at: {0}\n'.format(example_file))
hdfs.create_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print file_status

# append to the file created in previous step
print('appending to file at: {0}\n'.format(example_file))
hdfs.append_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print file_status

# read in the data for the file
print('reading data from file at: {0}\n'.format(example_file))
file_data = hdfs.read_file(example_file)
print file_data
示例#49
0
 def save(self):
     hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie')
     deployment_path = "user/oozie/bundles/{0}".format(self.name)
     bundle_path = "{0}/bundle.xml".format(deployment_path, self.name)
     
     hdfs.create_file(bundle_path, self.as_xml())