Exemplo n.º 1
0
def save_file_hdfs(rdd, dir_files_pdf, server_hdfs, user_name_hdfs):
    n_file_id = int(rdd[0])
    n_info_tec = rdd[1].replace("/", "-")
    n_file = rdd[2]
    hdfsclient = KerberosClient(server_hdfs)
    hdfsclient.write(os.path.join(dir_files_pdf,
                                  '{}_{}.pdf'.format(n_file_id, n_info_tec)),
                     n_file,
                     overwrite=True)
    return rdd
Exemplo n.º 2
0
def transform(outDir, image, x, y, dt):
    plt.switch_backend('agg')
    plt.figure(figsize=(25, 15), dpi=100)
    p = Proj(proj='geos', h=satHeight, lon_0=satLongitude, sweep=satSweep)
    XX, YY = np.meshgrid(x, y)
    lons, lats = p(XX, YY, inverse=True)
    mH = Basemap(resolution='i', projection='lcc', area_thresh=1500,
                 width=1800 * 3000, height=1060 * 3000,
                 lat_1=38.5, lat_2=38.5,
                 lat_0=38.5, lon_0=-97.5)
    xH, yH = mH(lons, lats)
    rgb = image[1][:, :-1, :]
    rgb = rgb / 256.0
    colorTuple = rgb.reshape((rgb.shape[0] * rgb.shape[1]), 3)
    colorTuple = np.insert(colorTuple, 3, 1.0, axis=1)
    newmap = mH.pcolormesh(xH, yH, image[1][:, :, 0], color=colorTuple, linewidth=0)
    newmap.set_array(None)
    mH.drawstates()
    mH.drawcountries()
    mH.drawcoastlines()
    # plt.title('GOES-16 Pseudo Color\n%s' % dt.strftime('%B %d, %Y UTC'))
    buf = BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
    buf.seek(0)
    client = KerberosClient('http://hc.gps.stthomas.edu:50070')
    with client.write(outDir + '/TRANSFORM_' + image[0].split("/")[-1], overwrite=True) as writer:
        writer.write(buf.getvalue())
    buf.close()
Exemplo n.º 3
0
def addMap(outDir, image, satLongitude, xmin, xmax, ymin, ymax, dt):
    plt.switch_backend('agg')
    plt.figure(figsize=(25, 15), dpi=100)
    m = Basemap(projection='geos', lon_0=satLongitude,
                resolution='i', area_thresh=1000,
                llcrnrx=xmin, llcrnry=ymin,
                urcrnrx=xmax, urcrnry=ymax)
    m.imshow(np.flipud(image[1]))
    m.drawcoastlines()
    m.drawcountries()
    m.drawstates()
    # plt.title('GOES-16 Pseudo Color\n%s' % dt.strftime('%B %d, %Y UTC'))
    buf = BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
    buf.seek(0)
    client = KerberosClient('http://hc.gps.stthomas.edu:50070')
    with client.write(outDir + '/MAP_' + image[0].split("/")[-1], overwrite=True) as writer:
        writer.write(buf.getvalue())
    buf.close()
Exemplo n.º 4
0
print('Fitting model to data...')
clf = OneVsRestLogisticRegression(negative_column_index=NEGATIVE_COLUMN_INDEX,
                                  class_weight='balanced')
clf.fit(X, y)

print('Saving to HDFS...')
mlb_pickle = pickle.dumps(mlb)
vectorizer_pickle = pickle.dumps(vectorizer)
clf_pickle = pickle.dumps(clf)

formatted_hdfs_path = "/".join(HDFS_MODEL_DIR.split('/')[5:])
current_time = datetime.now().strftime('%Y%m%d%H%M%S')

client.write('{}/{}/model/mlb_binarizer.pkl'.format(formatted_hdfs_path,
                                                    current_time),
             mlb_pickle,
             overwrite=True)
client.write('{}/{}/model/vectorizer.pkl'.format(formatted_hdfs_path,
                                                 current_time),
             vectorizer_pickle,
             overwrite=True)
client.write('{}/{}/model/model.pkl'.format(formatted_hdfs_path, current_time),
             clf_pickle,
             overwrite=True)

keys_string = 'SNCA_DK\n' + "\n".join([str(int(k)) for k in train_keys])
client.write('{}/{}/model/train_keys.csv'.format(formatted_hdfs_path,
                                                 current_time),
             keys_string,
             overwrite=True)
Exemplo n.º 5
0
# y has to be given in str format here, so as to store the results as tuples
# even if they have a single element
df_results = pd.DataFrame(
    np.concatenate(
        (df[ID_COLUMN].values.reshape(-1, 1),
            np.array([str(p) for p in y]).reshape(-1, 1)),
        axis=1),
    columns=[ID_COLUMN, LABEL_COLUMN]
)

print('Writing results to HDFS...')
formatted_hdfs_path = "/".join(HDFS_MODEL_DIR.split('/')[5:])
current_time = datetime.now().strftime('%Y%m%d%H%M%S')
client.write(
    '{}/{}/results/{}.csv'.format(formatted_hdfs_path,
                                  most_recent_date,
                                  current_time),
    df_results.to_csv(index=False),
    overwrite=True)

# Should only commit everything at the end, in a single transaction
conn.jconn.setAutoCommit(False)

set_module_and_client(curs, 'DUNANT IA')

# Some applications of the model should not update the database tables
if UPDATE_TABLES:
    print('Writing results to tables...')
    for labels, snca_dk in zip(y, df[ID_COLUMN].values):
        update_motivo_declarado(curs, snca_dk, labels)
        update_atividade_sindicancia(
            curs, snca_dk, ROBOT_NAME, ROBOT_NUMBER)
Exemplo n.º 6
0
    py_logger.info("hive execution completed")

    client = KerberosClient(hdfs_url)

    s3 = session.client('s3',use_ssl=False, verify=False)
    counter = 0

    for file_path in file_list_arr:

        file_path = source_directory + file_path

        file_name = os.path.basename(file_path)
        key_name = s3_folder_name + file_name
			
		with client.write(file_path) as f:
		    s3.download_fileobj(bucket_name, key_name, f)

        counter = counter + 1
        py_logger.info("File: " + file_path + " downloaded from s3 bucket")
        
    py_logger.info("S3 script execution completed. No.of Files downloaded: " + str(counter))

	#Compresses the log files which are greater than 30 days
    today = date.today()
    current_day = datetime.now().strftime('%d')
    log_directory = log_file_path.rpartition('/')[0] + log_file_path.rpartition('/')[1]
    tarFileName = log_directory + today.strftime("%d-%m-%Y") + '.tar.gz'
	
    if current_day == "30":
        # writing files to a compressed file
Exemplo n.º 7
0
class OperateHDFS:
    def __init__(self, url):
        '''

        :param url:HDFS名称节点的主机名或IP地址,以协议为前缀,其次是namenode上的WebHDFS端口,也可以指定多个URL以分号分隔以获取高可用性支持.
        '''
        # 实例化HDFS web client using Kerberos authentication
        self.client = KerberosClient(url)

    def file_list(self, file_path):
        '''

        :param file_path: HDFS远程目录路径
        :return: 返回一个远程目录中包含的所有文件
        '''
        file_detail = self.client.list(hdfs_path=file_path)
        return file_detail

    def file_read(self, file_path):
        '''
        从HDFS中读取文件
        :param file_path: HDFS远程文件路径
        :return:
        '''
        lines = []
        with self.client.read(hdfs_path=file_path,
                              encoding='utf-8',
                              delimiter=r'\n') as reader:
            # content = file.read()
            # print(content)
            for item in reader:
                lines.append(item.strip())
        return lines

    def file_create_write(self, file_path, data_write):
        '''
        在HDFS中创建新文件并写入内容
        :param file_path: HDFS远程文件路径
        :param data_write: 写入到文件的数据
        :return:
        '''
        self.client.write(hdfs_path=file_path,
                          data=data_write,
                          encoding='utf-8')

    def file_append_write(self, file_path, data_append):
        '''
        在HDFS中已存在的文件中追加写入内容,文件必须已存在
        :param file_path: HDFS远程文件路径
        :param data_append: 追加到文件的数据
        :return:
        '''
        self.client.write(hdfs_path=file_path,
                          data=data_append,
                          encoding='utf-8',
                          append=True)

    def file_rename(self, src_file_path, dst_file_path):
        '''
        重命名/移动文件或文件夹
        :param src_file_path: 源文件路径
        :param dst_file_path: 目的文件路径
        :return:
        '''
        self.client.rename(hdfs_src_path=src_file_path,
                           hdfs_dst_path=dst_file_path)

    def mkdir(self, file_path):
        '''
        在HDFS中创建远程目录,必要时递归创建
        :param file_path: 需要新建的文件夹路径(包含名字)
        :return:
        '''
        self.client.makedirs(hdfs_path=file_path)

    def upload_files(self, file_path, local_path):
        '''
        上传文件或目录到HDFS
        :param file_path:HDFS目标路径。如果它已经存在并且是一个目录,文件将被上传其中。
        :param local_path:文件或文件夹的本地路径。 如果是文件夹,则将上传其中的所有文件(请注意,这意味着没有文件的文件夹将不会远程创建)
        :return:hdfs_path_return:成功后,此方法将返回远程上传路径。
        '''
        hdfs_path_return = self.client.upload(hdfs_path=file_path,
                                              local_path=local_path)
        return hdfs_path_return

    def download_files(self, file_path, local_path):
        '''
        从HDFS下载一个文件或文件夹并将其保存在本地
        :param file_path:HDFS上要下载的文件或文件夹的路径。 如果是文件夹,则将下载该文件夹下的所有文件
        :param local_path:本地路径。 如果它已经存在并且是目录,则文件将在其中下载。
        :return: local_path_return:成功后,此方法将返回本地下载路径
        '''
        local_path_return = self.client.download(hdfs_path=file_path,
                                                 local_path=local_path)
        return local_path_return

    def delete_files(self, file_path):
        '''
        从HDFS中删除文件或目录
        :param file_path: HDFS中需要删除的文件或目录的路径
        :return:如果删除成功,则此函数返回“ True”,如果先前在“ hdfs_path”处不存在文件或目录,则返回“ False”。
        '''
        # recursive:递归删除文件和目录。 默认情况下,如果尝试删除非空目录,则此方法将引发HdfsError。
        # skip_trash:设置为false时,已删除的路径将被移至相应的垃圾文件夹,而不是被删除。 这需要Hadoop 2.9+且在集群上启用trash
        return self.client.delete(hdfs_path=file_path,
                                  recursive=False,
                                  skip_trash=True)

    def set_files_permission(self, file_path):
        '''
        更改文件的权限
        :param file_path: 需要更改权限的文件路径
        :return:
        '''
        # permission:文件的新八进制权限字符串
        self.client.set_permission(hdfs_path=file_path, permission=None)