Пример #1
0
import os
from settings import PATH, HIVE_SCRIPT_PATH, HIVE_DB, HDFS_PATH, HIVE_QUERY_SCRIPTS_PATH
from common import create_dir, color, get_files_in_dir_with_extension
        
#FILES = [file for file in os.listdir(HIVE_SCRIPT_PATH)]
#DATA_FILES = [ thing for thing in FILES if os.path.isfile(HIVE_SCRIPT_PATH+"/"+thing) and thing.endswith('.hive')]
DATA_FILES = get_files_in_dir_with_extension(HIVE_SCRIPT_PATH, '.hive')

if not DATA_FILES:
    print color.RED + "No hive files found in directory "+PATH+ color.END
    sys.exit(0)

#Hive variables
HIVE_SETUP="use "+HIVE_DB+"; set hive.cli.print.header=true; "

create_dir(HIVE_QUERY_SCRIPTS_PATH)

def generate_hive_queries(fields, tablename):
    #Create dir where we will write the results
    RESULTS_DIR=HIVE_QUERY_SCRIPTS_PATH+'/'+tablename
    create_dir(RESULTS_DIR)
    query_file = open( HIVE_QUERY_SCRIPTS_PATH + '/' + tablename + '.sh', "w")
    query_file.write("#!/bin/bash \n")
    file_name=RESULTS_DIR+'/'+tablename+'.txt'
    to_execute = "hive -e \""+HIVE_SETUP+" select count(*) as all_rows from "+tablename+";\" > "+file_name+'\n';
    to_execute+="echo \"----\" >>"+file_name+'\n';
    to_execute += "hive -e \""+HIVE_SETUP+" describe "+tablename+";\" > "+file_name+'\n';
    query_file.write(to_execute)
    for field in fields:
        column_name=field[0]
        file_name=RESULTS_DIR+'/'+column_name+'.txt'
Пример #2
0
# import require packages
import os
import pandas as pd
from settings import PATH, HIVE_SCRIPT_PATH, HIVE_DB, HDFS_PATH, BIG_HIVE_SCRIPT, dry_run
from common import create_dir, color, get_files_in_dir_with_extension
import subprocess
import sys

#Get files with extension in the defined directory
DATA_FILES = get_files_in_dir_with_extension(PATH, '.csv')
DATA_FILES.extend(get_files_in_dir_with_extension(PATH, '.txt'))

#If we did not find CSV or TXT files in the folder, just exit.
if not DATA_FILES:
    print color.RED + "No csv or txt found in directory " + PATH + color.END
    sys.exit(0)

#Create the Hive script directory if not exists
create_dir(HIVE_SCRIPT_PATH)

# define a bunch of nice helper functions
'''
For each file create a directory, because in Hive we need to put the data into a folder named the table
'''


def generate_folder(datafile):
    directory = os.path.splitext(datafile)[0]
    create_dir(directory)
    print "Created the following directory : " + directory
    return directory
Пример #3
0
# import require packages 
import os
import pandas as pd
from settings import PATH, HIVE_SCRIPT_PATH, HIVE_DB, HDFS_PATH, BIG_HIVE_SCRIPT, dry_run
from common import create_dir, color, get_files_in_dir_with_extension
import subprocess
import sys

#Get files with extension in the defined directory
DATA_FILES = get_files_in_dir_with_extension(PATH, '.csv')
DATA_FILES.extend(get_files_in_dir_with_extension(PATH, '.txt'))

#If we did not find CSV or TXT files in the folder, just exit.
if not DATA_FILES:
    print color.RED + "No csv or txt found in directory "+PATH+ color.END
    sys.exit(0)

#Create the Hive script directory if not exists
create_dir(HIVE_SCRIPT_PATH)

# define a bunch of nice helper functions

'''
For each file create a directory, because in Hive we need to put the data into a folder named the table
'''

def generate_folder(datafile): 
    directory = os.path.splitext(datafile)[0]
    create_dir(directory)
    print "Created the following directory : " + directory
    return directory