Пример #1
0
def read(fname):
    "Internal API to read data from HDFS files"
    # we need data file schema, later we need some proper default
    uri = os.getenv('WMA_SCHEMA', '/user/valya/test/fwjr_proc.avsc')
    schemaData = hdfs.load(uri)
    schema = avro.schema.parse(schemaData)

    out = []
    data = hdfs.load(fname)
    bytes_reader = io.BytesIO(data)

    if fname.endswith('.gz'):
        # use gzip'ed reader and pass to it BytesIO as file object
        gzip_reader = gzip.GzipFile(fileobj=bytes_reader)
        decoder = avro.io.BinaryDecoder(gzip_reader)
    else:
        # use non-compressed reader
        decoder = avro.io.BinaryDecoder(bytes_reader)

    reader = avro.io.DatumReader(schema)
    while True:
        try:
            rec = reader.read(decoder)
            out.append(rec)
        except:
            break
    # close gzip stream if necessary
    if fname.endswith('.gz'):
        gzip_reader.close()

    # close bytes stream
    bytes_reader.close()
    return out
Пример #2
0
 def __cp_file(self, wd):
     fn = "%s/fn" % wd
     hdfs.dump(self.data, fn, mode="wb")
     dest_dir = "%s/dest_dir" % wd
     hdfs.mkdir(dest_dir)
     fn_copy_on_wd = "%s/fn_copy" % wd
     hdfs.cp(fn, fn_copy_on_wd, mode="wb")
     self.assertEqual(hdfs.load(fn_copy_on_wd), self.data)
     self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd)
     fn_copy_on_dest_dir = "%s/fn" % dest_dir
     hdfs.cp(fn, dest_dir, mode="wb")
     self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data)
     self.assertRaises(IOError, hdfs.cp, fn, dest_dir)
Пример #3
0
def read_avro(fname, schema):
    "Internal API to read data from HDFS files"
    out = []
    data = hdfs.load(fname)
    bytes_reader = io.BytesIO(data)

    if  fname.endswith('.gz'):
        # use gzip'ed reader and pass to it BytesIO as file object
        gzip_reader = gzip.GzipFile(fileobj=bytes_reader)
        decoder = avro.io.BinaryDecoder(gzip_reader)
    else:
        # use non-compressed reader
        decoder = avro.io.BinaryDecoder(bytes_reader)

    reader = avro.io.DatumReader(schema)
    while True:
        try:
            rec = reader.read(decoder)
            out.append(rec)
        except:
            break
    # close gzip stream if necessary
    if  fname.endswith('.gz'):
        gzip_reader.close()

    # close bytes stream
    bytes_reader.close()
    return out
Пример #4
0
def read_job_conf(path: str) -> Dict[Any, Any]:
    """
    The configuration file is passed as path on HopsFS
    The path is a JSON containing different values depending on the op type
    """
    file_content = hdfs.load(path)
    return json.loads(file_content)
Пример #5
0
    def _read(self, spec, fields=None):
        "Internal read API"
        if PAT_UID.match(str(spec)):  # requested to read concrete file
            out = []
            year, month, _ = today()
            hdir = '%s/%s/%s' % (self.hdir, year, month)
            fname = file_name(hdir, spec, self.compress)
            data = hdfs.load(fname)
            bytes_reader = io.BytesIO(data)

            if self.compress:
                # use gzip'ed reader and pass to it BytesIO as file object
                gzip_reader = gzip.GzipFile(fileobj=bytes_reader)
                decoder = avro.io.BinaryDecoder(gzip_reader)
            else:
                # use non-compressed reader
                decoder = avro.io.BinaryDecoder(bytes_reader)

            reader = avro.io.DatumReader(self.schema)
            while True:
                try:
                    rec = reader.read(decoder)
                    out.append(rec)
                except:
                    break
            # close gzip stream if necessary
            if self.compress:
                gzip_reader.close()
            # close bytes stream
            bytes_reader.close()
            return out
        return self.empty_data
Пример #6
0
    def read_schema(self, context):
        sfile = context.get_job_conf().get('avro.schema', None)
        try:
            schemaData = hdfs.load(sfile)
        except ValueError:
            # if not sfile:
            # else:
            sys.exit(1)

        self.schema = avro.schema.parse(schemaData)
Пример #7
0
 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd)
     for t in src_t.walk():
         copy_name = t.name.replace(src_bn, copy_on_wd_bn)
         self.assertTrue(hdfs.path.exists(copy_name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(copy_name), self.data)
     hdfs.cp(src, copy_on_wd)
     for t in src_t.walk():
         copy_name = t.name.replace(src_bn,
                                    "%s/%s" % (copy_on_wd_bn, src_bn))
         self.assertTrue(hdfs.path.exists(copy_name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(copy_name), self.data)
Пример #8
0
def load(hdfs_path):
    """
    Read the content of hdfs_path and return it.

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).

    Returns:
        the read contents of hdfs_path
    """
    hdfs_path = _expand_path(hdfs_path)
    return hdfs.load(hdfs_path)
Пример #9
0
 def __init__(self, uri, compress=True):
     "ctor with hdfs uri: hdfsio:/path/schema.avsc"
     Storage.__init__(self, uri)
     schema = self.uri
     if not hdfs.ls(schema):
         raise Exception("No avro schema file found in provided uri: %s" %
                         uri)
     self.hdir = self.uri.rsplit('/', 1)[0]
     if not hdfs.path.isdir(self.hdir):
         raise Exception('HDFS path %s does not exists' % self.hdir)
     schema_doc = hdfs.load(schema)
     self.schema = avro.schema.parse(schema_doc)
     self.compress = compress
Пример #10
0
 def __cp_recursive(self, wd):
     src_t = self.__make_tree(wd)
     src = src_t.name
     copy_on_wd = "%s_copy" % src
     src_bn, copy_on_wd_bn = [
         hdfs.path.basename(d) for d in (src, copy_on_wd)
     ]
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(wd, root=copy_on_wd_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
     # check semantics when target dir already exists
     hdfs.rmr(copy_on_wd)
     hdfs.mkdir(copy_on_wd)
     hdfs.cp(src, copy_on_wd, mode="wb")
     exp_t = self.__make_tree(copy_on_wd, root=src_bn, create=False)
     for t, exp_t in czip(src_t.walk(), exp_t.walk()):
         self.assertTrue(hdfs.path.exists(exp_t.name))
         if t.kind == 0:
             self.assertEqual(hdfs.load(exp_t.name), self.data)
Пример #11
0
def wordcount():
    f = request.files['file']
    f.save(
        os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(f.filename)))
    with open(f.filename, 'r') as fopen:
        hdfs.dump(fopen.read(), '/user/input_wordcount/text')
    os.system(
        'pydoop script -c combiner wordcount.py /user/input_wordcount /user/output_wordcount'
    )
    list_files = hdfs.hdfs().list_directory('/user/output_wordcount')
    return json.dumps([
        hdfs.load(file['name'], mode='rt') for file in list_files
        if 'SUCCESS' not in file['name']
    ])
Пример #12
0
def lowercase():
    f = request.files['file']
    f.save(
        os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(f.filename)))
    with open(f.filename, 'r') as fopen:
        hdfs.dump(fopen.read(), '/user/input_lowercase/text')
    os.system(
        "pydoop script --num-reducers 0 -t '' lowercase.py /user/input_lowercase /user/output_lowercase"
    )
    list_files = hdfs.hdfs().list_directory('/user/output_lowercase')
    return json.dumps([
        hdfs.load(file['name'], mode='rt') for file in list_files
        if 'SUCCESS' not in file['name']
    ])
Пример #13
0
    def __init__(self, file_prefix, loadexist=False, readonly=False):
        CustomStorage.__init__(self)
        if not loadexist:
            if hdfs.path.exists('{0}_0'.format(file_prefix)):
                file_prefix += '_0'
            while hdfs.path.exists('{0}_0'.format(file_prefix)):
                insert_index = file_prefix.rfind('_')
                file_prefix = '{0}_{1}'.format(file_prefix[:insert_index], int(file_prefix[insert_index + 1:]) + 1)
        self.file_prefix = file_prefix
        self.read_only = readonly
        self.clear()
        logger.info('init hdfs storage from hdfs file_prefix {0}'.format(self.file_prefix))
        try:
            total_start = timeit.default_timer()
            prefix_split = hdfs.path.splitpath(self.file_prefix)
            folder_path = prefix_split[0]
            real_prefix = prefix_split[1] + '_'
            if not hdfs.path.exists(folder_path):
                hdfs.mkdir(folder_path)

            files_info = hdfs.lsl(folder_path)
            # files_info = hdfs.lsl('{0}_*'.format(self.file_prefix))
            logger.debug('files_info:{0}'.format(files_info))
            sizecount = 0
            for file_info in files_info:
                start_time = timeit.default_timer()
                file_name = hdfs.path.splitpath(file_info['path'])[1]
                if file_name.startswith(real_prefix) and file_info['kind'] == 'file':
                    logger.debug('file info: {0}'.format(file_info))
                    page_id = file_name[len(real_prefix):]
                    if not page_id.isdigit():
                        continue
                    logger.debug('file {0} page id :{1}#'.format(file_info['path'],
                                                                 page_id))
                    # if page_id.isdigit():
                    logger.info('load {0}# page file {1}'.format(page_id,
                                                                 file_info['path']))
                    content = hdfs.load(file_info['path'], mode='r')
                    # logger.debug('{0}# page content:{1}'.format(page_id, content))
                    self.pagedict[int(page_id)] = content
                    logger.debug('{0}# page load complete'.format(page_id))
                    end_time = timeit.default_timer()
                    eval(generate_timer_log_str.format(
                        'load {0} {1} byte'.format(file_name, len(self.pagedict[int(page_id)])),
                        start_time,
                        end_time))
                    sizecount += len(self.pagedict[int(page_id)])
        except IOError, ie:
            logger.debug(traceback.format_exc())
Пример #14
0
 def __init__(self, uri, wmauri, yarn=''):
     "ctor with LTS uri (hdfs:///path/schema.avsc) and WMArchive uri"
     self.uri = uri
     if not hdfs.ls(self.uri):
         raise Exception("No avro schema file found in provided uri: %s" %
                         uri)
     self.hdir = self.uri.rsplit('/', 1)[0]
     if not hdfs.path.isdir(self.hdir):
         raise Exception('HDFS path %s does not exists' % self.hdir)
     schema_doc = hdfs.load(self.uri)
     self.schema = avro.schema.parse(schema_doc)
     self.taskmgr = TaskManager()
     self.wmauri = wmauri  # WMArchive URL which will be used by submit
     if not self.wmauri.endswith('/wmarchive/data'):
         self.wmauri = '%s/wmarchive/data' % self.wmauri
     self.yarn = yarn
Пример #15
0
import sys
import pydoop.hdfs as hdfs

#create package
#date = 'data/'+str(sys.argv[1])[2:]+'/'
st = '['
for x in hdfs.ls("data/18-02-21/"):
    #for x in hdfs.ls("date"):
    st = st + hdfs.load(x)

st = st.replace("\n", ",")
st = st[:-1]
st = st + ']'

#string4 = '{"input":"'+str(sys.argv[1])+'"}'

hdfs.dump(st, "test/hello.txt")
hdfs.get("test/hello.txt", "/tmp/tmp.txt")
Пример #16
0
import pydoop.hdfs as hdfs

hdfs.mkdir("test")
hdfs.dump("hello hadoop", "test/hello.txt")

text = hdfs.load("test/hello.txt")
print(text)
Пример #17
0
 def get_data_from_hdfs(self,file_location_hdfs):
     if (self.file_exist(file_location_hdfs))==False:
         return -1
     data=hdfs.load(file_location_hdfs)
     return data
Пример #18
0
 def load(self):
     for test_path in self.hdfs_paths[0], self.local_paths[0]:
         hdfs.dump(self.data, test_path, mode="wb")
         rdata = hdfs.load(test_path)
         self.assertEqual(rdata, self.data)
Пример #19
0
import pydoop
import pydoop.hdfs as hdfs

if __name__ == "__main__":
    print hdfs.ls("/user")
    print hdfs.ls(".")
    files = hdfs.ls(".")
    text = hdfs.load(files[0])
    print text[0:20]
    print hdfs.path.isdir("/user")
    basename = hdfs.path.basename(files[0])

    #hdfs.get(basename,"/tmp/"+basename)
    #with open("/tmp/"+basename) as f:
    #    print f.read()

    #hdfs.put("/tmp/"+basename, basename+".copy")
    #print hdfs.load(basename+".copy")

    print hdfs.ls(".")
Пример #20
0
def main(input_path, output_attribute_index, scikit_output_path,
         spark_output_path):

    # Instancira se Passive Aggressive Regressor model
    regressor = PassiveAggressiveRegressor()
    for file_path in hdfs.ls(input_path):
        # Ucitava se sadrzaj fajla i kreira string matrica od njega
        content = hdfs.load(file_path)
        temp = content.split("\n")
        temp = list(map(lambda x: x.split(","), temp))
        temp = list(filter(lambda x: len(x) > 1, temp))
        raw_matrix = np.array(temp)
        # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti
        # koja se nakon toga koristi prilikom treniranja modela
        # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string')
        input_matrix = raw_matrix[1:, 3:-5].astype('float64')
        output_vector = raw_matrix[1:, -5 +
                                   output_attribute_index].astype('float64')
        # Model se trenira u vidu iterativnog poboljsanja
        regressor.partial_fit(input_matrix, output_vector)
        # Na konzoli se stampa putanja do obradjenog fajla
        print(file_path)

    # Cuva se kreirani model na izlaznoj putanji
    # koja je prosledjena u vidu argumenta
    with hdfs.open(scikit_output_path, 'w') as opened_file:
        pickle.dump(regressor, opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Trainer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog upisivanja modela)
    session = SparkSession(context)

    # Ucitavanje RDD podataka sa ulazne putanje
    input_data = context.textFile(input_path)
    # Parsiranje svakog reda na reci
    input_data = input_data.map(lambda x: x.split(","))
    # Ignorisu se header-i
    input_data = input_data.filter(lambda x: x[0] != "Timestamp")
    # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude)
    # i bira se odgovarajuca izlazna kolona
    # (u zavisnosti od output_attribute_index promenljive)
    input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[
        3:-5])) + [float(x[-5 + output_attribute_index])])

    # Formira se odgovarajuci DataFrame objekat
    # (VectorAssembler se koristi kod formiranja kolona
    # koje omogucavaju koriscenje fit metode linearne regresije)
    input_cols = []
    for i in range(15):
        input_cols.append("_" + str(i + 1))
    assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
    data_frame = assembler.transform(input_data.toDF())

    # Instancira se LinearRegression objekat i vrsi njegovo treniranje
    # i zatim cuvanje na zadatoj putanji
    regression = LinearRegression(featuresCol='features', labelCol='_16')
    model = regression.fit(data_frame)
    model.write().overwrite().save(spark_output_path)