Python open示例，pydoop.hdfs.open Python示例

示例#1

0

显示文件

文件： test_hdfs.py 项目： ZEMUSHKA/pydoop

 def open(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     with hdfs.open(test_path, "w") as f:
       f.write(self.data)
     f.fs.close()
     with hdfs.open(test_path) as f:
       self.assertEqual(f.read(), self.data)
     f.fs.close()

示例#2

0

显示文件

文件： hdfs2mongo_distributed.py 项目： legendlee1314/ooni

def xml_from_hdfs(url):
    with hdfs.open(url, "r") as f:
        lines = f.read().strip().split('\n')
        docs, doc = [], None
        for line in lines:
            if line.startswith('<doc'):
                doc = line
            elif line.startswith('</doc>'):
                docs.append(doc + line)
            else:
                #line = line.replace('&', '').replace('"', "'")
                doc += line.replace('"', "'")

        for doc in docs:
            dom = bs(doc).find('doc')
            doc = {}
            try:
                doc['id'] = dom.attrs['id']
                doc['url'] = dom.attrs['url']
                doc['title'] = dom.attrs['title']
            except AttributeError, e:
                continue
            doc['content'] = dom.text
            doc['md5'] = hashlib.md5(str(doc)).hexdigest()
            yield doc

示例#3

0

显示文件

文件： test_hdfs.py 项目： ZEMUSHKA/pydoop

 def dump(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     hdfs.dump(self.data, test_path)
     with hdfs.open(test_path) as fi:
       rdata = fi.read()
     fi.fs.close()
     self.assertEqual(rdata, self.data)

示例#4

0

显示文件

文件： avrolib.py 项目： CynthiaYiqingHuang/pydoop

 def __init__(self, ctx):
     super(AvroReader, self).__init__(ctx)
     isplit = ctx.input_split
     self.region_start = isplit.offset
     self.region_end = isplit.offset + isplit.length
     self.reader = SeekableDataFileReader(hdfs.open(isplit.filename),
                                          DatumReader())
     self.reader.align_after(isplit.offset)

示例#5

0

显示文件

文件： avrolib.py 项目： CynthiaYiqingHuang/pydoop

 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)

示例#6

0

显示文件

文件： try_input_format.py 项目： IDR/pydoop-features

 def map(self, ctx):
     p = BioImgPlane(ctx.value)
     pixels = p.get_xy()
     bn = '%s-z%04d-c%04d-t%04d.npy' % (p.name, p.z, p.c, p.t)
     fn = hdfs.path.join(self.out_dir, p.name, bn)
     with hdfs.open(fn, 'w') as fo:
         np.save(fo, pixels)
     ctx.emit(fn, '%s\t%s' % (p.dimension_order, pixels.shape))

示例#7

0

显示文件

文件： test_hdfs.py 项目： ZEMUSHKA/pydoop

 def put(self):
   src = hdfs.path.split(self.local_paths[0])[-1]
   dest = self.hdfs_paths[0]
   with open(src, "w") as f:
     f.write(self.data)
   hdfs.put(src, dest)
   with hdfs.open(dest) as fi:
     rdata = fi.read()
   self.assertEqual(rdata, self.data)

示例#8

0

显示文件

文件： wordcount-rr.py 项目： ilveroluca/pydoop

 def __init__(self, context):
   super(Reader, self).__init__()
   self.isplit = pp.InputSplit(context.getInputSplit())
   self.file = hdfs.open(self.isplit.filename)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)

示例#9

0

显示文件

文件： map_only_python_writer.py 项目： crs4/pydoop

 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     outfn = context.get_default_work_file()
     self.logger.info("writing to %s", outfn)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t")
     self.file = hdfs.open(outfn, "wt", user=hdfs_user)

示例#10

0

显示文件

文件： hdfs2mongo.py 项目： legendlee1314/ooni

def json_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    for fi in file_lists:
        with hdfs.open(fi, "r") as f:
            items = f.read().strip().split('\n')
            for it in items:
                it = loads(it)
                it['md5'] = hashlib.md5(str(it)).hexdigest()
                yield it

示例#11

0

显示文件

文件： wordcount-full.py 项目： ilveroluca/pydoop

 def __init__(self, context):
   super(Writer, self).__init__(context)
   self.logger = logging.getLogger("Writer")
   jc = context.getJobConf()
   jc_configure_int(self, jc, "mapred.task.partition", "part")
   jc_configure(self, jc, "mapred.work.output.dir", "outdir")
   jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t")
   jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
   self.outfn = "%s/part-%05d" % (self.outdir, self.part)
   self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)

示例#12

0

显示文件

文件： wordcount_full.py 项目： CynthiaYiqingHuang/pydoop

 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "w", user=hdfs_user)
     self.sep = jc.get("mapred.textoutputformat.separator", "\t")

示例#13

0

显示文件

文件： pterasort.py 项目： elzaggo/pydoop

 def _choose_break_points(cls, args):
     n_records, n_breakpoints, path = args
     block_size = n_records * RECORD_LENGTH
     with hdfs.open(path, 'r') as f:
         data = f.read(block_size)
     assert len(data) == block_size
     step = max(n_records // n_breakpoints, 1)
     keys = sorted([data[k:k + KEY_LENGTH]
                    for k in range(0, block_size, RECORD_LENGTH)])
     return [_ for _ in it.islice(keys, step, n_records, step)]

示例#14

0

显示文件

文件： ioformats.py 项目： elzaggo/pydoop

 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     self.logger.debug("part: %d", part)
     self.logger.debug("outdir: %s", out_dir)
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "wb", user=hdfs_user)

示例#15

0

显示文件

文件： kafka-producer.py 项目： bunop/ccc-capstone

def processLine(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            
            #Submit data (my function)
            submitLine(topic, line, trials=3)
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))

示例#16

0

显示文件

文件： avrolib.py 项目： wtj/pydoop

 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     self.logger = LOGGER.getChild('AvroWriter')
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.logger.debug('created hdfs file %s', outfn)
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
     self.logger.debug('opened AvroWriter')

示例#17

0

显示文件

文件： wordcount-full.py 项目： ilveroluca/pydoop

 def __init__(self, context):
   super(Reader, self).__init__()
   self.logger = logging.getLogger("Reader")
   self.isplit = pp.InputSplit(context.getInputSplit())
   for a in "filename", "offset", "length":
     self.logger.debug("isplit.%s = %r" % (a, getattr(self.isplit, a)))
   self.file = hdfs.open(self.isplit.filename)
   self.logger.debug("readline chunk size = %r" % self.file.chunk_size)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)

示例#18

0

显示文件

文件： ioformats.py 项目： elzaggo/pydoop

 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     remainder = self.isplit.offset % RECORD_LENGTH
     self.bytes_read = 0 if remainder == 0 else RECORD_LENGTH - remainder
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset + self.bytes_read)

示例#19

0

显示文件

文件： features.py 项目： manics/pydoop-features

def mapper(_, record, writer, conf):
    out_dir = conf.get('out.dir', utils.make_random_str())
    if not hdfs.path.isdir(out_dir):
        hdfs.mkdir(out_dir)
        hdfs.chmod(out_dir, 'g+rwx')
    img_path = record.strip()
    a = get_array(img_path)
    out_a = calc_features(a)
    out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path))
    with hdfs.open(out_path, 'w') as fo:
        np.save(fo, out_a)  # actual output
    hdfs.chmod(out_path, 'g+rw')
    writer.emit(img_path, fo.name)  # info (tab-separated input-output)

示例#20

0

显示文件

文件： hadut.py 项目： crs4/pydoop

def collect_output(mr_out_dir, out_file=None):
    """
    Return all mapreduce output in ``mr_out_dir``.

    Append the output to ``out_file`` if provided.  Otherwise, return
    the result as a single string (it is the caller's responsibility to
    ensure that the amount of data retrieved fits into memory).
    """
    if out_file is None:
        output = []
        for fn in iter_mr_out_files(mr_out_dir):
            with hdfs.open(fn, "rt") as f:
                output.append(f.read())
        return "".join(output)
    else:
        block_size = 16777216
        with open(out_file, 'a') as o:
            for fn in iter_mr_out_files(mr_out_dir):
                with hdfs.open(fn) as f:
                    data = f.read(block_size)
                    while len(data) > 0:
                        o.write(data)
                        data = f.read(block_size)

示例#21

0

显示文件

文件： hadoopReader.py 项目： davedwards/beautiful-data

def read(readFlag):
    print(readFlag);
    if (readFlag == True):
        targetFile = config.targetFile.strip()
        targetDirectory = config.targetDirectory.strip()
        targetPath = config.targetPath
        
        print(targetPath)
        
        # instantiate hadoop
        hdfs.hdfs()
        
        # read from hadoop
        fileToRead = hdfs.open(targetPath)
        print(fileToRead.read())

示例#22

0

显示文件

文件： wordcount_full.py 项目： CynthiaYiqingHuang/pydoop

 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset)
     self.bytes_read = 0
     if self.isplit.offset > 0:
         discarded = self.file.readline()
         self.bytes_read += len(discarded)

示例#23

0

显示文件

文件： checkrecords.py 项目： elzaggo/pydoop

def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteracheck'
    args.module = 'pteracheck'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = False
    args.num_reducers = 1
    args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
    path = os.path.join(args.output, 'part-r-00000')
    with hdfs.open(path, 'rb') as f:
        data = f.read()
    check_rows(data.split(b'\n')[:-1])

示例#24

0

显示文件

文件： check.py 项目： crs4/pydoop

def check_transpose(mr_out_dir):
    output = []
    for fn in hadut.iter_mr_out_files(mr_out_dir):
        with hdfs.open(fn, "rt") as f:
            for line in f:
                row = line.rstrip().split("\t")
                index = int(row.pop(0))
                output.append((index, row))
    output = [_[1] for _ in sorted(output)]
    exp_output = []
    in_fn = os.path.join(THIS_DIR, "data", "transpose_input", "matrix.txt")
    with open(in_fn) as f:
        for line in f:
            for i, item in enumerate(line.split()):
                try:
                    exp_output[i].append(item)
                except IndexError:
                    exp_output.append([item])
    return output == exp_output

示例#25

0

显示文件

文件： kafka-producer.py 项目： bunop/ccc-capstone

def processChunk(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        data = []
        
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            data += [line]
            
            if i % 5000 == 0:
                #Submit data (my function)
                submitChunk(topic, data, trials=3)
                data = []
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))
                
        #for every line
        #submit the rest of the data
        submitChunk(topic, data, trials=3)
        data = []

示例#26

0

显示文件

文件： hdfs2mongo.py 项目： legendlee1314/ooni

def xml_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    #for fi in file_lists:
    for i in xrange(0, 1):
        fi = '/datasets/corpus/enwiki-11g/wiki_912'
        with hdfs.open(fi, "r") as f:
            lines = f.read().strip().split('\n')
            docs, doc = [], None
            for line in lines:
                if line.startswith('<doc'):
                    doc = line
                elif line.startswith('</doc>'):
                    docs.append(doc + line)
                else:
                    #line = line.replace('&', '').replace('"', "'")
                    doc += line.replace('"', "'")

            for doc in docs:
                dom = bs(doc).find('doc')
                doc = dom.attrs
                doc['content'] = dom.text
                doc['md5'] = hashlib.md5(str(doc)).hexdigest()
                yield doc

示例#27

0

显示文件

        sys.exit(1)
    else:
        return ratings

def computeRmse(model, data, n):
    """
    Compute RMSE (Root Mean Squared Error).
    """
    predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))
    predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \
      .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \
      .values()
    return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))
for n in userArray:
	with open(uFile, "w") as fi:
        with hdfs.open('/user/cloudera/medium/ratings.dat') as f:
            for line in f:
                data = line
                userid = line.split("::")
                if (int(userid[0]) == int(n)):
                    fi.write(data)
                    print n
	f.close()
	if __name__ == "__main__":
		if (len(sys.argv) != 2):
			print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \
			  "MovieLensALS.py movieLensDataDir"
			sys.exit(1)

		# set up environment
		conf = SparkConf() \

示例#28

0

显示文件

文件： forecast.py 项目： AndraAnoaica/ENSAI-ANOAICA-TARDIVEL

import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open("hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv") as f:
    df =  pd.read_csv(f)
    
    
    df=pd.read_csv('/user/cloudera/python/cities_location.csv') 
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015,11,1,2,0,0)
    for i in range(len(df)):
        col = ["cities", "time",  "temperatureMin", "temperatureMax"]
        lat=df["latitude"].iloc[i]
        lng=df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day=day.data[0]
        data={"cities": df["cities"].iloc[i], "time" : Day.time, "temperatureMin" : Day.temperatureMin, "temperatureMax" : Day.temperatureMax}
        if i==0 :
            weather = pd.DataFrame(data, index=[0], columns= col)
        else:
            weather1 = pd.DataFrame(data, index=[0], columns= col)
            weather = pd.concat([weather, weather1], ignore_index=True)

示例#29

0

显示文件

        result = math.pow(math.e, -0.5 * (x_mu * inverse * x_mu.T))
        return norm_const * result
    else:
        raise NameError("The dimensions of the input don't match")


#import pydoop.hdfs as hdfs
k = 5

#using Hadoop system file
#with hdfs.open('/Users/ming/centroids.txt') as fp:

weights = []
means = []
sigmas = []
with hdfs.open('/Users/user06/parameters.txt') as file:
    for line in file:
        params = line.strip().split("\t")
        weights.append(float(params[0]))
        means.append(np.array(params[1].split(), float))
        sigmas.append(np.array(params[2].split(), float))

for line in sys.stdin:
    line = line.strip()
    point = np.array(line.split(), float)
    p = weights[0] * norm_pdf_multivariate(point, means[0], sigmas[0].reshape(
        (2, 2)))
    nearest = 0
    for i in range(1, k):
        q = weights[i] * norm_pdf_multivariate(point, means[i],
                                               sigmas[i].reshape((2, 2)))

示例#30

0

显示文件

文件： i7get_from_hdfs.py 项目： greatabel/DataAnalysis

import pydoop.hdfs as hdfs


b = hdfs.path.isdir("/data")

want_file = 'traffic.csv'

if b == True:
    print("---get test ---")
    lines = []
    with hdfs.open("hdfs://127.0.0.1:9000/data/"+want_file) as f:
        for line in f:
            # print(line, type(line))
            l = line.decode("utf-8")
            if l is not None and l != "":
                lines.append(l)
    print(lines)
    print("---end get----")

    with open("i8predict_flow/"+want_file, "wb") as myfile:
        myfile.write(str(lines))

示例#31

0

显示文件

文件： AnomalyDetection_CreditCardData.py 项目： guptashivam27/Anomaly-Detection

##Importing Required Packages
import numpy as np
import pydoop.hdfs as hd
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, auc, roc_curve, precision_recall_curve, average_precision_score

##Loading Credit Card Dataset
with hd.open("/user/hduser/creditcard.csv") as f:
    CreditCardData = pd.read_csv(f, header=0)

##Reducing the number of records of Original Dataset incase we wish to work on a smaller subset of Dataset
ReducedData = CreditCardData.iloc[:, :]

##Shape of Credit Card Dataset, i.e. number of rows & columns present in Dataset
print("\nShape of Credit Card Dataset (rows, columns): " +
      str(ReducedData.shape))

##Removing Duplicate Records (if any)
FinalData = ReducedData.drop_duplicates()
print(
    "\nShape of Credit Card Dataset after removing duplicate records (rows, columns): "
    + str(FinalData.shape))

##Checking for missing values

示例#32

0

显示文件

文件： saveHDFStoS3.py 项目： modupeEIT/Mindbenders_BD

import pydoop.hdfs as hdfs
import boto3
import botocore

s3 = boto3.resource('s3')

BUCKET = "bd-mindbenders12345"

file = hdfs.open("hdfs://localhost:9000/test.txt")

s3.Bucket(BUCKET).put_object(Key="test.txt", Body=file)

示例#33

0

显示文件

文件： hdfs.py 项目： clrke/hdfs-test

import pydoop.hdfs as hdfs
import config.hdfs

with hdfs.open(config.hdfs['ur']) as f:
    for line in f:
        print(line)

示例#34

0

显示文件

文件： local.py 项目： vadirajmkulkarni/DS222-Assignment

i = 0
import math
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


vocab = Counter()
labels = Counter()

with hdfs.open(
        '/user/ds222/assignment-1/DBPedia.verysmall/verysmall_train.txt') as f:
    for line in f:
        first, next = line.split(' ', 1)
        for label in first.split(','):
            labels[label] += 1
        words = next.strip().lower().split()
        for word in words:
            if (len(word) >= 4):
                if (word[0] != '<'):
                    vocab[word] += 1
        i = i + 1
#print(i)
#print(counter)


#Convert words to indexes

示例#35

0

显示文件

# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(5, 1000,
                                            '/user/root/genrecords_output')
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)

示例#36

0

显示文件

文件： indexData.py 项目： ipuneetrathore/searchSongs

	HDFSfiles.append(hdFiles[41:])


fileNames = []

indexName = 'music'
typeName = 'songs'
#IdField = 'songID'


bulkData = [] 

i = 1
for name in HDFSfiles:	
	dataDict = {}
       	fopen=hdfs.open("/gaana/gaanaLyrics/"+name)
	header = fopen.read()
       	header = re.sub('[^a-zA-Z]', ' ', header)
	header = header.replace("Advertisements"," ")
       	header = ''.join([item.lower() for item in header]) 
	songAndMovie = []
       	dlim = "lyrics"
#	nameNew = name.replace("-"," ")
	songAndMovie.append(name)
        dataDict[name] = header
	metaDict = {}
	dataDict = {}	
       	for elements in songAndMovie:
               	songsName = []
#              	if "lyrics" in elements:
		songName = elements.split('-')

示例#37

0

显示文件

文件： DataLake_stats.py 项目： fmilagres/data_engineering

from pandasql import sqldf
import os


login=''
senha=''

os.system('echo '+senha+' | kinit '+login)
dir = '/ranger/audit/hiveServer2/'
list = hdfs.ls(dir)

df = pd.DataFrame()
for pasta in list:
    for i in range(len(hdfs.ls(pasta))):
        try:
            with hdfs.open(hdfs.ls(pasta)[i], 'r') as f:
                jsn = [json.loads(line) for line in f]
                df = df.append([pd.DataFrame(jsn)], sort=True)
            
        except:
                print("Leitura do arquivo json em " + hdfs.ls(pasta)[i] + " não foi bem sucedida")

df1 = df[['evtTime','reqUser','resource','access','reqData']]
df1['reqUser'] = df1['reqUser'].str.upper()
df1 = df1[df1['access']=='SELECT']
# exclusao de usuarios de servico
exclusao = pd.DataFrame(['HIVE','RANGERLOOKUP'])
df1 = df1[~df1.reqUser.isin(exclusao.iloc[:,0])]
df1['evtTime'] = pd.to_datetime(df1['evtTime'].str[0:16], format='%Y-%m-%d %H:%M')

spark_df = spark.createDataFrame(df1)

示例#38

0

显示文件

文件： socialmedia_sentiment_pullfrommongodb.py 项目： rdstanley/untitled

            {"$group": {"_id": {'source':"$source",'tags':"$tags",'year': "$year_posted",'month':"$month_posted",'day':"$day_posted"}, "count": {"$sum": 1},"countNegative":{"$sum":"$Negative"},"countNeutral":{"$sum":"$Neutral"},"countPositive":{"$sum":"$Positive"}}},
            {"$sort": SON([("count", -1), ("_id", -1)])}
        ])
            #use reportdate for the filename
        filename = startdate.strftime('%Y-%m-%d')
        print(filename)
        for result_obj in daily_totals['result']:
            data_dict = result_obj['_id']
            date = (str(data_dict['year']) + "-" +  str(data_dict['month']) + "-" + str(data_dict['day']))
            tag = data_dict['tags']
            source = data_dict['source']
            count = result_obj['count']
            countPositive = result_obj['countPositive']
            countNegative = result_obj['countNegative']
            countNeutral = result_obj['countNeutral']
            data = (str(date) + "|" + str(tag) + "|" + str(source) + "|" + str(count) + "|" + str(countPositive) + "|" + str(countNegative) + "|" + str(countNeutral)+'\n')
            print(data)
            hdfs_path = '/socialmedia/sentiment/' + filename
            hdfs_path = settings.HDFS_HOST_NAME + ':' + settings.HDFS_PORT + settings.HDFS_ROOT_FOLDER + \
                        '/socialmedia/sentiment' + filename + '.in'
            logger.info('HDFS file path: %s' % hdfs_path)
            logger.debug('Data: %s' % data)

            try:
                hdfs_file = hdfs.open(hdfs_path, mode='a')
                hdfs_file.write(data.encode('utf-8'))
            except IOError, e:
                logger.debug("IOError: " + e.message)
                logger.debug("Caught Exception. Will create a new file on hdfs.")
                hdfs_file = hdfs.open(hdfs_path, mode='w')
                hdfs_file.write(data.encode('utf-8'))

示例#39

0

显示文件

        colourImg = PIL.Image.open(imgFile)
        #imshow(np.asarray(colourImg))
        nparray = np.asarray(colourImg)
        image = cv2.cvtColor(nparray, cv2.COLOR_RGB2BGR)

    return image


brand = "logitech"
mode = "image"

os.chdir("/tmp/")
myMachine = kpath.abspath('/tmp/data/input/racetrack/image/')
print(myMachine)

with hpath.open(myMachine + "driving_log.csv") as csvFile:
    df = pd.read_csv(csvFile,
                     names=[
                         "image_center", "image_left", "image_right",
                         "steering", "speed"
                     ])

#next(df.iterrows())[1]
df.iterrows()

# read and store multiple cameras and steering angles from driving_log.csv
# all three camera images will be used to train the model
images = []
steering_measurements = []

for index, row in df.iterrows():

示例#40

0

显示文件

	

import pydoop.hdfs as hdfs 
import logging
logging.basicConfig(level = logging.DEBUG)

# тест проверяет наличие строки в выходых файлах
# для каждой строки ищется ее пара в директории с нужной датой

with open('file1.csv','r') as in_f:
	for it,in_line in enumerate(in_f):
		date=in_line.strip().split(',')[0]
		for part in [1,2,3]:
			with hdfs.open('/data/archive/'+date+'/part-0000'+str(part)) as out_f:
				matching=[]
				for out_line in out_f:
					a=set(out_line.strip().split(','))
					if a==set(in_line.strip().split(',')):
						matching.append(True) 
						break
					else:
						matching.append(False)
			if any(matching):
				matching=True
				break
        	if not matching:            
			logging.debug("Error on line %s ,%s",it,in_line)

示例#41

0

显示文件

 def __missing__(self, path):
     f = hdfs.open(path, "wb")
     self[path] = f
     return f

示例#42

0

显示文件

import pydoop.hdfs as hdfs

for part in [1, 2, 3]:
    with hdfs.open('/data/archive/2014-04-29/part-0000' + str(part)) as out_f:
        with open('file1.csv', 'r') as in_f:
            for out_line in out_f:
                for in_line in in_f:
                    a = set(out_line.strip().split(','))
                    if a == set(in_line.strip().split(',')):
                        print True
                    else:
                        print False
                        print a
                        print set(in_line.strip().split(','))

示例#43

0

显示文件

 def __init__(self, context):
     super(Reader, self).__init__()
     self.logger = logging.getLogger("Reader")  #formatted logger obtained
     self.file = hdfs.open('HD-2004-2014-d.csv')
     self.logger.debug("readline chunk size = %r" % self.file.chunk_size)

示例#44

0

显示文件

文件： mapper.py 项目： Jeck96/CORNO_GRANDE_PROGETTO_1_BIG-DATA

#!/usr/bin/python3
"""mapper.py"""
import sys
import csv
import pydoop.hdfs as hdfs
import json
import costanct as C

azienda_map = {}
with hdfs.open('input/historical_stocks.csv', 'rt') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count > 0:
            ticker, _, name, _, _ = row
            azienda_map[ticker] = {'name': name}
        line_count += 1


def toJson(azione):
    dic = {
        "ticker": azione[0],
        "name": azienda_map[azione[0]],
        "close": azione[2],
        "date": azione[7],
    }
    return json.dumps(dic)


for line in sys.stdin:
    azione = line.split(',')

示例#45

0

显示文件

        context.setStatus("initializing")

    def map(self, context):
        k = context.getInputKey()
        tmp_data = csv.reader(f)
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")
            context.incrementCounter(self.inputWords, len(words))

    def close(self):
        self.logger.info("all done")


print "Prediction on HD 30 year data:"
f = hdfs.open('/HD-1984-2014-d.csv')
tmp_data = csv.reader(f)

my_data = list()
for item in tmp_data:
    tmp_item = list()
    for i in item:
        tmp_item.append(i)
    my_data.append(tmp_item)
data = my_data[1:]
X = list()
training_indices = list()
for i in xrange(int(len(data) * 0.9)):
    training_indices.append(i)

test_indices = list()

示例#46

0

显示文件

import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession

import pandas as pd
import pydoop.hdfs as hd

# create a spark session
#sparkSession = SparkSession.builder.master("local").appName("draw heat map").getOrCreate()
#df_load = sparkSession.read.csv('hdfs://dumbo/user/gx271/pubgETL/mir_death.csv')

with hd.open("hdfs://dumbo/user/gx271/pubgETL/mir_death.csv/part-00006") as f:
    df = pd.read_csv(f)

# convert DataFrame to np array

dat = df.as_matrix()

# dat = np.loadtxt('mydata.csv')

x, y = dat[:,0], dat[:,1]

heatmap, xedges, yedges = np.histogram2d(x, y, bins=50)  
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]  
plt.clf()  
plt.imshow(heatmap, extent=extent)  
# plt.show()

示例#47

0

显示文件

def main(input_path, output_attribute_index, scikit_output_path,
         spark_output_path):

    # Instancira se Passive Aggressive Regressor model
    regressor = PassiveAggressiveRegressor()
    for file_path in hdfs.ls(input_path):
        # Ucitava se sadrzaj fajla i kreira string matrica od njega
        content = hdfs.load(file_path)
        temp = content.split("\n")
        temp = list(map(lambda x: x.split(","), temp))
        temp = list(filter(lambda x: len(x) > 1, temp))
        raw_matrix = np.array(temp)
        # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti
        # koja se nakon toga koristi prilikom treniranja modela
        # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string')
        input_matrix = raw_matrix[1:, 3:-5].astype('float64')
        output_vector = raw_matrix[1:, -5 +
                                   output_attribute_index].astype('float64')
        # Model se trenira u vidu iterativnog poboljsanja
        regressor.partial_fit(input_matrix, output_vector)
        # Na konzoli se stampa putanja do obradjenog fajla
        print(file_path)

    # Cuva se kreirani model na izlaznoj putanji
    # koja je prosledjena u vidu argumenta
    with hdfs.open(scikit_output_path, 'w') as opened_file:
        pickle.dump(regressor, opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Trainer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog upisivanja modela)
    session = SparkSession(context)

    # Ucitavanje RDD podataka sa ulazne putanje
    input_data = context.textFile(input_path)
    # Parsiranje svakog reda na reci
    input_data = input_data.map(lambda x: x.split(","))
    # Ignorisu se header-i
    input_data = input_data.filter(lambda x: x[0] != "Timestamp")
    # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude)
    # i bira se odgovarajuca izlazna kolona
    # (u zavisnosti od output_attribute_index promenljive)
    input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[
        3:-5])) + [float(x[-5 + output_attribute_index])])

    # Formira se odgovarajuci DataFrame objekat
    # (VectorAssembler se koristi kod formiranja kolona
    # koje omogucavaju koriscenje fit metode linearne regresije)
    input_cols = []
    for i in range(15):
        input_cols.append("_" + str(i + 1))
    assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
    data_frame = assembler.transform(input_data.toDF())

    # Instancira se LinearRegression objekat i vrsi njegovo treniranje
    # i zatim cuvanje na zadatoj putanji
    regression = LinearRegression(featuresCol='features', labelCol='_16')
    model = regression.fit(data_frame)
    model.write().overwrite().save(spark_output_path)

示例#48

0

显示文件

文件： data_s3.py 项目： sapoondutta-eit/Mindbender_BD

bucket = 'enhance-it'
import boto3
import pydoop.hdfs as hdfs 

s3 = boto3.resource('s3')


file = hdfs.open('hdfs://master:9000/data_for_db/currency=USD/part-00000-f65c545f-baa0-4bf0-8aa9-0b14957848c4.c000.json')
s3.Bucket(bucket).put_object(Key='lituation/data_from_hdfs.csv', Body=file)

示例#49

0

显示文件

文件： test_partitioner.py 项目： elzaggo/pydoop

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(
    5, 1000, '/user/root/genrecords_output'
)
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)

示例#50

0

显示文件

#! /usr/bin/env python

import sys
from pydoop import hdfs
from DataPoint import DataPoint

#print "Start"

# read sys.argv[1] and sys.argv[2]
# put em in lists

if len(sys.argv)<3:
	print "Error: Insufficient Arguments"
	sys.exit(-1)

oldCentroidsFile = hdfs.open(sys.argv[1])
newCentroidsFile = hdfs.open(sys.argv[2])

oldCentroids = []
newCentroids = []

for line in oldCentroidsFile:
	if line.find("\t") != -1:
		(key,value) = line.strip().split("\t")
		oldCentroid = DataPoint(value)
	else:
		oldCentroid = DataPoint(line.strip()) 
	oldCentroids.append(oldCentroid)

for line in newCentroidsFile:
	(key,value) = line.strip().split("\t")

示例#51

0

显示文件

文件： mapperStg2.py 项目： bread-tan/canopyClusteringPython

#! /usr/bin/env python

import sys
import DataPoint
from pydoop import hdfs

# Check for sufficient arguments
if len(sys.argv) < 2:
    print("ERROR: Insufficient arguments")
    sys.exit(-1)

# List to hold canopy centers
canopyCenters = []

# Read canopy center file
file = hdfs.open(sys.argv[1])
for line in file:
    if line.find("Warning:") == 0:
        continue
    (key, value) = line.split("\t")
    dp = DataPoint.DataPoint(value.strip())
    canopyCenters.append(dp)

# Assign points to canopies
for line in sys.stdin:
    dp = DataPoint.DataPoint(line.strip())
    insert = True
    for canopyCenter in canopyCenters:
        if dp.checkT1(canopyCenter):
            print(canopyCenter.toString() + "\t" + dp.toString())

示例#52

0

显示文件

### Parte de leer los datos de hdfs ###
import pandas as pd
import numpy as np
import pydoop.hdfs as hd
from lxml import objectify

with hd.open("/user/datostiempo/20160525_1341.xml") as archivo:
    parsed = objectify.parse(archivo)

root = parsed.getroot()
prob_precipitacion = []
estado_cielo = []
viento = []
temperatura = []
tempmax = []
tempmin = []
iteraccion = 0
errores = []
print "root : ", root
for row in root.prediccion.dia:
    for row_precipitacion in row.prob_precipitacion:
        aux_precipitacion = []
        if (row_precipitacion != ''):
            aux_precipitacion.append(row_precipitacion)
        else:
            errores.append(1)
    prob_precipitacion.append(
        str(sum(aux_precipitacion) / float(len(aux_precipitacion))))
    for row_cielo in row.estado_cielo:
        aux_cielo = []
        if (row_cielo != ''):

示例#53

0

显示文件

文件： sparkdrunkdetection.py 项目： cloud17shield/DrunkDetection

import os
import pandas as pd
import pydoop.hdfs as hdfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

conf = SparkConf().setAppName("drunk detection").setMaster("yarn")
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)

csv_file_path = "hdfs:///drunkdetection/train_data48.csv"
predictor_path = "hdfs:///drunkdetection/shape_predictor_68_face_landmarks.dat"
image_path = "hdfs:///drunkdetection/drunk3.jpg"
model_path = "hdfs:///drunkdetection/rf48.pickle"

with hdfs.open("/drunkdetection/train_data48.csv") as csv:
    df = pd.read_csv(csv, index_col=0)
print(df.columns)
df_y = df['label'] == 3
df_X = df[['x' + str(i)
           for i in range(1, 49)] + ['y' + str(j) for j in range(1, 49)]]
X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                    df_y,
                                                    test_size=0.2,
                                                    random_state=15)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

示例#54

0

显示文件

文件： forecast.py 项目： AndraAnoaica/ENSAI-HiveQLscript

import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open(
        "hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv"
) as f:
    df = pd.read_csv(f)

    df = pd.read_csv('/user/cloudera/python/cities_location.csv')
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015, 11, 1, 2, 0, 0)
    for i in range(len(df)):
        col = ["cities", "time", "temperatureMin", "temperatureMax"]
        lat = df["latitude"].iloc[i]
        lng = df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day = day.data[0]
        data = {
            "cities": df["cities"].iloc[i],
            "time": Day.time,
            "temperatureMin": Day.temperatureMin,
            "temperatureMax": Day.temperatureMax
        }