示例#1
0
 def open(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     with hdfs.open(test_path, "w") as f:
       f.write(self.data)
     f.fs.close()
     with hdfs.open(test_path) as f:
       self.assertEqual(f.read(), self.data)
     f.fs.close()
def xml_from_hdfs(url):
    with hdfs.open(url, "r") as f:
        lines = f.read().strip().split('\n')
        docs, doc = [], None
        for line in lines:
            if line.startswith('<doc'):
                doc = line
            elif line.startswith('</doc>'):
                docs.append(doc + line)
            else:
                #line = line.replace('&', '').replace('"', "'")
                doc += line.replace('"', "'")

        for doc in docs:
            dom = bs(doc).find('doc')
            doc = {}
            try:
                doc['id'] = dom.attrs['id']
                doc['url'] = dom.attrs['url']
                doc['title'] = dom.attrs['title']
            except AttributeError, e:
                continue
            doc['content'] = dom.text
            doc['md5'] = hashlib.md5(str(doc)).hexdigest()
            yield doc
示例#3
0
 def dump(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     hdfs.dump(self.data, test_path)
     with hdfs.open(test_path) as fi:
       rdata = fi.read()
     fi.fs.close()
     self.assertEqual(rdata, self.data)
示例#4
0
 def __init__(self, ctx):
     super(AvroReader, self).__init__(ctx)
     isplit = ctx.input_split
     self.region_start = isplit.offset
     self.region_end = isplit.offset + isplit.length
     self.reader = SeekableDataFileReader(hdfs.open(isplit.filename),
                                          DatumReader())
     self.reader.align_after(isplit.offset)
示例#5
0
 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
示例#6
0
 def map(self, ctx):
     p = BioImgPlane(ctx.value)
     pixels = p.get_xy()
     bn = '%s-z%04d-c%04d-t%04d.npy' % (p.name, p.z, p.c, p.t)
     fn = hdfs.path.join(self.out_dir, p.name, bn)
     with hdfs.open(fn, 'w') as fo:
         np.save(fo, pixels)
     ctx.emit(fn, '%s\t%s' % (p.dimension_order, pixels.shape))
示例#7
0
 def put(self):
   src = hdfs.path.split(self.local_paths[0])[-1]
   dest = self.hdfs_paths[0]
   with open(src, "w") as f:
     f.write(self.data)
   hdfs.put(src, dest)
   with hdfs.open(dest) as fi:
     rdata = fi.read()
   self.assertEqual(rdata, self.data)
示例#8
0
 def __init__(self, context):
   super(Reader, self).__init__()
   self.isplit = pp.InputSplit(context.getInputSplit())
   self.file = hdfs.open(self.isplit.filename)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)
示例#9
0
 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     outfn = context.get_default_work_file()
     self.logger.info("writing to %s", outfn)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t")
     self.file = hdfs.open(outfn, "wt", user=hdfs_user)
示例#10
0
def json_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    for fi in file_lists:
        with hdfs.open(fi, "r") as f:
            items = f.read().strip().split('\n')
            for it in items:
                it = loads(it)
                it['md5'] = hashlib.md5(str(it)).hexdigest()
                yield it
示例#11
0
 def __init__(self, context):
   super(Writer, self).__init__(context)
   self.logger = logging.getLogger("Writer")
   jc = context.getJobConf()
   jc_configure_int(self, jc, "mapred.task.partition", "part")
   jc_configure(self, jc, "mapred.work.output.dir", "outdir")
   jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t")
   jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
   self.outfn = "%s/part-%05d" % (self.outdir, self.part)
   self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
示例#12
0
 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "w", user=hdfs_user)
     self.sep = jc.get("mapred.textoutputformat.separator", "\t")
示例#13
0
 def _choose_break_points(cls, args):
     n_records, n_breakpoints, path = args
     block_size = n_records * RECORD_LENGTH
     with hdfs.open(path, 'r') as f:
         data = f.read(block_size)
     assert len(data) == block_size
     step = max(n_records // n_breakpoints, 1)
     keys = sorted([data[k:k + KEY_LENGTH]
                    for k in range(0, block_size, RECORD_LENGTH)])
     return [_ for _ in it.islice(keys, step, n_records, step)]
示例#14
0
 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     self.logger.debug("part: %d", part)
     self.logger.debug("outdir: %s", out_dir)
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "wb", user=hdfs_user)
示例#15
0
def processLine(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            
            #Submit data (my function)
            submitLine(topic, line, trials=3)
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))
示例#16
0
文件: avrolib.py 项目: wtj/pydoop
 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     self.logger = LOGGER.getChild('AvroWriter')
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.logger.debug('created hdfs file %s', outfn)
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
     self.logger.debug('opened AvroWriter')
示例#17
0
 def __init__(self, context):
   super(Reader, self).__init__()
   self.logger = logging.getLogger("Reader")
   self.isplit = pp.InputSplit(context.getInputSplit())
   for a in "filename", "offset", "length":
     self.logger.debug("isplit.%s = %r" % (a, getattr(self.isplit, a)))
   self.file = hdfs.open(self.isplit.filename)
   self.logger.debug("readline chunk size = %r" % self.file.chunk_size)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)
示例#18
0
 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     remainder = self.isplit.offset % RECORD_LENGTH
     self.bytes_read = 0 if remainder == 0 else RECORD_LENGTH - remainder
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset + self.bytes_read)
示例#19
0
def mapper(_, record, writer, conf):
    out_dir = conf.get('out.dir', utils.make_random_str())
    if not hdfs.path.isdir(out_dir):
        hdfs.mkdir(out_dir)
        hdfs.chmod(out_dir, 'g+rwx')
    img_path = record.strip()
    a = get_array(img_path)
    out_a = calc_features(a)
    out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path))
    with hdfs.open(out_path, 'w') as fo:
        np.save(fo, out_a)  # actual output
    hdfs.chmod(out_path, 'g+rw')
    writer.emit(img_path, fo.name)  # info (tab-separated input-output)
示例#20
0
文件: hadut.py 项目: crs4/pydoop
def collect_output(mr_out_dir, out_file=None):
    """
    Return all mapreduce output in ``mr_out_dir``.

    Append the output to ``out_file`` if provided.  Otherwise, return
    the result as a single string (it is the caller's responsibility to
    ensure that the amount of data retrieved fits into memory).
    """
    if out_file is None:
        output = []
        for fn in iter_mr_out_files(mr_out_dir):
            with hdfs.open(fn, "rt") as f:
                output.append(f.read())
        return "".join(output)
    else:
        block_size = 16777216
        with open(out_file, 'a') as o:
            for fn in iter_mr_out_files(mr_out_dir):
                with hdfs.open(fn) as f:
                    data = f.read(block_size)
                    while len(data) > 0:
                        o.write(data)
                        data = f.read(block_size)
示例#21
0
def read(readFlag):
    print(readFlag);
    if (readFlag == True):
        targetFile = config.targetFile.strip()
        targetDirectory = config.targetDirectory.strip()
        targetPath = config.targetPath
        
        print(targetPath)
        
        # instantiate hadoop
        hdfs.hdfs()
        
        # read from hadoop
        fileToRead = hdfs.open(targetPath)
        print(fileToRead.read())
示例#22
0
 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset)
     self.bytes_read = 0
     if self.isplit.offset > 0:
         discarded = self.file.readline()
         self.bytes_read += len(discarded)
示例#23
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteracheck'
    args.module = 'pteracheck'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = False
    args.num_reducers = 1
    args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
    path = os.path.join(args.output, 'part-r-00000')
    with hdfs.open(path, 'rb') as f:
        data = f.read()
    check_rows(data.split(b'\n')[:-1])
示例#24
0
文件: check.py 项目: crs4/pydoop
def check_transpose(mr_out_dir):
    output = []
    for fn in hadut.iter_mr_out_files(mr_out_dir):
        with hdfs.open(fn, "rt") as f:
            for line in f:
                row = line.rstrip().split("\t")
                index = int(row.pop(0))
                output.append((index, row))
    output = [_[1] for _ in sorted(output)]
    exp_output = []
    in_fn = os.path.join(THIS_DIR, "data", "transpose_input", "matrix.txt")
    with open(in_fn) as f:
        for line in f:
            for i, item in enumerate(line.split()):
                try:
                    exp_output[i].append(item)
                except IndexError:
                    exp_output.append([item])
    return output == exp_output
示例#25
0
def processChunk(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        data = []
        
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            data += [line]
            
            if i % 5000 == 0:
                #Submit data (my function)
                submitChunk(topic, data, trials=3)
                data = []
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))
                
        #for every line
        #submit the rest of the data
        submitChunk(topic, data, trials=3)
        data = []
示例#26
0
def xml_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    #for fi in file_lists:
    for i in xrange(0, 1):
        fi = '/datasets/corpus/enwiki-11g/wiki_912'
        with hdfs.open(fi, "r") as f:
            lines = f.read().strip().split('\n')
            docs, doc = [], None
            for line in lines:
                if line.startswith('<doc'):
                    doc = line
                elif line.startswith('</doc>'):
                    docs.append(doc + line)
                else:
                    #line = line.replace('&', '').replace('"', "'")
                    doc += line.replace('"', "'")

            for doc in docs:
                dom = bs(doc).find('doc')
                doc = dom.attrs
                doc['content'] = dom.text
                doc['md5'] = hashlib.md5(str(doc)).hexdigest()
                yield doc
示例#27
0
        sys.exit(1)
    else:
        return ratings

def computeRmse(model, data, n):
    """
    Compute RMSE (Root Mean Squared Error).
    """
    predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))
    predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \
      .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \
      .values()
    return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))
for n in userArray:
	with open(uFile, "w") as fi:
        with hdfs.open('/user/cloudera/medium/ratings.dat') as f:
            for line in f:
                data = line
                userid = line.split("::")
                if (int(userid[0]) == int(n)):
                    fi.write(data)
                    print n
	f.close()
	if __name__ == "__main__":
		if (len(sys.argv) != 2):
			print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \
			  "MovieLensALS.py movieLensDataDir"
			sys.exit(1)

		# set up environment
		conf = SparkConf() \
import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open("hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv") as f:
    df =  pd.read_csv(f)
    
    
    df=pd.read_csv('/user/cloudera/python/cities_location.csv') 
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015,11,1,2,0,0)
    for i in range(len(df)):
        col = ["cities", "time",  "temperatureMin", "temperatureMax"]
        lat=df["latitude"].iloc[i]
        lng=df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day=day.data[0]
        data={"cities": df["cities"].iloc[i], "time" : Day.time, "temperatureMin" : Day.temperatureMin, "temperatureMax" : Day.temperatureMax}
        if i==0 :
            weather = pd.DataFrame(data, index=[0], columns= col)
        else:
            weather1 = pd.DataFrame(data, index=[0], columns= col)
            weather = pd.concat([weather, weather1], ignore_index=True)
        
示例#29
0
        result = math.pow(math.e, -0.5 * (x_mu * inverse * x_mu.T))
        return norm_const * result
    else:
        raise NameError("The dimensions of the input don't match")


#import pydoop.hdfs as hdfs
k = 5

#using Hadoop system file
#with hdfs.open('/Users/ming/centroids.txt') as fp:

weights = []
means = []
sigmas = []
with hdfs.open('/Users/user06/parameters.txt') as file:
    for line in file:
        params = line.strip().split("\t")
        weights.append(float(params[0]))
        means.append(np.array(params[1].split(), float))
        sigmas.append(np.array(params[2].split(), float))

for line in sys.stdin:
    line = line.strip()
    point = np.array(line.split(), float)
    p = weights[0] * norm_pdf_multivariate(point, means[0], sigmas[0].reshape(
        (2, 2)))
    nearest = 0
    for i in range(1, k):
        q = weights[i] * norm_pdf_multivariate(point, means[i],
                                               sigmas[i].reshape((2, 2)))
示例#30
0
import pydoop.hdfs as hdfs


b = hdfs.path.isdir("/data")

want_file = 'traffic.csv'

if b == True:
    print("---get test ---")
    lines = []
    with hdfs.open("hdfs://127.0.0.1:9000/data/"+want_file) as f:
        for line in f:
            # print(line, type(line))
            l = line.decode("utf-8")
            if l is not None and l != "":
                lines.append(l)
    print(lines)
    print("---end get----")

    with open("i8predict_flow/"+want_file, "wb") as myfile:
        myfile.write(str(lines))
##Importing Required Packages
import numpy as np
import pydoop.hdfs as hd
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, auc, roc_curve, precision_recall_curve, average_precision_score

##Loading Credit Card Dataset
with hd.open("/user/hduser/creditcard.csv") as f:
    CreditCardData = pd.read_csv(f, header=0)

##Reducing the number of records of Original Dataset incase we wish to work on a smaller subset of Dataset
ReducedData = CreditCardData.iloc[:, :]

##Shape of Credit Card Dataset, i.e. number of rows & columns present in Dataset
print("\nShape of Credit Card Dataset (rows, columns): " +
      str(ReducedData.shape))

##Removing Duplicate Records (if any)
FinalData = ReducedData.drop_duplicates()
print(
    "\nShape of Credit Card Dataset after removing duplicate records (rows, columns): "
    + str(FinalData.shape))

##Checking for missing values
示例#32
0
import pydoop.hdfs as hdfs
import boto3
import botocore

s3 = boto3.resource('s3')

BUCKET = "bd-mindbenders12345"

file = hdfs.open("hdfs://localhost:9000/test.txt")

s3.Bucket(BUCKET).put_object(Key="test.txt", Body=file)         
示例#33
0
文件: hdfs.py 项目: clrke/hdfs-test
import pydoop.hdfs as hdfs
import config.hdfs

with hdfs.open(config.hdfs['ur']) as f:
    for line in f:
        print(line)

示例#34
0
i = 0
import math
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


vocab = Counter()
labels = Counter()

with hdfs.open(
        '/user/ds222/assignment-1/DBPedia.verysmall/verysmall_train.txt') as f:
    for line in f:
        first, next = line.split(' ', 1)
        for label in first.split(','):
            labels[label] += 1
        words = next.strip().lower().split()
        for word in words:
            if (len(word) >= 4):
                if (word[0] != '<'):
                    vocab[word] += 1
        i = i + 1
#print(i)
#print(counter)


#Convert words to indexes
示例#35
0
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(5, 1000,
                                            '/user/root/genrecords_output')
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)
示例#36
0
	HDFSfiles.append(hdFiles[41:])


fileNames = []

indexName = 'music'
typeName = 'songs'
#IdField = 'songID'


bulkData = [] 

i = 1
for name in HDFSfiles:	
	dataDict = {}
       	fopen=hdfs.open("/gaana/gaanaLyrics/"+name)
	header = fopen.read()
       	header = re.sub('[^a-zA-Z]', ' ', header)
	header = header.replace("Advertisements"," ")
       	header = ''.join([item.lower() for item in header]) 
	songAndMovie = []
       	dlim = "lyrics"
#	nameNew = name.replace("-"," ")
	songAndMovie.append(name)
        dataDict[name] = header
	metaDict = {}
	dataDict = {}	
       	for elements in songAndMovie:
               	songsName = []
#              	if "lyrics" in elements:
		songName = elements.split('-')
from pandasql import sqldf
import os


login=''
senha=''

os.system('echo '+senha+' | kinit '+login)
dir = '/ranger/audit/hiveServer2/'
list = hdfs.ls(dir)

df = pd.DataFrame()
for pasta in list:
    for i in range(len(hdfs.ls(pasta))):
        try:
            with hdfs.open(hdfs.ls(pasta)[i], 'r') as f:
                jsn = [json.loads(line) for line in f]
                df = df.append([pd.DataFrame(jsn)], sort=True)
            
        except:
                print("Leitura do arquivo json em " + hdfs.ls(pasta)[i] + " não foi bem sucedida")

df1 = df[['evtTime','reqUser','resource','access','reqData']]
df1['reqUser'] = df1['reqUser'].str.upper()
df1 = df1[df1['access']=='SELECT']
# exclusao de usuarios de servico
exclusao = pd.DataFrame(['HIVE','RANGERLOOKUP'])
df1 = df1[~df1.reqUser.isin(exclusao.iloc[:,0])]
df1['evtTime'] = pd.to_datetime(df1['evtTime'].str[0:16], format='%Y-%m-%d %H:%M')

spark_df = spark.createDataFrame(df1)
            {"$group": {"_id": {'source':"$source",'tags':"$tags",'year': "$year_posted",'month':"$month_posted",'day':"$day_posted"}, "count": {"$sum": 1},"countNegative":{"$sum":"$Negative"},"countNeutral":{"$sum":"$Neutral"},"countPositive":{"$sum":"$Positive"}}},
            {"$sort": SON([("count", -1), ("_id", -1)])}
        ])
            #use reportdate for the filename
        filename = startdate.strftime('%Y-%m-%d')
        print(filename)
        for result_obj in daily_totals['result']:
            data_dict = result_obj['_id']
            date = (str(data_dict['year']) + "-" +  str(data_dict['month']) + "-" + str(data_dict['day']))
            tag = data_dict['tags']
            source = data_dict['source']
            count = result_obj['count']
            countPositive = result_obj['countPositive']
            countNegative = result_obj['countNegative']
            countNeutral = result_obj['countNeutral']
            data = (str(date) + "|" + str(tag) + "|" + str(source) + "|" + str(count) + "|" + str(countPositive) + "|" + str(countNegative) + "|" + str(countNeutral)+'\n')
            print(data)
            hdfs_path = '/socialmedia/sentiment/' + filename
            hdfs_path = settings.HDFS_HOST_NAME + ':' + settings.HDFS_PORT + settings.HDFS_ROOT_FOLDER + \
                        '/socialmedia/sentiment' + filename + '.in'
            logger.info('HDFS file path: %s' % hdfs_path)
            logger.debug('Data: %s' % data)

            try:
                hdfs_file = hdfs.open(hdfs_path, mode='a')
                hdfs_file.write(data.encode('utf-8'))
            except IOError, e:
                logger.debug("IOError: " + e.message)
                logger.debug("Caught Exception. Will create a new file on hdfs.")
                hdfs_file = hdfs.open(hdfs_path, mode='w')
                hdfs_file.write(data.encode('utf-8'))
示例#39
0
        colourImg = PIL.Image.open(imgFile)
        #imshow(np.asarray(colourImg))
        nparray = np.asarray(colourImg)
        image = cv2.cvtColor(nparray, cv2.COLOR_RGB2BGR)

    return image


brand = "logitech"
mode = "image"

os.chdir("/tmp/")
myMachine = kpath.abspath('/tmp/data/input/racetrack/image/')
print(myMachine)

with hpath.open(myMachine + "driving_log.csv") as csvFile:
    df = pd.read_csv(csvFile,
                     names=[
                         "image_center", "image_left", "image_right",
                         "steering", "speed"
                     ])

#next(df.iterrows())[1]
df.iterrows()

# read and store multiple cameras and steering angles from driving_log.csv
# all three camera images will be used to train the model
images = []
steering_measurements = []

for index, row in df.iterrows():
示例#40
0
	

import pydoop.hdfs as hdfs 
import logging
logging.basicConfig(level = logging.DEBUG)

# тест проверяет наличие строки в выходых файлах
# для каждой строки ищется ее пара в директории с нужной датой

with open('file1.csv','r') as in_f:
	for it,in_line in enumerate(in_f):
		date=in_line.strip().split(',')[0]
		for part in [1,2,3]:
			with hdfs.open('/data/archive/'+date+'/part-0000'+str(part)) as out_f:
				matching=[]
				for out_line in out_f:
					a=set(out_line.strip().split(','))
					if a==set(in_line.strip().split(',')):
						matching.append(True) 
						break
					else:
						matching.append(False)
			if any(matching):
				matching=True
				break
        	if not matching:            
			logging.debug("Error on line %s ,%s",it,in_line)
示例#41
0
 def __missing__(self, path):
     f = hdfs.open(path, "wb")
     self[path] = f
     return f
示例#42
0
import pydoop.hdfs as hdfs

for part in [1, 2, 3]:
    with hdfs.open('/data/archive/2014-04-29/part-0000' + str(part)) as out_f:
        with open('file1.csv', 'r') as in_f:
            for out_line in out_f:
                for in_line in in_f:
                    a = set(out_line.strip().split(','))
                    if a == set(in_line.strip().split(',')):
                        print True
                    else:
                        print False
                        print a
                        print set(in_line.strip().split(','))
示例#43
0
 def __init__(self, context):
     super(Reader, self).__init__()
     self.logger = logging.getLogger("Reader")  #formatted logger obtained
     self.file = hdfs.open('HD-2004-2014-d.csv')
     self.logger.debug("readline chunk size = %r" % self.file.chunk_size)
#!/usr/bin/python3
"""mapper.py"""
import sys
import csv
import pydoop.hdfs as hdfs
import json
import costanct as C

azienda_map = {}
with hdfs.open('input/historical_stocks.csv', 'rt') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count > 0:
            ticker, _, name, _, _ = row
            azienda_map[ticker] = {'name': name}
        line_count += 1


def toJson(azione):
    dic = {
        "ticker": azione[0],
        "name": azienda_map[azione[0]],
        "close": azione[2],
        "date": azione[7],
    }
    return json.dumps(dic)


for line in sys.stdin:
    azione = line.split(',')
示例#45
0
        context.setStatus("initializing")

    def map(self, context):
        k = context.getInputKey()
        tmp_data = csv.reader(f)
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")
            context.incrementCounter(self.inputWords, len(words))

    def close(self):
        self.logger.info("all done")


print "Prediction on HD 30 year data:"
f = hdfs.open('/HD-1984-2014-d.csv')
tmp_data = csv.reader(f)

my_data = list()
for item in tmp_data:
    tmp_item = list()
    for i in item:
        tmp_item.append(i)
    my_data.append(tmp_item)
data = my_data[1:]
X = list()
training_indices = list()
for i in xrange(int(len(data) * 0.9)):
    training_indices.append(i)

test_indices = list()
示例#46
0
import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession

import pandas as pd
import pydoop.hdfs as hd

# create a spark session
#sparkSession = SparkSession.builder.master("local").appName("draw heat map").getOrCreate()
#df_load = sparkSession.read.csv('hdfs://dumbo/user/gx271/pubgETL/mir_death.csv')

with hd.open("hdfs://dumbo/user/gx271/pubgETL/mir_death.csv/part-00006") as f:
    df = pd.read_csv(f)

# convert DataFrame to np array

dat = df.as_matrix()

# dat = np.loadtxt('mydata.csv')

x, y = dat[:,0], dat[:,1]

heatmap, xedges, yedges = np.histogram2d(x, y, bins=50)  
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]  
plt.clf()  
plt.imshow(heatmap, extent=extent)  
# plt.show()
示例#47
0
def main(input_path, output_attribute_index, scikit_output_path,
         spark_output_path):

    # Instancira se Passive Aggressive Regressor model
    regressor = PassiveAggressiveRegressor()
    for file_path in hdfs.ls(input_path):
        # Ucitava se sadrzaj fajla i kreira string matrica od njega
        content = hdfs.load(file_path)
        temp = content.split("\n")
        temp = list(map(lambda x: x.split(","), temp))
        temp = list(filter(lambda x: len(x) > 1, temp))
        raw_matrix = np.array(temp)
        # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti
        # koja se nakon toga koristi prilikom treniranja modela
        # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string')
        input_matrix = raw_matrix[1:, 3:-5].astype('float64')
        output_vector = raw_matrix[1:, -5 +
                                   output_attribute_index].astype('float64')
        # Model se trenira u vidu iterativnog poboljsanja
        regressor.partial_fit(input_matrix, output_vector)
        # Na konzoli se stampa putanja do obradjenog fajla
        print(file_path)

    # Cuva se kreirani model na izlaznoj putanji
    # koja je prosledjena u vidu argumenta
    with hdfs.open(scikit_output_path, 'w') as opened_file:
        pickle.dump(regressor, opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Trainer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog upisivanja modela)
    session = SparkSession(context)

    # Ucitavanje RDD podataka sa ulazne putanje
    input_data = context.textFile(input_path)
    # Parsiranje svakog reda na reci
    input_data = input_data.map(lambda x: x.split(","))
    # Ignorisu se header-i
    input_data = input_data.filter(lambda x: x[0] != "Timestamp")
    # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude)
    # i bira se odgovarajuca izlazna kolona
    # (u zavisnosti od output_attribute_index promenljive)
    input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[
        3:-5])) + [float(x[-5 + output_attribute_index])])

    # Formira se odgovarajuci DataFrame objekat
    # (VectorAssembler se koristi kod formiranja kolona
    # koje omogucavaju koriscenje fit metode linearne regresije)
    input_cols = []
    for i in range(15):
        input_cols.append("_" + str(i + 1))
    assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
    data_frame = assembler.transform(input_data.toDF())

    # Instancira se LinearRegression objekat i vrsi njegovo treniranje
    # i zatim cuvanje na zadatoj putanji
    regression = LinearRegression(featuresCol='features', labelCol='_16')
    model = regression.fit(data_frame)
    model.write().overwrite().save(spark_output_path)
示例#48
0
bucket = 'enhance-it'
import boto3
import pydoop.hdfs as hdfs 

s3 = boto3.resource('s3')


file = hdfs.open('hdfs://master:9000/data_for_db/currency=USD/part-00000-f65c545f-baa0-4bf0-8aa9-0b14957848c4.c000.json')
s3.Bucket(bucket).put_object(Key='lituation/data_from_hdfs.csv', Body=file)
示例#49
0
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(
    5, 1000, '/user/root/genrecords_output'
)
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)
示例#50
0
#! /usr/bin/env python

import sys
from pydoop import hdfs
from DataPoint import DataPoint

#print "Start"

# read sys.argv[1] and sys.argv[2]
# put em in lists

if len(sys.argv)<3:
	print "Error: Insufficient Arguments"
	sys.exit(-1)

oldCentroidsFile = hdfs.open(sys.argv[1])
newCentroidsFile = hdfs.open(sys.argv[2])

oldCentroids = []
newCentroids = []

for line in oldCentroidsFile:
	if line.find("\t") != -1:
		(key,value) = line.strip().split("\t")
		oldCentroid = DataPoint(value)
	else:
		oldCentroid = DataPoint(line.strip()) 
	oldCentroids.append(oldCentroid)

for line in newCentroidsFile:
	(key,value) = line.strip().split("\t")
#! /usr/bin/env python

import sys
import DataPoint
from pydoop import hdfs

# Check for sufficient arguments
if len(sys.argv) < 2:
    print("ERROR: Insufficient arguments")
    sys.exit(-1)

# List to hold canopy centers
canopyCenters = []

# Read canopy center file
file = hdfs.open(sys.argv[1])
for line in file:
    if line.find("Warning:") == 0:
        continue
    (key, value) = line.split("\t")
    dp = DataPoint.DataPoint(value.strip())
    canopyCenters.append(dp)

# Assign points to canopies
for line in sys.stdin:
    dp = DataPoint.DataPoint(line.strip())
    insert = True
    for canopyCenter in canopyCenters:
        if dp.checkT1(canopyCenter):
            print(canopyCenter.toString() + "\t" + dp.toString())
示例#52
0
### Parte de leer los datos de hdfs ###
import pandas as pd
import numpy as np
import pydoop.hdfs as hd
from lxml import objectify

with hd.open("/user/datostiempo/20160525_1341.xml") as archivo:
    parsed = objectify.parse(archivo)

root = parsed.getroot()
prob_precipitacion = []
estado_cielo = []
viento = []
temperatura = []
tempmax = []
tempmin = []
iteraccion = 0
errores = []
print "root : ", root
for row in root.prediccion.dia:
    for row_precipitacion in row.prob_precipitacion:
        aux_precipitacion = []
        if (row_precipitacion != ''):
            aux_precipitacion.append(row_precipitacion)
        else:
            errores.append(1)
    prob_precipitacion.append(
        str(sum(aux_precipitacion) / float(len(aux_precipitacion))))
    for row_cielo in row.estado_cielo:
        aux_cielo = []
        if (row_cielo != ''):
import os
import pandas as pd
import pydoop.hdfs as hdfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

conf = SparkConf().setAppName("drunk detection").setMaster("yarn")
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)

csv_file_path = "hdfs:///drunkdetection/train_data48.csv"
predictor_path = "hdfs:///drunkdetection/shape_predictor_68_face_landmarks.dat"
image_path = "hdfs:///drunkdetection/drunk3.jpg"
model_path = "hdfs:///drunkdetection/rf48.pickle"

with hdfs.open("/drunkdetection/train_data48.csv") as csv:
    df = pd.read_csv(csv, index_col=0)
print(df.columns)
df_y = df['label'] == 3
df_X = df[['x' + str(i)
           for i in range(1, 49)] + ['y' + str(j) for j in range(1, 49)]]
X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                    df_y,
                                                    test_size=0.2,
                                                    random_state=15)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
示例#54
0
import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open(
        "hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv"
) as f:
    df = pd.read_csv(f)

    df = pd.read_csv('/user/cloudera/python/cities_location.csv')
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015, 11, 1, 2, 0, 0)
    for i in range(len(df)):
        col = ["cities", "time", "temperatureMin", "temperatureMax"]
        lat = df["latitude"].iloc[i]
        lng = df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day = day.data[0]
        data = {
            "cities": df["cities"].iloc[i],
            "time": Day.time,
            "temperatureMin": Day.temperatureMin,
            "temperatureMax": Day.temperatureMax
        }