def load_from_hdfs(data_package, file_name):
#def load_from_hdfs(data_package, file_name='CThead_uchar.raw'):
	hdfs_str  = data_package.stream_hdfs_file_name
	hdfs_addr = hdfs_str[:hdfs_str.rfind('0/')+1]
	hdfs_path = hdfs_str[hdfs_str.rfind('0/')+2:]

	if log_type in ['time','all']: st = time.time()
	client = InsecureClient(hdfs_addr, user=getpass.getuser())


	with client.read('%s/%s'%(hdfs_path, file_name)) as reader:
		data = numpy.array(Image.open(StringIO(reader.read())))

	print_purple("LOADED")

	return data
Пример #2
0
def read_hdfs(filename, root_dir='data'):
    data_dir = os.path.join(root_dir, filename)
    client_hdfs = InsecureClient('http://' + os.environ['IP_HDFS'] + ':50070')

    with client_hdfs.read(data_dir, encoding='latin-1') as reader:
        df = pd.read_csv(reader, index_col=0)
    return df
Пример #3
0
class Storage:
    def __init__(self, protocol: str = 'webHDFS', *args, **kwargs):
        self.protocol, self.client = protocol.lower(), None
        if protocol.lower() == 'webHDFS'.lower():
            from hdfs import InsecureClient
            self.client = InsecureClient(*args, **kwargs)
            for f in 'upload download list status delete'.split():
                setattr(self, f, getattr(self,
                                         '%s_%s' % (f, protocol.lower())))

    def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs):
        to_screen("upload %s -> %s" % (local_path, remote_path))
        return self.client.upload(local_path=local_path,
                                  hdfs_path=remote_path,
                                  **kwargs)

    def download_webhdfs(self, remote_path: str, local_path: str, **kwargs):
        mkdir_for(local_path)
        to_screen("download %s -> %s" % (remote_path, local_path))
        return self.client.download(local_path=local_path,
                                    hdfs_path=remote_path,
                                    overwrite=True,
                                    **kwargs)

    def list_webhdfs(self, remote_path: str, **kwargs):
        return self.client.list(hdfs_path=remote_path, **kwargs)

    def status_webhdfs(self, remote_path: str, **kwargs):
        return self.client.status(hdfs_path=remote_path, **kwargs)

    def delete_webhdfs(self, remote_path: str, **kwargs):
        return self.client.delete(hdfs_path=remote_path, **kwargs)
Пример #4
0
    def post(self):
        image_name = int(
            time.time()
        )  #per avere sempre immagini con nomi diversi utilizzo la funzione time di python
        image_path = "/root/ZAGA/ZoraOD/Images_bbx/{}.jpg".format(image_name)
        with open(image_path, 'wb') as image:
            image.write(
                request.data
            )  #l'immagine contenuta in request.data viene salvata in locale
        # result e' il risultato dell'object detection. Puo' essere la stringa che deve pronunciare il robot,
        # oppure le possibili labels che identificano un oggetto nell'immagine se lo score e' compreso tra due soglie.
        # Viene restituito il vettore nullo se non e' stato trovato nessun oggetto
        result = obj_detection.find_result(image_path, image_name)

        # se l'hdfs e' connesso vi salvo l'immagine
        # uso il modulo socket per controllare se la connessione con la porta dell'hdfs e'attiva
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        port_result = sock.connect_ex(('localhost', 50070))
        # se la porta e' aperta restituisce 0, altrimenti restituisce un valore diverso da 0
        if port_result == 0:
            client_hdfs = InsecureClient('http://localhost:50070')
            # sposto l'immagine nel HDFS
            client_hdfs.upload(
                '/zora-object-detection/images/{}.jpg'.format(image_name),
                image_path)
            os.remove(image_path)

        return result  #il risultato viene inviato al robot
Пример #5
0
 def __init__(self, redis_ip, redis_port):
     self.redis_ip = redis_ip
     self.port = redis_port
     self.r = redis.StrictRedis(host=redis_ip, port=redis_port, db=0)
     self.client = InsecureClient('http://cdh1:50070/',
                                  'admin',
                                  root='/user/admin/open_data')
Пример #6
0
def crop():
    # Check pictures folders
    if request.args.get('from') is None:
        return 'No "from" directory given.'

    if request.args.get('to') is None:
        return 'No "to" directory given.'

    directory_from = request.args.get('from')
    directory_to = request.args.get('to')

    dask_client = Client('192.168.1.4:8786')
    hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******')

    with hdfs_client.read('/' + directory_from + 'data.csv') as reader:
        data = pd.read_csv(reader)
        data = dd.from_pandas(data, npartitions=24)

    data.map_partitions(compute_crop,
                        directory_from,
                        directory_to,
                        meta='dask.dataframe.core.Series').compute()

    create_csv(directory_to=directory_to)

    return "Crop finished"
def save_to_hdfs(docs, project, index=0):
    client = InsecureClient(HDFS_URL, user=HDFS_USER)
    filename = get_filename(project)
    print("Going to write %s into %s_%s" % (len(docs), filename, index))
    text_to_write = "\n".join([json.dumps(doc) for doc in docs])
    with client.write(filename + "_" + str(index), encoding='utf-8') as writer:
        writer.write(text_to_write)
Пример #8
0
 def download_directory(self, directory_url):
     '''Downloads directory from remote HDFS to local, archives it and
     returns the zip of the directory'''
     logger.log_info("Downloading the directory {0} ".format(directory_url))
     # Remove the base url from the absolute directory path provided as parameter
     # For example, if the absolute path is hdfs://alpha:9000/configuration/12345/drift,
     # the below statement will return /configuration/12345/drift
     directory_name_with_path = urllib3.util.parse_url(directory_url).path
     directory_name = os.path.split(directory_name_with_path)[1]
     web_hdfs_url = Environment().get_web_hdfs_url()
     session = SwSessionManager().get_session()
     user_name = session.get_username()
     client = InsecureClient(web_hdfs_url, user_name)
     try:
         with tempfile.TemporaryDirectory() as temp:
             client.download(hdfs_path=directory_name_with_path,
                             local_path=temp,
                             n_threads=5)
             tmp_archive = os.path.join(temp)
             data = io.BytesIO()
             with open(shutil.make_archive(tmp_archive, 'gztar', temp),
                       "rb") as output_data:
                 data.write(output_data.read())
             data.seek(0)
         return send_file(data,
                          as_attachment=True,
                          attachment_filename=directory_name + ".tar.gz")
     except Exception as e:
         raise ServiceError(
             "Downloading the folder from HDFS failed with the error: {0}".
             format(str(e)))
Пример #9
0
class HDFSService(object):
    def __init__(self):
        self.hdfs = InsecureClient('http://127.0.0.1:9870', user='******')
        self.base_path = '/users/root'

    def mkdir(self, path):
        return self.hdfs.makedirs(path)

    def list(self, path):
        try:
            return self.hdfs.list(path)
        except HdfsError as e:
            print(e)
            return []

    def get(self, path):
        pass

    def upload(self, path, local_path=None, data=None):
        path = self.base_path + path
        if data is not None:
            return self.hdfs.write(path, data=data)
        elif local_path is not None:
            return self.hdfs.upload(path, local_path)
        return False
        pass

    def download(self, path):
        path = self.base_path + path
        with self.hdfs.read(path) as reader:
            print(path)
            buf = reader.read()
        print(len(buf))
        return buf
Пример #10
0
    def write_to_hdfs(rows: List[Tuple[str, str]]):
        conn: Connection = Connection.get_connection_from_secrets('local_hdfs')
        uri = conn.get_uri()
        pat = re.compile("http://(\w+(:\w+)?)?@")
        print(conn.get_uri())

        uri = pat.sub("http://", uri)
        print(uri)
        print(conn.login)
        client = InsecureClient(uri, user=conn.login)
        sch = avro.schema.make_avsc_object({
            'type':'record',
            'name':'Video',
            'fields': [
                {'type': {'type': 'string', 'avro.java.string': 'String'}, 'name': 'title'},
                {'type': ["null", {'type': 'string', 'avro.java.string': 'String'}], 'name': 'description'},
            ]
        })
        local_file_name = 'videos.avro'
        writer = DataFileWriter(open(local_file_name, "wb"), DatumWriter(), sch)
        for row in rows:
            print(row)
            writer.append({"title":row[0], "description":row[1]})
        writer.close()
        client.upload('/tmp/videos.avro', local_file_name)
def normalize():

	# Check pictures folders
	if request.args.get('from') is None:
		return 'No "from" directory given.'

	if request.args.get('to') is None:
		return 'No "to" directory given.'

	dask_client = Client('192.168.1.4:8786')
	hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******')	

	from_directory = request.args.get('from')
	to_directory = request.args.get('to')

	with hdfs_client.read('/' + from_directory + 'data.csv') as reader:
		data = pd.read_csv(reader)
		data = dd.from_pandas(data, npartitions=24)

	data.map_partitions(compute_norm,
						from_directory,
						to_directory,
						meta='dask.dataframe.core.Series').compute()
	
	create_csv(directory_to=to_directory)

	return 'Normalization done.'
Пример #12
0
    def process_item(self, item, spider):
        folder = "media/"

        try:
            os.mkdir(os.path.join(BASE_PATH, folder))
        except:
            pass

        # if not Article.objects.filter(url=item['url']).exists() and item['content']:
        if item['content']:
            content = remove_tags(item['content'], which_ones=('script', 'noscript', 'iframe', 'pre', 'link', 'frame',
                                                               'meta', 'form'))
            content = dedent(content)
            # content = content.replace('</pre>', '')
            content = content.replace('</article>', '')
            # content = content.replace('<', '&lt;')
            while '\n\n' in content:
                content = content.replace('\n\n', '\n')

            # filename = item['url'].replace('/', '_') + '.md'
            filename = str(uuid.uuid4()) + '.md'
            full_filename = os.path.join(BASE_PATH, folder, filename)

            with open(full_filename, "w") as text_file:
                text_file.write(content.encode('utf8'))
            spark_data = {'content': item.get('content', ''), 'domain': item.get('domain', ''),
                          'url': item.get('url', ''), 'title': item.get('title', ''), }
            item['content'] = full_filename
            item.save()

            client = InsecureClient(HDFS_ADRESS, user='******')
            with client.write(str(uuid.uuid4()) + '.json', encoding='utf-8') as writer:
                writer.write(json.dumps(spark_data))

        return item
Пример #13
0
def robot_out_of_stock(date):
    conn = base_hook('robot_out_of_stock')

    url = conn['host']
    token = Variable.get('robot_token')

    headers = {
        'content-type': 'application/json',
        'Authorization': 'JWT ' + token
    }
    data = {'date': date}

    r = requests.get(url, data=json.dumps(data), headers=headers, timeout=10)
    r = r.json()

    if r['message'] == "No out_of_stock items for this date":
        print(f'There is no out_of_stock items for {date}')
        logging.info(f'There is no out_of_stock items for {date}')

    client = InsecureClient('http://127.0.0.1:50070/', user='******')

    with client.write(os.path.join('/bronze/out_of_stock',
                                   date + '_out_of_stock.json'),
                      encoding='utf-8') as f:
        json.dump(r, f)
        logging.info('File loaded ' +
                     os.path.join('/bronze/out_of_stock', date +
                                  '_out_of_stock.json'))
Пример #14
0
 def __init__(self, input_folder, model_folder, img_size=240):
     self.input_folder = input_folder
     self.model_folder = model_folder
     self.hdfs_client = InsecureClient('http://192.168.1.4:9870',
                                       user='******')
     self.imgs, self.labels = self.read_images(input_folder, 240)
     self.default = "svm"
Пример #15
0
def load_from_hdfs(data_package, hdfs_addr, hdfs_path):
#def load_from_hdfs(data_package, file_name='CThead_uchar.raw'):
	if log_type in ['time','all']: st = time.time()
	dp = data_package
	ds = dp.data_range
	ds_seq = [ds[elem][1]-ds[elem][0] for elem in ['z', 'y', 'x'] if elem in ds]


	while True:
		try:
			client = InsecureClient(hdfs_addr, user=getpass.getuser())
			
		
			file_python_dtype = Vivaldi_dtype_to_python_dtype(dp.file_dtype)
			file_bytes = get_bytes(file_python_dtype)
		
			#print "START TO CONNECT HDFS"
			bef = time.time()
			with client.read(hdfs_path, offset=(ds_seq[1]*ds_seq[2]*ds['z'][0]*file_bytes),length=ds_seq[0]*ds_seq[1]*ds_seq[2]*file_bytes) as reader:
				buf = reader.read()
			aft = time.time()
		
			diff = aft - bef
		
			print_bold( "DATA LOADING ENDS from %s -- time elapsed = %.03f (sec) , reading speed = %.03f MB/sec"%(socket.gethostname(), diff, len(buf) / diff * (1024 ** -2)))
			data = numpy.fromstring(buf, dtype=file_python_dtype).reshape(ds_seq)

			break

		except:
			print bcolors.WARNING + "Connection Broken" + bcolors.ENDC
	

	return data
Пример #16
0
def delete_data(request):
    response_content = {}
    response = HttpResponse()
    try:
        proj_id = request.GET.get('proj_id')
        data_id = request.GET.get('data_id')
        user_id = request.GET.get('user_id')
        fetched = Datasets.objects.filter(proj_id=proj_id,
                                          data_id=data_id,
                                          user_id=user_id).values('hdfs_path')
        if len(fetched) == 0:
            raise Exception('Oops! No access!')
        if list(fetched)[0]['hdfs_path']:
            client = InsecureClient("http://hdfs.neurolearn.com:50070",
                                    user="******")
            client.delete(list(fetched)[0]['hdfs_path'], recursive=True)
        Datasets.objects.filter(proj_id=proj_id,
                                data_id=data_id,
                                user_id=user_id).delete()

        response_content['msg'] = 'success'
        response_content['error_num'] = 0
    except Exception as e:
        response_content['msg'] = str(e)
        response_content['error_num'] = 1

    response.write(json.dumps(response_content))

    return response
Пример #17
0
    def upload_directory(self, directory_path, archive_directory_data):
        '''Untars the archive_directory_data provided as input,
        and uploads all the contents of the tar to the directory path
        specified on HDFS.
        '''
        logger.log_info("Uploading the directory to HDFS")
        web_hdfs_url = Environment().get_web_hdfs_url()
        hdfs_file_base_url = Environment().get_hdfs_file_base_url()
        session = SwSessionManager().get_session()
        user_name = session.get_username()
        client = InsecureClient(web_hdfs_url, user_name)
        directory_name_with_path = "/" + directory_path
        directory_name = os.path.split(directory_path)[1]
        try:
            with tempfile.TemporaryDirectory() as temp:
                local_dir_path = temp + "/" + directory_name + ".tar.gz"
                with open(local_dir_path, "wb") as dir_archive:
                    dir_archive.write(archive_directory_data)
                with tarfile.open(local_dir_path, "r:gz") as tar:
                    tar.extractall(temp)
                os.remove(local_dir_path)
                response = client.upload(hdfs_path=directory_name_with_path,
                                         local_path=temp)
                logger.log_info(
                    "Successfully uploaded the directory {0} to HDFS".format(
                        response))
            return hdfs_file_base_url + directory_name_with_path

        except Exception as e:
            raise ServiceError(
                "Uploading the directory to HDFS failed with the error: {0}".
                format(str(e)))
Пример #18
0
    def __init__(self,
                 path: str,
                 mode: str = 'r',
                 encoding: str = 'utf-8',
                 host: str = HDFS_HOST,
                 port: int = HDFS_PORT,
                 user: str = HDFS_USER):
        self.client = InsecureClient(url=f'http://{host}:{port}', user=user)

        self.path = path
        self.name = path.split('\\')[-1]
        self.mode = mode
        self.encoding = encoding

        if self.mode[0] == 'r':
            self.__cache_content()
            self.fptr = 0
        elif self.mode[0] == 'w':
            self.content = self.__binary_helper('')
            self.fptr = 0
        elif self.mode[0] == 'a':
            self.__cache_content()
            self.fptr = len(self.content)
        else:
            raise UnsupportedMode(f'unsupported mode {self.mode}')
Пример #19
0
    def __init__(self, data_path=None):
        if data_path == None:
            self.data_path = r'./config/connect_info.json'
        else:
            assert type(data_path) == str
            self.data_path = data_path
        if not os.path.exists(self.data_path):
            self.data_path = r'./connect_info.json'

        with open(self.data_path) as data_file:
            data = json.load(data_file)
            print("Data: ", data)
            self.hdfs_client = InsecureClient(
                url='http://' + data['namenode_url'] + ':' + str(data['port']),
                user=data['user'],
                root=data['root_path'])
            print("hdfs client: ", self.hdfs_client)
            self.img_dir = data['img_dir']
            print("img dir: ", self.img_dir)

        if self.img_dir[-1] != '/':
            self.img_dir += '/'
        else:
            pass

        self.file_name = 1
Пример #20
0
 def __init__(self, service_id, user_override=None):
     self.service_id = service_id
     self.webhdfs_url = HADOOPS[service_id]['webhdfs_url']
     self.webhdfs_user = HADOOPS[service_id]['webhdfs_user']
     if user_override:
         self.webhdfs_user = user_override
     self.id_prefix = HADOOPS[service_id]['id_prefix']
     self.client = InsecureClient(self.webhdfs_url, self.webhdfs_user)
Пример #21
0
    def __init__(self, namenode_url, username, submission_id, camera_id):
        """Initialize an internal client

        This constructor initializes an HDFS client.

        """

        self._internal_client = InsecureClient(namenode_url, user='******', root='/'.join(['/users', username, str(submission_id), str(camera_id)]))
Пример #22
0
    def __init__(self, input_folder, image_path, output_folder):
        logging.info('ImageCropper.init')

        self.input_folder = input_folder
        self.image_path = image_path
        self.output_folder = output_folder
        self.hdfs_client = InsecureClient('http://192.168.1.4:9870',
                                          user='******')
Пример #23
0
 def connect(self):
     self.conn = InsecureClient(f"http://{self.host}:{self.port}",
                                user=self.user)
     if os.environ.get("KAFKA_BOOTSTRAP", None):
         self.producer = KafkaProducer(bootstrap_servers=os.environ.get(
             "KAFAKA_BOOTSTRAP", "localhost:1234"))
     else:
         self.producer = None
Пример #24
0
 def __init__(self, protocol: str = 'webHDFS', *args, **kwargs):
     self.protocol, self.client = protocol.lower(), None
     if protocol.lower() == 'webHDFS'.lower():
         from hdfs import InsecureClient
         self.client = InsecureClient(*args, **kwargs)
         for f in 'upload download list status delete'.split():
             setattr(self, f, getattr(self,
                                      '%s_%s' % (f, protocol.lower())))
Пример #25
0
 def upload_file(self):
     ip_address = self.ip_input.toPlainText()
     port_number = self.port_input.toPlainText()
     user_name = self.user_input.toPlainText()
     upload_file = self.dir_input.toPlainText()
     host_address = 'http://'+ip_address + ':' + port_number
     hadoop = InsecureClient(host_address,user_name)
     hadoop.upload('',upload_file)
Пример #26
0
    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)
Пример #27
0
def save_dataframe_as_hdfs(filename, df, root_dir='data', enc='utf8'):
    data_dir = os.path.join(root_dir, filename)
    if not os.path.exists(root_dir):
        os.makedirs(root_dir)
    client_hdfs = InsecureClient('http://' + os.environ['IP_HDFS'] + ':50070')

    with client_hdfs.write(data_dir, encoding=enc) as writer:
        df.to_csv(writer)
    return True
Пример #28
0
    def __init__(self, dir_algo, algo, path_img):
        logging.info('prediction_ML.init')
        self.directory_algo = dir_algo
        self.path_img = path_img
        self.algo = algo

        self.hdfs_client = InsecureClient('http://192.168.1.4:9870',
                                          user='******')
        self.image = self.read_image(self.path_img, 240)
Пример #29
0
    def __init__(self, url, user):
        u = urlsplit(url)
        if u.scheme != 'http' and u.scheme != 'https':
            raise ValueError("Invalid name node address")

        self.url = urlunparse((u.scheme, u.netloc, '', '', '', ''))
        self.client = InsecureClient(self.url, user=user)
        self.localdir = u.path
        self.prefix = 'HDFS'
Пример #30
0
 def upload_file(self):
     ip_address = self.ip_input.toPlainText()
     port_number = self.port_input.toPlainText()
     user_name = self.user_input.toPlainText()
     file_name = self.File_directory_display.toPlainText()
     dir_name = self.dir_input.toPlainText()
     host_address = 'http://' + ip_address + ':' + port_number
     hadoop = InsecureClient(host_address, user_name)
     hadoop.upload(dir_name, file_name)
Пример #31
0
    def __init__(self, url, user, base_path=""):
        self._logger = logging.getLogger(self.__class__.__name__)
        self._url = url
        self._user_ = user
        self._base_path = base_path
        self._client = InsecureClient(url, user)

        if not self._exist(base_path):
            self._mkdir(base_path)
Пример #32
0
 def mkdir_hdfs(self):
     ip_address = self.ip_input.toPlainText()
     port_number = self.port_input.toPlainText()
     user_name = self.user_input.toPlainText()
     dir_name = self.dir_input.toPlainText()
     target_name = dir_name + '/' + self.mkdir_input.toPlainText()
     host_address = 'http://' + ip_address + ':' + port_number
     hadoop = InsecureClient(host_address, user_name)
     hadoop.makedirs(target_name)
Пример #33
0
def put_in_hdfs(hdfs_path, local_path):
    print('uploading...')
    client = InsecureClient('http://quickstart.cloudera:50070', user='******')
    client.upload(hdfs_path=hdfs_path,
                  local_path=local_path,
                  progress=lambda x, y: print(x, y),
                  overwrite=True,
                  temp_dir='/tmp/{}'.format(local_path))
    print('done!')
Пример #34
0
def load_file_list_from_hdfs(data_package):
	if log_type in ['time','all']: st = time.time()
	hdfs_str  = data_package.stream_hdfs_file_name
	hdfs_addr = hdfs_str[:hdfs_str.rfind('0/')+1]
	hdfs_path = hdfs_str[hdfs_str.rfind('0/')+2:]


	client = InsecureClient(hdfs_addr, user=getpass.getuser())
	return client.list(hdfs_path), hdfs_path
Пример #35
0
    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)
Пример #36
0
class HDFSStorage(Storage):
    """
    HDFS storage
    """

    def fix_slashes(self, path):
        sep = os.path.sep
        if path[0] != sep:
            path = sep + path
        if path[-1] != sep:
            path = path + sep
        return path

    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)

    def _open(self, name, mode='rb'):
        local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep))
        if not os.path.exists(local_path):
            remote_path = self.path(name)
            local_dir = os.path.dirname(local_path)
            if not os.path.exists(local_dir):
                os.mkdir(local_dir)
            print self.client.download(remote_path, local_path=local_path, overwrite=True,
                                       temp_dir=tempfile.gettempdir())
        return File(open(local_path, mode))

    def _save(self, name, content):
        print "_save(%s, %s, %s)" % (self, name, content)
        local_path = content.name
        hdfs_path = self.path(name)  # os.path.basename(local_path))
        print hdfs_path, local_path
        self.client.write(hdfs_path, data=content, overwrite=True)
        return name

    def url(self, name):
        return self.fetch_url % name

    def delete(self, name):
        return self.client.delete(self.path(name))

    def listdir(self, path):
        file_list = []
        dir_list = []
        for name, status in self.client.list(self.path(path), status=True):
            if status['type'] == 'DIRECTORY':
                dir_list.append(name)
            elif status['type'] == 'FILE':
                file_list.append(name)
        return dir_list, file_list

    def size(self, name):
        return self.client.status(self.path(name))['length']

    def exists(self, name):
        try:
            return True if self.client.status(self.path(name)) else False
        except HdfsError:
            return False

    def path(self, name):
        return (self.hdfs_root + name).replace('\\', '/')
Пример #37
0
# -*- coding: utf-8 -*-
#
# Copyright © 2018 white <*****@*****.**>
#
# Distributed under terms of the MIT license.

"""
https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client
"""
from hdfs import InsecureClient

hdfs_url = "http://192.168.30.125:50070"
hdfs_user = "******"
c = InsecureClient(hdfs_url, user=hdfs_user)

c.write("/test_write", data="string")
c.delete("/test_write")
c.makedirs("/new/path") # 自动递归创建

with c.read("f.txt", encoding="utf-8") as f:
    content = f.read()

c.write("/test.txt", "test string")
Пример #38
0
def stream(string, lines, t):
	"""
	Stream tweets from twitter and save them to file every hour

	Args:
		lines - array of streaming words
		t - Twarc class

	Returns:
		boolean - True (OK) / False (Error)
	"""
	words = lines
	string = string

	hour_keywords = {}

	# make timestamps
	timestr = time.strftime("%Y-%m-%d_%H-%M-%S")
	datestr = time.strftime("%Y-%m-%d")

	# get total time for check time
	start_time = time.time()

	# create directories and files for keywords
        tweets_to_write = {}
        indexes = {}
        client = InsecureClient('http://192.168.1.12:50070', user='******')
	for word in words:
		dir_word = word.replace(" ", "_")

		# for statistics
		if not os.path.isdir("data/statistics"):
			os.makedirs("data/statistics")

		# for statistics date
		if not os.path.isdir("data/statistics/"+datestr):
			os.makedirs("data/statistics/"+datestr)

		# for keyword
		if not os.path.isdir("data/"+dir_word):
			os.makedirs("data/"+dir_word)

		# for date
		if not os.path.isdir("data/"+dir_word+"/"+datestr):
			os.makedirs("data/"+dir_word+"/"+datestr)

		# create json file for writing data
		with open("data/"+dir_word+"/"+datestr+"/"+timestr+".json", "w") as fw:
			fw.write("[")

                tweets_to_write[dir_word] = []
                indexes[dir_word] = 0
             

        minutes = 1
	while True:
		try:
			# find lines in twitter
                        print "String query: %s" % string 
			for tweet in t.stream(string):
				# regex to find keyword
				for word in words:
					dir_word = word.replace(" ", "_")
                			filename = "data/"+dir_word+"/"+datestr+"/"+timestr
					# create list of words in keyword
					wlist = word.split()
					# length of this list
					w_length = len(wlist)
					check = 0
					# for every word in keyword
					for w in wlist:
						# check if word is in tweet
						keyword = re.search("%s" % w, tweet["text"], re.IGNORECASE)
						if keyword:
							check += 1
					# if every word from keyword is in tweet, save to file
					if check == w_length:
                                                print "Tweet language: %s" % tweet['lang']
                                                if tweet['lang'] in languages:
                                                	dumped_json = json.dumps(tweet)
                                                        tweets_to_write[dir_word].append(dumped_json)
							with open(filename + ".json", "a") as fw:
                                                	    
								fw.write(dumped_json)
								fw.write(",")
                                                	 
                                	        	


							# counting total
							if word in total_keywords:
								total_keywords[word] += 1
							else:
								total_keywords[word] = 1
							# counting hourly
							if word in hour_keywords:
								hour_keywords[word] += 1
							else:
								hour_keywords[word] = 1
                                                        if len(tweets_to_write[dir_word]) % 10 == 0:
                                            			print "Goint to write into %s_%s" % (filename, indexes[dir_word])
                                	    			with client.write(filename + "_" + str(indexes[dir_word]), encoding='utf-8') as writer:
                                	    			    writer.write("\n".join(tweets_to_write))
                                	    			indexes[dir_word] = indexes[dir_word]+1
                                	    			tweets_to_write[dir_word] = []

				# exit every hour and start function again
				if start_time+3600 < time.time():
					for word in words:
						dir_word = word.replace(" ", "_")
						with open("data/"+dir_word+"/"+datestr+"/"+timestr+".json", "a+") as fw:
							fw.seek(-1, os.SEEK_END)
							if fw.read() == ",":
								fw.seek(-1, os.SEEK_END)
								fw.truncate()
							fw.write("]")
					# hour statistics
					with open("data/statistics"+"/"+datestr+"/"+timestr+".txt", "w") as fw:
						for word in hour_keywords:
							fw.write(str(word) + " : " + str(hour_keywords[word]) + "\n")
					# total statistics
					with open("data/statistics/statistics.txt", "w") as fw:
						for word in total_keywords:
							fw.write(str(word) + " : " + str(total_keywords[word]) + "\n")
					return True

		# except for quit application
		except KeyboardInterrupt:
			for word in words:
				dir_word = word.replace(" ", "_")
				with open("data/"+dir_word+"/"+datestr+"/"+timestr+".json", "a+") as fw:
					fw.seek(-1, os.SEEK_END)
					if fw.read() == ",":
						fw.seek(-1, os.SEEK_END)
						fw.truncate()
					fw.write("]")
			# hour statistics
			with open("data/statistics"+"/"+datestr+"/"+timestr+".txt", "w") as fw:
				for word in hour_keywords:
					fw.write(str(word) + " : " + str(hour_keywords[word]) + "\n")
			# total statistics
			with open("data/statistics/statistics.txt", "w") as fw:
				for word in total_keywords:
					fw.write(str(word) + " : " + str(total_keywords[word]) + "\n")
			sys.stdout.write("QUIT\n")
			sys.exit(0)
		# except for problems with key
		except KeyError:
			# exit every hour and start function again
			if start_time+3600 < time.time():
				for word in words:
					dir_word = word.replace(" ", "_")
					with open("data/"+dir_word+"/"+datestr+"/"+timestr+".json", "a+") as fw:
						fw.seek(-1, os.SEEK_END)
						if fw.read() == ",":
							fw.seek(-1, os.SEEK_END)
							fw.truncate()
						fw.write("]")
				# hour statistics
				with open("data/statistics"+"/"+datestr+"/"+timestr+".txt", "w") as fw:
					for word in hour_keywords:
						fw.write(str(word) + " : " + str(hour_keywords[word]) + "\n")
				# total statistics
				with open("data/statistics/statistics.txt", "w") as fw:
					for word in total_keywords:
						fw.write(str(word) + " : " + str(total_keywords[word]) + "\n")
				return True
			continue
	# error
	return False