def load_from_hdfs(data_package, file_name): #def load_from_hdfs(data_package, file_name='CThead_uchar.raw'): hdfs_str = data_package.stream_hdfs_file_name hdfs_addr = hdfs_str[:hdfs_str.rfind('0/')+1] hdfs_path = hdfs_str[hdfs_str.rfind('0/')+2:] if log_type in ['time','all']: st = time.time() client = InsecureClient(hdfs_addr, user=getpass.getuser()) with client.read('%s/%s'%(hdfs_path, file_name)) as reader: data = numpy.array(Image.open(StringIO(reader.read()))) print_purple("LOADED") return data
def read_hdfs(filename, root_dir='data'): data_dir = os.path.join(root_dir, filename) client_hdfs = InsecureClient('http://' + os.environ['IP_HDFS'] + ':50070') with client_hdfs.read(data_dir, encoding='latin-1') as reader: df = pd.read_csv(reader, index_col=0) return df
class Storage: def __init__(self, protocol: str = 'webHDFS', *args, **kwargs): self.protocol, self.client = protocol.lower(), None if protocol.lower() == 'webHDFS'.lower(): from hdfs import InsecureClient self.client = InsecureClient(*args, **kwargs) for f in 'upload download list status delete'.split(): setattr(self, f, getattr(self, '%s_%s' % (f, protocol.lower()))) def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs): to_screen("upload %s -> %s" % (local_path, remote_path)) return self.client.upload(local_path=local_path, hdfs_path=remote_path, **kwargs) def download_webhdfs(self, remote_path: str, local_path: str, **kwargs): mkdir_for(local_path) to_screen("download %s -> %s" % (remote_path, local_path)) return self.client.download(local_path=local_path, hdfs_path=remote_path, overwrite=True, **kwargs) def list_webhdfs(self, remote_path: str, **kwargs): return self.client.list(hdfs_path=remote_path, **kwargs) def status_webhdfs(self, remote_path: str, **kwargs): return self.client.status(hdfs_path=remote_path, **kwargs) def delete_webhdfs(self, remote_path: str, **kwargs): return self.client.delete(hdfs_path=remote_path, **kwargs)
def post(self): image_name = int( time.time() ) #per avere sempre immagini con nomi diversi utilizzo la funzione time di python image_path = "/root/ZAGA/ZoraOD/Images_bbx/{}.jpg".format(image_name) with open(image_path, 'wb') as image: image.write( request.data ) #l'immagine contenuta in request.data viene salvata in locale # result e' il risultato dell'object detection. Puo' essere la stringa che deve pronunciare il robot, # oppure le possibili labels che identificano un oggetto nell'immagine se lo score e' compreso tra due soglie. # Viene restituito il vettore nullo se non e' stato trovato nessun oggetto result = obj_detection.find_result(image_path, image_name) # se l'hdfs e' connesso vi salvo l'immagine # uso il modulo socket per controllare se la connessione con la porta dell'hdfs e'attiva sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) port_result = sock.connect_ex(('localhost', 50070)) # se la porta e' aperta restituisce 0, altrimenti restituisce un valore diverso da 0 if port_result == 0: client_hdfs = InsecureClient('http://localhost:50070') # sposto l'immagine nel HDFS client_hdfs.upload( '/zora-object-detection/images/{}.jpg'.format(image_name), image_path) os.remove(image_path) return result #il risultato viene inviato al robot
def __init__(self, redis_ip, redis_port): self.redis_ip = redis_ip self.port = redis_port self.r = redis.StrictRedis(host=redis_ip, port=redis_port, db=0) self.client = InsecureClient('http://cdh1:50070/', 'admin', root='/user/admin/open_data')
def crop(): # Check pictures folders if request.args.get('from') is None: return 'No "from" directory given.' if request.args.get('to') is None: return 'No "to" directory given.' directory_from = request.args.get('from') directory_to = request.args.get('to') dask_client = Client('192.168.1.4:8786') hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') with hdfs_client.read('/' + directory_from + 'data.csv') as reader: data = pd.read_csv(reader) data = dd.from_pandas(data, npartitions=24) data.map_partitions(compute_crop, directory_from, directory_to, meta='dask.dataframe.core.Series').compute() create_csv(directory_to=directory_to) return "Crop finished"
def save_to_hdfs(docs, project, index=0): client = InsecureClient(HDFS_URL, user=HDFS_USER) filename = get_filename(project) print("Going to write %s into %s_%s" % (len(docs), filename, index)) text_to_write = "\n".join([json.dumps(doc) for doc in docs]) with client.write(filename + "_" + str(index), encoding='utf-8') as writer: writer.write(text_to_write)
def download_directory(self, directory_url): '''Downloads directory from remote HDFS to local, archives it and returns the zip of the directory''' logger.log_info("Downloading the directory {0} ".format(directory_url)) # Remove the base url from the absolute directory path provided as parameter # For example, if the absolute path is hdfs://alpha:9000/configuration/12345/drift, # the below statement will return /configuration/12345/drift directory_name_with_path = urllib3.util.parse_url(directory_url).path directory_name = os.path.split(directory_name_with_path)[1] web_hdfs_url = Environment().get_web_hdfs_url() session = SwSessionManager().get_session() user_name = session.get_username() client = InsecureClient(web_hdfs_url, user_name) try: with tempfile.TemporaryDirectory() as temp: client.download(hdfs_path=directory_name_with_path, local_path=temp, n_threads=5) tmp_archive = os.path.join(temp) data = io.BytesIO() with open(shutil.make_archive(tmp_archive, 'gztar', temp), "rb") as output_data: data.write(output_data.read()) data.seek(0) return send_file(data, as_attachment=True, attachment_filename=directory_name + ".tar.gz") except Exception as e: raise ServiceError( "Downloading the folder from HDFS failed with the error: {0}". format(str(e)))
class HDFSService(object): def __init__(self): self.hdfs = InsecureClient('http://127.0.0.1:9870', user='******') self.base_path = '/users/root' def mkdir(self, path): return self.hdfs.makedirs(path) def list(self, path): try: return self.hdfs.list(path) except HdfsError as e: print(e) return [] def get(self, path): pass def upload(self, path, local_path=None, data=None): path = self.base_path + path if data is not None: return self.hdfs.write(path, data=data) elif local_path is not None: return self.hdfs.upload(path, local_path) return False pass def download(self, path): path = self.base_path + path with self.hdfs.read(path) as reader: print(path) buf = reader.read() print(len(buf)) return buf
def write_to_hdfs(rows: List[Tuple[str, str]]): conn: Connection = Connection.get_connection_from_secrets('local_hdfs') uri = conn.get_uri() pat = re.compile("http://(\w+(:\w+)?)?@") print(conn.get_uri()) uri = pat.sub("http://", uri) print(uri) print(conn.login) client = InsecureClient(uri, user=conn.login) sch = avro.schema.make_avsc_object({ 'type':'record', 'name':'Video', 'fields': [ {'type': {'type': 'string', 'avro.java.string': 'String'}, 'name': 'title'}, {'type': ["null", {'type': 'string', 'avro.java.string': 'String'}], 'name': 'description'}, ] }) local_file_name = 'videos.avro' writer = DataFileWriter(open(local_file_name, "wb"), DatumWriter(), sch) for row in rows: print(row) writer.append({"title":row[0], "description":row[1]}) writer.close() client.upload('/tmp/videos.avro', local_file_name)
def normalize(): # Check pictures folders if request.args.get('from') is None: return 'No "from" directory given.' if request.args.get('to') is None: return 'No "to" directory given.' dask_client = Client('192.168.1.4:8786') hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') from_directory = request.args.get('from') to_directory = request.args.get('to') with hdfs_client.read('/' + from_directory + 'data.csv') as reader: data = pd.read_csv(reader) data = dd.from_pandas(data, npartitions=24) data.map_partitions(compute_norm, from_directory, to_directory, meta='dask.dataframe.core.Series').compute() create_csv(directory_to=to_directory) return 'Normalization done.'
def process_item(self, item, spider): folder = "media/" try: os.mkdir(os.path.join(BASE_PATH, folder)) except: pass # if not Article.objects.filter(url=item['url']).exists() and item['content']: if item['content']: content = remove_tags(item['content'], which_ones=('script', 'noscript', 'iframe', 'pre', 'link', 'frame', 'meta', 'form')) content = dedent(content) # content = content.replace('</pre>', '') content = content.replace('</article>', '') # content = content.replace('<', '<') while '\n\n' in content: content = content.replace('\n\n', '\n') # filename = item['url'].replace('/', '_') + '.md' filename = str(uuid.uuid4()) + '.md' full_filename = os.path.join(BASE_PATH, folder, filename) with open(full_filename, "w") as text_file: text_file.write(content.encode('utf8')) spark_data = {'content': item.get('content', ''), 'domain': item.get('domain', ''), 'url': item.get('url', ''), 'title': item.get('title', ''), } item['content'] = full_filename item.save() client = InsecureClient(HDFS_ADRESS, user='******') with client.write(str(uuid.uuid4()) + '.json', encoding='utf-8') as writer: writer.write(json.dumps(spark_data)) return item
def robot_out_of_stock(date): conn = base_hook('robot_out_of_stock') url = conn['host'] token = Variable.get('robot_token') headers = { 'content-type': 'application/json', 'Authorization': 'JWT ' + token } data = {'date': date} r = requests.get(url, data=json.dumps(data), headers=headers, timeout=10) r = r.json() if r['message'] == "No out_of_stock items for this date": print(f'There is no out_of_stock items for {date}') logging.info(f'There is no out_of_stock items for {date}') client = InsecureClient('http://127.0.0.1:50070/', user='******') with client.write(os.path.join('/bronze/out_of_stock', date + '_out_of_stock.json'), encoding='utf-8') as f: json.dump(r, f) logging.info('File loaded ' + os.path.join('/bronze/out_of_stock', date + '_out_of_stock.json'))
def __init__(self, input_folder, model_folder, img_size=240): self.input_folder = input_folder self.model_folder = model_folder self.hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') self.imgs, self.labels = self.read_images(input_folder, 240) self.default = "svm"
def load_from_hdfs(data_package, hdfs_addr, hdfs_path): #def load_from_hdfs(data_package, file_name='CThead_uchar.raw'): if log_type in ['time','all']: st = time.time() dp = data_package ds = dp.data_range ds_seq = [ds[elem][1]-ds[elem][0] for elem in ['z', 'y', 'x'] if elem in ds] while True: try: client = InsecureClient(hdfs_addr, user=getpass.getuser()) file_python_dtype = Vivaldi_dtype_to_python_dtype(dp.file_dtype) file_bytes = get_bytes(file_python_dtype) #print "START TO CONNECT HDFS" bef = time.time() with client.read(hdfs_path, offset=(ds_seq[1]*ds_seq[2]*ds['z'][0]*file_bytes),length=ds_seq[0]*ds_seq[1]*ds_seq[2]*file_bytes) as reader: buf = reader.read() aft = time.time() diff = aft - bef print_bold( "DATA LOADING ENDS from %s -- time elapsed = %.03f (sec) , reading speed = %.03f MB/sec"%(socket.gethostname(), diff, len(buf) / diff * (1024 ** -2))) data = numpy.fromstring(buf, dtype=file_python_dtype).reshape(ds_seq) break except: print bcolors.WARNING + "Connection Broken" + bcolors.ENDC return data
def delete_data(request): response_content = {} response = HttpResponse() try: proj_id = request.GET.get('proj_id') data_id = request.GET.get('data_id') user_id = request.GET.get('user_id') fetched = Datasets.objects.filter(proj_id=proj_id, data_id=data_id, user_id=user_id).values('hdfs_path') if len(fetched) == 0: raise Exception('Oops! No access!') if list(fetched)[0]['hdfs_path']: client = InsecureClient("http://hdfs.neurolearn.com:50070", user="******") client.delete(list(fetched)[0]['hdfs_path'], recursive=True) Datasets.objects.filter(proj_id=proj_id, data_id=data_id, user_id=user_id).delete() response_content['msg'] = 'success' response_content['error_num'] = 0 except Exception as e: response_content['msg'] = str(e) response_content['error_num'] = 1 response.write(json.dumps(response_content)) return response
def upload_directory(self, directory_path, archive_directory_data): '''Untars the archive_directory_data provided as input, and uploads all the contents of the tar to the directory path specified on HDFS. ''' logger.log_info("Uploading the directory to HDFS") web_hdfs_url = Environment().get_web_hdfs_url() hdfs_file_base_url = Environment().get_hdfs_file_base_url() session = SwSessionManager().get_session() user_name = session.get_username() client = InsecureClient(web_hdfs_url, user_name) directory_name_with_path = "/" + directory_path directory_name = os.path.split(directory_path)[1] try: with tempfile.TemporaryDirectory() as temp: local_dir_path = temp + "/" + directory_name + ".tar.gz" with open(local_dir_path, "wb") as dir_archive: dir_archive.write(archive_directory_data) with tarfile.open(local_dir_path, "r:gz") as tar: tar.extractall(temp) os.remove(local_dir_path) response = client.upload(hdfs_path=directory_name_with_path, local_path=temp) logger.log_info( "Successfully uploaded the directory {0} to HDFS".format( response)) return hdfs_file_base_url + directory_name_with_path except Exception as e: raise ServiceError( "Uploading the directory to HDFS failed with the error: {0}". format(str(e)))
def __init__(self, path: str, mode: str = 'r', encoding: str = 'utf-8', host: str = HDFS_HOST, port: int = HDFS_PORT, user: str = HDFS_USER): self.client = InsecureClient(url=f'http://{host}:{port}', user=user) self.path = path self.name = path.split('\\')[-1] self.mode = mode self.encoding = encoding if self.mode[0] == 'r': self.__cache_content() self.fptr = 0 elif self.mode[0] == 'w': self.content = self.__binary_helper('') self.fptr = 0 elif self.mode[0] == 'a': self.__cache_content() self.fptr = len(self.content) else: raise UnsupportedMode(f'unsupported mode {self.mode}')
def __init__(self, data_path=None): if data_path == None: self.data_path = r'./config/connect_info.json' else: assert type(data_path) == str self.data_path = data_path if not os.path.exists(self.data_path): self.data_path = r'./connect_info.json' with open(self.data_path) as data_file: data = json.load(data_file) print("Data: ", data) self.hdfs_client = InsecureClient( url='http://' + data['namenode_url'] + ':' + str(data['port']), user=data['user'], root=data['root_path']) print("hdfs client: ", self.hdfs_client) self.img_dir = data['img_dir'] print("img dir: ", self.img_dir) if self.img_dir[-1] != '/': self.img_dir += '/' else: pass self.file_name = 1
def __init__(self, service_id, user_override=None): self.service_id = service_id self.webhdfs_url = HADOOPS[service_id]['webhdfs_url'] self.webhdfs_user = HADOOPS[service_id]['webhdfs_user'] if user_override: self.webhdfs_user = user_override self.id_prefix = HADOOPS[service_id]['id_prefix'] self.client = InsecureClient(self.webhdfs_url, self.webhdfs_user)
def __init__(self, namenode_url, username, submission_id, camera_id): """Initialize an internal client This constructor initializes an HDFS client. """ self._internal_client = InsecureClient(namenode_url, user='******', root='/'.join(['/users', username, str(submission_id), str(camera_id)]))
def __init__(self, input_folder, image_path, output_folder): logging.info('ImageCropper.init') self.input_folder = input_folder self.image_path = image_path self.output_folder = output_folder self.hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******')
def connect(self): self.conn = InsecureClient(f"http://{self.host}:{self.port}", user=self.user) if os.environ.get("KAFKA_BOOTSTRAP", None): self.producer = KafkaProducer(bootstrap_servers=os.environ.get( "KAFAKA_BOOTSTRAP", "localhost:1234")) else: self.producer = None
def __init__(self, protocol: str = 'webHDFS', *args, **kwargs): self.protocol, self.client = protocol.lower(), None if protocol.lower() == 'webHDFS'.lower(): from hdfs import InsecureClient self.client = InsecureClient(*args, **kwargs) for f in 'upload download list status delete'.split(): setattr(self, f, getattr(self, '%s_%s' % (f, protocol.lower())))
def upload_file(self): ip_address = self.ip_input.toPlainText() port_number = self.port_input.toPlainText() user_name = self.user_input.toPlainText() upload_file = self.dir_input.toPlainText() host_address = 'http://'+ip_address + ':' + port_number hadoop = InsecureClient(host_address,user_name) hadoop.upload('',upload_file)
def __init__(self, location=None, base_url=None): self.hdfs_hosts = settings.HDFS_STORAGE['hosts'] self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root']) self.media_root = settings.MEDIA_ROOT self.media_url = self.fix_slashes(settings.MEDIA_URL) self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root) self.client = InsecureClient(self.hdfs_hosts)
def save_dataframe_as_hdfs(filename, df, root_dir='data', enc='utf8'): data_dir = os.path.join(root_dir, filename) if not os.path.exists(root_dir): os.makedirs(root_dir) client_hdfs = InsecureClient('http://' + os.environ['IP_HDFS'] + ':50070') with client_hdfs.write(data_dir, encoding=enc) as writer: df.to_csv(writer) return True
def __init__(self, dir_algo, algo, path_img): logging.info('prediction_ML.init') self.directory_algo = dir_algo self.path_img = path_img self.algo = algo self.hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') self.image = self.read_image(self.path_img, 240)
def __init__(self, url, user): u = urlsplit(url) if u.scheme != 'http' and u.scheme != 'https': raise ValueError("Invalid name node address") self.url = urlunparse((u.scheme, u.netloc, '', '', '', '')) self.client = InsecureClient(self.url, user=user) self.localdir = u.path self.prefix = 'HDFS'
def upload_file(self): ip_address = self.ip_input.toPlainText() port_number = self.port_input.toPlainText() user_name = self.user_input.toPlainText() file_name = self.File_directory_display.toPlainText() dir_name = self.dir_input.toPlainText() host_address = 'http://' + ip_address + ':' + port_number hadoop = InsecureClient(host_address, user_name) hadoop.upload(dir_name, file_name)
def __init__(self, url, user, base_path=""): self._logger = logging.getLogger(self.__class__.__name__) self._url = url self._user_ = user self._base_path = base_path self._client = InsecureClient(url, user) if not self._exist(base_path): self._mkdir(base_path)
def mkdir_hdfs(self): ip_address = self.ip_input.toPlainText() port_number = self.port_input.toPlainText() user_name = self.user_input.toPlainText() dir_name = self.dir_input.toPlainText() target_name = dir_name + '/' + self.mkdir_input.toPlainText() host_address = 'http://' + ip_address + ':' + port_number hadoop = InsecureClient(host_address, user_name) hadoop.makedirs(target_name)
def put_in_hdfs(hdfs_path, local_path): print('uploading...') client = InsecureClient('http://quickstart.cloudera:50070', user='******') client.upload(hdfs_path=hdfs_path, local_path=local_path, progress=lambda x, y: print(x, y), overwrite=True, temp_dir='/tmp/{}'.format(local_path)) print('done!')
def load_file_list_from_hdfs(data_package): if log_type in ['time','all']: st = time.time() hdfs_str = data_package.stream_hdfs_file_name hdfs_addr = hdfs_str[:hdfs_str.rfind('0/')+1] hdfs_path = hdfs_str[hdfs_str.rfind('0/')+2:] client = InsecureClient(hdfs_addr, user=getpass.getuser()) return client.list(hdfs_path), hdfs_path
class HDFSStorage(Storage): """ HDFS storage """ def fix_slashes(self, path): sep = os.path.sep if path[0] != sep: path = sep + path if path[-1] != sep: path = path + sep return path def __init__(self, location=None, base_url=None): self.hdfs_hosts = settings.HDFS_STORAGE['hosts'] self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root']) self.media_root = settings.MEDIA_ROOT self.media_url = self.fix_slashes(settings.MEDIA_URL) self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root) self.client = InsecureClient(self.hdfs_hosts) def _open(self, name, mode='rb'): local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep)) if not os.path.exists(local_path): remote_path = self.path(name) local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.mkdir(local_dir) print self.client.download(remote_path, local_path=local_path, overwrite=True, temp_dir=tempfile.gettempdir()) return File(open(local_path, mode)) def _save(self, name, content): print "_save(%s, %s, %s)" % (self, name, content) local_path = content.name hdfs_path = self.path(name) # os.path.basename(local_path)) print hdfs_path, local_path self.client.write(hdfs_path, data=content, overwrite=True) return name def url(self, name): return self.fetch_url % name def delete(self, name): return self.client.delete(self.path(name)) def listdir(self, path): file_list = [] dir_list = [] for name, status in self.client.list(self.path(path), status=True): if status['type'] == 'DIRECTORY': dir_list.append(name) elif status['type'] == 'FILE': file_list.append(name) return dir_list, file_list def size(self, name): return self.client.status(self.path(name))['length'] def exists(self, name): try: return True if self.client.status(self.path(name)) else False except HdfsError: return False def path(self, name): return (self.hdfs_root + name).replace('\\', '/')
# -*- coding: utf-8 -*- # # Copyright © 2018 white <*****@*****.**> # # Distributed under terms of the MIT license. """ https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client """ from hdfs import InsecureClient hdfs_url = "http://192.168.30.125:50070" hdfs_user = "******" c = InsecureClient(hdfs_url, user=hdfs_user) c.write("/test_write", data="string") c.delete("/test_write") c.makedirs("/new/path") # 自动递归创建 with c.read("f.txt", encoding="utf-8") as f: content = f.read() c.write("/test.txt", "test string")
def stream(string, lines, t): """ Stream tweets from twitter and save them to file every hour Args: lines - array of streaming words t - Twarc class Returns: boolean - True (OK) / False (Error) """ words = lines string = string hour_keywords = {} # make timestamps timestr = time.strftime("%Y-%m-%d_%H-%M-%S") datestr = time.strftime("%Y-%m-%d") # get total time for check time start_time = time.time() # create directories and files for keywords tweets_to_write = {} indexes = {} client = InsecureClient('http://192.168.1.12:50070', user='******') for word in words: dir_word = word.replace(" ", "_") # for statistics if not os.path.isdir("data/statistics"): os.makedirs("data/statistics") # for statistics date if not os.path.isdir("data/statistics/"+datestr): os.makedirs("data/statistics/"+datestr) # for keyword if not os.path.isdir("data/"+dir_word): os.makedirs("data/"+dir_word) # for date if not os.path.isdir("data/"+dir_word+"/"+datestr): os.makedirs("data/"+dir_word+"/"+datestr) # create json file for writing data with open("data/"+dir_word+"/"+datestr+"/"+timestr+".json", "w") as fw: fw.write("[") tweets_to_write[dir_word] = [] indexes[dir_word] = 0 minutes = 1 while True: try: # find lines in twitter print "String query: %s" % string for tweet in t.stream(string): # regex to find keyword for word in words: dir_word = word.replace(" ", "_") filename = "data/"+dir_word+"/"+datestr+"/"+timestr # create list of words in keyword wlist = word.split() # length of this list w_length = len(wlist) check = 0 # for every word in keyword for w in wlist: # check if word is in tweet keyword = re.search("%s" % w, tweet["text"], re.IGNORECASE) if keyword: check += 1 # if every word from keyword is in tweet, save to file if check == w_length: print "Tweet language: %s" % tweet['lang'] if tweet['lang'] in languages: dumped_json = json.dumps(tweet) tweets_to_write[dir_word].append(dumped_json) with open(filename + ".json", "a") as fw: fw.write(dumped_json) fw.write(",") # counting total if word in total_keywords: total_keywords[word] += 1 else: total_keywords[word] = 1 # counting hourly if word in hour_keywords: hour_keywords[word] += 1 else: hour_keywords[word] = 1 if len(tweets_to_write[dir_word]) % 10 == 0: print "Goint to write into %s_%s" % (filename, indexes[dir_word]) with client.write(filename + "_" + str(indexes[dir_word]), encoding='utf-8') as writer: writer.write("\n".join(tweets_to_write)) indexes[dir_word] = indexes[dir_word]+1 tweets_to_write[dir_word] = [] # exit every hour and start function again if start_time+3600 < time.time(): for word in words: dir_word = word.replace(" ", "_") with open("data/"+dir_word+"/"+datestr+"/"+timestr+".json", "a+") as fw: fw.seek(-1, os.SEEK_END) if fw.read() == ",": fw.seek(-1, os.SEEK_END) fw.truncate() fw.write("]") # hour statistics with open("data/statistics"+"/"+datestr+"/"+timestr+".txt", "w") as fw: for word in hour_keywords: fw.write(str(word) + " : " + str(hour_keywords[word]) + "\n") # total statistics with open("data/statistics/statistics.txt", "w") as fw: for word in total_keywords: fw.write(str(word) + " : " + str(total_keywords[word]) + "\n") return True # except for quit application except KeyboardInterrupt: for word in words: dir_word = word.replace(" ", "_") with open("data/"+dir_word+"/"+datestr+"/"+timestr+".json", "a+") as fw: fw.seek(-1, os.SEEK_END) if fw.read() == ",": fw.seek(-1, os.SEEK_END) fw.truncate() fw.write("]") # hour statistics with open("data/statistics"+"/"+datestr+"/"+timestr+".txt", "w") as fw: for word in hour_keywords: fw.write(str(word) + " : " + str(hour_keywords[word]) + "\n") # total statistics with open("data/statistics/statistics.txt", "w") as fw: for word in total_keywords: fw.write(str(word) + " : " + str(total_keywords[word]) + "\n") sys.stdout.write("QUIT\n") sys.exit(0) # except for problems with key except KeyError: # exit every hour and start function again if start_time+3600 < time.time(): for word in words: dir_word = word.replace(" ", "_") with open("data/"+dir_word+"/"+datestr+"/"+timestr+".json", "a+") as fw: fw.seek(-1, os.SEEK_END) if fw.read() == ",": fw.seek(-1, os.SEEK_END) fw.truncate() fw.write("]") # hour statistics with open("data/statistics"+"/"+datestr+"/"+timestr+".txt", "w") as fw: for word in hour_keywords: fw.write(str(word) + " : " + str(hour_keywords[word]) + "\n") # total statistics with open("data/statistics/statistics.txt", "w") as fw: for word in total_keywords: fw.write(str(word) + " : " + str(total_keywords[word]) + "\n") return True continue # error return False