def _success_hook(self): success_file = self.get_job_dir().rstrip("/") + "/_SUCCESS" if is_gcs_path(self.get_job_dir()): from luigi.contrib.gcs import GCSClient client = GCSClient() client.put_string("", success_file) else: # assume local filesystem otherwise open(success_file, "a").close()
def run(self): service_account_info = json.loads( os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')) credentials = service_account.Credentials.from_service_account_info( service_account_info) client = GCSClient(oauth_credentials=credentials) file_path = f'{self.bucket}/{self.filename}' fp = client.download(file_path) self.output().makedirs() os.replace(fp.name, self.output().path)
def run(self): result_folder = self.input()['results'].path with open(self.input()['run_name'].path, 'r') as f: run_name = f.read() zipfile_name = shutil.make_archive(run_name, 'zip', result_folder) service_account_info = json.loads( os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')) credentials = service_account.Credentials.from_service_account_info( service_account_info) client = GCSClient(oauth_credentials=credentials) client.put(zipfile_name, f"{self.bucket}/{run_name}.zip")
def __init__(self, path, format=None, client=None, flag='_SUCCESS'): if format is None: format = luigi.format.get_default_format() if path[-1] != "/": raise ValueError("GCSFlagTarget requires the path to be to a " "directory. It must end with a slash ( / ).") # This is the only line that's different super(GCSFlagTarget2, self).__init__(path, client=client) self.format = format self.fs = client or GCSClient() self.flag = flag
def run(self): tmp_txt_file = GCSClient().download(self.input()[0].path) text_arr = split_into_sentences(open(tmp_txt_file.name, "r").read()) print(text_arr) tmp_db = atomic_file(self.temp_result_file_path) tmp_db.generate_tmp_path(tmp_db.path) db = Db(sqlite3.connect(tmp_db.tmp_path), Sql()) db.setup(2) SENTENCE_SEPARATOR = '\n' WORD_SEPARATOR = ' ' Parser("notused", db, SENTENCE_SEPARATOR, WORD_SEPARATOR).parse_array(text_arr) tmp_db.close() GCSClient().put(self.temp_result_file_path, self.output().path) tmp_txt_file.close()
def run(self): if not self.gs_source_path.startswith( 'https://www.youtube.com/watch?v='): return yt = YouTube(self.gs_source_path) temp_result_file = yt.streams.filter( file_extension='mp4').first().download('../tmp', self.video_id) print("UploadFileOnStorage.run downloaded to: ", temp_result_file) GCSClient().put(temp_result_file, self.output().path)
def run(self): tmp_video_file = GCSClient().download(self.gs_path_video) tmp_audio_file = GCSClient().download(self.requires()[0].output().path) #'ffmpeg -i Late_For_Work.mp4 -i Late_For_Work.mp4.voice.mp3 -c:v copy -map 0:v:0 -map 1:a:0 result.mp4' cmd = [ 'ffmpeg', '-y', '-i', tmp_video_file.name, '-i', tmp_audio_file.name, '-c:v', 'copy', '-map', '0:v:0', '-map', '1:a:0', self.temp_result_file ] print(cmd) subprocess.check_call(cmd) tmp_video_file.close() tmp_audio_file.close() GCSClient().put(self.temp_result_file, self.output().path)
def run(self): client = GCSClient() if self.string: client.put_string(contents=self.string, dest_path=self.gs_path, mimetype=self.mime_type) elif self.file_path: client.put(filename=self.file_path, dest_path=self.gs_path, mimetype=self.mime_type) if self.remove_file: os.remove(self.file_path)
def _get_input_schema(self): """Arbitrarily picks an object in input and reads the Avro schema from it.""" assert avro, 'avro module required' input_target = flatten(self.input())[0] input_fs = input_target.fs if hasattr(input_target, 'fs') else GCSClient() input_uri = self.source_uris()[0] if '*' in input_uri: file_uris = list(input_fs.list_wildcard(input_uri)) if file_uris: input_uri = file_uris[0] else: raise RuntimeError('No match for ' + input_uri) schema = [] exception_reading_schema = [] def read_schema(fp): # fp contains the file part downloaded thus far. We rely on that the DataFileReader # initializes itself fine as soon as the file header with schema is downloaded, without # requiring the remainder of the file... try: reader = avro.datafile.DataFileReader(fp, avro.io.DatumReader()) schema[:] = [ BigQueryLoadAvro._get_writer_schema(reader.datum_reader) ] except Exception as e: # Save but assume benign unless schema reading ultimately fails. The benign # exception in case of insufficiently big downloaded file part seems to be: # TypeError('ord() expected a character, but string of length 0 found',). exception_reading_schema[:] = [e] return False return True input_fs.download(input_uri, 64 * 1024, read_schema).close() if not schema: raise exception_reading_schema[0] return schema[0]
def _make_target_classes(self): '''Create client and target objects for storage service.''' if self.storage_service == 'local': client = LocalFileSystem() target = LocalTarget flag_target = LocalTarget # just use a normal target elif self.storage_service == 's3': client = S3Client( aws_access_key_id=self.config['aws_access_key_id'], aws_secret_access_key=self.config['aws_secret_access_key']) target = S3Target flag_target = S3FlagTarget elif self.storage_service == 'gcs': # Use GCP Service Account private key credentials to # authenticate with the GCS API. Service Accounts # are unique to the GCP project. key_json = json.loads( self.config['gcs_service_account_private_key_json']) cred = ServiceAccountCredentials.from_json_keyfile_dict(key_json) client = GCSClient(oauth_credentials=cred) # Hack around a bug in Luigi's GCS module class GCSFlagTarget2(GCSTarget): fs = None def __init__(self, path, format=None, client=None, flag='_SUCCESS'): if format is None: format = luigi.format.get_default_format() if path[-1] != "/": raise ValueError("GCSFlagTarget requires the path to be to a " "directory. It must end with a slash ( / ).") # This is the only line that's different super(GCSFlagTarget2, self).__init__(path, client=client) self.format = format self.fs = client or GCSClient() self.flag = flag def exists(self): flag_target = self.path + self.flag return self.fs.exists(flag_target) target = GCSTarget flag_target = GCSFlagTarget2 else: raise EnvironmentError('Please add known file_storage value to config.') # Targets will be initialized many times in luigi tasks. # Subclass the chosen Target and add the Client to it, so # you don't have to pass the Client to the DAG. init_kwargs = { 'format': MixedUnicodeBytesFormat() } if self.storage_service != 'local': init_kwargs['client'] = client class TargetWithClient(target): def __init__(self, path): super(TargetWithClient, self).__init__(path, **init_kwargs) class FlagTargetWithClient(flag_target): def __init__(self, path): super(FlagTargetWithClient, self).__init__(path, **init_kwargs) self.client = client self.target_class = TargetWithClient self.flag_target_class = FlagTargetWithClient
def run(self): client = GCSClient() source_path = f"resources/csv/{self.date.isoformat()}.csv" client.put(source_path, dest_path=f"gs://luigi_example/{self.date.isoformat()}.csv")
def run(self): print(">>>> Run GenVoiceFile") random.seed(self.random_seed) #with self.input()[0].open() as json_data: # d = json.load(json_data) # labels = self.json_labels_to_pd(d) labels = pd.read_csv(self.input()['labels_csv'].open(), header=None, names=['Label', 'category', 'start', 'end']) print(labels) if self.text_generator == 'markov': tmp_file_db = GCSClient().download(self.input()['markov_db'].path) print("GenVoiceFile.run markov_db tmp file path: ", tmp_file_db.name) self.generator = MarkovTextGenerator( tmp_file_db.name) #'combined.db') usedBefore = set() fullTrack = self.textToAudioSegment("") curr_time_mksec = fullTrack.duration_seconds * 1000000 video_duration_mksec = max(labels['end']) * 1000000 print('video_duration_mksec:' + str(video_duration_mksec)) while (curr_time_mksec < video_duration_mksec): observedBefore = set( labels[labels['start'] * 1000000 < curr_time_mksec]['Label']) candidates = list(observedBefore - usedBefore) seedWords = 'Hmmm' if len(candidates) > 0: seedWords = candidates[0] elif len(usedBefore) > 0: # Need to do this if list of detected words is too small seedWords = random.sample(usedBefore, 1)[0] print("RANDOM seed word!") seedWordsToGen = seedWords.lower() print('curr_time_mksec:' + str(curr_time_mksec) + ' ' + seedWordsToGen) wordsToSay = self.generator.get_text("TODO: full text so far", seedWordsToGen) acceptable_len = 140 shortest_wordsToSay = wordsToSay for _ in range(5): print(f"Generated sentence length: {len(wordsToSay)}") if len(wordsToSay) < acceptable_len: shortest_wordsToSay = wordsToSay break wordsToSay = self.generator.get_text("TODO: full text so far", seedWordsToGen) if len(wordsToSay) < len(shortest_wordsToSay): shortest_wordsToSay = wordsToSay acceptable_len = acceptable_len + 10 wordsToSay = shortest_wordsToSay print(wordsToSay) wordsToSay_corrected = str(TextBlob(wordsToSay).correct()) if wordsToSay_corrected != wordsToSay: wordsToSay = wordsToSay_corrected print("CORRECTED to: " + wordsToSay) print(f"Generated sentence length, final: {len(wordsToSay)}") segment = self.textToAudioSegment(wordsToSay) print(segment.duration_seconds) print(fullTrack.duration_seconds) print("---------") if (segment.duration_seconds + fullTrack.duration_seconds ) * 1000000 > video_duration_mksec: break fullTrack = fullTrack + segment # print seedWords in usedBefore usedBefore = usedBefore.union(set([seedWords])) curr_time_mksec = fullTrack.duration_seconds * 1000000 fullTrack.export(self.temp_result_file, format="mp3") GCSClient().put(self.temp_result_file, self.output().path)