def _success_hook(self):
     success_file = self.get_job_dir().rstrip("/") + "/_SUCCESS"
     if is_gcs_path(self.get_job_dir()):
         from luigi.contrib.gcs import GCSClient
         client = GCSClient()
         client.put_string("", success_file)
     else:
         # assume local filesystem otherwise
         open(success_file, "a").close()
예제 #2
0
    def run(self):
        service_account_info = json.loads(
            os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON'))
        credentials = service_account.Credentials.from_service_account_info(
            service_account_info)

        client = GCSClient(oauth_credentials=credentials)

        file_path = f'{self.bucket}/{self.filename}'

        fp = client.download(file_path)
        self.output().makedirs()

        os.replace(fp.name, self.output().path)
예제 #3
0
    def run(self):
        result_folder = self.input()['results'].path

        with open(self.input()['run_name'].path, 'r') as f:
            run_name = f.read()

        zipfile_name = shutil.make_archive(run_name, 'zip', result_folder)

        service_account_info = json.loads(
            os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON'))
        credentials = service_account.Credentials.from_service_account_info(
            service_account_info)

        client = GCSClient(oauth_credentials=credentials)

        client.put(zipfile_name, f"{self.bucket}/{run_name}.zip")
예제 #4
0
 def __init__(self, path, format=None, client=None, flag='_SUCCESS'):
     if format is None:
         format = luigi.format.get_default_format()
     if path[-1] != "/":
         raise ValueError("GCSFlagTarget requires the path to be to a "
                          "directory.  It must end with a slash ( / ).")
     # This is the only line that's different
     super(GCSFlagTarget2, self).__init__(path, client=client)
     self.format = format
     self.fs = client or GCSClient()
     self.flag = flag
예제 #5
0
 def run(self):
     tmp_txt_file = GCSClient().download(self.input()[0].path)
     
     text_arr = split_into_sentences(open(tmp_txt_file.name, "r").read())
     
     print(text_arr)
     
     tmp_db = atomic_file(self.temp_result_file_path)
     tmp_db.generate_tmp_path(tmp_db.path)
     db = Db(sqlite3.connect(tmp_db.tmp_path), Sql())
     db.setup(2)
     
     SENTENCE_SEPARATOR = '\n'
     WORD_SEPARATOR = ' '
     
     Parser("notused", db, SENTENCE_SEPARATOR, WORD_SEPARATOR).parse_array(text_arr)
     tmp_db.close()
     
     GCSClient().put(self.temp_result_file_path, self.output().path)  
     
     tmp_txt_file.close()
예제 #6
0
    def run(self):
        if not self.gs_source_path.startswith(
                'https://www.youtube.com/watch?v='):
            return

        yt = YouTube(self.gs_source_path)

        temp_result_file = yt.streams.filter(
            file_extension='mp4').first().download('../tmp', self.video_id)
        print("UploadFileOnStorage.run downloaded to: ", temp_result_file)

        GCSClient().put(temp_result_file, self.output().path)
예제 #7
0
    def run(self):

        tmp_video_file = GCSClient().download(self.gs_path_video)
        tmp_audio_file = GCSClient().download(self.requires()[0].output().path)

        #'ffmpeg -i Late_For_Work.mp4 -i Late_For_Work.mp4.voice.mp3 -c:v copy -map 0:v:0 -map 1:a:0 result.mp4'
        cmd = [
            'ffmpeg', '-y', '-i', tmp_video_file.name, '-i',
            tmp_audio_file.name, '-c:v', 'copy', '-map', '0:v:0', '-map',
            '1:a:0', self.temp_result_file
        ]
        print(cmd)
        subprocess.check_call(cmd)

        tmp_video_file.close()
        tmp_audio_file.close()

        GCSClient().put(self.temp_result_file, self.output().path)
예제 #8
0
 def run(self):
     client = GCSClient()
     if self.string:
         client.put_string(contents=self.string, dest_path=self.gs_path, mimetype=self.mime_type)
     elif self.file_path:
         client.put(filename=self.file_path, dest_path=self.gs_path, mimetype=self.mime_type)
         if self.remove_file:
             os.remove(self.file_path)
예제 #9
0
    def _get_input_schema(self):
        """Arbitrarily picks an object in input and reads the Avro schema from it."""
        assert avro, 'avro module required'

        input_target = flatten(self.input())[0]
        input_fs = input_target.fs if hasattr(input_target,
                                              'fs') else GCSClient()
        input_uri = self.source_uris()[0]
        if '*' in input_uri:
            file_uris = list(input_fs.list_wildcard(input_uri))
            if file_uris:
                input_uri = file_uris[0]
            else:
                raise RuntimeError('No match for ' + input_uri)

        schema = []
        exception_reading_schema = []

        def read_schema(fp):
            # fp contains the file part downloaded thus far. We rely on that the DataFileReader
            # initializes itself fine as soon as the file header with schema is downloaded, without
            # requiring the remainder of the file...
            try:
                reader = avro.datafile.DataFileReader(fp,
                                                      avro.io.DatumReader())
                schema[:] = [
                    BigQueryLoadAvro._get_writer_schema(reader.datum_reader)
                ]
            except Exception as e:
                # Save but assume benign unless schema reading ultimately fails. The benign
                # exception in case of insufficiently big downloaded file part seems to be:
                # TypeError('ord() expected a character, but string of length 0 found',).
                exception_reading_schema[:] = [e]
                return False
            return True

        input_fs.download(input_uri, 64 * 1024, read_schema).close()
        if not schema:
            raise exception_reading_schema[0]
        return schema[0]
예제 #10
0
    def _make_target_classes(self):
        '''Create client and target objects for storage service.'''

        if self.storage_service == 'local':
            client = LocalFileSystem()
            target = LocalTarget
            flag_target = LocalTarget # just use a normal target

        elif self.storage_service == 's3':
            client = S3Client(
                aws_access_key_id=self.config['aws_access_key_id'],
                aws_secret_access_key=self.config['aws_secret_access_key'])
            target = S3Target
            flag_target = S3FlagTarget

        elif self.storage_service == 'gcs':

            # Use GCP Service Account private key credentials to
            # authenticate with the GCS API. Service Accounts
            # are unique to the GCP project.
            key_json = json.loads(
                    self.config['gcs_service_account_private_key_json'])
            cred = ServiceAccountCredentials.from_json_keyfile_dict(key_json)
            client = GCSClient(oauth_credentials=cred)

            # Hack around a bug in Luigi's GCS module
            class GCSFlagTarget2(GCSTarget):
                fs = None
                def __init__(self, path, format=None, client=None, flag='_SUCCESS'):
                    if format is None:
                        format = luigi.format.get_default_format()
                    if path[-1] != "/":
                        raise ValueError("GCSFlagTarget requires the path to be to a "
                                         "directory.  It must end with a slash ( / ).")
                    # This is the only line that's different
                    super(GCSFlagTarget2, self).__init__(path, client=client)
                    self.format = format
                    self.fs = client or GCSClient()
                    self.flag = flag
                def exists(self):
                    flag_target = self.path + self.flag
                    return self.fs.exists(flag_target)

            target = GCSTarget
            flag_target = GCSFlagTarget2
        else:
            raise EnvironmentError('Please add known file_storage value to config.')

        # Targets will be initialized many times in luigi tasks.
        # Subclass the chosen Target and add the Client to it, so
        # you don't have to pass the Client to the DAG.

        init_kwargs = {
            'format': MixedUnicodeBytesFormat()
        }
        if self.storage_service != 'local':
            init_kwargs['client'] = client

        class TargetWithClient(target):
            def __init__(self, path):
                super(TargetWithClient, self).__init__(path, **init_kwargs)

        class FlagTargetWithClient(flag_target):
            def __init__(self, path):
                super(FlagTargetWithClient, self).__init__(path, **init_kwargs)


        self.client = client
        self.target_class = TargetWithClient
        self.flag_target_class = FlagTargetWithClient
예제 #11
0
 def run(self):
     client = GCSClient()
     source_path = f"resources/csv/{self.date.isoformat()}.csv"
     client.put(source_path,
                dest_path=f"gs://luigi_example/{self.date.isoformat()}.csv")
예제 #12
0
    def run(self):
        print(">>>> Run GenVoiceFile")

        random.seed(self.random_seed)

        #with self.input()[0].open() as json_data:
        #    d = json.load(json_data)
        #    labels = self.json_labels_to_pd(d)

        labels = pd.read_csv(self.input()['labels_csv'].open(),
                             header=None,
                             names=['Label', 'category', 'start', 'end'])
        print(labels)

        if self.text_generator == 'markov':
            tmp_file_db = GCSClient().download(self.input()['markov_db'].path)
            print("GenVoiceFile.run markov_db tmp file path: ",
                  tmp_file_db.name)
            self.generator = MarkovTextGenerator(
                tmp_file_db.name)  #'combined.db')

        usedBefore = set()

        fullTrack = self.textToAudioSegment("")

        curr_time_mksec = fullTrack.duration_seconds * 1000000

        video_duration_mksec = max(labels['end']) * 1000000
        print('video_duration_mksec:' + str(video_duration_mksec))
        while (curr_time_mksec < video_duration_mksec):
            observedBefore = set(
                labels[labels['start'] * 1000000 < curr_time_mksec]['Label'])
            candidates = list(observedBefore - usedBefore)

            seedWords = 'Hmmm'

            if len(candidates) > 0:
                seedWords = candidates[0]
            elif len(usedBefore) > 0:
                # Need to do this if list of detected words is too small
                seedWords = random.sample(usedBefore, 1)[0]
                print("RANDOM seed word!")

            seedWordsToGen = seedWords.lower()
            print('curr_time_mksec:' + str(curr_time_mksec) + ' ' +
                  seedWordsToGen)
            wordsToSay = self.generator.get_text("TODO: full text so far",
                                                 seedWordsToGen)

            acceptable_len = 140
            shortest_wordsToSay = wordsToSay

            for _ in range(5):
                print(f"Generated sentence length: {len(wordsToSay)}")
                if len(wordsToSay) < acceptable_len:
                    shortest_wordsToSay = wordsToSay
                    break
                wordsToSay = self.generator.get_text("TODO: full text so far",
                                                     seedWordsToGen)
                if len(wordsToSay) < len(shortest_wordsToSay):
                    shortest_wordsToSay = wordsToSay
                acceptable_len = acceptable_len + 10

            wordsToSay = shortest_wordsToSay
            print(wordsToSay)
            wordsToSay_corrected = str(TextBlob(wordsToSay).correct())
            if wordsToSay_corrected != wordsToSay:
                wordsToSay = wordsToSay_corrected
                print("CORRECTED to: " + wordsToSay)

            print(f"Generated sentence length, final: {len(wordsToSay)}")

            segment = self.textToAudioSegment(wordsToSay)

            print(segment.duration_seconds)
            print(fullTrack.duration_seconds)
            print("---------")

            if (segment.duration_seconds + fullTrack.duration_seconds
                ) * 1000000 > video_duration_mksec:
                break
            fullTrack = fullTrack + segment

            # print seedWords in usedBefore

            usedBefore = usedBefore.union(set([seedWords]))

            curr_time_mksec = fullTrack.duration_seconds * 1000000

        fullTrack.export(self.temp_result_file, format="mp3")

        GCSClient().put(self.temp_result_file, self.output().path)