def upload_folder(root, folder='', bucket_name='illiad-audio', make_public=False): """ Uploads a folder to google cloud storage :param path: path to the folder :param bucket_name: name of the bucket to upload to :return: list of paths to files """ client = storage.Client() bucket = client.get_bucket(bucket_name) blob_names = [] urls = [] files = os.listdir(os.path.join(root,folder)) for i, file in enumerate(files): progress_bar(i, len(files), text="uploading file:") blob_name = "%s/%s"%(folder, file) blob = bucket.blob(blob_name) with open(os.path.join(root, folder, file), 'rb') as f: blob.upload_from_file(f) blob_names.append("gs://%s/%s"%(bucket_name, blob_name)) if make_public: print("making %d public"%i) ##if i == 4: # pdb.set_trace() blob.make_public() urls.append(unquote(blob.public_url)) ##pdb.set_trace() return blob_names, urls
def upload_files(file_list, bucket_name='illiad-audio', make_public=False): """ Uploads a list of files to google cloud storage :param file list: list of paths to the files :param bucket_name: name of the bucket to upload to :return: list of paths to files """ client = storage.Client() bucket = client.get_bucket(bucket_name) blob_names = [] urls = [] for i, f in enumerate(file_list): progress_bar(i, len(file_list), text="uploading file:") f = os.path.abspath(f) blob_name = "/".join(f.split("/")[-2:]) blob = bucket.blob(blob_name) with open(f, 'rb') as f: blob.upload_from_file(f) blob_names.append("gs://%s/%s" % (bucket_name, blob_name)) if make_public: print("making %s public" % blob_name) ##if i == 4: # pdb.set_trace() blob.make_public() urls.append(unquote(blob.public_url)) ##pdb.set_trace() return blob_names, urls
def slice_file(file): rate, data = sf.read(file) if len(data.shape) == 1: data = np.vstack((data,data)).T all_midpoints, all_startpoints = find_divisions(data, SILENCE, rate) print("Number of found midpoints is \n" "channel1: %d \n" "channel2: %d \n" % (len(all_midpoints[0]), len(all_midpoints[1]))) directory, file = os.path.split(file) name = file.split(".")[0] flac_dir = "%s/%s_split" % (directory, name) if not os.path.isdir(flac_dir): os.mkdir(flac_dir) for i, channel in enumerate(all_midpoints): for j, pair in enumerate(zip([0] + channel, channel + [data.shape[0]])): slice = data[pair[0]:pair[1], i] startpoint = all_startpoints[i][j] time_in_sec = float(startpoint) / float(rate) outfile = encode_filename(name, channel=i, timestamp=time_in_sec, extension="flac") progress_bar(j, len(channel)+1, "writing flac file: ") sf.write(os.path.join(flac_dir, outfile), slice, rate) return flac_dir
def test(model): model.eval() #drop outを適用しない test_loss = 0.0 correct = 0 total = 0 with torch.no_grad(): for batch_idx, (data, labels) in enumerate(valloader): if use_gpu: inputs = Variable(data.cuda()) labels = Variable(labels.cuda()) else: inputs, labels = Variable(data), Variable(labels) # forward outputs = model(inputs) _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) # statistics test_loss += loss.item() #total += labels.size(0) correct += preds.eq(labels).sum().item() total = len(valset) progress_bar(batch_idx, len(valloader), 'Test Loss: %.3f | Test Acc: %.3f%% (c:%d/t:%d)' % (test_loss/(batch_idx+1), 100.*correct/total, correct, total)) # 精度が改善したらモデルを保存 acc = 100.*correct/total return acc
def train(model, criterion, optimizer, scheduler): model.train() #drop outを適用 train_loss = 0 correct = 0 total = 0 for batch_idx, (data, labels) in enumerate(trainloader): if use_gpu: #GPUが使えるなら inputs = Variable(data.cuda()) labels = Variable(labels.cuda()) else: inputs, labels = Variable(data), Variable(labels) optimizer.zero_grad() # forward outputs = model(inputs) _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) loss.backward() #Back propagation #optimizer.step() # n epoch でlearning rate を m倍する train_loss += loss.item() correct += preds.eq(labels).sum().item() total = len(trainset) progress_bar(batch_idx, len(trainloader), 'Train Loss: %.3f | Train Acc: %.3f%% (c:%d/t:%d)' % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) # そのepoch最後のLossとAccuracy print('Train Loss: {:.4f}, Train Acc: {:.4f} %'.format(train_loss/(batch_idx+1), 100.*correct/total))
def transcribe_slices(uri_list, name="", save_intermediate=True): """ Transcribe a list of uri's that are all part of the same recording :param uri_list: :param name: :param save_intermediate: :return: """ transcript = [] word_num = 0 complete_operations = np.zeros((len(uri_list),)) for i, uri in enumerate(uri_list): progress_bar(i, len(uri_list), text="transcribing uri: ") alternatives = get_google_transcription(uri) file = uri.split("//")[-1] metadata = decode_filename(file) for j, alternative in enumerate(alternatives): if not alternative: continue word = {"text":alternative.transcript, "confidence": alternative.confidence} if "channel" in metadata: word["speaker"] = metadata["channel"] if "timestamp" in metadata: word["starttime"] = metadata["timestamp"] if "name" in metadata: name = metadata["name"] word["id"] = "%s_%d" % (name, word_num), transcript.append(word) word_num += 1 complete_operations[i] = 1 if i % INTERMEDIATE_SAVE_TIMESTEPS is 0 and save_intermediate: intermediate = {"uri_list": uri_list, "complete_uris": list(complete_operations), "transcript": transcript} with open(TEMP_FILE, 'w') as f: f.write(json.dumps(intermediate)) if save_intermediate: os.remove(TEMP_FILE) return transcript
def transcribe_in_parallel(uri_list, name=None, save_intermediate=True): transcript = [] word_num = 0 uri_index = 0 clients = [] for i in range (0, MAX_CLIENTS): clients.append(speech.Client()) # Do batches of 10 at a time for k in range(0, len(uri_list), MAX_CLIENTS): operations = [] if k+MAX_CLIENTS < len(uri_list): uris = uri_list[k:k+MAX_CLIENTS] else: uris = uri_list[k:] for i, uri in enumerate(uris): print('creating client') audio_sample = clients[i].sample( content=None, source_uri=uri, encoding='FLAC') operations.append(audio_sample.long_running_recognize('en-US')) complete_operations = np.zeros((len(operations),)) while np.sum(complete_operations) != len(operations): time.sleep(2) incomplete_operations = np.where(complete_operations == 0)[0] for index in incomplete_operations: operation = operations[index] success = False try: operation.poll() except ValueError: print("valueerror") if operation.complete: complete_operations[index] = 1 results = operation.results uri_index += 1 if results: file = uri_list[k + index].split("//")[-1] metadata = decode_filename(file) for alternative in results: if not alternative: continue word = {"text": alternative.transcript, "confidence": alternative.confidence} if "channel" in metadata: word["speaker"] = metadata["channel"] if "timestamp" in metadata: word["starttime"] = metadata["timestamp"] if "name" in metadata: name = metadata["name"] word["id"] = "%s_%d" % (name, word_num) transcript.append(word) word_num += 1 progress_bar(uri_index, len(uri_list), text="transcribed uris: ") if save_intermediate: intermediate = {"uri_list": uri_list, "complete_uris":list(complete_operations), "transcript":transcript} with open(TEMP_FILE, 'w') as f: f.write(json.dumps(intermediate)) if save_intermediate: os.remove(TEMP_FILE) return transcript
headers['Accept-Language'] = 'en-US,en;q=0.9' res = utility.request_url(url, headers) length = res.getheader('content-length') if length: length = int(length) if os.path.isfile(file): if os.path.getsize(file) == length: logger.debug('same file {} and same size exist. do not download again'.format(file)) print('file {} already exist, cancel download.'.format(file)) exit(0) block_size = 1024 * 16 utility.progress_bar(0, length, prefix='Progress:', suffix='Complete', length=50) with open(dn_file, "wb") as f: size = 0 while True: data = res.read(block_size) if not data: if size == length: break else: raise Exception('{} downloaded size is not same as expected length.'.format(dn_file)) f.write(data) size += len(data) if length: utility.progress_bar(size, length, prefix='Progress:', suffix='Complete', length=50) if os.path.isfile(file):
((long_regions[:, 1] - long_regions[:, 0]) / 2 + long_regions[:, 0]).astype("int32"))) all_midpoints.append(midpoints) return all_midpoints if __name__ == "__main__": rate, data = wavfile.read(path) all_midpoints = find_divisions(data, SILENCE, rate) print("Number of found midpoints is \n" "channel1: %d \n" "channel2: %d \n" % (len(all_midpoints[0]), len(all_midpoints[1]))) directory, file = os.path.split(path) name = file.split(".")[0] flac_dir = "%s/%s_split" % (directory, name) if not os.path.isdir(flac_dir): os.mkdir(flac_dir) for i, channel in enumerate(all_midpoints): for j, midpoint in enumerate(channel): if j is 0: slice = data[0:midpoint, i] elif j is len(channel) - 1: slice = data[midpoint:-1, i] else: slice = data[channel[j - 1]:midpoint, i] #pdb.set_trace() time_in_sec = float(midpoint) / float(rate) outfile = "%s/%s_channel_%d_timestamp_%d.flac" % ( flac_dir, name, i, int(100 * time_in_sec)) progress_bar(j, len(channel)) sf.write(outfile, slice, rate)
def download_tasks(size): global all_topics cnt_waiting, cnt_doing, cnt_completed, cnt_terminated, cnt_killed = update_download_tasks_status( ) cnt_topic_waiting, cnt_topic_completed, cnt_topic_downloading, cnt_topic_failed = count_topic_download_status( ) free_space = size - cnt_doing - cnt_waiting cnt_topic_remains = 0 cnt = 0 for topic in all_topics: if cnt_topic_completed + cnt_topic_failed < len( all_topics): # to avoid double progress bar utility.progress_bar(cnt_topic_completed + cnt_topic_failed, len(all_topics), prefix='Scanning:', suffix='Complete', length=50) if 'download' not in topic: if free_space > 0: cnt += 1 utility.progress_bar(cnt_topic_completed + cnt_topic_failed, len(all_topics), prefix='Loading :', suffix='{}/{} '.format( cnt, free_space), length=50) mid = topic['mid'] aid = topic['aid'] cid = topic['cid'] title = topic['title'] videos = get_videos(mid, aid, cid, title) if videos is None: topic['url'] = '' save_failed_download(topic) else: for video in videos: video['status'] = 0 # waiting to start video['process'] = None all_downloads.extend(videos) topic['download'] = videos free_space -= len(videos) else: cnt_topic_remains += 1 trigger_downloads(size) cnt_waiting, cnt_doing, cnt_completed, cnt_terminated, cnt_killed = update_download_tasks_status( ) cnt_topic_waiting, cnt_topic_completed, cnt_topic_downloading, cnt_topic_failed = count_topic_download_status( ) utility.progress_bar(cnt_topic_completed + cnt_topic_failed, len(all_topics), prefix='Progress:', suffix='Complete', length=50) if cnt_topic_remains > 0 or cnt_waiting > 0 or cnt_doing > 0: timer = threading.Timer(2.0, download_tasks, [size]) timer.start()
def load_topics(urls, words=None): if not check_urls(urls): exit(-1) else: results = [] idx = 0 for url in urls: idx += 1 print('processing url(s) {}/{} : {}'.format(idx, len(urls), url)) mid = (url.split('/'))[3] if len(mid) == 0: logger.warning('error : failed to locate mid') else: print('loading topics primary data ...') page = 1 topics = get_topics(mid, page) if topics is not None: pages = topics[0]['pages'] count = topics[0]['count'] print('{} topics in {} pages to be loaded'.format( count, pages)) # reload and apply keywords topics = [] utility.progress_bar(0, pages, prefix='Progress:', suffix='Complete', length=50) page = 0 while page < pages: page += 1 new_page_topics = get_topics(mid, page, words) if new_page_topics is not None: topics.extend(new_page_topics) utility.progress_bar(page, pages, prefix='Progress:', suffix='Complete', length=50) if len(topics) > 0: print('loading cid for {} topic(s)'.format( len(topics))) step = 0 utility.progress_bar(step, len(topics), prefix='Progress:', suffix='Complete', length=50) for topic in topics: step += 1 ref = topic['ref'] title = topic['title'] aid = topic['aid'] cid = get_cid(aid, ref) if cid is not None: results.append( dict(mid=mid, aid=aid, cid=cid, title=title, url=url)) else: # failed to get cid, save to error file save_failed_download( dict(mid=mid, aid=aid, cid='', title=title, url=url)) utility.progress_bar(step, len(topics), prefix='Progress:', suffix='Complete', length=50) print('{} topics loaded.'.format(len(results))) return results