def download(video_url, path): try: video_stream = PyTube(video_url).streams \ .filter(progressive=True, file_extension='mp4') \ .order_by('resolution') \ .desc() \ .first() except: video_stream = PyTube(video_url).streams.first() try: filename = video_stream.default_filename\ .replace(' ', '_')\ .replace('.mp4', '') except KeyError: filename = str(uuid4()) video_stream.download(filename=filename, output_path=path) return filename + '.mp4'
def retrieve_captions_old(url, keyword): ''' return captions if the video is in right format and contains target keyword else, return None ''' try: video = PyTube(yp.get_youtube_url(url)) except (KeyboardInterrupt, SystemExit): raise except: cp.print_color( cp.ColorEnum.YELLOW, "failed to generate PyTube representation for the video") return None caption = video.captions.get_by_language_code('en') if not caption: cp.print_color(cp.ColorEnum.YELLOW, "no caption available for the video") return None try: srt_captions = caption.generate_srt_captions().lower().split('\n\n') except (KeyboardInterrupt, SystemExit): raise except: cp.print_color(cp.ColorEnum.YELLOW, "failed to retrieve for the video") return None # make sure the keyword appear in captions before crawling keyword_exist = False for caption in srt_captions: keyword_exist = contain_keyword(keyword, caption) if keyword_exist: return srt_captions cp.print_color(cp.ColorEnum.YELLOW, "captions do not contain the keyword") return None
def __init__(self, url): self.url = url self.video = PyTube(get_youtube_url(url))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-k", "--keyword", type=str, required=True, help="target keyword to generate data for") parser.add_argument("-s", "--size", type=int, default=100, help="number of url to collect") parser.add_argument("-a", "--api_key", type=str, required=True, help="API key for youtube data v3 API") args = parser.parse_args() keyword = args.keyword.lower() cp.print_progress("keyword is ", keyword) url_fetcher = YoutubeSearcher(args.api_key, keyword) urls = [] plural = inflect.engine() while len(urls) < args.size: url = url_fetcher.next()[0] if not url: cp.print_warning("there are no more urls to process") if url in urls: cp.print_warning("video is already added", url) continue try: video = PyTube(utils.get_youtube_url(url)) except Exception as exception: cp.print_error( "failed to generate PyTube representation for vidoe - ", url) continue caption = video.captions.get_by_language_code('en') if not caption: cp.print_warning("no caption available for video - ", url) continue try: srt_captions = caption.generate_srt_captions().lower().split( '\n\n') except Exception as exception: cp.print_error("failed to retrieve for vidoe - ", url) continue keyword_exist = False for captions in srt_captions: if keyword in captions or plural.plural(keyword) in captions: keyword_exist = True break if not keyword_exist: cp.print_warning("keywords never appear in the video - ", url) continue urls.append(url) cp.print_progress(len(urls), " / ", args.size, " - ", url) cp.print_warning(len(urls), "urls are collected for ", keyword) with open(keyword + "_url_" + args.size + ".txt", 'w') as output_file: for url in urls: output_file.write(url + "\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-k", "--keyword", type=str, required=True, help="target keyword to generate data for") parser.add_argument("-f", "--url_file", type=str, help="file containing urls of the video") parser.add_argument("-s", "--size", type=int, default=100, help="number of videos to consider") parser.add_argument("-l", "--video_length", type=int, default=3600, help="length of maximum length for a video (s)") parser.add_argument("-a", "--api_key", type=str, required=True, help="API key for youtube data v3 API") parser.add_argument("-c", "--continue_from", type=str, help="url to start from in the given url file") parser.add_argument("-o", "--output_file", type=str, help="csv file to append output to") args = parser.parse_args() keyword = args.keyword.lower() sd.default.samplerate = SAMPLE_RATE cp.print_progress("keyword is ", keyword) plural = inflect.engine() if args.url_file: # read in from the file print('fetching urls from the given file : ', args.url_file) url_fetcher = FileReader(args.url_file) else: # fetch using keywords print('fetching urls by searching youtube with keywords : ', keyword) url_fetcher = YoutubeSearcher(args.api_key, keyword) csv_writer = CsvWriter(keyword, args.output_file) total_cc_count = 0 total_audio_count = 0 continuing = args.continue_from != None url_set = set() for i in range(args.size): url = url_fetcher.next() if not url: cp.print_warning("there are no more urls to process") break url = url[0] if continuing: if url != args.continue_from: continue else: continuing = False cp.print_progress(i + 1, " / ", args.size, " - ", url) if url in url_set: cp.print_warning("video is already processed", url) continue url_set.add(url) if continuing: if url != args.continue_from: continue else: continuing = False try: video = PyTube(utils.get_youtube_url(url)) except Exception as exception: cp.print_error( "failed to generate PyTube representation for video ", url) cp.print_error(exception) continue if int(video.length) > args.video_length: continue caption = video.captions.get_by_language_code('en') if not caption: cp.print_warning("no caption available for video - ", url) continue try: srt_captions = caption.generate_srt_captions().split('\n\n') except Exception as exception: cp.print_error("failed to retrieve srt for video - ", url) cp.print_error(exception) continue translator = str.maketrans('', '', string.punctuation) # to remove punctuation srt_tag_re = re.compile(r"<.*?>|\(.*?\)|\[.*?\]") keyword_exist = False for captions in srt_captions: if keyword in captions or plural.plural(keyword) in captions: keyword_exist = True break if not keyword_exist: cp.print_warning("keywords never appear in the video - ", url) continue try: crawler = YoutubeCrawler(url) audio_data = crawler.get_audio() except Exception as exception: cp.print_error("failed to download audio file for video ", url) cp.print_error(exception) continue collected_data = [] video_cc_count = 0 video_audio_count = 0 for captions in srt_captions: cc_split = captions.split('\n') if len(cc_split) == 4 and cc_split[0] == '': cc_split = (cc_split[1], cc_split[2], cc_split[3]) elif len(cc_split) != 3: cp.print_warning( "srt format is not interpretable for video - ", cc_split) continue _, cc_time, cc_text = cc_split cc_text = srt_tag_re.sub('', cc_text) # clean up punctuation cc_text = cc_text.translate(translator) cc_text = cc_text.lower().strip().replace(',', '') words = cc_text.strip().split() # skip videos without target keyword audio if keyword not in words and plural.plural(keyword) not in words: continue # occurance in audio start_ms, end_ms = utils.parse_srt_time(cc_time) cp.print_instruction( "How many time was the keyword spoken? (\"r\" to replay audio)\n", "[ " + cc_text + " ]") while True: try: time.sleep(0.5) sd.play(audio_data[start_ms:end_ms], blocking=True) sd.stop() user_input = input() audio_count = int(user_input) except ValueError: if user_input != "r": cp.print_error("Invalid Input. Expect Integer") continue else: break # occurance in captions cc_count = 0 for word in words: if keyword == word or keyword + "s" == word or keyword + "es" == word: cc_count += 1 collected_data.append( [url, start_ms, end_ms, cc_text, cc_count, audio_count]) video_cc_count += cc_count video_audio_count += audio_count print(url, "- cc_count : ", video_cc_count, ", audio_count : ", video_audio_count) total_cc_count += video_cc_count total_audio_count += video_audio_count csv_writer.write(collected_data) print("total cc_count : ", total_cc_count, ", total audio_count : ", total_audio_count) cp.print_progress("collected data sotred in ", keyword + ".csv")