def maybe_download_audio_track(yt_video_id): is_ytdownloader_installed() video_data_path = os.path.join(const.VIDEO_DATA_DIR, yt_video_id) ensure_dir(video_data_path) video = pafy.new(yt_video_id) # download audio audio_lowest_size = sorted(video.audiostreams, key=lambda x: x.get_filesize())[0] #print 'audio lowest download size: ' + str(audio_lowest_size.get_filesize()) if audio_lowest_size.get_filesize() > 500000000: raise Exception("audio_file_is_too_big") audio_path = os.path.join(video_data_path, "audio." + audio_lowest_size.extension) if not os.path.exists(audio_path): print 'downloading audio ' + audio_path audio_lowest_size.download(filepath=audio_path, quiet=True) if not os.path.exists(audio_path): raise Exception("audio_download_failed") return audio_path
def start_parsing(threads_num): try: ensure_dir(const.VIDEO_DATA_DIR) video_parser_threads = [] # start parsing threads for i in range(0, threads_num): #print 'start parsing thread ' + str(i) thr = Thread(target=video_parser_thread_loop) thr.daemon = True thr.start() video_parser_threads.append(thr) # wait for threads while True: if not any([thr.isAlive() for thr in video_parser_threads]): break time.sleep(5) except (KeyboardInterrupt, SystemExit): print '\n! Received keyboard interrupt, quitting threads.\n' print "DONE" stats_util.show_global_stats()
def main(): parser = ArgumentParser() parser.add_argument("-y", "--youtube_api_key", type=str, required=True, help="API key for youtube data v3 API") parser.add_argument("-w", "--words_api_key", type=str, required=True, help="API key for words API") parser.add_argument("-k", "--keyword_list", nargs='+', type=str, required=True, help="list of keywords to collect") parser.add_argument("-s", "--samples_per_keyword", type=int, default=10, help="number of samples to collect per keyword") parser.add_argument("-o", "--output_dir", type=str, default="./generated_keyword_audios", help="path to the output dir") args = parser.parse_args() file_utils.ensure_dir(TEMP_DIR) file_utils.ensure_dir(args.output_dir) collection_result = {} for index, keyword in enumerate(args.keyword_list): cp.print_color( cp.ColorEnum.BOLD, f"collecting {args.samples_per_keyword} audio samples of keyword : {keyword}" ) count = generate_dataset(args.youtube_api_key, args.words_api_key, keyword, args.samples_per_keyword, args.output_dir) collection_result[keyword] = count for keyword, count in collection_result.items(): cp.print_color(cp.ColorEnum.BOLD, f"collected {count} keywords of {keyword}") file_utils.remove_dir(TEMP_DIR)
def __init__(self, utterance_sep, path_output_lu_parses, path_output_parses, parser_path, cfg_rules_path, pos_tagger_path=None, path_to_freq_norms=None, path_to_image_norms=None, path_to_dictionary=None, lu_analyzer_path=None, path_to_anew=None, path_to_warringer=None, do_wnic=False, path_to_rst_python=None, path_to_rst=None, path_output_rst=None, path_to_stanford_cp=None, path_to_mpqa_lexicon=None, path_to_lda_model=None, path_to_lda_wordids=None, do_lexical=True, do_syntactic=True, do_semantic=True, do_pragmatic=False, lexical_list=None, syntactic_list=None, semantic_list=None, pragmatic_list=None): '''Parameters: source_transcript : list of strings. Full paths to directories containing transcripts (with no filler annotations) source_transcript_fillers : list of string. Full paths to a directories containing transcripts with filler annotations utterance_sep : string. The string that delimits utterance boundaries in the transcript path_lu_output_parses : string. The absolute path to a directory that will store the Lu features and parses. path_output_parses : string. The absolute path to a directory that will store the parse trees produced for the data. parser_path : string. The absolute path to a directory containing a Stanford lexparser cfg_rules_path : string. The absolute path to a file containing cfg productions to be extracted (one per line) path_output_lda_topics: string. The absolute path to the csv file where key-value topics will be stored. pos_tagger_path : optional, string. Full path to a directory containing a Stanford POS tagger path_to_freq_norms : optional, string. Full path to a file containing frequency norms path_to_image_norms : optional, string. Full path to a file containing imageability norms path_to_dictionary : optional, string. Full path to a file containing valid words for the language lu_analyzer_path : optional path_to_rst_python : optional, string. Full path to virtualenv python, for RST path_to_rst : optional, string. Full path to folder with RST's 'parse.py' path_output_rst: optional, string. Full path to where RST stores its results path_to_lda_model : string. Full path to trained LDA model. path_to_lda_wordids : string. Full path to word IDs used in trained LDA model. ''' self.utterance_sep = utterance_sep self.output_rst_dir = os.path.abspath(path_output_rst) self.output_parse_dir = os.path.abspath(path_output_parses) self.output_lu_parse_dir = os.path.abspath(path_output_lu_parses) self.pos_tagger_path = pos_tagger_path self.parser_path = parser_path self.cfg_rules_path = cfg_rules_path self.path_to_mpqa_lexicon = path_to_mpqa_lexicon self.path_to_rst_python = path_to_rst_python self.path_to_rst = path_to_rst self.path_to_stanford_cp = path_to_stanford_cp self.path_to_lda_model = path_to_lda_model self.path_to_lda_wordids = path_to_lda_wordids self.do_lexical = do_lexical self.do_syntactic = do_syntactic self.do_semantic = do_semantic self.do_pragmatic = do_pragmatic self.lexical_list = lexical_list self.syntactic_list = syntactic_list self.semantic_list = semantic_list self.pragmatic_list = pragmatic_list file_utils.ensure_dir(self.output_parse_dir) file_utils.ensure_dir(self.output_lu_parse_dir) file_utils.ensure_dir(self.output_rst_dir) # self.transcript_set = transcript.TranscriptSet(dataset=[]) # Get lexical norms if path_to_freq_norms is not None: self.norms_freq = functions.get_frequency_norms(path_to_freq_norms) else: # default self.norms_freq = functions.get_frequency_norms() if path_to_image_norms is not None: self.norms_image = functions.get_imageability_norms(path_to_image_norms) else: # default self.norms_image = functions.get_imageability_norms() if path_to_anew is not None: self.norms_anew = functions.get_anew_norms(path_to_anew) else: # default self.norms_anew = None # Warringer if path_to_warringer is not None: self.norms_warringer = functions.get_warringer_norms(path_to_warringer) else: # default self.norms_warringer = functions.get_warringer_norms() # MPQA if path_to_mpqa_lexicon is not None: [self.mpqa_words, self.mpqa_types, self.mpqa_polarities] = functions.get_mpqa_lexicon(path_to_mpqa_lexicon) else: # default [self.mpqa_words, self.mpqa_types, self.mpqa_polarities] = functions.get_mpqa_lexicon() # Set up the dictionary of valid words for the language if path_to_dictionary is not None: source_dict = path_to_dictionary else: source_dict = os.path.abspath("../feature_extraction/text/american-english") # default with open(source_dict, 'r') as fin_dict: words = fin_dict.readlines() self.dictionary_words = set(word.strip().lower() for word in words) self.prondict = cmudict.dict() if lu_analyzer_path is not None: self.lu_analyzer_path = lu_analyzer_path else: self.lu_analyzer_path = os.path.abspath('../L2SCA-2011-10-10/') # semantics if do_wnic: self.brown_ic = wnic.ic('ic-brown.dat') # FR: it would be nice to have a dat based on normative data, baby self.semcor_ic = wnic.ic('ic-semcor.dat') else: self.brown_ic = [] self.semcor_ic = []
if __name__ == '__main__': # lianjia_root_site = 'http://bj.fang.lianjia.com' # lianjia = LianJiaCrawler(lianjia_root_site) # debug code # print lianjia._url_dict # print lianjia._html_dict # print lianjia._html_dict[u'HuaiRou'][0] # lianjia._cal_average_price_from_html(lianjia._html_dict[u'HuaiRou'][0]) # print lianjia._price_dict from datetime import date from utils.file_utils import ensure_dir ensure_dir(PROJECT_DIR + '/data/json/crawler/housing') json_out_path = PROJECT_DIR + '/data/json/crawler/housing/{0}_lianjia_housing.json'.format(str(date.today())) json_dict = {} for city, confg in LIANJIA_MAP.items(): lianjia = LianJiaCrawler(confg['website'], confg['area_map']) lianjia.get_price_dict() # print lianjia._price_dict json_dict[city] = lianjia._price_dict # save the price to the json file with open(json_out_path, 'w') as f: json.dump(json_dict,f)
def maybe_download_subtitiles(yt_video_id, auto_subs=False): is_ytdownloader_installed() subs_name = "autosubs" if auto_subs else "subs" # download subtitles video_data_path = os.path.join(const.VIDEO_DATA_DIR, yt_video_id) ensure_dir(video_data_path) _subs_path_tmp = os.path.join(video_data_path, subs_name + ".vtt") subs_path = os.path.join(video_data_path, "%s.%s.vtt" % (subs_name, const.LANGUAGE)) if not os.path.exists(subs_path): print 'downloading subtitles to ' + subs_path p = subprocess.Popen([ "youtube-dl", "--write-auto-sub" if auto_subs else "--write-sub", "--sub-lang", const.LANGUAGE, "--skip-download", "-o", _subs_path_tmp, 'https://www.youtube.com/watch?v=' + yt_video_id ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() try: p.kill() except: pass if p.returncode != 0: print 'ERROR: %s' + err raise Exception(subs_name + "_error_downloading_subtitles") if not os.path.exists(subs_path): raise Exception(subs_name + "_subtitles_not_available") subs = pyvtt.WebVTTFile.open(subs_path) # fix youtube autosubs if auto_subs: fixed_subs = [] for s in subs: # print "--> "+s.text rows = s.text.split('\n') # take last row (bugfix) s.text = rows[-1] timecodes = [ pyvtt.WebVTTTime.from_string(x).ordinal for x in re.findall(r'<(\d+:\d+:\d+.\d+)>', s.text) ] words_str = re.sub(r'<[^>]*>', '', s.text) words = re.compile(r'[\s]+').split(words_str) if len(rows) < 2 and len(timecodes) == 0: continue if len(words) > 1 and len(timecodes) == 0: #s.text = "[BAD] "+s.text continue fixed_subs.append(s) subs = fixed_subs return subs
def __init__(self, name, out_dir, **kwargs): self.out_dir = os.path.join(out_dir, name) file_utils.ensure_dir(self.out_dir) super().__init__(name, **kwargs)