def maybe_download_audio_track(yt_video_id):
    is_ytdownloader_installed()

    video_data_path = os.path.join(const.VIDEO_DATA_DIR, yt_video_id)

    ensure_dir(video_data_path)

    video = pafy.new(yt_video_id)
    # download audio
    audio_lowest_size = sorted(video.audiostreams,
                               key=lambda x: x.get_filesize())[0]
    #print 'audio lowest download size: ' + str(audio_lowest_size.get_filesize())
    if audio_lowest_size.get_filesize() > 500000000:
        raise Exception("audio_file_is_too_big")

    audio_path = os.path.join(video_data_path,
                              "audio." + audio_lowest_size.extension)

    if not os.path.exists(audio_path):
        print 'downloading audio ' + audio_path
        audio_lowest_size.download(filepath=audio_path, quiet=True)

    if not os.path.exists(audio_path):
        raise Exception("audio_download_failed")

    return audio_path
def start_parsing(threads_num):
    
    try:        

        ensure_dir(const.VIDEO_DATA_DIR)       

        video_parser_threads = []
        # start parsing threads

        for i in range(0, threads_num):
            #print 'start parsing thread ' + str(i)
            thr = Thread(target=video_parser_thread_loop)
            thr.daemon = True
            thr.start()
            video_parser_threads.append(thr)

        # wait for threads
        while True: 
            if not any([thr.isAlive() for thr in video_parser_threads]):
                break
            time.sleep(5)

    except (KeyboardInterrupt, SystemExit):
        print '\n! Received keyboard interrupt, quitting threads.\n'

    print "DONE"

    stats_util.show_global_stats()
示例#3
0
def main():
    parser = ArgumentParser()

    parser.add_argument("-y",
                        "--youtube_api_key",
                        type=str,
                        required=True,
                        help="API key for youtube data v3 API")

    parser.add_argument("-w",
                        "--words_api_key",
                        type=str,
                        required=True,
                        help="API key for words API")

    parser.add_argument("-k",
                        "--keyword_list",
                        nargs='+',
                        type=str,
                        required=True,
                        help="list of keywords to collect")

    parser.add_argument("-s",
                        "--samples_per_keyword",
                        type=int,
                        default=10,
                        help="number of samples to collect per keyword")

    parser.add_argument("-o",
                        "--output_dir",
                        type=str,
                        default="./generated_keyword_audios",
                        help="path to the output dir")

    args = parser.parse_args()
    file_utils.ensure_dir(TEMP_DIR)

    file_utils.ensure_dir(args.output_dir)

    collection_result = {}

    for index, keyword in enumerate(args.keyword_list):
        cp.print_color(
            cp.ColorEnum.BOLD,
            f"collecting {args.samples_per_keyword} audio samples of keyword : {keyword}"
        )

        count = generate_dataset(args.youtube_api_key, args.words_api_key,
                                 keyword, args.samples_per_keyword,
                                 args.output_dir)

        collection_result[keyword] = count

    for keyword, count in collection_result.items():
        cp.print_color(cp.ColorEnum.BOLD,
                       f"collected {count} keywords of {keyword}")

    file_utils.remove_dir(TEMP_DIR)
示例#4
0
    def __init__(self, utterance_sep, path_output_lu_parses, path_output_parses,
                 parser_path, cfg_rules_path, pos_tagger_path=None, path_to_freq_norms=None, path_to_image_norms=None,
                 path_to_dictionary=None, lu_analyzer_path=None, path_to_anew=None, path_to_warringer=None, do_wnic=False,
                 path_to_rst_python=None, path_to_rst=None, path_output_rst=None, path_to_stanford_cp=None,
                 path_to_mpqa_lexicon=None, path_to_lda_model=None, path_to_lda_wordids=None, do_lexical=True,
                 do_syntactic=True, do_semantic=True, do_pragmatic=False, lexical_list=None, syntactic_list=None,
                 semantic_list=None, pragmatic_list=None):
        '''Parameters:
        source_transcript : list of strings. Full paths to directories containing transcripts (with no filler annotations)
        source_transcript_fillers : list of string. Full paths to a directories containing transcripts with filler annotations
        utterance_sep : string. The string that delimits utterance boundaries in the transcript
        path_lu_output_parses : string. The absolute path to a directory that will store the Lu features and parses.
        path_output_parses : string. The absolute path to a directory that will store the parse trees produced for the data.
        parser_path : string. The absolute path to a directory containing a Stanford lexparser
        cfg_rules_path : string. The absolute path to a file containing cfg productions to be extracted (one per line)
        path_output_lda_topics: string. The absolute path to the csv file where key-value topics will be stored.
        pos_tagger_path : optional, string. Full path to a directory containing a Stanford POS tagger
        path_to_freq_norms : optional, string. Full path to a file containing frequency norms
        path_to_image_norms : optional, string. Full path to a file containing imageability norms
        path_to_dictionary : optional, string. Full path to a file containing valid words for the language
        lu_analyzer_path : optional
        path_to_rst_python : optional, string. Full path to virtualenv python, for RST
        path_to_rst : optional, string. Full path to folder with RST's 'parse.py'
        path_output_rst: optional, string. Full path to where RST stores its results
        path_to_lda_model : string. Full path to trained LDA model.
        path_to_lda_wordids : string. Full path to word IDs used in trained LDA model.
        '''

        self.utterance_sep = utterance_sep

        self.output_rst_dir = os.path.abspath(path_output_rst)
        self.output_parse_dir = os.path.abspath(path_output_parses)
        self.output_lu_parse_dir = os.path.abspath(path_output_lu_parses)

        self.pos_tagger_path = pos_tagger_path
        self.parser_path = parser_path
        self.cfg_rules_path = cfg_rules_path
        self.path_to_mpqa_lexicon = path_to_mpqa_lexicon
        self.path_to_rst_python = path_to_rst_python
        self.path_to_rst = path_to_rst
        self.path_to_stanford_cp = path_to_stanford_cp
        self.path_to_lda_model = path_to_lda_model
        self.path_to_lda_wordids = path_to_lda_wordids

        self.do_lexical = do_lexical
        self.do_syntactic = do_syntactic
        self.do_semantic = do_semantic
        self.do_pragmatic = do_pragmatic
        self.lexical_list = lexical_list
        self.syntactic_list = syntactic_list
        self.semantic_list = semantic_list
        self.pragmatic_list = pragmatic_list

        file_utils.ensure_dir(self.output_parse_dir)
        file_utils.ensure_dir(self.output_lu_parse_dir)
        file_utils.ensure_dir(self.output_rst_dir)

        # self.transcript_set = transcript.TranscriptSet(dataset=[])

        # Get lexical norms
        if path_to_freq_norms is not None:
            self.norms_freq = functions.get_frequency_norms(path_to_freq_norms)
        else: # default
            self.norms_freq = functions.get_frequency_norms()

        if path_to_image_norms is not None:
            self.norms_image = functions.get_imageability_norms(path_to_image_norms)
        else: # default
            self.norms_image = functions.get_imageability_norms()

        if path_to_anew is not None:
            self.norms_anew = functions.get_anew_norms(path_to_anew)
        else: # default
            self.norms_anew = None

        # Warringer
        if path_to_warringer is not None:
            self.norms_warringer = functions.get_warringer_norms(path_to_warringer)
        else: # default
            self.norms_warringer = functions.get_warringer_norms()

        # MPQA
        if path_to_mpqa_lexicon is not None:
            [self.mpqa_words, self.mpqa_types, self.mpqa_polarities] = functions.get_mpqa_lexicon(path_to_mpqa_lexicon)
        else: # default
            [self.mpqa_words, self.mpqa_types, self.mpqa_polarities] = functions.get_mpqa_lexicon()

        # Set up the dictionary of valid words for the language
        if path_to_dictionary is not None:
            source_dict = path_to_dictionary
        else:
            source_dict = os.path.abspath("../feature_extraction/text/american-english") # default
        with open(source_dict, 'r') as fin_dict:
            words = fin_dict.readlines()
            self.dictionary_words = set(word.strip().lower() for word in words)
        self.prondict = cmudict.dict()

        if lu_analyzer_path is not None:
            self.lu_analyzer_path = lu_analyzer_path
        else:
            self.lu_analyzer_path = os.path.abspath('../L2SCA-2011-10-10/')

        # semantics
        if do_wnic:
            self.brown_ic = wnic.ic('ic-brown.dat')      # FR: it would be nice to have a dat based on normative data, baby
            self.semcor_ic = wnic.ic('ic-semcor.dat')
        else:
            self.brown_ic = []
            self.semcor_ic = []
示例#5
0
if __name__ == '__main__':
    # lianjia_root_site = 'http://bj.fang.lianjia.com'
    # lianjia = LianJiaCrawler(lianjia_root_site)


    # debug code
    # print lianjia._url_dict
    # print lianjia._html_dict
    # print lianjia._html_dict[u'HuaiRou'][0]
    # lianjia._cal_average_price_from_html(lianjia._html_dict[u'HuaiRou'][0])
    # print lianjia._price_dict

    from datetime import date
    from utils.file_utils import ensure_dir
    ensure_dir(PROJECT_DIR + '/data/json/crawler/housing')
    json_out_path = PROJECT_DIR + '/data/json/crawler/housing/{0}_lianjia_housing.json'.format(str(date.today()))

    json_dict = {}
    for city, confg in LIANJIA_MAP.items():
        lianjia = LianJiaCrawler(confg['website'], confg['area_map'])
        lianjia.get_price_dict()
        # print lianjia._price_dict
        json_dict[city] = lianjia._price_dict


     # save the price to the json file
    with open(json_out_path, 'w') as f:
        json.dump(json_dict,f)

def maybe_download_subtitiles(yt_video_id, auto_subs=False):
    is_ytdownloader_installed()

    subs_name = "autosubs" if auto_subs else "subs"

    # download subtitles
    video_data_path = os.path.join(const.VIDEO_DATA_DIR, yt_video_id)

    ensure_dir(video_data_path)

    _subs_path_tmp = os.path.join(video_data_path, subs_name + ".vtt")
    subs_path = os.path.join(video_data_path,
                             "%s.%s.vtt" % (subs_name, const.LANGUAGE))

    if not os.path.exists(subs_path):
        print 'downloading subtitles to ' + subs_path

        p = subprocess.Popen([
            "youtube-dl", "--write-auto-sub" if auto_subs else "--write-sub",
            "--sub-lang", const.LANGUAGE, "--skip-download", "-o",
            _subs_path_tmp, 'https://www.youtube.com/watch?v=' + yt_video_id
        ],
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        out, err = p.communicate()

        try:
            p.kill()
        except:
            pass

        if p.returncode != 0:
            print 'ERROR: %s' + err
            raise Exception(subs_name + "_error_downloading_subtitles")

        if not os.path.exists(subs_path):
            raise Exception(subs_name + "_subtitles_not_available")

    subs = pyvtt.WebVTTFile.open(subs_path)

    # fix youtube autosubs
    if auto_subs:
        fixed_subs = []

        for s in subs:
            # print "--> "+s.text
            rows = s.text.split('\n')

            # take last row (bugfix)
            s.text = rows[-1]

            timecodes = [
                pyvtt.WebVTTTime.from_string(x).ordinal
                for x in re.findall(r'<(\d+:\d+:\d+.\d+)>', s.text)
            ]

            words_str = re.sub(r'<[^>]*>', '', s.text)
            words = re.compile(r'[\s]+').split(words_str)

            if len(rows) < 2 and len(timecodes) == 0:
                continue

            if len(words) > 1 and len(timecodes) == 0:
                #s.text = "[BAD] "+s.text
                continue

            fixed_subs.append(s)

        subs = fixed_subs

    return subs
示例#7
0
 def __init__(self, name, out_dir, **kwargs):
     self.out_dir = os.path.join(out_dir, name)
     file_utils.ensure_dir(self.out_dir)
     super().__init__(name, **kwargs)