Exemplo n.º 1
0
    def __init__(self, start_ratio=0.0, end_ratio=0.9, sample_ratio=1.0):
        # load data from wmt_news
        zh_en_dict = load_json(filtered_union_zh_en_dict_path)
        en_zh_dict = load_json(filtered_union_en_zh_dict_path)

        data = []
        for zh, val in zh_en_dict.items():
            if not val or 'translation' not in val or not val['translation']:
                continue

            for en in val['translation']:
                data.append([zh, en])
                data.append([en, zh])

        for en, val in en_zh_dict.items():
            if not val or 'translation' not in val or not val['translation']:
                continue

            for zh in val['translation']:
                data.append([zh, en])
                data.append([en, zh])

        # TODO remove duplicate

        # reproduce the process that nmt would go through in order to get its train set; shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # sample data
        data = self.sample_data(data, sample_ratio)

        # split dataset
        data = self.__split_data(data, start_ratio, end_ratio)

        self.__src_data, self.__tar_data = list(zip(*data))
Exemplo n.º 2
0
 def loader():
     if not Indexer._indexer:
         Indexer._indexer = load_binary(Indexer.index_path)
     if not Indexer._download:
         Indexer._download = load_json(Indexer.download_path)
     if not Indexer._analysis:
         Indexer._analysis = load_json(Indexer.analysis_path)
Exemplo n.º 3
0
    def update(self, t_weight=6, d_weight=3, c_weight=1, forced=False):
        indexer = load_binary(self.index_path)
        indexer_readable = load_json(self.index_readable_path)
        download = load_json(self.download_path)
        analysis = load_json(self.analysis_path)

        total_count = Counter()
        id_noun_count = {}

        for video_id, content in download.items():
            title = analysis[video_id]['nouns']['title'].split() * t_weight
            description = analysis[video_id]['nouns']['description'].split(
            ) * d_weight
            caption = analysis[video_id]['nouns']['caption'].split() * c_weight

            state = content.get('state')

            if state == 'update' or forced:
                self._delete_item(indexer, video_id)
                self._add_item(indexer, video_id,
                               title + description + caption)
                download[video_id]['state'] = 'complete'
            elif state == 'new':
                self._add_item(indexer, video_id,
                               title + description + caption)
                download[video_id]['state'] = 'complete'

        save_binary(indexer, self.index_path)
        save_json(indexer, self.index_readable_path)
        save_json(download, self.download_path)
        self._indexer = load_binary(self.index_path)
Exemplo n.º 4
0
def update(forced=False):

    download_path = path['download']
    analysis_path = path['analysis']
    doc2vec_src_path = path['doc2vec']['src']
    doc2vec_model_path = path['doc2vec']['model']

    download = load_json(download_path)
    analysis = load_json(analysis_path)

    model = gensim.models.doc2vec.Doc2Vec
    model = model.load(doc2vec_model_path)
    corpus = list(CorpusGensim(doc2vec_src_path))

    for video_id, content in download.items():
        state = content.get('state', 'new')
        if state == 'new' or state == 'update' or forced:
            words = analysis[video_id]['nouns']['all'].split()
            dt = get_similarity(corpus, model, words, 0, 0.1) + words
            dr = get_similarity(corpus, model, words, 0.9, 1)  # + words
            keyword = keywords(words, dt, dr).get_keywords(10, 2)

            analysis[video_id]['keywords'] = ' '.join(keyword)

    save_json(analysis, analysis_path)
Exemplo n.º 5
0
    def test_no_overlap(self):

        train_meta = utils.load_json(config.TRAIN_METADATA_PATH)
        valid_meta = utils.load_json(config.VAL_METADATA_PATH)
        test_meta = utils.load_json(config.TEST_METADATA_PATH)

        train_videos = list(train_meta.keys())
        valid_videos = list(valid_meta.keys())
        test_videos = list(test_meta.keys())

        self.assertFalse(not set(train_videos).isdisjoint(valid_videos))
        self.assertFalse(not set(train_videos).isdisjoint(test_videos))
        self.assertFalse(not set(valid_videos).isdisjoint(test_videos))
Exemplo n.º 6
0
def main2():
    # ./module-table-analysis.py -f /home/qinshulei/projects/huawei/githubs/test_result_dict.json -t /home/qinshulei/projects/huawei/githubs/test-definitions
    # test_dir = '/home/qinshulei/projects/huawei/githubs/test-definitions'
    # result_file = '/home/qinshulei/projects/huawei/githubs/test_result_dict.json'
    # generate_module_dict(result_json_dict, test_dir)
    # get args
    parser = argparse.ArgumentParser(prog='PROG')
    parser.add_argument('-f',
                        '--file',
                        required=True,
                        help='The data file path to load.')
    parser.add_argument('-t',
                        "--testDir",
                        required=True,
                        help="specific test case dir")

    # TODO : save result to a file
    parser.add_argument('-o',
                        '--output_file',
                        help='allow output the result to a file')
    config = vars(parser.parse_args())
    test_dir = config.get("testDir")
    # test_result_dict.json
    result_file = config.get("file")
    result_json_dict = utils.load_json(result_file)
    # job_result_dict = result_json_dict
    module_dict = generate_module_dict(result_json_dict, test_dir)
    print print_scope_result(module_dict)
Exemplo n.º 7
0
def traverse_dict_and_merge(_dict_dir, _merged_dict):
    for file_name in os.listdir(_dict_dir):
        file_path = os.path.join(_dict_dir, file_name)

        print(f'\nloading dictionary from {file_path} ...')

        tmp_dict = load_json(file_path)

        print(f'merging dict {file_name} ...')

        mode = 0 if '_v_all' not in file_name else 1
        # if mode == 1:
        #     continue

        length = len(tmp_dict)
        i = 0
        for key, val in tmp_dict.items():
            if i % 50 == 0:
                progress = float(i + 1) / length * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            _merged_dict = __merge_dict(_merged_dict, key, val, mode)
            i += 1

    return _merged_dict
Exemplo n.º 8
0
def build_unpaired_dataset(db_path, batch_size, training=True):

    def preprocess(fake_noise, fake_label, root, real_images, training):
        fake_noise = tf.strings.join([root, fake_noise], '/')
        fake_label = tf.strings.join([root, fake_label], '/')
        real_image = tf.py_function(np.random.choice(real_images), [real_images], 'string')
        real_image = tf.strings.join([root, real_image], '/')
        fake_noise = read_image(fake_noise)
        fake_label = read_image(fake_label)
        real_image = read_image(real_image)
        if training:
            fake_noise, fake_label = augument_image(fake_noise, fake_label)
        return fake_noise, real_image, fake_label

    db = load_json(db_path)
    ds = tf.data.Dataset.from_tensor_slices((db['fake_input'], db['fake_label']))
    if training:
        ds = ds.shuffle(SHUFFLE_BUFFER_SIZE)
    ds = ds.map(functools.partial(preprocess, 
                                  root=db['root'], 
                                  real_images=db['real_image'],
                                  training=training),
                num_parallel_calls=NUM_PARALLEL_CALLS)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(PREFETCH_BUFFER_SIZE)
    return ds
Exemplo n.º 9
0
    def __load_test(self):
        """ load test data """
        print('\nStart loading test data ...')

        if os.path.isfile(self.__test_emb_pkl_path):
            self.__test_X, self.__test_y, _, _ = load_pkl(
                self.__test_emb_pkl_path)

        else:
            print('loading test doc list ...')

            # load the doc_list
            emb_json_path = os.path.join(path.PATH_TMP_DIR,
                                         'emb_test_data.json')
            if os.path.isfile(emb_json_path):
                docs = load_json(emb_json_path)
            else:
                path_list = self.__get_path_list('test')
                docs = self.__load_docs(path_list, emb_json_path)

            print('converting test docs to trainable test data format ...')

            # convert the doc list to trainable data format
            self.__test_X, self.__test_y = self.__convert(
                docs, self.__test_emb_pkl_path)

        print('Finish loading test data')
    def __load_from_dict():
        # load data from files
        # zh_en_dict = load_json(filtered_pos_union_en_zh_dict_path)
        zh_en_dict = load_json(filtered_pos_union_zh_en_dict_path)
        zh_en_list = list(
            filter(lambda x: 'translation' in x[1] and x[1]['translation'],
                   zh_en_dict.items()))
        zh_en_list = list(
            map(
                lambda x: [[x[0]] * len(x[1]['translation']), x[1][
                    'translation']], zh_en_list))
        # data = reduce(lambda x, y: [x[0] + y[0], x[1] + y[1]], zh_en_list)

        zh_data = []
        en_data = []
        length = len(zh_en_list)
        for i, val in enumerate(zh_en_list):
            if i % 50 == 0:
                progress = float(i + 1) / length * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            zh_data += val[0]
            en_data += val[1]

        return list(zip(zh_data, en_data))
Exemplo n.º 11
0
def get_board_type(directory, filename):
    strinfo = re.compile('.txt')
    json_name = strinfo.sub('.json', filename)
    test_info = utils.load_json(os.path.join(directory, json_name))
    if 'board' in test_info.keys():
        # for dummy-ssh board
        board_type = ''
        try:
            if re.search('ssh', test_info['board_instance']):
                board_type = test_info['board_instance'].split('_')[0]
            else:
                board_verify = test_info['board'].split(',')[0]
                for key in device_map.keys():
                    if device_map[key][0] == board_verify:
                        board_type = key
                        break
                    else:
                        board_type = ''
        except KeyError:
            try:
                board_verify = test_info['board'].split(',')[0]
            except:
                board_verify = test_info['board']
            for key in device_map.keys():
                if device_map[key][0] == board_verify:
                    board_type = key
                    break
                else:
                    board_type = ''
        return board_type
    return ''
Exemplo n.º 12
0
 def __init__(self, file_path):
     self.file_path = file_path
     self._data = utils.load_json(file_path)
     self._data["save_directory"] = os.path.normpath(
         self._data["save_directory"])
     self._data["users"] = list(dict.fromkeys(self._data["users"]))
     self._data["bookmarks"] = list(dict.fromkeys(self._data["bookmarks"]))
Exemplo n.º 13
0
    def __init__(self,
                 start_ratio=0.0,
                 end_ratio=0.98,
                 _sample_rate=1.0,
                 data_params={},
                 tokenizer_pl=[],
                 encoder_pl=[],
                 _tokenizer_dir='cdlm',
                 _dataset='cdlm'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.__encoder_pl = encoder_pl
        self.__sample_rate = _sample_rate

        self.__tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl')
        self.__processed_dir_path = create_dir(data_dir, 'preprocessed',
                                               _dataset)

        # load data from files
        # zh_en_dict = load_json(filtered_pos_union_en_zh_dict_path)
        zh_en_dict = load_json(filtered_pos_union_zh_en_dict_path)
        zh_en_list = list(
            filter(lambda x: 'translation' in x[1] and x[1]['translation'],
                   zh_en_dict.items()))
        zh_en_list = list(
            map(
                lambda x: [[x[0]] * len(x[1]['translation']), x[1][
                    'translation']], zh_en_list))
        # data = reduce(lambda x, y: [x[0] + y[0], x[1] + y[1]], zh_en_list)

        zh_data = []
        en_data = []
        length = len(zh_en_list)
        for i, val in enumerate(zh_en_list):
            if i % 50 == 0:
                progress = float(i + 1) / length * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            zh_data += val[0]
            en_data += val[1]

        data = list(zip(zh_data, en_data))

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        if os.path.isfile(self.__tokenizer_path):
            self.__tokenizer = load_pkl(self.__tokenizer_path)
        else:
            self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data))
            self.get_tokenizer()

        # get the data set (train or validation or test)
        data = self.__split_data(data, start_ratio, end_ratio)

        self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
Exemplo n.º 14
0
    def __load_train(self):
        """ load train data """
        print('\nStart loading train data')

        if os.path.isfile(self.__emb_pkl_path):
            self.__train_X, self.__train_y, self.dict, self.voc_size = load_pkl(
                self.__emb_pkl_path)

        else:
            print('loading doc list ...')

            # load the doc_list
            emb_json_path = os.path.join(path.PATH_TMP_DIR, 'emb_data.json')
            if os.path.isfile(emb_json_path):
                docs = load_json(emb_json_path)
            else:
                path_list = self.__get_path_list()
                docs = self.__load_docs(path_list, emb_json_path)

            print('generating dictionary ...')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(docs)

            print('converting docs to trainable data format ...')

            # convert the doc list to trainable data format
            self.__train_X, self.__train_y = self.__convert(
                docs, self.__emb_pkl_path)

        print('Finish loading train data')
    def __gen_topics_mask(self):
        topics = utils.load_json(path.TOPIC_BONDS_JSON)
        topics = self.dict.doc2idx(topics)
        while -1 in topics:
            topics.remove(-1)

        self.__topic_mask = self.__2_sum_one_hot(topics)
        self.__topic_mask[self.__topic_mask > 0] = 1
    def __load_dir(dir_path):
        """
        Load all the data in "dir_path", and complement the data in the dates that no transaction happened
        :return
            data: (list)
            e.g. [ # include transactions happen in many days
                ['bond_a', 'bond_b', ...], # represent transaction happen in one day
                ['bond_a', 'bond_b', ...],
                ...
            ]
        """
        data = []

        # load the date list
        date_list = os.listdir(dir_path)
        date_list.sort()

        # generate a date dict so that we can check whether there is transaction happens in that date
        date_dict = utils.list_2_dict(date_list)

        # find out the start and end date of all the transactions
        start_date = date_list[0][len('doc_'):-len('.json')]
        end_date = date_list[-1][len('doc_'):-len('.json')]

        # covert the date to timestamp
        cur_timestamp = utils.date_2_timestamp(start_date)
        end_timestamp = utils.date_2_timestamp(end_date) + 86000

        # traverse all the date between the start date and the end date, but skip the holidays
        while cur_timestamp < end_timestamp:
            _date = utils.timestamp_2_date(cur_timestamp)
            file_name = f'doc_{_date}.json'

            # check if there is any transaction
            if file_name in date_dict:
                file_path = os.path.join(dir_path, file_name)

                # remove nan in doc
                tmp_doc = list(
                    map(lambda x: x if isinstance(x, str) else '',
                        utils.load_json(file_path)))
                while '' in tmp_doc:
                    tmp_doc.remove('')

                data.append(tmp_doc)

            # if it is holidays, then skip it
            elif date.is_holiday(_date):
                pass

            # if no transaction happens in that date
            else:
                data.append([])

            # move to the next day
            cur_timestamp += 86400

        return data
Exemplo n.º 17
0
def get_board_instance(directory, filename):
    strinfo = re.compile('.txt')
    json_name = strinfo.sub('.json', filename)
    #with open(os.path.join(directory, json_name), "r") as lines:
    test_info = utils.load_json(os.path.join(directory, json_name))
    if 'board_instance' in test_info.keys():
        board_instance = test_info['board_instance']
        return board_instance
    return ''
Exemplo n.º 18
0
def get_board_instance(directory, filename):
    strinfo = re.compile('.txt')
    json_name = strinfo.sub('.json',filename)
    #with open(os.path.join(directory, json_name), "r") as lines:
    test_info = utils.load_json(os.path.join(directory, json_name))
    if 'board_instance' in test_info.keys():
            board_instance = test_info['board_instance']
            return board_instance
    return ''
Exemplo n.º 19
0
def normalizing(forced=False):

    download_path = path['download']
    analysis_path = path['analysis']
    normed_path = path['norm']

    download = load_json(download_path)
    analysis = load_json(analysis_path)

    with open(normed_path, 'w', encoding='utf8') as f:
        for video_id, content in download.items():
            state = content.get('state', 'new')
            if state == 'new' or state == 'update' or forced:

                norm_title = normalize(content.get('title', '').lower(),
                                       english=True,
                                       number=True,
                                       punctuation=False,
                                       remains={'+', '#'})
                norm_description = normalize(content.get('description',
                                                         '').lower(),
                                             english=True,
                                             number=True,
                                             punctuation=False,
                                             remains={'+', '#'})
                norm_caption = normalize(content.get('caption', '').lower(),
                                         english=True,
                                         number=True,
                                         punctuation=False,
                                         remains={'+', '#'})
                f.write('{}\n'.format(norm_title + norm_description +
                                      norm_caption))

                analysis[video_id] = {}

                norm = {}

                norm['title'] = norm_title
                norm['description'] = norm_description
                norm['caption'] = norm_caption
                norm['trackKind'] = content.get('trackKind', '')
                analysis[video_id]['norm'] = norm
        save_json(analysis, analysis_path)
def main(args):

    if args.subset == constants.TRAIN:
        root = config.TRAIN_SOUND_ROOT
        cls_dirs = True
    elif args.subset == constants.VALID:
        root = config.VALID_SOUND_ROOT
        cls_dirs = True
    elif args.subset == constants.TEST:
        root = config.TEST_SOUND_ROOT
        cls_dirs = False
    else:
        raise ValueError("Invalid subset.")

    convert_to_tfrecords(utils.load_json(args.meta_path),
                         utils.load_json(args.classes_path),
                         root,
                         args.save_path,
                         args.sampling_rate,
                         class_dirs=cls_dirs)
Exemplo n.º 21
0
def parse_json(json):
    jobs = utils.load_json(json)
    url = utils.validate_input(jobs['username'], jobs['token'], jobs['server'])
    connection = utils.connect(url)
    duration = jobs['duration']
    # Remove unused data
    jobs.pop('duration')
    jobs.pop('username')
    jobs.pop('token')
    jobs.pop('server')
    return connection, jobs, duration
Exemplo n.º 22
0
def load_branches(json_file):
    """This function loads branches' info from a Json file and dumps it
    into a list of dictionaries

    :param filename: File to be loaded
    :return: List of dictionaries
    """
    logger.info("Loading branches from file %s", json_file)
    repo_dict = load_json(json_file)
    logger.info("Branches loaded")
    return repo_dict
Exemplo n.º 23
0
def parse_json(json):
    jobs = utils.load_json(json)
    url = utils.validate_input(jobs['username'], jobs['token'], jobs['server'])
    connection = utils.connect(url)
    duration = jobs['duration']
    # Remove unused data
    jobs.pop('duration')
    jobs.pop('username')
    jobs.pop('token')
    jobs.pop('server')
    return connection, jobs, duration
Exemplo n.º 24
0
def update(forced=False):
    def get_nouns(text):
        nouns = komoran3.nouns(text)
        return [
            noun for noun in nouns if len(noun) > 1 and not noun.isnumeric()
        ]

    user_dict_path = path['user_dictionary']

    komoran3 = Komoran('./lib/komoran/komoran/models',
                       './lib/komoran/komoran/libs')
    komoran3.set_user_dictionary(user_dict_path)

    download_path = path['download']
    analysis_path = path['analysis']

    download = load_json(download_path)
    analysis = load_json(analysis_path)

    for video_id, content in download.items():
        state = content.get('state', 'new')
        if state == 'new' or state == 'update' or forced:

            norm = analysis[video_id]['norm']

            nouns = {}
            nouns['title'] = ' '.join(get_nouns(norm.get('title', '')))
            nouns['description'] = ' '.join(
                get_nouns(norm.get('description', '')))
            nouns['caption'] = ' '.join(get_nouns(norm.get('caption', '')))
            nouns['all'] = nouns['title'] + ' ' + nouns[
                'description'] + ' ' + nouns['caption']

            analysis[video_id]['nouns'] = nouns

    save_json(analysis, analysis_path)
Exemplo n.º 25
0
def get_board_type(directory, filename):
    strinfo = re.compile('.txt')
    json_name = strinfo.sub('.json', filename)
    test_info = utils.load_json(os.path.join(directory, json_name))
    if 'board' in test_info.keys():
        # for dummy-ssh board
        board_type = ''
        try:
            if re.search('ssh', test_info['board_instance']):
                board_type = test_info['board_instance'].split('_')[0]
            else:
                if ',' in test_info['board']:
                    board_verify = test_info['board'].split(',')[0]
                    for key in device_map.keys():
                        if device_map[key][0] == board_verify:
                            board_type = key
                            break
                        else:
                            board_type = ''
                else:
                    # for dummy_ssh_{board_type}
                    board_type = test_info['board'].split('_')[-1]
        except KeyError:
            if ',' in test_info['board']:
                try:
                    board_verify = test_info['board'].split(',')[0]
                except:
                    board_verify = test_info['board']
                    for key in device_map.keys():
                        if device_map[key][0] == board_verify:
                            board_type = key
                            break
                        else:
                            board_type = ''
            else:
                # for jobs which has not incomplete
                board_type = test_info['board'].split('_')[-1]
        return board_type
    return ''
Exemplo n.º 26
0
def build_paired_dataset(db_path, batch_size, training=True):

    def preprocess(noise, label, root, training):
        noise = tf.strings.join([root, noise], '/')
        label = tf.strings.join([root, label], '/')
        noise = read_image(noise)
        label = read_image(label)
        if training:
            noise, label = augument_image(noise, label)
        return noise, label


    db = load_json(db_path)
    ds = tf.data.Dataset.from_tensor_slices((db['noise'], db['label']))
    if training:
        ds = ds.shuffle(SHUFFLE_BUFFER_SIZE)
    ds = ds.map(functools.partial(preprocess, 
                                  root=db['root'], 
                                  training=training),
                num_parallel_calls=NUM_PARALLEL_CALLS)
    ds = ds.batch(batch_size)
    return ds
def main(args):

    metadata = utils.load_json(args.train_metadata)

    r_avg = utils.StreamingAverage()
    g_avg = utils.StreamingAverage()
    b_avg = utils.StreamingAverage()

    for video_id, cls in metadata.items():

        video_folder_path = os.path.join(config.TRAIN_FRAMES_ROOT,
                                         utils.class_name_to_dir_name(cls),
                                         video_id)

        frame_paths = [
            os.path.join(video_folder_path, frame_path)
            for frame_path in os.listdir(video_folder_path)
        ]

        for frame_path in frame_paths:

            frame = cv2.imread(frame_path)

            assert len(frame.shape) == 3
            assert frame.shape[-1] == 3

            # opencv loads images in BGR
            b = np.mean(frame[..., 0])
            g = np.mean(frame[..., 1])
            r = np.mean(frame[..., 2])

            r_avg.add(r)
            g_avg.add(g)
            b_avg.add(b)

    means = [r_avg.avg, g_avg.avg, b_avg.avg]
    np.save(args.save_path, means)
Exemplo n.º 28
0
def gen_group_according_to(file_path):
    print('loading data ...')
    dict_dealer_index_2_group = utils.load_json(file_path)

    data, d_dealers, total_volume, total_transaction_count, bound_timestamp, d_new_bonds = utils.load_pkl(
        os.path.join(path.ROOT_DIR, 'runtime', 'tmp123.pkl'))

    utils.write_pkl(
        os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'), d_dealers)
    # d_dealers = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'))

    labels = set(list(map(lambda x: x[1], dict_dealer_index_2_group.items())))
    group_list = [{} for i in range(len(labels))]

    print('traversing data ...')

    length = len(d_dealers)
    cur = 0
    for dealer_index, trace_list in d_dealers.items():
        # show progress
        if cur % 5 == 0:
            progress = float(cur + 1) / length * 100.
            print('\rprogress: %.2f%% ' % progress, end='')
        cur += 1

        if dealer_index not in dict_dealer_index_2_group:
            continue

        group_index = dict_dealer_index_2_group[dealer_index]
        group_list[group_index][dealer_index] = trace_list

    print('\rprogress: 100.0%  \nsaving data ...')

    plan_name = os.path.splitext(os.path.split(file_path)[1])[0] + '.json'
    group_path = os.path.join(path.DATA_ROOT_DIR, 'groups', plan_name)
    utils.write_json(group_path, group_list)
Exemplo n.º 29
0
    def __load_docs(path_list, emb_json_path):
        """ load all the data from the path list """
        docs = []
        length = len(path_list)

        # traverse the path list to load all the data
        for i, _path in enumerate(path_list):
            # show progress
            if i % 5 == 0:
                progress = float(i + 1) / length * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            # remove nan in doc
            tmp_doc = list(
                map(lambda x: x
                    if isinstance(x, str) else '', load_json(_path)))
            while '' in tmp_doc:
                tmp_doc.remove('')

            docs.append(tmp_doc)

        # cache data for faster processing next time
        write_json(emb_json_path, docs)
        return docs
Exemplo n.º 30
0
def main(args):

  # load video classes
  classes = utils.load_json(config.CLASSES_PATH)

  # load lists of videos
  train_metadata = utils.load_json(config.TRAIN_METADATA_PATH)
  val_metadata = utils.load_json(config.VALID_METADATA_PATH)
  test_metadata = utils.load_json(config.TEST_METADATA_PATH)

  num_found = 0
  total = 0

  total_train_present = 0
  total_train_missing = 0

  total_val_present = 0
  total_val_missing = 0

  # load subset
  subset = None
  if args.subset:
    subset = utils.load_json(args.subset)

  # count train and validation videos
  for cls in classes:

    if subset is not None and cls not in subset:
      continue

    total += 1

    cls_train_path = os.path.join(config.TRAIN_ROOT, cls.replace(" ", "_"))
    cls_valid_path = os.path.join(config.VALID_ROOT, cls.replace(" ", "_"))

    train_found = False
    valid_found = False

    if os.path.isdir(cls_train_path):
      train_present, train_missing = count_present_and_missing(cls, cls_train_path, train_metadata)
      train_found = True
      total_train_present += train_present
      total_train_missing += train_missing

    if os.path.isdir(cls_valid_path):
      valid_present, valid_missing = count_present_and_missing(cls, cls_valid_path, val_metadata)
      valid_found = True
      total_val_present += valid_present
      total_val_missing += valid_missing

    if train_found or valid_found:
      num_found += 1

      if args.details:
        print("class {}".format(cls))

        if train_found:
          print("train: {} / {}".format(train_present, train_present + train_missing))

        if valid_found:
          print("valid: {} / {}".format(valid_present, valid_present + valid_missing))

        print()

  # count test videos
  test_present, test_missing = count_present_and_missing(None, config.TEST_ROOT, test_metadata)

  # print
  train_percent_found = 0
  if total_train_present > 0:
    train_percent_found = (total_train_present * 100) / (total_train_present + total_train_missing)

  valid_percent_found = 0
  if total_val_present > 0:
    valid_percent_found = (total_val_present * 100) / (total_val_present + total_val_missing)

  test_percent_found = 0
  if test_present > 0:
    test_percent_found = (test_present * 100) / (test_present + test_missing)

  print("class stats:")
  print("\t{:d} / {:d} classes found".format(num_found, total))

  print()

  print("video stats (only for found classes):")
  print("\t{:d} / {:d} ({:.2f}%) train videos found".format(
    total_train_present, total_train_present + total_train_missing, train_percent_found))
  print("\t{:d} / {:d} ({:.2f}%) valid videos found".format(
    total_val_present, total_val_present + total_val_missing, valid_percent_found))
  print("\t{:d} / {:d} ({:.2f}%) test videos found".format(
    test_present, test_present + test_missing, test_percent_found))
Exemplo n.º 31
0
        if dealer_index not in dict_dealer_index_2_group:
            continue

        group_index = dict_dealer_index_2_group[dealer_index]
        group_list[group_index][dealer_index] = trace_list

    print('\rprogress: 100.0%  \nsaving data ...')

    plan_name = os.path.splitext(os.path.split(file_path)[1])[0] + '.json'
    group_path = os.path.join(path.DATA_ROOT_DIR, 'groups', plan_name)
    utils.write_json(group_path, group_list)


# gen_group_according_to(os.path.join(
#     path.ROOT_DIR,
#     'group',
#     # 'group_K-means_without_original_stat.json'
#     # 'group_Spectral_Clustering_without_original_stat_with_model_input_features.json'
#     # 'group_K-means_filter_lower_5.json'
#     'group_Spectral_Clustering_filter_lower_5_with_model_input_features.json'
# ))
#
# print('done')

_path = r'D:\Data\share_mine_laptop\community_detection\data\groups\group_Spectral_Clustering_filter_lower_5_with_model_input_features.json'
data = utils.load_json(_path)

for i, v in enumerate(data):
    print(i, len(v))
Exemplo n.º 32
0
def caption_download():
    if 'credentials' not in session:
        return redirect('authorize')
    credentials = google.oauth2.credentials.Credentials(
        **session['credentials'])

    youtube = googleapiclient.discovery.build(API_SERVICE_NAME,
                                              API_VERSION,
                                              credentials=credentials)

    channel_id = request.args.get('channel_id')
    resume = request.args.get('resume', 0)
    resume = int(resume) if resume else 0

    max_results = 50

    try:
        #1. page 정보 추출
        search_list = youtube.search().list(part='id',
                                            channelId=channel_id,
                                            maxResults=1,
                                            type='video',
                                            fields='items,pageInfo').execute()
    except Exception as e:
        print(e)
        return jsonify({'error': str(e)})

    caption_count = 0
    total_results = search_list['pageInfo']['totalResults']
    sys.stdout.write('\rprogress... {:4}/{:4} '.format(caption_count,
                                                       total_results))

    pages = total_results // max_results + 1
    next_page_token = ''

    caption_time_pattern = re.compile(
        '\d:\d\d:\d\d.\d\d\d,\d:\d\d:\d\d.\d\d\d')

    for page in range(pages):
        #2. video id, title, description 추출
        search_list = youtube.search().list(
            part='snippet',
            channelId=channel_id,
            maxResults=max_results,
            pageToken=next_page_token,
            type='video',
            fields='items(etag,id,snippet(description,title)),nextPageToken'
        ).execute()

        next_page_token = search_list.get('nextPageToken', None)

        for item in search_list['items']:
            caption_count += 1

            if caption_count < resume:
                sys.stdout.write('\rprogress... {:4}/{:4} '.format(
                    caption_count, total_results))
                continue

            caption_dump = load_json(download_path)

            video_id = get_depth_dict(item, ('id', 'videoId'), None)
            if not video_id:
                print('error : There is no video_id.')
                sys.stdout.write('\rprogress... {:4}/{:4} '.format(
                    caption_count, total_results))
                continue
            title = get_depth_dict(item, ('snippet', 'title'), None)
            description = get_depth_dict(item, ('snippet', 'description'),
                                         None)

            #3. caption id 추출
            caption_list = youtube.captions().list(
                part='snippet',
                videoId=video_id,
                fields='items(etag,id,snippet(language,trackKind,lastUpdated))'
            ).execute()

            caption_kind = {}
            for item in caption_list['items']:
                track_kind = item['snippet']['trackKind']
                caption_kind[track_kind] = {
                    'id': item['id'],
                    'lastUpdated': item['snippet']['lastUpdated']
                }

            #4. 선호하는 trackKind 추출
            caption_id = None
            track_kind_preference = ['standard', 'ASR', 'forced']
            for preference in track_kind_preference:
                if caption_kind.get(preference, None):
                    caption_id = caption_kind[preference]['id']
                    new_last_updated = caption_kind[preference]['lastUpdated']
                    track_kind = preference
                    break

            if not caption_id:
                print('error : There is no caption. video_id : ', video_id)
                update_info = {
                    'title': title,
                    'description': description,
                    'error': 'There is no caption.'
                }
                if caption_dump.get(video_id, None):
                    update_info['state'] = 'new'
                else:
                    update_info['state'] = 'update'
                caption_dump[video_id] = update_info
                save_json(caption_dump, download_path)
                sys.stdout.write('\rprogress... {:4}/{:4} '.format(
                    caption_count, total_results))
                continue

            #5. caption lastUpdated 체크
            old_last_updated = get_depth_dict(caption_dump,
                                              (video_id, 'lastUpdated'), None)
            if old_last_updated and new_last_updated == old_last_updated:
                sys.stdout.write('\rprogress... {:4}/{:4} '.format(
                    caption_count, total_results))
                continue

            #6. download
            try:
                caption = youtube.captions().download(
                    id=caption_id, tfmt='sbv').execute().decode("utf-8")
            except Exception as e:
                print(e)
                print('video_id : ', video_id)
                update_info = {
                    'title': title,
                    'description': description,
                    'error': str(e)
                }
                if caption_dump.get(video_id, None):
                    update_info['state'] = 'update'
                else:
                    update_info['state'] = 'new'
                caption_dump[video_id] = update_info
                save_json(caption_dump, download_path)
                sys.stdout.write('\rprogress... {:4}/{:4} '.format(
                    caption_count, total_results))
            else:
                caption = caption_time_pattern.sub('', caption)
                update_info = {
                    'lastUpdated': new_last_updated,
                    'trackKind': track_kind,
                    'caption': caption,
                    'title': title,
                    'description': description
                }
                if caption_dump.get(video_id, None):
                    update_info['state'] = 'update'
                else:
                    update_info['state'] = 'new'
                caption_dump[video_id] = update_info
                save_json(caption_dump, download_path)
                sys.stdout.write('\rprogress... {:4}/{:4} '.format(
                    caption_count, total_results))

            time.sleep(0.5)

    return redirect(url_for('index', _exteranl=True))
Exemplo n.º 33
0
filtered_en_ro_dict_path = os.path.join(dictionary_dir, 'filtered_en_ro_merged.json')

delete_ro_keys = []
delete_en_keys = []


def __check_has_val(val):
    for k, l in val.items():
        if l:
            return True
    return False


print('\nloading zh_en_dict ...')

ro_en_dict = load_json(merged_ro_en_dict_path)
ro_en_dict = filter_duplicate(ro_en_dict)

print('filtering zh_en_dict ...')

for ro, val in ro_en_dict.items():
    if 'translation' not in val:
        continue

    translations = val['translation']
    translations = list(filter(lambda x: x in en_word_dict, translations))

    if not translations:
        del ro_en_dict[ro]['translation']

        if not __check_has_val(ro_en_dict[ro]):
Exemplo n.º 34
0
 def __init__(self, file_path):
     self.file_path = file_path
     self._data = utils.load_json(file_path)
     self._data['save_directory'] = os.path.normpath(self._data['save_directory'])
     self._data['users'] = list(dict.fromkeys(self._data['users']))