Пример #1
0
    def evaluate(self,
                 prefix,
                 X1,
                 X2,
                 Y,
                 record_details=False,
                 names_1=None,
                 names_2=None):
        ret = self.model.test((X1, X2), Y)

        logs.new_line(True)
        for indicator, score in ret.items():
            logs.add(self.M.name, f'{prefix}_evaluation',
                     f'{indicator}: {score}', logs.LEVEL_RET, True)

        if record_details:
            predict_y = self.model.predict_label(
                (self.__test_X1, self.__test_X2))

            logs.new_line(True)
            for i, v in enumerate(predict_y):
                logs.add(
                    self.M.name, 'test_samples',
                    json.dumps({
                        'ret': "success" if v == Y[i] else "fail",
                        'predict': int(v),
                        'ground_truth': int(Y[i]),
                        'name_1': names_1[i],
                        'name_2': names_2[i],
                    }), logs.LEVEL_DETAIL, True)
    def __init__(self):
        train_loader = Loader(self.M.data_params['neg_rate_train'],
                              0,
                              self.M.data_params['train_ratio'],
                              use_cache=False)
        val_loader = Loader(self.M.data_params['neg_rate_val'],
                            self.M.data_params['train_ratio'],
                            self.M.data_params['train_ratio'] +
                            self.M.data_params['val_ratio'],
                            use_cache=False)
        test_loader = Loader(self.M.data_params['neg_rate_test'],
                             self.M.data_params['train_ratio'] +
                             self.M.data_params['val_ratio'],
                             1.0,
                             use_cache=False)

        self.__train_X1, self.__train_X2, self.__train_Y, self.__train_names_1, self.__train_names_2 = train_loader.all(
        )
        self.__val_X1, self.__val_X2, self.__val_Y, self.__val_names_1, self.__val_names_2 = val_loader.all(
        )
        self.__test_X1, self.__test_X2, self.__test_Y, self.__test_names_1, self.__test_names_2 = test_loader.all(
        )

        logs.new_paragraph(True)
        logs.add(
            self.M.name, 'data_shape',
            json.dumps({
                'train_x': self.__train_X1.shape,
                'train_y': self.__train_Y.shape,
                'val_x': self.__val_X1.shape,
                'val_y': self.__val_Y.shape,
                'test_x': self.__test_X1.shape,
                'test_y': self.__test_Y.shape,
            }), logs.LEVEL_DATA, True)
def statistic():
    print(f'\nstatistic the shared competitors for competitors ... ')

    # record statistics
    shared_competitor_counts = []

    # to remove duplicate statistic
    d_min_name_max_name_2_has_statistic = {}

    length = len(d_name_2_competitors)
    _i = 0
    for _name_1, competitors in d_name_2_competitors.items():
        if _i % 2 == 0:
            progress = float(_i + 1) / length * 100.
            print('\rprogress: %.2f%% ' % progress, end='')
        _i += 1

        for _j, _name_2 in enumerate(list(competitors)):
            # remove duplicate statistic
            key = f'{min(_name_1, _name_2)}____{max(_name_1, _name_2)}'
            if key in d_min_name_max_name_2_has_statistic:
                continue
            d_min_name_max_name_2_has_statistic[key] = True

            if _name_2 not in d_name_2_competitors:
                shared_competitor_counts.append(0)
                continue

            shared_num = len(competitors.intersection(d_name_2_competitors[_name_2]))
            shared_competitor_counts.append(shared_num)

    logs.new_line()
    logs.add('statistics', 'total count of competitors companies', f'{len(d_name_2_competitors)}', output=True)
    logs.add('statistics', 'mean of shared competitors', f'among competitors: {np.mean(shared_competitor_counts)}',
             output=True)
    logs.add('statistics', 'std of shared competitors', f'among competitors: {np.std(shared_competitor_counts)}',
             output=True)
    logs.add('statistics', 'max of shared competitors', f'among competitors: {np.max(shared_competitor_counts)}',
             output=True)
    logs.add('statistics', 'min of shared competitors', f'among competitors: {np.min(shared_competitor_counts)}',
             output=True)

    bins = list(range(0, 53, 1))
    plt.figure(figsize=(18, 8))
    plt.hist(shared_competitor_counts, bins=bins, edgecolor='white')
    plt.title(
        f'histogram for count of shared competitors among competitors',
        fontsize=22)
    plt.xlabel('count of shared competitors for each similar company pair', fontsize=16)
    plt.ylabel('count of company pairs', fontsize=16)
    plt.xticks(bins)
    plt.savefig(path_lib.get_relative_file_path(
        'runtime', 'analysis', 'figures', f'hist_for_shared_competitor_among_competitors.png'),
        dpi=300)
    plt.show()
    plt.close()
    def train(self, use_cache=True):
        print('\nBuilding model ({}) ...'.format(self.M.TIME))
        self.model = self.M()

        print('\nTraining model ...')
        start_time = time.time()
        self.model.train(self.__X, self.__names, use_cache)
        train_time = time.time() - start_time
        print('\nFinish training')

        logs.add(self.M.name, 'training_time', f'{train_time}')
Пример #5
0
    def train(self):
        print('\nBuilding model ({}) ...'.format(self.M.TIME))
        self.model = self.M()

        print('\nTraining model ...')
        start_time = time.time()
        self.model.train((self.__train_X1, self.__train_X2), self.__train_Y,
                         (self.__val_X1, self.__val_X2), self.__val_Y)
        train_time = time.time() - start_time
        print('\nFinish training')

        logs.add(self.M.name, 'training_time', f'{train_time}')
Пример #6
0
    def __init__(self):
        o_loader = Loader(negative_rate=self.M.data_params['neg_rate'],
                          use_cache=True)
        (self.__train_X1, self.__train_X2, self.__train_Y, self.__train_names_1, self.__train_names_2), \
        (self.__val_X1, self.__val_X2, self.__val_Y, self.__val_names_1, self.__val_names_2), \
        (self.__test_X1, self.__test_X2, self.__test_Y, self.__test_names_1, self.__test_names_2) = \
            o_loader.train_val_test(self.M.data_params['train_ratio'], self.M.data_params['val_ratio'])

        logs.new_paragraph(True)
        logs.add(
            self.M.name, 'data_shape',
            json.dumps({
                'train_x': self.__train_X1.shape,
                'train_y': self.__train_Y.shape,
                'val_x': self.__val_X1.shape,
                'val_y': self.__val_Y.shape,
                'test_x': self.__test_X1.shape,
                'test_y': self.__test_Y.shape,
            }), logs.LEVEL_DATA, True)
Пример #7
0
 def __log(self):
     logs.new_line()
     logs.add(self.name, 'data_params', json.dumps(self.data_params),
              logs.LEVEL_PARAM, True)
     logs.add(self.name, 'train_params', json.dumps(self.train_params),
              logs.LEVEL_PARAM, True)
     logs.add(self.name, 'model_params', json.dumps(self.model_params),
              logs.LEVEL_PARAM, True)
     logs.add(self.name, 'monitor_params', json.dumps(self.monitor_params),
              logs.LEVEL_PARAM, True)
     logs.add(self.name, 'model_dir', self.model_dir, logs.LEVEL_PATH, True)
     logs.add(self.name, 'tensorboard_dir', self.tb_dir, logs.LEVEL_PATH,
              True)
Пример #8
0
def statistic(_top_k_similar):
    _top_k_idx = top_k_idx[:, -_top_k_similar:]
    _top_k_idx = _top_k_idx[::-1]

    print(f'\nstatistic the shared competitors for top {_top_k_similar} similar companies of all Linkedin companies ... ')

    # record statistics
    shared_competitor_counts = []

    # to remove duplicate statistic
    d_min_name_max_name_2_has_statistic = {}

    length = len(names)
    for _i, _name_1 in enumerate(names):

        if _i % 2 == 0:
            progress = float(_i + 1) / length * 100.
            print('\rprogress: %.2f%% ' % progress, end='')

        similar_names = names[_top_k_idx[_i]]

        for _j, _name_2 in enumerate(similar_names):

            # remove duplicate statistic
            key = f'{min(_name_1, _name_2)}____{max(_name_1, _name_2)}'
            if key in d_min_name_max_name_2_has_statistic:
                continue
            d_min_name_max_name_2_has_statistic[key] = True

            if _name_1 not in d_name_2_competitors or _name_2 not in d_name_2_competitors:
                shared_competitor_counts.append(0)
                continue

            competitor_set_1 = d_name_2_competitors[_name_1]
            competitor_set_2 = d_name_2_competitors[_name_2]

            shared_num = len(competitor_set_1.intersection(competitor_set_2))
            shared_competitor_counts.append(shared_num)

    logs.new_line()
    logs.add('statistics', 'total count of companies', f'{len(names)}', output=True)
    logs.add('statistics', 'mean of shared competitors', f'among top {_top_k_similar} similar companies: {np.mean(shared_competitor_counts)}', output=True)
    logs.add('statistics', 'std of shared competitors', f'among top {_top_k_similar} similar companies: {np.std(shared_competitor_counts)}', output=True)
    logs.add('statistics', 'max of shared competitors', f'among top {_top_k_similar} similar companies: {np.max(shared_competitor_counts)}', output=True)
    logs.add('statistics', 'min of shared competitors', f'among top {_top_k_similar} similar companies: {np.min(shared_competitor_counts)}', output=True)

    num_0 = len(list(filter(lambda x: x == 0, shared_competitor_counts)))
    shared_competitor_counts = list(filter(lambda x: x > 0, shared_competitor_counts))

    plt.figure(figsize=(14, 8))
    plt.hist(shared_competitor_counts, bins=[0.1, 1, 2, 3, 4, 5, 10, 20, 40], edgecolor='white')
    plt.title(
        f'histogram for count of shared competitors among top {_top_k_similar} similar companies of all Linkedin companies\n(spike for ({num_0} zero shared competitors) is removed)',
        fontsize=22)
    plt.xlabel('count of shared competitors for each similar company pair', fontsize=16)
    plt.ylabel('count of company pairs', fontsize=16)
    plt.xticks([0, 1, 2, 3, 4, 5, 10, 20, 40])
    plt.savefig(
        path_lib.get_relative_file_path('runtime', 'analysis', 'figures',
                                        f'hist_for_shared_competitor_among_top_{_top_k_similar}_similar_companies.png'),
        dpi=300)
    plt.show()
    plt.close()