예제 #1
0
    def __init__(self,
                 hparams,
                 train_size: int,
                 class_weight: Optional[Tensor] = None):
        # model, criterion
        self.model = VAE()

        # optimizer and scheduler
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=hparams.learning_rate,
                                          eps=hparams.eps,
                                          weight_decay=hparams.weight_decay)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, **hparams.scheduler)
        self.bce = nn.BCEWithLogitsLoss(reduction='none')
        # self.kld = nn.KLDivLoss(reduction='sum')
        # device
        device_for_summary = self.__init_device(hparams.device,
                                                hparams.out_device)

        # summary
        self.writer = SummaryWriter(logdir=hparams.logdir)
        # TODO: fill in ~~DUMMY~~INPUT~~SIZE~~
        path_summary = Path(self.writer.logdir, 'summary.txt')
        if not path_summary.exists():
            print_to_file(path_summary, summary, (self.model, (40, 11)),
                          dict(device=device_for_summary))

        # save hyperparameters
        path_hparam = Path(self.writer.logdir, 'hparams.txt')
        if not path_hparam.exists():
            print_to_file(path_hparam, hparams.print_params)
예제 #2
0
    def __init__(self, hp):
        # TODO: model, criterion
        self.loss = nn.L1Loss(reduction='none')
        self.model = waveunet_residual.WaveUNet(
            ch_double=hp.model['ch_double'])

        # optimizer and scheduler
        self.optimizer = AdamW(
            self.model.parameters(),
            lr=hp.learning_rate,
            weight_decay=hp.weight_decay,
        )

        # device
        device_for_summary = self._init_device(hp.device, hp.out_device)

        # summary
        self.writer = SummaryWriter(log_dir=hp.logdir)
        path_summary = Path(self.writer.log_dir, 'summary.txt')
        if not path_summary.exists():
            print_to_file(path_summary, summary, (self.model, hp.dummy_input),
                          dict(device=device_for_summary))

        # save hyper-parameters
        path_hp = Path(self.writer.log_dir, 'hp.txt')
        if not path_hp.exists():
            print_to_file(path_hp, hp.print_params)

        # evaluation metric
        self.metric = ['SegSNR', 'fwSegSNR', 'PESQ', 'STOI']
예제 #3
0
파일: enewstree.py 프로젝트: qlmgg/rap
def post_enewstree_forum(post_url, src):
    """ 消息树发主贴函数

    @param post_url:   板块地址 如:http://enewstree.com/discuz/forum.php?mod=forumdisplay&fid=47
    @type post_url:    str

    @param src:        用户名,密码,标题,主帖内容,等等。
    @type src:         dict

    @return:           是否发帖成功,帖子URL
    @rtype:            bool,str

    """
    logger = utils.RAPLogger(post_url)
    sess = utils.RAPSession(src)
    # Step 1: 登录
    if not login_enewstree(post_url, sess, src):
        logger.error(' Login Error')
        return ('', str(logger))
    logger.info(' Login OK')

    fid = re.findall(r'fid=(\d*)', post_url)[0]
    resp = sess.get('http://enewstree.com/discuz/forum.php?mod=post&action=newthread&fid='+fid)
    formhash = re.findall(r'formhash=(.*?)"', resp.content)[0]
    logger.info('formhash:'+ formhash)
    payload = {
        'formhash':formhash,
        'posttime':int(time.time()),
        'wysiwyg':'1',
        'subject':src['subject'].decode('utf8').encode(CHARSET,'ignore'),
        'message':src['content'].decode('utf8').encode(CHARSET,'ignore'),
        'allownoticeauthor':'1',
        'usesig':'1',
        'secqaahash':'qSxqZkkW',
        'secanswer':'1776',
        'seccodehash':'cSxqZkkW',
        'seccodemodid':'forum::post',
        'seccodeverify':'ec48',
        'save':'',
        'uploadalbum':'-2',
        'newalbum':'',
    }

    # 发送登录post包
    resp = sess.post('http://enewstree.com/discuz/forum.php?mod=post&action=newthread&fid='+fid
                     +'&extra=&topicsubmit=yes',
                     data=payload,
                     headers = {
                         'Referer':post_url
                     })
    utils.print_to_file(resp.content)
    # 若指定字样出现在response中,表示回复成功
    if src['subject'].decode('utf8') not in resp.content.decode(CHARSET):
        logger.error(' Post Error')
        return ('', str(logger))
    logger.info(' Post OK')
    url = resp.url
    print url
    return (url, str(logger))
예제 #4
0
def main(test_epoch: int):
    train_loader, valid_loader, test_loader = data_manager.get_dataloader(hp)
    if test_epoch == -1:
        runner = Runner(hp)

        # TODO: add all the evaluation metrics
        dict_loss = dict(loss=['Multiline', ['loss/train', 'loss/valid']])
        dict_eval = dict(
            PESQ=['Multiline', ['eval/PESQ_out', 'eval/PESQ_x']],
            STOI=['Multiline', ['eval/STOI_out', 'eval/STOI_x']],
            SegSNR=['Multiline', ['eval/SegSNR_out', 'eval/SegSNR_x']],
            fwSegSNR=['Multiline', ['eval/fwSegSNR_out', 'eval/fwSegSNR_x']])
        runner.writer.add_custom_scalars(dict(train=dict_loss,
                                              valid=dict_eval))

        epoch = 0
        test_epoch_or_zero = 0
        print(f'Training on {runner.str_device}')
        for epoch in range(hp.num_epochs):
            # training
            train_loss, train_eval = runner.run(train_loader, 'train', epoch)
            if train_loss is not None:
                if torch.isfinite(torch.tensor(train_loss)):
                    runner.writer.add_scalar('loss/train', train_loss, epoch)

            # checkpoint save
            torch.save(runner.model.module.state_dict(),
                       Path(runner.writer.log_dir, f'{epoch}.pt'))

            # validation
            valid_loss, valid_eval = runner.run(valid_loader, 'valid', epoch)
            if valid_loss is not None:
                if torch.isfinite(torch.tensor(valid_loss)):
                    runner.writer.add_scalar('loss/valid', valid_loss, epoch)

        print('Training Finished')
        test_epoch = test_epoch_or_zero if test_epoch_or_zero > 0 else epoch
    else:
        runner = Runner(hp)

    # test
    test_eval_outs, test_eval_xs = runner.test(test_loader, test_epoch)

    # TODO: write test result
    str_metric = ['SegSNR', 'fwSegSNR', 'PESQ', 'STOI']
    print_eval_outs, print_eval_xs = dict(), dict()
    for k, eval_out, eval_x in zip(str_metric, test_eval_outs, test_eval_xs):
        print_eval_outs[k] = eval_out
        print_eval_xs[k] = eval_x

    print(f'Test - Input Eval: {print_eval_xs}')
    print(f'Test - Out Eval: {print_eval_outs}')

    path_eval = Path(hp.logdir, f'test_{test_epoch}', 'test_eval.txt')
    if not path_eval.exists():
        print_to_file(path_eval, print_eval, (print_eval_xs, print_eval_outs))

    runner.writer.close()
예제 #5
0
def e_step():
    # E-step
    # h_ik = P(c_i|s_k)/SUM(P(c_i|s_k)
    h_ik.clear()
    print("E-step...")
    for i, context in enumerate(P_cs):
        temp = {}
        for sense in context:
            temp[sense] = P_cs[i][sense]/P_c[i]
        h_ik.append(temp)
    utils.print_to_file('h_ik.json', h_ik)
예제 #6
0
def select_k_best(X, y, k):
    features = list(X.columns)
    if len(features) <= k:
        return [(feature, 0) for feature in features]
    clf = SelectKBest(chi2, k=k)
    clf.fit(X, y)
    features_mask = clf.get_support()
    new_features = [a for a, b in zip(features, features_mask) if b]
    print_to_file('List of {} features selected by select k best'.format(k),
                  'select_k_best.txt')
    for feat in new_features:
        print_to_file(feat, 'select_k_best.txt')
    return new_features
    def __init__(self,
                 hparams,
                 train_size: int,
                 class_weight: Optional[Tensor] = None):
        # model, criterion, and prediction
        self.model = UNet(ch_in=2, ch_out=1, **hparams.model)
        self.sigmoid = torch.nn.Sigmoid()
        self.criterion = torch.nn.BCEWithLogitsLoss(reduction='none')
        self.class_weight = class_weight

        # for prediction
        self.frame2time = hparams.hop_size / hparams.sample_rate
        self.T_6s = round(6 / self.frame2time) - 1
        self.T_12s = round(12 / self.frame2time) - 1
        self.metrics = ('precision', 'recall', 'F1')

        # optimizer and scheduler
        self.optimizer = AdamW(
            self.model.parameters(),
            lr=hparams.learning_rate,
            weight_decay=hparams.weight_decay,
        )
        self.scheduler = CosineLRWithRestarts(self.optimizer,
                                              batch_size=hparams.batch_size,
                                              epoch_size=train_size,
                                              **hparams.scheduler)
        self.scheduler.step()

        self.f1_last_restart = -1

        # device
        device_for_summary = self._init_device(hparams.device,
                                               hparams.out_device)

        # summary
        self.writer = SummaryWriter(logdir=hparams.logdir)
        path_summary = Path(self.writer.logdir, 'summary.txt')
        if not path_summary.exists():
            print_to_file(path_summary, summary,
                          (self.model,
                           (2, 128, 16 * hparams.model['stride'][1]**4)),
                          dict(device=device_for_summary))

        # save hyperparameters
        path_hparam = Path(self.writer.logdir, 'hparams.txt')
        if not path_hparam.exists():
            with path_hparam.open('w') as f:
                for var in vars(hparams):
                    value = getattr(hparams, var)
                    print(f'{var}: {value}', file=f)
    def __init__(self, path_state_dict=''):
        self.model_name = hp.model_name
        module = eval(hp.model_name)

        self.model = module(**getattr(hp, hp.model_name))
        self.criterion = nn.MSELoss(reduction='none')
        self.optimizer = AdamW(self.model.parameters(),
                               lr=hp.learning_rate,
                               weight_decay=hp.weight_decay,
                               )

        self.__init_device(hp.device, hp.out_device)

        self.scheduler: Optional[CosineLRWithRestarts] = None
        self.max_epochs = hp.n_epochs
        self.loss_last_restart = float('inf')

        self.writer: Optional[CustomWriter] = None

        self.valid_eval_sample: Dict[str, Any] = dict()

        # Load State Dict
        if path_state_dict:
            st_model, st_optim = torch.load(path_state_dict, map_location=self.in_device)
            try:
                if hasattr(self.model, 'module'):
                    self.model.module.load_state_dict(st_model)
                else:
                    self.model.load_state_dict(st_model)
                self.optimizer.load_state_dict(st_optim)
            except:
                raise Exception('The model is different from the state dict.')

        path_summary = hp.logdir / 'summary.txt'
        if not path_summary.exists():
            print_to_file(
                path_summary,
                summary,
                (self.model, hp.dummy_input_size),
                dict(device=self.str_device[:4])
            )
            # dd.io.save((hp.logdir / hp.hparams_fname).with_suffix('.h5'), asdict(hp))
            with (hp.logdir / 'hparams.txt').open('w') as f:
                f.write(repr(hp))
예제 #9
0
def compute_likelihood():
    # P(c_i|s_k) = PROD_{f_j in c_i) (P(f_j|s_k))
    P_cs.clear()
    P_c.clear()
    for text in corpus:
        for i, sentence in enumerate(text):
            temp = {}
            for word in filter(lambda w: w['sense'] != '', sentence):
                try:
                    for possible_sense in possible_senses[word['lemma']]:
                        # print("- Possible sense", possible_sense)
                        product = 1
                        for feature_in_sentence in filter(lambda w: w['sense'] != '' and w['lemma'] != word['lemma'], sentence):
                            # print("--", feature_in_sentence['lemma'])
                            found = False
                            for feature_in_senses in P_fs[possible_sense]:
                                if feature_in_sentence['lemma'] == feature_in_senses:
                                    found = True
                                    # print("---", feature_in_senses, P_fs[possible_sense][feature_in_senses])
                                    product *= P_fs[possible_sense][feature_in_senses]
                                    break
                            if not found:
                                product = 0
                        # print("P(c_" + str(i), "|", possible_sense, ") =", product)
                        temp[possible_sense] = product
                except KeyError:
                    print('KeyError: ' + word['lemma'])
            P_cs.append(temp)
    utils.print_to_file('P_cs.json', P_cs)

    # P(c_i) = SUM P(c_i|s_k)P(s_k)
    for context in P_cs:
        sum = 0
        for sense in context:
            sum += context[sense] * P_s[sense]
        P_c.append(sum)
    utils.print_to_file('P_c.json', P_c)

    # l(C) -> Likelihood of the corpus
    prod_contexts = np.prod(P_c)
    likelihood = log(prod_contexts)
    print("Likelihood:", likelihood)
    return likelihood
    def __init__(self, path_state_dict=''):
        module = eval(hp.model_name)

        self.model = module(**getattr(hp, hp.model_name))
        if isinstance(hp.criterion_names, str):
            criterion_names = (hp.criterion_names, )
        else:
            criterion_names = hp.criterion_names
        self.criterions = [
            eval(f'nn.{name}')(reduction='sum') for name in criterion_names
        ]

        self.__init_device(hp.device, hp.out_device)

        self.writer: CustomWriter = None

        self.optimizer = optim.Adam(
            self.model.parameters(),
            lr=hp.learning_rate,
            weight_decay=hp.weight_decay,
        )

        # Load State Dict
        if path_state_dict:
            st_model, st_optim = torch.load(path_state_dict, self.in_device)
            try:
                if isinstance(self.model, nn.DataParallel):
                    self.model.module.load_state_dict(st_model)
                else:
                    self.model.load_state_dict(st_model)
                self.optimizer.load_state_dict(st_optim)
            except RuntimeError:
                raise Exception('The model is different from the state dict.')

        path_summary = hp.logdir / 'summary.txt'
        if not path_summary.exists():
            print_to_file(path_summary, summary,
                          (self.model, hp.dummy_input_size),
                          dict(device=self.str_device[:4]))
            with (hp.logdir / 'hparams.txt').open('w') as f:
                f.write(repr(hp))
예제 #11
0
def mutal_information_filter(data, feature_list, features_map):
    print_to_file(f' Computing Mutual Information Scores', 'mutual.txt')
    scores = np.zeros((len(feature_list), len(feature_list)))
    print_to_file(',{}'.format(','.join(feature_list)), 'mutual.txt')
    for i in range(len(feature_list)):
        for j in range(i):
            feature1_type = features_map[feature_list[i]].get_type()
            feature2_type = features_map[feature_list[j]].get_type()
            if not feature1_type == feature2_type:
                continue
            if feature1_type == FeatureType.CONTINUOUS:
                scores[i][j] = mutual_info_regression()
            scores[i][j] = normalized_mutual_info_score(
                data[feature], data[feature2]) if i != j else 0
        print_to_file(
            '{},{}'.format(feature_list[i],
                           ','.join([str(x) for x in scores[i]])),
            'mutual.txt')
    #print_to_file(f' final scores:\n{scores}\n end', 'mutual.txt')
    over_threshold = [
        sum([1 for i in j if i > FILTER_THRESHOLD]) for j in scores
    ]
    while any(i > 0 for i in over_threshold):
        max_index = over_threshold.index(max(over_threshold))
        print_to_file(
            f' Deleting feature number {max_index} which is {feature_list[max_index]}',
            'mutual.txt')
        scores = np.delete(scores, max_index, 0)
        scores = np.delete(scores, max_index, 1)
        data.drop(feature_list[max_index], axis=1)
        features_map.pop(feature_list[max_index])
        feature_list.pop(max_index)
        over_threshold = [
            sum(1 for i in j if i > FILTER_THRESHOLD) for j in scores
        ]
    print_to_file(
        f' Feature list after filter {feature_list} for total of {len(feature_list)} features',
        'mutual.txt')
    return data, feature_list, features_map
예제 #12
0
def sfs(X, y, features, feature_map, clf, outfile, eps=1e-8):
    selected_features = []
    selected_features_extended = []
    scores = []
    score_diff = 1
    last_score = 0
    epoch = 0
    print_to_file('Sequential forward selection search:', outfile)
    candidates = features.copy()
    while score_diff > eps:
        best_feature = None
        forward_best_score = 0
        epoch += 1
        # Forward search
        for feature in candidates:
            temp_features = selected_features_extended.copy()
            if isinstance(feature_map[feature], CategoricalFeature):
                temp_features.extend(feature_map[feature].sub_features)
            else:
                temp_features.append(feature)
            test_data = X[temp_features]
            score = cross_val_score(estimator=clf, X=test_data, y=y,
                                    cv=5).mean()
            if score > forward_best_score:
                forward_best_score = score
                best_feature = feature
        score_diff = forward_best_score - last_score
        if score_diff < eps:
            break
        last_score = forward_best_score
        selected_features.append(best_feature)
        scores.append(last_score)
        candidates.remove(best_feature)
        if isinstance(feature_map[best_feature], CategoricalFeature):
            selected_features_extended.extend(
                feature_map[best_feature].sub_features)
        else:
            selected_features_extended.append(best_feature)
        print(f'{epoch},', end='')
    ret_val = [(selected_features[i], scores[i])
               for i in range(len(selected_features))]
    ret_val.sort(key=lambda x: x[1], reverse=True)
    print_to_file('Features and scores for Sequential forward search:',
                  outfile)
    print_to_file('Feature,Score', outfile)
    for feature in ret_val:
        print_to_file('{},{}'.format(feature[0], feature[1]), outfile)

    return selected_features
예제 #13
0
def m_step():
    # M-step
    print("M-step...")
    Z = {}
    for sense in P_fs:
        for feature in P_fs[sense]:
            sum = 0
            Z[feature] = 0
            for text in corpus:
                for i, sentence in enumerate(text):
                    try:
                        if feature in [x['lemma'] for x in sentence]:
                            sum += h_ik[i][sense]
                    except KeyError:
                        # TODO: Too much keyerrors, probably there is a bug
                        pass
                        # print('KeyError:', feature)
            # print(feature, sum)
            Z[feature] += sum
            P_fs[sense][feature] = sum / Z[feature]
    utils.print_to_file('P_fs.json', P_fs)

    total = 0
    for sense in P_fs:
        for i, context in enumerate(h_ik):
            try:
                total += h_ik[i][sense]
            except KeyError:
                pass

    for sense in P_fs:
        sum = 0
        for i, context in enumerate(h_ik):
            try:
                sum += h_ik[i][sense]
            except KeyError:
                pass
        P_s[sense] = sum / total
    utils.print_to_file('P_s.json', P_s)
예제 #14
0
def explore_repository(repo_name, tree_str='', files_dict=None, depth=0):
    """
    Método principal da aplicação
    Percorre recursivamente o repositório
    """
    repository_content = pull_folder_content(repo_name)
    if (repository_content):
        if depth == 0:
            print('[+] Scraping no repositório ' + repo_name + ' iniciado...')
        if files_dict is None:
            files_dict = {}
        folders, files = extract_hrefs(repository_content)
        for f in folders:
            tree_str += generate_str_with_spaces(depth,
                                                 get_folder_or_file_name(f),
                                                 is_folder=True)
            tree_str, files_dict = explore_repository(f,
                                                      tree_str=tree_str,
                                                      files_dict=files_dict,
                                                      depth=depth + 1)
        for f in files:
            filename, lines, bytes_, extension = explore_file(f)
            files_dict = include_extension_in_files_dict(files_dict,
                                                         lines=lines,
                                                         bytes_=bytes_,
                                                         extension=extension)
            tree_str += generate_str_with_spaces(depth,
                                                 filename,
                                                 is_folder=False,
                                                 loc=lines)
        if depth == 0:
            print_to_file(repo_name, tree_str, files_dict)
            print('[+] Scraping no repositório ' + repo_name + ' finalizado!')
        return tree_str, files_dict
    else:
        return
예제 #15
0
	def classify_reviews(self, folderpath):
		negc = 0
		posc = 0
		undc = 0
		total = 0
		filenames = glob.glob(folderpath)
		totalnumfiles = len(filenames)

		print("Testing data set")
		print("Number of files:{0}".format(totalnumfiles))

		if totalnumfiles == 0:
			print("No files to classify, exiting...")
			return

		for f in filenames:
			with open(f, 'r') as infile:
				clsfy = self.classify(infile)
				total += clsfy
				if clsfy > 0:
					posc += 1
				elif clsfy < 0:
					negc += 1
				elif clsfy == 0:
					undc += 1

		results = """
Results...
Average classification: {0}
Percentage of positve classifications: %{1}
Percentage of negative classifications: %{2}
Percentage of undertermined classifications: %{3}
		""".format(total/totalnumfiles,utils.percentage(posc, len(filenames)),utils.percentage(negc, len(filenames)),utils.percentage(undc, len(filenames)))
		print results

		utils.print_to_file("testResults.txt", "\nFolderpath: {0}\nNumber of Files: {1}\n{2}".format(folderpath,totalnumfiles,results))
예제 #16
0
def iterative_k_best(data, clf, eps=1e-8):
    X, y = data_get_label(data)
    k = 1
    last_score = 0
    new_score = eps * 2
    res_for_k = []
    while new_score - last_score > eps:
        features = select_k_best(X, y, k)
        new_score = cross_val_score(estimator=clf, X=X, y=y, cv=5).mean()
        res_for_k.append((new_score, features))
        k += 1
    print_to_file('Feature sets and scores for iterative best k features:',
                  'itk.csv')
    print_to_file('K,Score,Features', 'itk.csv')
    for i in range(len(res_for_k)):
        print_to_file(
            '{},{},{}'.format(i + 1, res_for_k[i][0],
                              ','.join(res_for_k[i][1])), 'itk.csv')
예제 #17
0
def new_mutual_info_filter(data, feature_list, features_map):
    features_num = len(feature_list)
    scores = np.zeros((features_num, features_num))
    print_to_file(f' Computing Mutual Information Scores', 'mutual.txt')
    print_to_file(',{}'.format(','.join(feature_list)), 'mutual.txt')
    for i, target in enumerate(feature_list):
        temp_list = feature_list.copy()
        temp_list.remove(target)
        X = data[temp_list]
        discrete_features = [
            features_map[feature].get_type() != FeatureType.CONTINUOUS
            for feature in temp_list
        ]
        f_scores = None
        if features_map[target].get_type() == FeatureType.CONTINUOUS:
            f_scores = mutual_info_regression(
                X, data[target], discrete_features=discrete_features)
        else:
            f_scores = mutual_info_classif(X,
                                           data[target].astype(int),
                                           discrete_features=discrete_features)
        for j in range(i):
            scores[i][j] = f_scores[j]
        for j in range(i + 1, features_num):
            scores[i][j] = f_scores[j - 1]
        print_to_file(
            '{},{}'.format(feature_list[i],
                           ','.join([str(x) for x in scores[i]])),
            'mutual.txt')
    print_to_file(
        'overall there are {} pairs of strongly corelated features'.format(
            len(scores[scores > 3]) / 2), 'mutual.txt')
    indices = np.nonzero(scores > 3)
    indices_fix = []
    for i in range(len(indices[0])):
        indices_fix.append(
            (min(indices[0][i],
                 indices[1][i]), max(indices[0][i], indices[1][i])))
        indices_fix = list(set(indices_fix))
    for ind in indices_fix:
        print_to_file(
            'Features {} and {} with score {}'.format(feature_list[ind[0]],
                                                      feature_list[ind[1]],
                                                      scores[ind[0]][ind[1]]),
            'mutual.txt')
        print_to_file('Dropping feature {}'.format(feature_list[ind[1]]),
                      'mutual.txt')
        feature_list.remove(feature_list[ind[1]])
    return scores, feature_list
예제 #18
0
answers = 0
correct_answers = 0

allwords = utils.read_from_file('allwords.json')
allwords_with_sensekey = utils.read_from_file('allwords_with_sensekey.json')
possible_senses = utils.read_from_file('possibile_senses.json')
corpus = utils.read_from_file('corpus.json')

print("Counting occurrencies...")
# Count all occurrencies of a word in the corpus
# count(w_j)
count_w = list(map(lambda w: (w, 1), allwords))
count_w = list(reduce.reduceByKey(lambda x, y: x + y, count_w))
count_w.sort(key=lambda x: x[0], reverse=True)
utils.print_to_file('count_w.json', count_w)

# Count all occurrencies of a (word, sense) in the corpus
# count(s_i, w_j)
count_ws = list(map(lambda w: (w, 1), filter(lambda w: w[1] != '', utils.matrix_to_array(allwords_with_sensekey))))
count_ws = list(reduce.reduceByKey(lambda x, y: x + y, count_ws))
count_ws.sort(key=lambda x: x[1], reverse=True)
utils.print_to_file('count_sw.json', count_ws)

# Count all occurrencies of a sense in the corpus
# count(s_i)
count_s = list(map(lambda w: (w[1], 1), filter(lambda w: w[1] != '', utils.matrix_to_array(allwords_with_sensekey))))
count_s = list(reduce.reduceByKey(lambda x, y: x + y, count_s))
count_s.sort(key=lambda x: x[0], reverse=True)
utils.print_to_file('count_s.json', count_s)
                             suffix='Complete',
                             length=50)

    allwords_with_sensekey_formatted = list(
        map(
            lambda w: {
                "word": w[0],
                "sense": w[1]
            },
            filter(lambda w: w[1] != '',
                   utils.matrix_to_array(allwords_with_sensekey))))
    corpus.append([[{
        "lemma": y[0],
        "sense": y[1]
    } for y in x] for x in allwords_with_sensekey])

    for x in allwords_with_sensekey_formatted:
        try:
            possible_senses[x['word']].append(x['sense'])
        except KeyError:
            possible_senses[x['word']] = [x['sense']]

    possible_senses_copy = possible_senses.copy()
    for key in possible_senses_copy.keys():
        possible_senses[key] = list(set(possible_senses[key]))

utils.print_to_file('allwords.json', allwords)
utils.print_to_file('allwords_with_sensekey.json', allwords_with_sensekey)
utils.print_to_file('possibile_senses.json', possible_senses)
utils.print_to_file('corpus.json', corpus)