예제 #1
0
    returns:
        model: a dictionary mapping words to word-vectors (embeddings).
    """
    if word2vec_format:
        return gensim.models.KeyedVectors.load_word2vec_format(filepath,
                                                               binary=True)
    else:  #own pretrained model
        return gensim.models.Word2Vec.load(filepath)


if __name__ == "__main__":

    ### load data:
    trainpath = 'train_data/train_data.json'
    testpath = 'test_data/test_data.json'
    traindata = dp.loadfile(trainpath)

    inc_categories = [
        'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cond-mat.stat-mech',
        'cond-mat.str-el', 'cond-mat.supr-con', 'cond-mat.soft', 'quant-ph',
        'cond-mat.dis-nn', 'cond-mat.quant-gas', 'hep-th'
    ]
    #
    train_X, train_y = dp.generate_Xy_data_categories(traindata,
                                                      inc_categories,
                                                      ignore_others=True,
                                                      shuffle_seed=0,
                                                      ydatatype='onehot',
                                                      clean_x=True,
                                                      keep_latex_tags=True)
예제 #2
0
        returns: the predicted labels or probabilities of docs
        """
        probabilities = self.model.predict(X_ints)

        if return_probabilities:
            return probabilities
        else:
            return np.round(probabilities)


if __name__ == "__main__":

    ### load data:
    trainpath = 'train_data/train_data.json'
    testpath = 'test_data/test_data.json'
    traindata, testdata = dp.loadfile(trainpath), dp.loadfile(testpath)

    inc_categories = [
        'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cond-mat.stat-mech',
        'cond-mat.str-el', 'cond-mat.supr-con', 'cond-mat.soft', 'quant-ph',
        'cond-mat.dis-nn', 'cond-mat.quant-gas', 'hep-th'
    ]

    train_X, train_y = dp.generate_Xy_data_categories(traindata,
                                                      inc_categories,
                                                      ignore_others=True,
                                                      shuffle_seed=0,
                                                      ydatatype='onehot',
                                                      clean_x=True,
                                                      keep_latex_tags=True)
    test_X, test_y = dp.generate_Xy_data_categories(testdata,