Пример #1
0
def access_model():
    """Summary
    """
    corpus_paths = [
        helper.get_processed_train_path(DATA_DIR + f, is_final)
        for f in CORPUS_FILES
    ]
    # load corpus to list of sentence
    train_corpus = list(iterdocuments(corpus_paths))

    for type_model in ['dm', 'dbow']:
        # Assessing Model
        model_path = helper.get_doc2vec_model_path(type_model, is_final)
        model = gensim.models.doc2vec.Doc2Vec.load(model_path)
        ranks = []
        second_ranks = []
        for doc_id in range(len(train_corpus)):
            inferred_vector = model.infer_vector(train_corpus[doc_id].words)
            sims = model.docvecs.most_similar([inferred_vector],
                                              topn=len(model.docvecs))
            rank = [docid for docid, sim in sims].index(doc_id)
            ranks.append(rank)

            second_ranks.append(sims[1])
            sys.stdout.write('\r')
            sys.stdout.write(str(doc_id))
            sys.stdout.flush()

        sys.stdout.write('\r')
        logger.info(collections.Counter(ranks))
Пример #2
0
def train_tfidf(min_count=1, chunksize=5000, ngrams=(1, 1), is_final=False):
    """Summary

    Args:
        min_count (int, optional): Description
        chunksize (int, optional): Description
        ngrams (tuple, optional): Description
    """
    train_files = [
        helper.get_processed_train_path(DATA_DIR + f, is_final)
        for f in CORPUS_FILES
    ]
    dictionary_path = helper.get_dictionary_path(min_count, is_final)
    tfidf_model_path = helper.get_tfidf_model_path(min_count, is_final)
    chunkedCorpus = ChunkedCorpus(train_files,
                                  min_count=min_count,
                                  chunksize=ngrams,
                                  ngrams=ngrams)
    chunkedCorpus.dictionary.save(dictionary_path)

    # build tf-idf model
    logger.info('build tf-idf model')
    tfidf = TfidfGensimVectorizer(dictionary_file=dictionary_path,
                                  ngrams=ngrams)
    tfidf.fit()
    tfidf.save(tfidf_model_path)
Пример #3
0
def train_model(min_count, use_external_data=False, is_final=False):
    """
    Train word embedding using Skipgram model with gensim

    Args:
        min_count (TYPE): Description
        size (None, optional): Description
        use_external_data (bool, optional): Description
        is_final (bool, optional): Description
    """
    logger.info('train word embedding')

    corpus_paths = [helper.get_processed_train_path(DATA_DIR + f, is_final)
                    for f in CORPUS_FILES]
    output_path = helper.get_w2v_model_path(min_count, is_final)
    if use_external_data:
        output_path = helper.get_external_w2v_model_path()
        corpus_paths.extend(
            [helper.get_processed_train_path(DATA_DIR + f, is_final)
             for f in EXTERNAL_CORPUS_FILES])
    w2v_params = SKIPGRAM_PARAMS.copy()
    w2v_params['min_count'] = min_count

    # load corpus to list of sentence
    documents = []
    for corpus_path in corpus_paths:
        df = pd.read_csv(corpus_path, header=None, names=['text'])
        logger.info('corpus: %s, size: %s',
                    os.path.basename(corpus_path),
                    df.shape[0])
        documents.extend(df.text.tolist())

    logger.info('total of documents: %s', len(documents))
    documents = [d.split() for d in documents]

    # train model
    logger.info('training skipgram on corpus: %s', ','.join(corpus_paths))
    skipgram_model = Word2Vec(documents, **w2v_params)
    skipgram_model.wv.save_word2vec_format(output_path)
    logger.info('saved skipgram model as {:s}'.format(output_path))
Пример #4
0
def train(min_count, is_final=False):
    """Summary

    Args:
        min_count (TYPE): Description
    """
    logger.info('start train char2vec using fastText package')
    unified_corpus = DATA_DIR + 'unified_corpus.preprocessed.train'
    corpus_paths = [
        helper.get_processed_train_path(DATA_DIR + f, is_final)
        for f in CORPUS_FILES
    ]
    frames = []

    tqdm.pandas()
    for corpus_path in corpus_paths:
        df = pd.read_csv(corpus_path, header=None, names=['text'])
        # df.text = df.progress_apply(
        #     lambda x: x['text'].replace('_', ''),
        #     axis=1)
        frames.append(df)
    pd.concat(frames).to_csv(unified_corpus, index=False, header=False)

    w2v_params = FASTTEXT_PARAMS.copy()
    w2v_params['minCount'] = min_count
    logger.info('parameters: %s', w2v_params)

    model = train_unsupervised(input=unified_corpus, **w2v_params)
    model.save_model(helper.get_char2vec_model_path(w2v_params['minCount']))
    logger.info('done train char2vec using fastText package')

    print(
        'get_sentence_vector:',
        model.get_sentence_vector(
            'tạ_cảnh cảm_ơn thông_tin của bạn ad đã ghi_nhận mong bạn thông_cảm .'
        ))
    print(
        'get_sentence_vector:',
        model.get_sentence_vector(
            'km nạp thẻ trong hôm_nay áp_dụng cho các số trả trước hòa_mạng trong năm number_token . thuê_bao của bạn kích_hoạt từ number_token bạn ạ .'
        ))
Пример #5
0
from src import helper
from src.constants import *
from src.text_cnn import TextCNN

logger = logging.getLogger(__name__)

# Parameters
# ==================================================

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1,
                      "Percentage of the training data to use for validation")
tf.flags.DEFINE_string(
    "positive_data_file",
    helper.get_processed_train_path(DATA_DIR +
                                    'Positive_train.csv.preprocessed'),
    "Data source for the positive data.")
tf.flags.DEFINE_string(
    "negative_data_file",
    helper.get_processed_train_path(DATA_DIR +
                                    'Negative_train.csv.preprocessed'),
    "Data source for the negative data.")
tf.flags.DEFINE_string(
    "neutral_data_file",
    helper.get_processed_train_path(DATA_DIR +
                                    'Neutral_train.csv.preprocessed'),
    "Data source for the neutral data.")

# Model Hyperparameters
tf.flags.DEFINE_integer(
    "embedding_dim", 128,
Пример #6
0
def compute_train_test_matrix(min_count, method, is_final=False):
    """Summary

    Returns:
        TYPE: Description

    Args:
        min_count (TYPE): Description
        method (TYPE): Description
        is_final (bool, optional): Description
    """
    logger.info('prepare train data')

    # load corpus
    logger.info('load corpus and build corpus matrix')
    train_corpus_paths = [
        helper.get_processed_train_path(DATA_DIR + f, is_final)
        for f in CORPUS_FILES
    ]
    # in the final model, we dont have test dataset
    test_corpus_paths = []
    if is_final is False:
        test_corpus_paths = [
            helper.get_processed_test_path(DATA_DIR + f) for f in CORPUS_FILES
        ]

    # extract labels from file name
    labels = [str.lower(f.split('_')[0]) for f in CORPUS_FILES]
    logger.info('labels: %s', labels)

    X_train = []
    y_train = []
    X_test = []
    y_test = []
    precomputed_path = helper.get_precomputed_matrix(min_count, method,
                                                     is_final)
    if os.path.isfile(precomputed_path):
        logger.info('load train & test precomputed norm matrix from file')
        data = np.load(precomputed_path)
        X_train = data['X_train']
        y_train = data['y_train']
        X_test = data['X_test']
        y_test = data['y_test']
    else:
        logger.info('build train matrix from corpus')
        transformer = Transformer(min_count, method, is_final)

        X_train, y_train = convert_preprocessed_text_to_vector(
            train_corpus_paths, labels, transformer)
        logger.info('X_train.shape: %s, y_train.shape: %s', X_train.shape,
                    y_train.shape)

        if is_final is False:
            X_test, y_test = convert_preprocessed_text_to_vector(
                test_corpus_paths, labels, transformer)
            logger.info('X_test.shape: %s, y_test.shape: %s', X_test.shape,
                        y_test.shape)

        # save matrix into file
        np.savez(precomputed_path,
                 X_train=X_train,
                 y_train=y_train,
                 X_test=X_test,
                 y_test=y_test)

    return X_train, y_train, X_test, y_test