예제 #1
0
def gen_data_for_clf(wv_url, save_url):
    train_df = load_to_df(TRAIN_URL)
    test_df = load_to_df(TEST_URL)
    X = infer_avg_wvs(wv_url, train_df['word_seg'].apply(str.split))
    y = train_df['class'].values
    X_test = infer_avg_wvs(wv_url, test_df['word_seg'].apply(str.split))
    joblib.dump((X, y, X_test), save_url)
예제 #2
0
def ft_process(data_url=None):
    """ process data into what ft model need, and save it into './processed_data' dir

    Args:
        data_url: url to original .csv data

    Returns:
        str: url to saved processed data

    """
    save_filename = basename(data_url).replace('.csv', '_ft.csv')
    save_url = from_project_root("embedding_model/processed_data/" + save_filename)

    # file specified by data_url is already processed
    if exists(save_url):
        return save_url
    if data_url is not None:
        labels, sentences = load_raw_data(data_url)
    else:
        train_df = load_to_df(TRAIN_URL)
        labels = train_df['class'].values
        sentences = train_df['word_seg']

    with open(save_url, "w", encoding='utf-8', newline='\n') as ft_file:
        for i in range(len(labels)):
            label = FT_LABEL_PREFIX + str(labels[i])
            sentence = ' '.join(sentences[i])
            ft_file.write('{} {}\n'.format(label, sentence))
    return save_url
예제 #3
0
def gen_data_for_stacking(args, column='word_seg', n_splits=5, random_state=None):
    """

    Args:
        args:
        column:
        n_splits:
        random_state:

    Returns:

    """

    train_df = load_to_df(TRAIN_URL)
    y = train_df['class'].values
    X = train_df[column].values
    X_test = load_to_df(TEST_URL)[column].values

    skf = StratifiedKFold(n_splits=n_splits, shuffle=bool(random_state), random_state=random_state)
    y_pred = np.zeros((X.shape[0],))  # for printing score of each fold
    y_pred_proba = np.zeros((X.shape[0], N_CLASSES))
    y_test_pred_proba = np.zeros((X_test.shape[0], N_CLASSES))

    with tempfile.NamedTemporaryFile() as t_file:
        for ind, (train_index, cv_index) in enumerate(skf.split(X, y)):  # cv split
            X_train, X_cv = X[train_index], X[cv_index]
            y_train, y_cv = y[train_index], y[cv_index]

            with open(t_file.name, "w", encoding='utf-8', newline='\n') as ft_file:
                for i in range(len(y_train)):
                    label = FT_LABEL_PREFIX + str(y_train[i])
                    ft_file.write('{} {}\n'.format(label, X_train[i]))

            clf = ft.supervised(t_file.name, output="tmp", thread=N_JOBS, label_prefix=FT_LABEL_PREFIX, **args)

            y_pred[cv_index] = [int(label[0]) for label in clf.predict(X_cv)]
            y_pred_proba[cv_index] = [[t[1] for t in sorted(proba, key=lambda x: int(x[0]))]
                                      for proba in clf.predict_proba(X_cv, N_CLASSES)]

            print("%d/%d cv macro f1 :" % (ind + 1, n_splits), f1_score(y_cv, y_pred[cv_index], average='macro'))
            y_test_pred_proba += [[t[1] for t in sorted(proba, key=lambda x: int(x[0]))]
                                  for proba in clf.predict_proba(X_test, N_CLASSES)]

    print("macro f1:", f1_score(y, y_pred, average='macro'))  # calc macro_f1 score
    y_test_pred_proba /= n_splits  # normalize to 1

    return y_pred_proba, y, y_test_pred_proba
예제 #4
0
def train_w2v_model(data_url=None, kwargs=None):
    """ get or train a new d2v_model

    Args:
        data_url: url to data file, None to train use
        kwargs: args for d2v model

    Returns:
        w2v_model

    """
    model_url = args_to_url(kwargs)
    if exists(model_url):
        return Word2Vec.load(model_url)

    if data_url is not None:
        _, sequences = load_raw_data(data_url)

    # use data from all train text and test text
    else:
        train_df = load_to_df(TRAIN_URL)
        test_df = load_to_df(TEST_URL)
        sequences = train_df['word_seg'].append(test_df['word_seg'],
                                                ignore_index=True)
        sequences = sequences.apply(str.split)

    print("Word2Vec model is training...\n trained model will be saved at \n ",
          model_url)
    s_time = time()
    # more info here [https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec]
    model = Word2Vec(sequences, workers=N_JOBS, **kwargs)
    e_time = time()
    print("training finished in %.3f seconds" % (e_time - s_time))
    model.save(model_url)
    # save wv of model
    wv_save_url = model_url.replace('.bin', '.txt').replace('w2v', 'wv')
    model.wv.save_word2vec_format(wv_save_url, binary=False)
    return model
예제 #5
0
def tfidf_transform(train_url,
                    test_url,
                    column='word_seg',
                    sublinear_tf=True,
                    max_n=MAX_N,
                    min_df=MIN_DF,
                    max_df=MAX_DF,
                    max_features=MAX_FEATURES):
    """ vectorize use TfidfVectorizer

    Args:
        train_url: url to train data
        test_url: url to test data
        column: column to use
        sublinear_tf:
        max_n:
        min_df:
        max_df:
        max_features:

    Returns:
        X, X_test, y: vectorized data

    """
    # set token_pattern(default: (?u)\b\w\w+\b' to keep single char tokens
    vectorizer = TfidfVectorizer(min_df=min_df,
                                 max_df=max_df,
                                 max_features=max_features,
                                 ngram_range=(1, max_n),
                                 sublinear_tf=sublinear_tf,
                                 token_pattern='(?u)\w+')
    train_df = load_to_df(train_url)
    X = vectorizer.fit_transform(train_df[column])
    y = np.asarray(train_df['class'])
    X_test = None
    if test_url:
        X_test = vectorizer.transform(load_to_df(test_url)[column])
    return X, y, X_test
예제 #6
0
def train_d2v_model(data_url=None, kwargs=None):
    """ get or train a new d2v_model

    Args:
        data_url: url to data file
        kwargs: args for d2v model

    Returns:
        w2v_model

    """
    model_url = args_to_url(kwargs)
    if exists(model_url):
        return Doc2Vec.load(model_url)

    if data_url is not None:
        _, sequences = load_raw_data(data_url)

    # use data from all train text and test text
    else:
        train_df = load_to_df(TRAIN_URL)
        test_df = load_to_df(TEST_URL)
        sequences = train_df['word_seg'].append(test_df['word_seg'],
                                                ignore_index=True)
        sequences = sequences.apply(str.split)

    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sequences)]
    print("Doc2Vec model is training...\n trained model will be saved at \n ",
          model_url)
    # more info here [https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec]
    s_time = time()
    model = Doc2Vec(documents, workers=N_JOBS, **kwargs)
    model.save(model_url)
    e_time = time()
    print("training finished in %.3f seconds" % (e_time - s_time))
    return model
예제 #7
0
def generate_meta_feature(data_url, normalize=True):
    """ generate meta feature

    Args:
        data_url: url to data
        normalize: normalize result into [0, 1]

    Returns:
        generated meta DataFrame

    """
    save_url = data_url.replace('.csv', '_meta_df.pk')
    if exists(save_url):
        return joblib.load(save_url)

    data_df = load_to_df(data_url)
    meta_df = pd.DataFrame()

    for level in ('word_seg', 'article'):
        # word num
        meta_df[level +
                '_num'] = data_df[level].apply(lambda x: len(x.split()))
        # different word num
        meta_df[level + '_unique'] = data_df[level].apply(
            lambda x: len(set(x.split())))
        # most common word num
        meta_df[[level + '_common', level + '_common_num'
                 ]] = pd.DataFrame(data_df[level].apply(lambda x: Counter(
                     x.split()).most_common(1)[0]).tolist()).astype(int)

    # average phrase len
    meta_df[
        'avg_phrase_len'] = meta_df['article_num'] / meta_df['word_seg_num']

    # normalization
    if normalize:
        for col in meta_df:
            meta_df[col] -= meta_df[col].min()
            meta_df[col] /= meta_df[col].max()

    joblib.dump(meta_df, save_url)
    return meta_df
예제 #8
0
def generate_vectors(train_url,
                     test_url=None,
                     column='article',
                     trans_type=None,
                     max_n=1,
                     min_df=1,
                     max_df=1.0,
                     max_features=1,
                     sublinear_tf=True,
                     balanced=False,
                     re_weight=0,
                     verbose=False,
                     drop_words=0):
    """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer

    Args:
        train_url: url to train csv
        test_url: url to test csv, set to None if not need X_test
        column: column to use as feature
        trans_type: specific transformer, {'dc','idf'}
        max_n: max_n for ngram_range
        min_df: min_df for CountVectorizer
        max_df: max_df for CountVectorizer
        max_features: max_features for CountVectorizer
        sublinear_tf: sublinear_tf for default TfdcTransformer
        balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf
        re_weight: re_weight for TfdcTransformer
        verbose: True to show more information
        drop_words: randomly delete some words from sentences

    Returns:
        X, y, X_test

    """
    verbose and print("loading '%s' level data from %s with pandas" %
                      (column, train_url))

    train_df = load_to_df(train_url)

    # vectorizer
    vec = CountVectorizer(ngram_range=(1, max_n),
                          min_df=min_df,
                          max_df=max_df,
                          max_features=max_features,
                          token_pattern='\w+')
    s_time = time()
    verbose and print("finish loading, vectorizing")
    verbose and print("vectorizer params:", vec.get_params())

    sequences = train_df[column]
    # delete some words randomly
    for i, row in enumerate(sequences):
        if drop_words <= 0:
            break
        if np.random.ranf() < drop_words:
            row = np.array(row.split())
            sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35])

    X = vec.fit_transform(sequences)
    e_time = time()
    verbose and print("finish vectorizing in %.3f seconds, transforming" %
                      (e_time - s_time))
    # transformer
    if trans_type is None or trans_type == 'idf':
        trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced)
    else:
        trans = TfdcTransformer(sublinear_tf=sublinear_tf,
                                balanced=balanced,
                                re_weight=re_weight)

    verbose and print("transformer params:", trans.get_params())
    y = np.array((train_df["class"]).astype(int))
    X = trans.fit_transform(X, y)

    X_test = None
    if test_url:
        verbose and print("transforming test set")
        test_df = load_to_df(test_url)
        X_test = vec.transform(test_df[column])
        X_test = trans.transform(X_test)

    s_time = time()
    verbose and print("finish transforming in %.3f seconds\n" %
                      (s_time - e_time))
    return X, y, X_test
예제 #9
0
def dict_transform(tw_dict,
                   train_url=TRAIN_URL,
                   test_url=None,
                   column='word_seg',
                   max_n=MAX_N,
                   min_df=MIN_DF,
                   max_df=MAX_DF,
                   max_features=MAX_FEATURES,
                   normalize=True,
                   sublinear_tf=True,
                   re_weight=0):
    """ use offline dict to transform data into vector

    Args:
        train_url: url to train data (with header)
        test_url: url to test data (with header)
        sentences: list of sentence to be vectorized
        tw_dict: term weighting dict to use
        max_n: max_n for CountVectorizer
        min_df: min_df for CountVectorizer
        max_df: max_df for CountVectorizer
        normalize: normalize the vector or not
        sublinear_tf: use 1 + log(tf) instead of tf
        max_features: max_features for CountVectorizer
        re_weight: if re_weight > 0, use (1-re_weight) + (re_weight) * weights instead of weight
        column: column to use in dataframe

    Returns:
        X, y, X_test: vectorized data

    """
    print("transforming...")
    train_df = load_to_df(train_url)
    vectorizer = CountVectorizer(min_df=min_df,
                                 max_df=max_df,
                                 ngram_range=(1, max_n),
                                 token_pattern='(?u)\w+',
                                 max_features=max_features)
    X_train = vectorizer.fit_transform(
        train_df[column])  # use train data to get vocab
    y_train = np.asarray(train_df['class'])
    X_test = vectorizer.transform(
        load_to_df(test_url)[column]) if test_url else None

    # get words of all columns represent to
    words = vectorizer.get_feature_names()
    # get weights of words
    weights = np.array([tw_dict[word] for word in words])
    if re_weight > 0:
        weights = 1 + re_weight * weights

    for X in (X_train, X_test):
        if X is None:
            continue

        # sublinear_tf like tf-idf
        if sublinear_tf:
            X.data = np.log(X.data) + 1

        X = X.multiply(weights)  # can not use * to multiply
        if normalize:
            norm = sp.sparse.linalg.norm(X, axis=1)
            for i, row in enumerate(X.row):
                X.data[i] /= norm[row]

    return X_train, y_train, X_test