def create_word2vec_features(data, col1, col2, pref=''):

    logging.info('Creating Word2Vec features.')
    feature_class = pref + 'word2vec'
    if check_if_exists(feature_class):
        logging.info('Word2Vec features are already created.')
        return

    models = []

    # Create our own model.
    corpus = list(data[col1]) + list(data[col2])
    models.append(Word2VecModel(corpus=corpus, name='Corpus'))

    # Load pre-trained models.
    for file in os.listdir(MODELS_DIR):
        if file.endswith('.txt') or file.endswith('.bin'):
            models.append(Word2VecModel(path=os.path.join(MODELS_DIR, file),
                                        name=file.split('.', 1)[0]))

    res = pd.DataFrame()
    for model in models:
        estimator = Word2VecEstimator(model)
        res['%s_n_similarity' % model.name] = data.apply(
            lambda x: estimator.get_n_similarity(x[col1], x[col2]), axis=1)
        res['%s_n_similarity_imp' % model.name] = data.apply(
            lambda x: estimator.get_n_similarity_imp(x[col1], x[col2]), axis=1)
        res['%s_centroid_rmse' % model.name] = data.apply(
            lambda x: estimator.get_centroid_rmse(x[col1], x[col2]), axis=1)
        res['%s_centroid_rmse_imp' % model.name] = data.apply(
            lambda x: estimator.get_centroid_rmse_imp(x[col1], x[col2]), axis=1)

    add_features(feature_class, res.columns.tolist())
    dump_features(feature_class, res)
    logging.info('Word2Vec features are created and saved to pickle file.')
def create_most_common_words_features(df_all, col1, col2,
                                      max_features=MAX_FEATURES, pref=''):

    logging.info('Creating most common words features.')
    feature_class = pref + 'most_common_words'
    if check_if_exists(feature_class):
        logging.info('Most common words features already created.')
        return

    count_vectorizer = CountVectorizer(min_df=MIN_DF,
                                       max_df=MAX_DF,
                                       max_features=MAX_FEATURES,
                                       strip_accents='unicode',
                                       analyzer='word',
                                       token_pattern=TOKEN_PATTERN,
                                       ngram_range=NGRAM_RANGE,
                                       stop_words='english',
                                       binary=True,
                                       vocabulary=None)
    documents = pd.concat([df_all[col1], df_all[col2]], axis=0)
    X = count_vectorizer.fit_transform(documents)

    logging.debug(count_vectorizer.get_feature_names())

    X_col1 = X[0:len(df_all)]
    X_col2 = X[len(df_all):2 * len(df_all)]

    res = X_col1 + X_col2

    dump_features(feature_class, res)
    logging.info('Most common words features are created and saved to pickle file.')
def create_common_words_count_features(data, pref=''):

    logging.info('Creating common words features')
    feature_class = pref + 'common_words'
    if check_if_exists(feature_class):
        logging.info('Common words features already created')
        return

    res = pd.DataFrame()
    res['common_words'] = data.apply(
        lambda x: common_words_count(x['words1'], x['words2']), axis=1)
    res['len1'] = data['words1'].apply(lambda x: len(x))
    res['len2'] = data['words2'].apply(lambda x: len(x))
    res['lenunion'] = data.apply(
        lambda x: union_words_count(x['words1'], x['words2']), axis=1)
    res['distance1'] = res['common_words'] / res['lenunion']
    res['distance2'] = res['common_words'] / (res['len1'] + res['len2'])

    res['common_words_len'] = data.apply(
        lambda x: common_words_len(x['words1'], x['words2']), axis=1)
    res['abs_len1'] = data['words1'].apply(lambda x: words_len(x))
    res['abs_len2'] = data['words2'].apply(lambda x: words_len(x))
    res['abs_lenunion'] = data.apply(
        lambda x: union_words_len(x['words1'], x['words2']), axis=1)
    res['absdistance1'] = res['common_words_len'] / res['abs_lenunion']
    res['absdistance2'] = res['common_words_len'] / (res['abs_len1'] +
                                                     res['abs_len2'])

    features = res.columns.tolist()
    add_features(feature_class, features)
    dump_features(feature_class, res)
    logging.info('Common words features are created and saved to pickle file.')
def create_raw_tfidf_features(df_all, columns, pref=''):

    logging.info('Creating raw tfidf features.')
    feature_class = '%sraw_tfidf_%s' % (pref, columns[0])
    if check_if_exists(feature_class):
        logging.info('Raw tfidf features already created.')
        return

    for c in columns:

        tfidf_vectorizer = TfidfVectorizer(min_df=MIN_DF,
                                           max_df=MAX_DF,
                                           max_features=None,
                                           strip_accents='unicode',
                                           analyzer='word',
                                           token_pattern=TOKEN_PATTERN,
                                           ngram_range=NGRAM_RANGE,
                                           use_idf=1,
                                           smooth_idf=1,
                                           sublinear_tf=1,
                                           stop_words='english',
                                           norm=NORM,
                                           vocabulary=None)

        X = tfidf_vectorizer.fit_transform(df_all[c])
        logging.info('Shape of Tfidf transform matrix is %s' % str(X.shape))

        feature_class = '%sraw_tfidf_%s' % (pref, c)
        dump_features(feature_class, X)

    logging.info('Raw tfidf features are created and saved to pickle file.')
def create_svd_tfidf_features(columns, n_components=N_COMPONENTS, pref=''):

    logging.info('Creating svd tfidf features.')
    feature_class = pref + 'svd_tfidf'
    if check_if_exists(feature_class):
        logging.info('SVD tfidf features already created.')
        return

    data = []
    svd = TruncatedSVD(n_components=n_components, n_iter=15)

    for c in columns:
        X = load_features('%sraw_tfidf_%s' % (pref, c))
        X_transformed = svd.fit_transform(X)
        svd_columns = [
            'tfidf_svd_' + c + '_' + str(i) for i in range(n_components)
        ]
        data.append(pd.DataFrame(X_transformed, columns=svd_columns))

    df = pd.concat(data, axis=1, ignore_index=True)
    add_features(feature_class, df.columns.tolist())
    dump_features(feature_class, df)

    logging.info('Shape of svd tfidf features is %s' % str(df.shape))
    logging.info('Svd tfidf features are created.')
示例#6
0
def create_logistic_features(metafeatures_dir=METAFEATURES_DIR,
                             preds_dir=PRED_DIR,
                             pref=''):

    logging.info('Creating logistic features.')
    feature_class = pref + 'logistic'
    if check_if_exists(feature_class):
        logging.info('Logistic features (%s) already created.' % feature_class)
        return

    metafeatures_filenames = filenames_in_dir(metafeatures_dir, '.pickle')
    preds_filenames = filenames_in_dir(preds_dir, '.csv')
    common_filenames = set(metafeatures_filenames).intersection(
        set(preds_filenames))
    common_filenames = sorted(common_filenames)

    # We are only interested in logistic metafeatures.
    common_filenames = [
        f for f in common_filenames if f.startswith('Logistic')
    ]

    train_data = []
    for filename in common_filenames:
        # Only logistic regression use as features.
        if not filename.startswith('Logistic'):
            continue

        with open((os.path.join(metafeatures_dir, filename + '.pickle')),
                  'rb') as file:
            try:
                metafeature = np.sum(pickle.load(file), axis=1)
            except:
                metafeature = pickle.load(file)
            metafeature = rescale_preds(metafeature, a=B, b=A)
            train_data.append(metafeature)

    train_data = np.stack(train_data, axis=1)
    train_data = pd.DataFrame(train_data, columns=common_filenames)

    # Load preds.
    test_data = []
    for filename in common_filenames:
        file = os.path.join(preds_dir, filename + '.csv')
        preds = pd.read_csv(file, usecols=['is_duplicate'])
        # We need to rescale predictions back ot avoid double rescaling.
        # TODO: think about a better way to do it.
        preds = rescale_preds(preds, a=B, b=A)
        test_data.append(preds.values)

    test_data = np.concatenate(test_data, axis=1)
    test_data = pd.DataFrame(test_data, columns=common_filenames)

    data = pd.concat([train_data, test_data])

    add_features(feature_class, common_filenames)
    dump_features(feature_class, data)
    logging.info('Logistic features are created and saved to pickle file.')
示例#7
0
def create_magic_features(df_all, pref=''):

    logging.info('Creating magic features.')
    feature_class = pref + 'magic'
    if check_if_exists(feature_class):
        logging.info('Magic features (%s) already created.' %
                     feature_class)
        return

    # 1. Creating questions dictionary: question -> hash_value.
    logging.debug('Creating questions dictionary...')
    questions1 = df_all[['question1', 'question2']].copy()
    questions2 = df_all[['question2', 'question1']].copy()
    questions2.rename(columns={'question1': 'question2', 'question2': 'question1'},
                      inplace=True)
    questions = questions1.append(questions2)
    questions.reset_index(inplace=True, drop=True)

    unique_questions = questions.drop_duplicates(subset=['question1'])
    unique_questions.reset_index(inplace=True, drop=True)
    questions_dict = pd.Series(unique_questions.index.values,
                               index=unique_questions['question1'].values).to_dict()
    # 2. Creating hash values.
    logging.debug('Creating hash dictionary...')
    # res = pd.DataFrame()
    questions['q1hash'] = questions['question1'].map(questions_dict)
    questions['q2hash'] = questions['question2'].map(questions_dict)

    # 3. Creating intersection features.
    logging.debug('Creating edges.')
    questions['l1hash'] = questions['q1hash'].apply(lambda x: [x])
    questions['l2hash'] = questions['q2hash'].apply(lambda x: [x])
    questions['edges1'] = questions.groupby('q1hash')['l2hash'].transform(sum)
    questions['edges2'] = questions.groupby('q2hash')['l1hash'].transform(sum)

    # 4
    wanted_cols = ['l1hash', 'l2hash', 'edges1', 'edges2', 'q1hash', 'q2hash']
    res = questions[wanted_cols].copy()[0:len(df_all)]

    # 3. Creating intersection features.
    logging.debug('Creating intersection features...')
    res['common_edges'] = res.apply(
        lambda x: len(set(x.edges1).intersection(set(x.edges2))), axis=1)

    # 4. Is question 2 ever appeared as question 1 column.
    logging.debug('Creating q2 in q1 feature...')
    questions1 = set(res['q1hash'].values)
    res['q2inq1'] = res['q2hash'].apply(lambda x: int(x in questions1))

    res.drop(['l1hash', 'l2hash', 'edges1', 'edges2'], axis=1, inplace=True)
    print(res.head())
    add_features(feature_class, res.columns.tolist())
    dump_features(feature_class, res)
    logging.info('Magic features are created and saved to pickle file.')
def create_tfidf_features(df_all, columns, qcol, unique=False, pref=''):

    logging.info('Creating tfidf features')
    feature_class = pref + 'tfidf'
    if check_if_exists(feature_class):
        logging.info('Tfidf features already created.')
        return

    df = pd.DataFrame()
    df['id'] = df_all['id']

    if TFIDF_ANALYSIS:

        for c in columns:

            logging.info("Doing TFIDF Analysis, it may take some time")
            if unique:
                create_idf(df_all[c].unique())
            else:
                create_idf(df_all[c])

            # different types of tfidf
            types = ('binary', 'freq', 'log_freq', 'dnorm')

            # different ways of aggregating term tfidf in query
            indexes, prefixes = (0, 1, 2), ('s', 'm', 'a')

            # two different functions - one exact match, other - common words
            funcs, suffixes = [tfidf1, tfidf2], ('1', '2')

            for (func, suffix) in zip(funcs, suffixes):
                if (func == tfidf2) and (not TFIDF2):
                    continue

                df['temp'] = df_all.apply(
                    lambda x: func(x[qcol], x[c], type='all'), axis=1)

                ind = 0
                for t in types:
                    for prefix in prefixes:
                        name = qcol + prefix + t + '_tfidf_' + c + '_' + suffix
                        df[name] = df['temp'].map(lambda x: x[ind])
                        ind += 1

            df.drop(['temp'], axis=1, inplace=True)

            logging.info('TFIDF analysis is finished')

    df.drop('id', axis=1, inplace=True)
    add_features(feature_class, df.columns.tolist())
    dump_features(feature_class, df)
    logging.info('Tfidf features are created and saved to pickle file.')
def create_distance_tfidf_features(col1, col2, pref=''):

    logging.info('Creating distance tfidf features.')
    feature_class = pref + 'distance_tfidf'
    if check_if_exists(feature_class):
        logging.info('Distance tfidf features already created.')
        return

    X_col1 = load_features('%scommon_vocabulary_raw_tfidf_%s' % (pref, col1))
    X_col2 = load_features('%scommon_vocabulary_raw_tfidf_%s' % (pref, col2))

    res = pd.DataFrame()
    res['cosine_similarity_%s_%s' % (col1, col2)] = (list(
        map(cosine_sim, X_col1, X_col2)))
    res['rmse_%s_%s' % (col1, col2)] = (list(map(rmse, X_col1, X_col2)))

    add_features(feature_class, res.columns.tolist())
    dump_features(feature_class, res)

    logging.info('Distance tfidf features are created.')
def create_common_vocabulary_raw_tfidf_features(df_all, col1, col2, pref=''):

    logging.info('Creating common vocabulary raw tfidf features.')
    feature_class = pref + 'common_vocabulary_raw_tfidf'
    if check_if_exists(feature_class):
        logging.info('Common vocabulary raw tfidf features already created.')
        return

    tfidf_vectorizer = TfidfVectorizer(min_df=MIN_DF,
                                       max_df=MAX_DF,
                                       max_features=None,
                                       strip_accents='unicode',
                                       analyzer='word',
                                       token_pattern=TOKEN_PATTERN,
                                       ngram_range=NGRAM_RANGE,
                                       use_idf=1,
                                       smooth_idf=1,
                                       sublinear_tf=1,
                                       stop_words='english',
                                       norm=NORM,
                                       vocabulary=None)
    documents = pd.concat([df_all[col1], df_all[col2]], axis=0)
    X = tfidf_vectorizer.fit_transform(documents)

    X_col1 = X[0:len(df_all)]
    X_col2 = X[len(df_all):2 * len(df_all)]

    res = X_col1.multiply(X_col2)

    dump_features('%scommon_vocabulary_raw_tfidf_%s' % (pref, col1), X_col1)
    dump_features('%scommon_vocabulary_raw_tfidf_%s' % (pref, col2), X_col2)
    dump_features(feature_class, res)
    logging.info('Common vocabulary Raw tfidf features are created and saved'
                 'to pickle file.')
示例#11
0
def create_count_features(df_all, pref=''):

    logging.info('Creating count features.')
    feature_class = pref + 'count'
    if check_if_exists(feature_class):
        logging.info('Count features (%s) already created.' % feature_class)
        return

    df_q1_q2 = df_all[['question1', 'question2']].reset_index(drop=True)
    df_q2_q1 = df_all[['question1', 'question2']].reset_index(drop=True)
    df_q2_q1.rename(columns={
        'question1': 'question2',
        'question2': 'question1'
    })
    df = pd.concat([df_q1_q2, df_q2_q1], axis=0, ignore_index=True)

    # Create count of q1 and q2 features.
    res = pd.DataFrame()
    grouper1 = df.reset_index().groupby('question1')
    grouper2 = df.reset_index().groupby('question2')
    res['q1count'] = grouper1['question2'].transform('count')
    res['q2count'] = grouper2['question1'].transform('count')
    res['q1rank'] = grouper1['question2'].rank()
    res['q2rank'] = grouper2['question1'].rank()
    # res['hash1'] = grouper1['index'].transform(lambda x: x.iloc[0])
    # res['hash2'] = grouper2['index'].transform(lambda x: x.iloc[0])
    res = res[0:len(df_q1_q2)]

    # Number of sentences count.
    res['sent1count'] = df_q1_q2['question1'].apply(
        lambda x: len(create_sentences(x)))
    res['sent2count'] = df_q1_q2['question2'].apply(
        lambda x: len(create_sentences(x)))

    add_features(feature_class, res.columns.tolist())
    dump_features(feature_class, res)
    logging.info('Count features are created and saved to pickle file.')
def create_wordnet_features(data, pref=''):

    feature_class = pref + 'wordnet'
    logging.info('Creating wordnet (%s) features' % feature_class)
    if check_if_exists(feature_class):
        logging.info('Wordnet (%s) features already created' % feature_class)
        return

    res = pd.DataFrame()
    logging.info('Creating synonyms count...')
    res['synonyms_count'] = data.apply(
        lambda x: synonyms_count(x['words1'], x['words2']), axis=1)
    logging.info('Creating antonyms count...')
    res['antonyms_count'] = data.apply(
        lambda x: antonyms_count(x['words1'], x['words2']), axis=1)
    #    logging.info('Creating hyponyms count...')
    #    res['hyponyms_count'] = data.apply(
    #        lambda x: hyponyms_count(x['words1'], x['words2']), axis=1)
    #    logging.info('Creating hypernyms count...')
    #    res['hypernyms_count'] = data.apply(
    #        lambda x: hypernyms_count(x['words1'], x['words2']), axis=1)
    logging.info('Calculating synonyms and antonyms distances...')
    len1 = data['words1'].apply(lambda x: len(x))
    len2 = data['words2'].apply(lambda x: len(x))
    lenunion = data.apply(
        lambda x: union_words_count(x['words1'], x['words2']), axis=1)

    res['syn_distance1'] = res['synonyms_count'] / lenunion
    res['syn_distance2'] = res['synonyms_count'] / (len1 + len2)

    res['anton_distance1'] = res['antonyms_count'] / lenunion
    res['anton_distance2'] = res['antonyms_count'] / (len1 + len2)

    features = res.columns.tolist()
    add_features(feature_class, features)
    dump_features(feature_class, res)
    logging.info('Common words features are created and saved to pickle file.')
def create_common_vocabulary_svd_tfidf_features(n_components=2 * N_COMPONENTS,
                                                pref=''):

    logging.info('Creating common vocabulary svd tfidf features.')
    feature_class = pref + 'common_vocabulary_svd_tfidf'
    if check_if_exists(feature_class):
        logging.info('Common Vocabulary SVD tfidf features already created.')
        return

    svd = TruncatedSVD(n_components=n_components, n_iter=15)

    X = load_features(pref + 'common_vocabulary_raw_tfidf')
    X_transformed = svd.fit_transform(X)
    svd_columns = [
        'common_vocabulary_tfidf_svd_' + str(i) for i in range(n_components)
    ]
    data = pd.DataFrame(X_transformed, columns=svd_columns)

    add_features(feature_class, data.columns.tolist())
    dump_features(feature_class, data)

    logging.info('Shape of common vocabulary svd tfidf features is %s' %
                 str(data.shape))
    logging.info('Common vocabulary SVD tfidf features are created.')
示例#14
0
def create_specific_word_counts(df_all,
                                specific_words=SPECIFIC_WORDS,
                                pref=''):

    logging.info('Creating specific word features.')
    feature_class = pref + 'specific_words'
    if check_if_exists(feature_class):
        logging.info('Specific word features (%s) already created.' %
                     feature_class)
        return

    # Doing some preprocessing to not relly of whether data is supplied
    # preprocessed already.
    df_all['question1'] = df_all['question1'].apply(lambda x: str(x).lower())
    df_all['question2'] = df_all['question2'].apply(lambda x: str(x).lower())

    res = pd.DataFrame()
    for word in specific_words:
        res[word +
            'in_q1'] = df_all['question1'].apply(lambda x: int(word in x))
        res[word +
            'in_q2'] = df_all['question2'].apply(lambda x: int(word in x))
        res[word] = res[word + 'in_q1'] + res[word + 'in_q2']

    res['neg_in_q1'], res['neg_in_q2'], res['neg'] = 0, 0, 0
    for word in NEGATION_WORDS:
        res['neg_in_q1'] = res['neg_in_q1'] + df_all['question1'].apply(
            lambda x: int(word in x))
        res['neg_in_q2'] = res['neg_in_q2'] + df_all['question2'].apply(
            lambda x: int(word in x))
    res['neg'] = res['neg_in_q1'] + res['neg_in_q2']

    add_features(feature_class, res.columns.tolist())
    dump_features(feature_class, res)
    logging.info(
        'Specific word counts features are created and saved to pickle file.')
def create_grouping_features(df_all, pref=''):

    logging.info('Creating grouping features.')
    feature_class = pref + 'grouping'
    if check_if_exists(feature_class):
        logging.info('Grouping features (%s) already created.' % feature_class)
        return

    columns = ['distance1', 'distance2', 'absdistance1', 'absdistance2']
    common_words = (load_features(pref + 'common_words')[columns].reset_index(
        drop=True))

    if check_if_exists(pref + 'distance_tfidf'):
        distance_tfidf_features = (load_features(pref +
                                                 'distance_tfidf').reset_index(
                                                     drop=True))
        columns += distance_tfidf_features.columns.tolist()
    else:
        distance_tfidf_features = pd.DataFrame()

    if check_if_exists(pref + 'word2vec'):
        word2vec_features = load_features(pref +
                                          'word2vec').reset_index(drop=True)
        columns += word2vec_features.columns.tolist()
    else:
        word2vec_features = pd.DataFrame()

    if check_if_exists(pref + 'wordnet'):
        wordnet_features = load_features(pref +
                                         'wordnet').reset_index(drop=True)
        columns += wordnet_features.columns.tolist()
    else:
        wordnet_features = pd.DataFrame()

    df_q1_q2 = pd.concat([
        common_words, distance_tfidf_features, word2vec_features,
        wordnet_features, df_all[['question1', 'question2'
                                  ]].reset_index(drop=True)
    ],
                         axis=1)
    df_q2_q1 = pd.concat([
        common_words, distance_tfidf_features, word2vec_features,
        wordnet_features, df_all[['question2', 'question1'
                                  ]].reset_index(drop=True)
    ],
                         axis=1)
    df_q2_q1.rename(columns={
        'question1': 'question2',
        'question2': 'question1'
    })
    df = pd.concat([df_q1_q2, df_q2_q1], axis=0, ignore_index=True)

    # GroupBy objects.
    groupby_q1 = df.groupby('question1')
    groupby_q2 = df.groupby('question2')

    df['q1count'] = groupby_q1['question2'].transform('count')
    df['q2count'] = groupby_q2['question1'].transform('count')
    inds_q1_gr_q2 = (df['q1count'] > df['q2count'])[0:len(df_q1_q2)]
    inds_q2_gr_q1 = ~inds_q1_gr_q2

    res = pd.DataFrame()

    groupers = ['min', 'max', 'mean']
    for grouper in groupers:
        for col in columns:

            res[grouper + '_by_q1_' +
                col] = (groupby_q1[col].transform(grouper)[0:len(df_q1_q2)])
            res[grouper + '_by_q2_' +
                col] = (groupby_q2[col].transform(grouper)[0:len(df_q1_q2)])

            res[col] = df[col][0:len(df_q1_q2)]
            res['rel_q1_' + col] = res.apply(lambda x: np_utils.try_to_divide(
                x[col], x[grouper + '_by_q1_' + col]),
                                             axis=1)
            res['req_q2_' + col] = res.apply(lambda x: np_utils.try_to_divide(
                x[col], x[grouper + '_by_q2_' + col]),
                                             axis=1)

            res[grouper + '_by_' + col] = 0
            res[grouper + '_by_' + col][inds_q1_gr_q2] = res[grouper +
                                                             '_by_q1_' + col]
            res[grouper + '_by_' + col][inds_q2_gr_q1] = res[grouper +
                                                             '_by_q2_' + col]
            res['rel_' + col] = res.apply(lambda x: np_utils.try_to_divide(
                x[col], x[grouper + '_by_' + col]),
                                          axis=1)

            del res[col]

    add_features(feature_class, res.columns.tolist())
    dump_features(feature_class, res)
    logging.info('Grouping features are created and saved to pickle file.')