示例#1
0
 def prepare(self):
     df_train = nltk_stemming(self.input_files['train'])
     df_test = nltk_stemming(self.input_files['test'])
     train_qs = pd.Series(df_train['question1'].tolist() +
                          df_train['question2'].tolist() +
                          df_test['question1'].tolist() +
                          df_test['question2'].tolist()).astype(str)
     self.vectorizer.fit(train_qs.values)
示例#2
0
def main():
    options = common_feature_parser().parse_args()

    df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['test'])
    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist() +
                         df_test['question1'].tolist() +
                         df_test['question2'].tolist()).astype(str)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, norm='l2')
    vectorizer.fit_transform(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=vectorizer)
示例#3
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['test'])
    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist() +
                         df_test['question1'].tolist() +
                         df_test['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', ngram_range=(1, 2)),
        TruncatedSVD(n_components=10)
    )
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
def create_word_match_feature(data_file):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    df = nltk_stemming(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = df.apply(word_match_share, axis=1, raw=True)
    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
示例#5
0
 def read_data(self, data_file):
     data = nltk_stemming(data_file)
     q1s = data['question1'].fillna("").tolist()
     q2s = data['question2'].fillna("").tolist()
     X1 = self.count_vectorizer.transform(q1s)
     X2 = self.count_vectorizer.transform(q2s)
     return np.array(
         self.tfidf_transformer.transform(
             X1.minimum(X2)).sum(axis=1)).flatten()
示例#6
0
def create_feature(data_file, vectorizer):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_stemming(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    X1 = vectorizer.transform(df.question1.values.astype(str))
    X2 = vectorizer.transform(df.question2.values.astype(str))
    values = []
    for i in tqdm(range(X1.shape[0])):
        values.append(np.dot(X1[i], X2[i].T))
    df[column_name] = values
    df[column_name].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
示例#7
0
def create_feature(data_file, vectorizer):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_stemming(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    X1 = vectorizer.transform(df.question1.values.astype(str))
    X2 = vectorizer.transform(df.question2.values.astype(str))
    X = np.hstack((X1, X2))

    column_names = []
    for i in range(X.shape[1]):
        column_name = column_name_prefix + '.' + str(i)
        df[column_name] = X[:, i]
        column_names.append(column_name)
    column_names = pd.Index(column_names)
    df[column_names] = X
    df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file, vectorizer: TfidfVectorizer):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    df = nltk_stemming(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)

    cosine_values = []
    q1vec = vectorizer.transform(
        df['question1'].apply(lambda x: x if type(x) == str else '').values)
    q2vec = vectorizer.transform(
        df['question2'].apply(lambda x: x if type(x) == str else '').values)
    for i in range(df.shape[0]):
        cosine_values.append(
            round(float(np.dot(q1vec[i], q2vec[i].T)[0, 0]), 5))

    column_name = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = cosine_values
    df[[column_name]].to_csv(feature_output_file(data_file),
                             index=False,
                             float_format='%.5f')
示例#9
0
 def read_data(self, data_file):
     return np.array(
         self.vectorizer.transform(
             nltk_stemming(data_file)['question2'].fillna("").tolist()).sum(
                 axis=1)).reshape(-1, )
 def read_data(self, data_file):
     X1 = self.vectorizer.transform(
         nltk_stemming(data_file)['question1'].fillna("").tolist())
     X2 = self.vectorizer.transform(
         nltk_stemming(data_file)['question2'].fillna("").tolist()).T
     return [(X1.getrow(i), X2.getcol(i)) for i in range(X1.shape[0])]