def prepare(self): df_train = nltk_stemming(self.input_files['train']) df_test = nltk_stemming(self.input_files['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) self.vectorizer.fit(train_qs.values)
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, norm='l2') vectorizer.fit_transform(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=vectorizer)
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) pipeline = make_pipeline( TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', ngram_range=(1, 2)), TruncatedSVD(n_components=10) ) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def create_word_match_feature(data_file): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_stemming(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) df[column_name] = df.apply(word_match_share, axis=1, raw=True) df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def read_data(self, data_file): data = nltk_stemming(data_file) q1s = data['question1'].fillna("").tolist() q2s = data['question2'].fillna("").tolist() X1 = self.count_vectorizer.transform(q1s) X2 = self.count_vectorizer.transform(q2s) return np.array( self.tfidf_transformer.transform( X1.minimum(X2)).sum(axis=1)).flatten()
def create_feature(data_file, vectorizer): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_stemming(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) X1 = vectorizer.transform(df.question1.values.astype(str)) X2 = vectorizer.transform(df.question2.values.astype(str)) values = [] for i in tqdm(range(X1.shape[0])): values.append(np.dot(X1[i], X2[i].T)) df[column_name] = values df[column_name].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file, vectorizer): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_stemming(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) X1 = vectorizer.transform(df.question1.values.astype(str)) X2 = vectorizer.transform(df.question2.values.astype(str)) X = np.hstack((X1, X2)) column_names = [] for i in range(X.shape[1]): column_name = column_name_prefix + '.' + str(i) df[column_name] = X[:, i] column_names.append(column_name) column_names = pd.Index(column_names) df[column_names] = X df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file, vectorizer: TfidfVectorizer): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_stemming(data_file) print(sys.argv[0], data_file, file=sys.stderr) cosine_values = [] q1vec = vectorizer.transform( df['question1'].apply(lambda x: x if type(x) == str else '').values) q2vec = vectorizer.transform( df['question2'].apply(lambda x: x if type(x) == str else '').values) for i in range(df.shape[0]): cosine_values.append( round(float(np.dot(q1vec[i], q2vec[i].T)[0, 0]), 5)) column_name = 'f{0}'.format( os.path.basename(feature_output_file(data_file)).split('_')[0]) df[column_name] = cosine_values df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def read_data(self, data_file): return np.array( self.vectorizer.transform( nltk_stemming(data_file)['question2'].fillna("").tolist()).sum( axis=1)).reshape(-1, )
def read_data(self, data_file): X1 = self.vectorizer.transform( nltk_stemming(data_file)['question1'].fillna("").tolist()) X2 = self.vectorizer.transform( nltk_stemming(data_file)['question2'].fillna("").tolist()).T return [(X1.getrow(i), X2.getcol(i)) for i in range(X1.shape[0])]