def main(): options = common_feature_parser().parse_args() df_train = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['test']) train_diff_pairs = [ list_diff_pairs(q1, q2) for q1, q2 in tqdm( zip(df_train.question1.astype(str), df_train.question2.astype( str))) ] test_diff_pairs = [ list_diff_pairs(q1, q2) for q1, q2 in tqdm( zip(df_test.question1.astype(str), df_test.question2.astype(str))) ] pipeline = make_pipeline( CountVectorizer(max_df=0.5, min_df=100, dtype=np.int32, tokenizer=lambda a: a, lowercase=False), NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True)) pipeline.fit(train_diff_pairs + test_diff_pairs) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def main(): options = common_feature_parser().parse_args() df_train = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['test']) dup_counter = Counter() for q1, q2, dup in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str), df_train.is_duplicate)): if dup: dup_counter.update(list_diff_pairs(q1, q2)) train_diff_pairs_ = [list_diff_pairs(q1, q2) for q1, q2 in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str)))] train_diff_pairs = [pair for pair in list(chain.from_iterable(train_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ] test_diff_pairs_ = [list_diff_pairs(q1, q2) for q1, q2 in tqdm(zip(df_test.question1.astype(str), df_test.question2.astype(str)))] test_diff_pairs = [pair for pair in list(chain.from_iterable(test_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ] pipeline = make_pipeline( CountVectorizer(max_df=0.5, min_df=MIN_FREQ, dtype=np.int32, tokenizer=lambda a: a, lowercase=False), NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True) ) pipeline.fit(train_diff_pairs + test_diff_pairs) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def prepare(self): df_train = nltk_tokenize(self.input_files['train']) df_test = nltk_tokenize(self.input_files['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) self.vectorizer.fit(train_qs.values)
def prepare(self): df_train = nltk_tokenize('data/input/train.csv') df_test = nltk_tokenize('data/input/test.csv') vectorizer = CountVectorizer(min_df=2) qs = pd.Series(df_train.question1.astype(str).tolist() + df_train.question2.astype(str).tolist() + df_test.question1.astype(str).tolist() + df_test.question2.astype(str).tolist()) vectorizer.fit(qs) self.vocab = vectorizer.vocabulary_
def prepare(self): self.model = gensim.models.KeyedVectors.load_word2vec_format('data/input/glove.840B.300d.bin', binary=True) df_train = nltk_tokenize(self.input_files['train']) df_test = nltk_tokenize(self.input_files['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) self.vectorizer.fit(train_qs.values)
def prepare(self): self.model = gensim.models.KeyedVectors.load_word2vec_format( 'data/input/GoogleNews-vectors-negative300.bin', binary=True) self.model.init_sims(replace=True) df_train = nltk_tokenize(self.input_files['train']) df_test = nltk_tokenize(self.input_files['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) self.vectorizer.fit(train_qs.values)
def create_feature(data_file, vectorizer, pipeline): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format( os.path.basename(feature_output_file(data_file)).split('_')[0]) X1 = pipeline.transform( vectorizer.transform(df.question1.values.astype(str))) X2 = pipeline.transform( vectorizer.transform(df.question2.values.astype(str))) X = np.hstack((X1, X2)) column_names = [] for i in tqdm(range(X.shape[1])): column_name = column_name_prefix + '.' + str(i) df[column_name] = X[:, i] column_names.append(column_name) column_names = pd.Index(column_names) df[column_names] = X df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file, vectorizer): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format( os.path.basename(feature_output_file(data_file)).split('_')[0]) diff_pairs = [ list_diff_pairs(q1, q2) for q1, q2 in tqdm( zip(df.question1.astype(str), df.question1.astype(str))) ] X = vectorizer.transform(diff_pairs) column_names = [] for i in range(X.shape[1]): column_name = column_name_prefix + '.' + str(i) df[column_name] = X[:, i] column_names.append(column_name) column_names = pd.Index(column_names) df[column_names] = X df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_features_files(train_path, test_path): print(train_path, test_path) if os.path.exists(feature_output_file(train_path)) and os.path.exists( feature_output_file(test_path)): print('File exists {}.'.format(feature_output_file(train_path)) + ", " + feature_output_file(test_path)) return print('Preprocessing') train = nltk_tokenize(train_path) for q1, q2, dup in tqdm( zip(train.question1.astype(str), train.question2.astype(str), train.is_duplicate)): if dup: diff_pairs = list_diff_pairs(q1, q2) dup_counter.update(diff_pairs) print('freatures >= MIN_FREQ: {}'.format( sum(1 for t, freq in dup_counter.most_common() if freq >= MIN_FREQ))) print('Creating feature for train') create_features(train_path) print('Creating feature for test') create_features(test_path)
def read_data(self, data_file): df = nltk_tokenize(data_file) X1 = self.vectorizer.transform(df['question1'].fillna("").tolist()) X2 = self.vectorizer.transform(df['question2'].fillna("").tolist()) X1rows = [X1.getrow(i) for i in tqdm(range(X1.shape[0]))] X2rows = [X2.getrow(i) for i in tqdm(range(X2.shape[0]))] return list(zip(df['question1'].tolist(), df['question2'].tolist(), X1rows, X2rows))
def main(): options = common_feature_parser().parse_args() df_train = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) pipeline = make_pipeline( CountVectorizer(max_df=0.5, min_df=2, max_features=200), TfidfTransformer(norm='l2'), TruncatedSVD(n_components=10)) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def create_features(data_path): data = nltk_tokenize(data_path) feature_dicts = Parallel(n_jobs=-1, verbose=3)( delayed(create_feature)(q1, q2) for q1, q2 in zip( data.question1.astype(str), data.question2.astype(str))) df = pd.DataFrame(feature_dicts) df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
def create_features(data_path): print('data_path file: {}'.format(data_path)) data = nltk_tokenize(data_path) #[:1000] features = Parallel(n_jobs=-1, verbose=5)( delayed(create_feature)(q1, q2) for q1, q2 in zip( data.question1.astype(str), data.question2.astype(str))) df = pd.DataFrame(features) df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
def main(): options = common_feature_parser().parse_args() df_train = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) pipeline = make_pipeline( TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', dtype=np.int32), NMF(n_components=150, random_state=1, l1_ratio=.15)) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def create_word_match_feature(data_file, model: gensim.models.KeyedVectors): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) df[column_name] = df.apply(wmd, axis=1, raw=True, model=model) df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name = 'f{0}'.format( os.path.basename(feature_output_file(data_file)).split('_')[0]) df[column_name] = df.apply(calc_feature, axis=1, raw=True) df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def main(): options = common_feature_parser().parse_args() df_train = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['train']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) pipeline = make_pipeline( TfidfVectorizer(max_df=0.5, min_df=2, norm='l2'), LatentDirichletAllocation(n_topics=10, random_state=1)) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def create_feature(data_file, model): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) values = np.zeros((df.shape[0])) for i in tqdm(range(df.shape[0])): q1 = df.question1.values[i] q2 = df.question2.values[i] values[i] = calculate_distance(q1, q2, model) df[column_name] = values df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_word_match_feature(data_file, model: gensim.models.Doc2Vec): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) X1 = calc_document_vector(df.question1.values.astype(str).tolist(), model) X2 = calc_document_vector(df.question2.values.astype(str).tolist(), model) X = np.hstack((X1, X2)) column_names = [] for i in tqdm(range(X.shape[1])): column_name = column_name_prefix + '.' + str(i) df[column_name] = X[:, i] column_names.append(column_name) column_names = pd.Index(column_names) print('Start to write dataset') df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_features_files(train_path, test_path): print(train_path, test_path) if os.path.exists(feature_output_file(train_path)) and os.path.exists(feature_output_file(test_path)): print('File exists {}.'.format(feature_output_file(train_path)) + ", " + feature_output_file(test_path)) return print('Preprocessing') train = nltk_tokenize(train_path) for q1, q2, dup in tqdm(zip(train.question1.astype(str), train.question2.astype(str), train.is_duplicate)): words1 = q1.split() words2 = q2.split() all_counter.update(words1) all_counter.update(words2) if dup: dup_counter.update(words1) dup_counter.update(words2) print('Creating feature for train') create_features(train_path) print('Creating feature for test') create_features(test_path)
def read_data(self, data_file): return nltk_tokenize(data_file)