Exemplo n.º 1
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])

    train_diff_pairs = [
        list_diff_pairs(q1, q2) for q1, q2 in tqdm(
            zip(df_train.question1.astype(str), df_train.question2.astype(
                str)))
    ]
    test_diff_pairs = [
        list_diff_pairs(q1, q2) for q1, q2 in tqdm(
            zip(df_test.question1.astype(str), df_test.question2.astype(str)))
    ]

    pipeline = make_pipeline(
        CountVectorizer(max_df=0.5,
                        min_df=100,
                        dtype=np.int32,
                        tokenizer=lambda a: a,
                        lowercase=False),
        NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True))
    pipeline.fit(train_diff_pairs + test_diff_pairs)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemplo n.º 2
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['test'])

    dup_counter = Counter()
    for q1, q2, dup in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str),
                                df_train.is_duplicate)):
        if dup:
            dup_counter.update(list_diff_pairs(q1, q2))

    train_diff_pairs_ = [list_diff_pairs(q1, q2)
                        for q1, q2 in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str)))]
    train_diff_pairs = [pair for pair in list(chain.from_iterable(train_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ]

    test_diff_pairs_ = [list_diff_pairs(q1, q2)
                       for q1, q2 in tqdm(zip(df_test.question1.astype(str), df_test.question2.astype(str)))]
    test_diff_pairs = [pair for pair in list(chain.from_iterable(test_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ]

    pipeline = make_pipeline(
        CountVectorizer(max_df=0.5, min_df=MIN_FREQ, dtype=np.int32, tokenizer=lambda a: a, lowercase=False),
        NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True)
    )
    pipeline.fit(train_diff_pairs + test_diff_pairs)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemplo n.º 3
0
 def prepare(self):
     df_train = nltk_tokenize(self.input_files['train'])
     df_test = nltk_tokenize(self.input_files['test'])
     train_qs = pd.Series(df_train['question1'].tolist() +
                          df_train['question2'].tolist() +
                          df_test['question1'].tolist() +
                          df_test['question2'].tolist()).astype(str)
     self.vectorizer.fit(train_qs.values)
Exemplo n.º 4
0
 def prepare(self):
     df_train = nltk_tokenize('data/input/train.csv')
     df_test = nltk_tokenize('data/input/test.csv')
     vectorizer = CountVectorizer(min_df=2)
     qs = pd.Series(df_train.question1.astype(str).tolist() + df_train.question2.astype(str).tolist() +
                    df_test.question1.astype(str).tolist() + df_test.question2.astype(str).tolist())
     vectorizer.fit(qs)
     self.vocab = vectorizer.vocabulary_
Exemplo n.º 5
0
 def prepare(self):
     self.model = gensim.models.KeyedVectors.load_word2vec_format('data/input/glove.840B.300d.bin',
                                                                  binary=True)
     df_train = nltk_tokenize(self.input_files['train'])
     df_test = nltk_tokenize(self.input_files['test'])
     train_qs = pd.Series(df_train['question1'].tolist() +
                          df_train['question2'].tolist() +
                          df_test['question1'].tolist() +
                          df_test['question2'].tolist()).astype(str)
     self.vectorizer.fit(train_qs.values)
Exemplo n.º 6
0
 def prepare(self):
     self.model = gensim.models.KeyedVectors.load_word2vec_format(
         'data/input/GoogleNews-vectors-negative300.bin', binary=True)
     self.model.init_sims(replace=True)
     df_train = nltk_tokenize(self.input_files['train'])
     df_test = nltk_tokenize(self.input_files['test'])
     train_qs = pd.Series(df_train['question1'].tolist() +
                          df_train['question2'].tolist() +
                          df_test['question1'].tolist() +
                          df_test['question2'].tolist()).astype(str)
     self.vectorizer.fit(train_qs.values)
def create_feature(data_file, vectorizer, pipeline):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_tokenize(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name_prefix = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])
    X1 = pipeline.transform(
        vectorizer.transform(df.question1.values.astype(str)))
    X2 = pipeline.transform(
        vectorizer.transform(df.question2.values.astype(str)))
    X = np.hstack((X1, X2))

    column_names = []
    for i in tqdm(range(X.shape[1])):
        column_name = column_name_prefix + '.' + str(i)
        df[column_name] = X[:, i]
        column_names.append(column_name)
    column_names = pd.Index(column_names)
    df[column_names] = X
    df[column_names].to_csv(feature_output_file(data_file),
                            index=False,
                            float_format='%.5f')
Exemplo n.º 8
0
def create_feature(data_file, vectorizer):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_tokenize(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name_prefix = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])
    diff_pairs = [
        list_diff_pairs(q1, q2) for q1, q2 in tqdm(
            zip(df.question1.astype(str), df.question1.astype(str)))
    ]
    X = vectorizer.transform(diff_pairs)

    column_names = []
    for i in range(X.shape[1]):
        column_name = column_name_prefix + '.' + str(i)
        df[column_name] = X[:, i]
        column_names.append(column_name)
    column_names = pd.Index(column_names)
    df[column_names] = X
    df[column_names].to_csv(feature_output_file(data_file),
                            index=False,
                            float_format='%.5f')
Exemplo n.º 9
0
def create_features_files(train_path, test_path):
    print(train_path, test_path)
    if os.path.exists(feature_output_file(train_path)) and os.path.exists(
            feature_output_file(test_path)):
        print('File exists {}.'.format(feature_output_file(train_path)) +
              ", " + feature_output_file(test_path))
        return

    print('Preprocessing')
    train = nltk_tokenize(train_path)
    for q1, q2, dup in tqdm(
            zip(train.question1.astype(str), train.question2.astype(str),
                train.is_duplicate)):
        if dup:
            diff_pairs = list_diff_pairs(q1, q2)
            dup_counter.update(diff_pairs)

    print('freatures >= MIN_FREQ: {}'.format(
        sum(1 for t, freq in dup_counter.most_common() if freq >= MIN_FREQ)))

    print('Creating feature for train')
    create_features(train_path)

    print('Creating feature for test')
    create_features(test_path)
Exemplo n.º 10
0
 def read_data(self, data_file):
     df = nltk_tokenize(data_file)
     X1 = self.vectorizer.transform(df['question1'].fillna("").tolist())
     X2 = self.vectorizer.transform(df['question2'].fillna("").tolist())
     X1rows = [X1.getrow(i) for i in tqdm(range(X1.shape[0]))]
     X2rows = [X2.getrow(i) for i in tqdm(range(X2.shape[0]))]
     return list(zip(df['question1'].tolist(), df['question2'].tolist(), X1rows, X2rows))
Exemplo n.º 11
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])
    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist() +
                         df_test['question1'].tolist() +
                         df_test['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        CountVectorizer(max_df=0.5, min_df=2, max_features=200),
        TfidfTransformer(norm='l2'), TruncatedSVD(n_components=10))
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
def create_features(data_path):
    data = nltk_tokenize(data_path)

    feature_dicts = Parallel(n_jobs=-1, verbose=3)(
        delayed(create_feature)(q1, q2) for q1, q2 in zip(
            data.question1.astype(str), data.question2.astype(str)))

    df = pd.DataFrame(feature_dicts)
    df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
Exemplo n.º 13
0
def create_features(data_path):
    print('data_path file: {}'.format(data_path))
    data = nltk_tokenize(data_path)  #[:1000]

    features = Parallel(n_jobs=-1, verbose=5)(
        delayed(create_feature)(q1, q2) for q1, q2 in zip(
            data.question1.astype(str), data.question2.astype(str)))
    df = pd.DataFrame(features)
    df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])
    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist() +
                         df_test['question1'].tolist() +
                         df_test['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', dtype=np.int32),
        NMF(n_components=150, random_state=1, l1_ratio=.15))
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemplo n.º 15
0
def create_word_match_feature(data_file, model: gensim.models.KeyedVectors):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    df = nltk_tokenize(data_file)

    print(sys.argv[0], data_file, file=sys.stderr)
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = df.apply(wmd, axis=1, raw=True, model=model)
    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemplo n.º 16
0
def create_feature(data_file):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    df = nltk_tokenize(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = df.apply(calc_feature, axis=1, raw=True)
    df[[column_name]].to_csv(feature_output_file(data_file),
                             index=False,
                             float_format='%.5f')
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])

    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        TfidfVectorizer(max_df=0.5, min_df=2, norm='l2'),
        LatentDirichletAllocation(n_topics=10, random_state=1))
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemplo n.º 18
0
def create_feature(data_file, model):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_tokenize(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])

    values = np.zeros((df.shape[0]))
    for i in tqdm(range(df.shape[0])):
        q1 = df.question1.values[i]
        q2 = df.question2.values[i]
        values[i] = calculate_distance(q1, q2, model)

    df[column_name] = values
    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_word_match_feature(data_file, model: gensim.models.Doc2Vec):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_tokenize(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])

    X1 = calc_document_vector(df.question1.values.astype(str).tolist(), model)
    X2 = calc_document_vector(df.question2.values.astype(str).tolist(), model)
    X = np.hstack((X1, X2))

    column_names = []
    for i in tqdm(range(X.shape[1])):
        column_name = column_name_prefix + '.' + str(i)
        df[column_name] = X[:, i]
        column_names.append(column_name)
    column_names = pd.Index(column_names)
    print('Start to write dataset')
    df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemplo n.º 20
0
def create_features_files(train_path, test_path):
    print(train_path, test_path)
    if os.path.exists(feature_output_file(train_path)) and os.path.exists(feature_output_file(test_path)):
        print('File exists {}.'.format(feature_output_file(train_path)) + ", " + feature_output_file(test_path))
        return

    print('Preprocessing')
    train = nltk_tokenize(train_path)
    for q1, q2, dup in tqdm(zip(train.question1.astype(str), train.question2.astype(str), train.is_duplicate)):
        words1 = q1.split()
        words2 = q2.split()
        all_counter.update(words1)
        all_counter.update(words2)
        if dup:
            dup_counter.update(words1)
            dup_counter.update(words2)

    print('Creating feature for train')
    create_features(train_path)

    print('Creating feature for test')
    create_features(test_path)
Exemplo n.º 21
0
 def read_data(self, data_file):
     return nltk_tokenize(data_file)