Пример #1
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['test'])

    dup_counter = Counter()
    for q1, q2, dup in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str),
                                df_train.is_duplicate)):
        if dup:
            dup_counter.update(list_diff_pairs(q1, q2))

    train_diff_pairs_ = [list_diff_pairs(q1, q2)
                        for q1, q2 in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str)))]
    train_diff_pairs = [pair for pair in list(chain.from_iterable(train_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ]

    test_diff_pairs_ = [list_diff_pairs(q1, q2)
                       for q1, q2 in tqdm(zip(df_test.question1.astype(str), df_test.question2.astype(str)))]
    test_diff_pairs = [pair for pair in list(chain.from_iterable(test_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ]

    pipeline = make_pipeline(
        CountVectorizer(max_df=0.5, min_df=MIN_FREQ, dtype=np.int32, tokenizer=lambda a: a, lowercase=False),
        NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True)
    )
    pipeline.fit(train_diff_pairs + test_diff_pairs)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Пример #2
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])

    train_diff_pairs = [
        list_diff_pairs(q1, q2) for q1, q2 in tqdm(
            zip(df_train.question1.astype(str), df_train.question2.astype(
                str)))
    ]
    test_diff_pairs = [
        list_diff_pairs(q1, q2) for q1, q2 in tqdm(
            zip(df_test.question1.astype(str), df_test.question2.astype(str)))
    ]

    pipeline = make_pipeline(
        CountVectorizer(max_df=0.5,
                        min_df=100,
                        dtype=np.int32,
                        tokenizer=lambda a: a,
                        lowercase=False),
        NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True))
    pipeline.fit(train_diff_pairs + test_diff_pairs)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
def main():
    options = common_feature_parser().parse_args()
    train_file = dict(generate_filename_from_prefix(
        options.data_prefix))['train']
    test_file = dict(generate_filename_from_prefix(
        options.data_prefix))['test']

    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    test_df['is_duplicate'] = -1
    test_df.rename(columns={'test_id': 'id'}, inplace=True)
    df = train_df.append(test_df)
    df.reset_index(inplace=True)
    from collections import Counter
    see_later1 = []
    see_later2 = []
    sentence_counter1 = Counter()
    sentence_counter2 = Counter()

    for i in tqdm(range(df.shape[0])):
        row = df.iloc[-i - 1]
        q1 = str(row['question1'])
        q2 = str(row['question2'])
        see_later1.append(sentence_counter1[q1])
        see_later2.append(sentence_counter2[q2])
        sentence_counter1[q1] += 1
        sentence_counter2[q2] += 1
    df['see_later1'] = list(reversed(see_later1))
    df['see_later2'] = list(reversed(see_later2))
    create_feature(data_file=train_file, df=df[df.is_duplicate >= 0])
    create_feature(data_file=test_file, df=df[df.is_duplicate < 0])
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming_without_stopwords(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = get_test_df(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])

    vectorizer = TfKLdVectorizer(alpha=1,
                                 divergence='js',
                                 ngram_range=(1, 2),
                                 max_df=0.4,
                                 min_df=5)
    train_q1s = pd.Series(df_train['question1'].tolist() +
                          df_test['question1'].tolist()).astype(str)
    train_q2s = pd.Series(df_train['question2'].tolist() +
                          df_test['question2'].tolist()).astype(str)
    train_ys = pd.Series(df_train['is_duplicate'].tolist() +
                         df_test['is_duplicate'].tolist()).astype(int)
    vectorizer.fit(train_q1s, train_q2s, train_ys)

    train_qs = pd.Series(train_q1s.tolist() + train_q2s.tolist()).astype(str)
    value_qs = vectorizer.transform(train_qs)
    print(value_qs)

    pipeline = make_pipeline(TruncatedSVD(n_components=10))
    pipeline.fit(value_qs)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name,
                       vectorizer=vectorizer,
                       pipeline=pipeline)
Пример #5
0
def main():
    options = common_feature_parser().parse_args()
    # from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook
    train_orig = pd.read_csv(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    test_orig = pd.read_csv(dict(generate_filename_from_prefix(options.data_prefix))['test'])

    df1 = train_orig[['question1']].copy()
    df2 = train_orig[['question2']].copy()
    df1_test = test_orig[['question1']].copy()
    df2_test = test_orig[['question2']].copy()

    df2.rename(columns={'question2': 'question1'}, inplace=True)
    df2_test.rename(columns={'question2': 'question1'}, inplace=True)

    train_questions = df1.append(df2)
    train_questions = train_questions.append(df1_test)
    train_questions = train_questions.append(df2_test)
    train_questions.drop_duplicates(subset=['question1'], inplace=True)

    train_questions.reset_index(inplace=True, drop=True)
    questions_dict = pd.Series(train_questions.index.values, index=train_questions.question1.values).to_dict()
    train_cp = train_orig.copy()
    test_cp = test_orig.copy()
    train_cp.drop(['qid1', 'qid2'], axis=1, inplace=True)

    test_cp['is_duplicate'] = -1
    test_cp.rename(columns={'test_id': 'id'}, inplace=True)
    comb = pd.concat([train_cp, test_cp])

    comb['q1_hash'] = comb['question1'].map(questions_dict)
    comb['q2_hash'] = comb['question2'].map(questions_dict)
    q1_vc = comb.q1_hash.value_counts().to_dict()
    q2_vc = comb.q2_hash.value_counts().to_dict()
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, questions_dict=questions_dict, q1_vc=q1_vc, q2_vc=q2_vc)
def main():
    file_prefix = '0.1925_lstm_leak_203_101_0.26_0.35_36'
    options = common_feature_parser().parse_args()
    input_files = dict(generate_filename_from_prefix(options.data_prefix))
    train_file = os.path.join(os.path.dirname(input_files['train']), '../lstm/', file_prefix + '.stacking.csv')
    test_file = os.path.join(os.path.dirname(input_files['train']), '../lstm/', file_prefix + '.submission.csv')
    neighbor_sets, neighbor_weights = prepare_graph_with_filenames(options, train_file, test_file)
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, neighbor_sets=neighbor_sets, neighbor_weights=neighbor_weights)
def main():
    options = common_feature_parser().parse_args()

    df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, norm='l2')
    vectorizer.fit_transform(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=vectorizer)
def main():
    options = common_feature_parser().parse_args()
    print(sys.argv[0], file=sys.stderr)
    df_train = convert(nltk_pos_tag(dict(generate_filename_from_prefix(options.data_prefix))['train']))
    print(df_train.head(), file=sys.stderr)

    train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, norm='l2')
    vectorizer.fit_transform(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=vectorizer)
Пример #9
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train'])

    train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', ngram_range=(1, 2)),
        NMF(n_components=10, random_state=1, l1_ratio=.15)
    )
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Пример #10
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming_without_stopwords(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])

    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist()).astype(str)

    pipeline = make_pipeline(CountVectorizer(max_df=0.5, min_df=2),
                             TruncatedSVD(n_components=10))
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])

    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        TfidfVectorizer(max_df=0.5, min_df=2, norm='l2'),
        LatentDirichletAllocation(n_topics=10, random_state=1))
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Пример #12
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming_without_stopwords(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])

    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist()).astype(str)
    vectorizer = CountVectorizer(max_df=0.5,
                                 min_df=2,
                                 ngram_range=(2, 2),
                                 binary=True)
    vectorizer.fit_transform(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=vectorizer)
Пример #13
0
def main():
    options = common_feature_parser().parse_args()
    # from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook
    train_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    test_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])
    counter = Counter()
    features = {
        'train': calculate_features(train_df, counter),
        'test': calculate_features(test_df, counter)
    }

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, features=features[k])
Пример #14
0
def dump_graph(options):
    node_file = get_node_filename(options)
    edge_file = get_edge_filename(options)
    if os.path.exists(node_file) and os.path.exists(edge_file):
        print("File exists {} and {}".format(node_file, edge_file))
        return

    train_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    test_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])
    edges, question2id = build_graph(test_df, train_df)
    dump_load_file(node_file, question2id)
    with open(edge_file, 'w') as f:
        for e in edges:
            print(e, file=f)
Пример #15
0
def prepare_graph_with_filenames(options, train_file, test_file):
    input_files = dict(generate_filename_from_prefix(options.data_prefix))
    print('Stacking ingredients: {} and {}'.format(train_file, test_file),
          file=sys.stderr)
    neighbor_sets = defaultdict(set)
    neighbor_weights = defaultdict(dict)

    dfs = []
    df_train = pd.read_csv(input_files['train'])
    df_train['prob'] = pd.read_csv(train_file)['prediction']
    dfs.append(df_train)

    df_test = pd.read_csv(input_files['test'])
    df_test['prob'] = pd.read_csv(test_file)['is_duplicate']
    dfs.append(df_test)

    for df in dfs:
        for i, (q1, q2, value) in tqdm(
                enumerate(
                    zip(df.question1.astype(str), df.question2.astype(str),
                        df.prob))):
            neighbor_sets[q1].add(q2)
            neighbor_weights[q1][q2] = value
            neighbor_sets[q2].add(q1)
            neighbor_weights[q2][q1] = value
    return neighbor_sets, neighbor_weights
Пример #16
0
def prepare_graph(options, file_prefix):
    input_files = dict(generate_filename_from_prefix(options.data_prefix))
    train_file = os.path.join(os.path.dirname(input_files['train']),
                              '../output/', file_prefix + '.model.train.pred')
    test_file = os.path.join(os.path.dirname(input_files['train']),
                             '../output/', file_prefix + '.submission.csv')
    return prepare_graph_with_filenames(options, train_file, test_file)
Пример #17
0
def main():
    parser = common_feature_parser()
    parser.add_argument('--google_word2vec', default='data/input/GoogleNews-vectors-negative300.bin', type=str)
    options = parser.parse_args()

    model = gensim.models.KeyedVectors.load_word2vec_format(options.google_word2vec, binary=True)
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, model=model)
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['test'])
    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist() +
                         df_test['question1'].tolist() +
                         df_test['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', ngram_range=(1, 2), dtype=np.float32),
        TruncatedSVD(n_components=50, n_iter=10)
    )
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
def main():
    file_prefix = 'xgb_cross_0.json.xgb_cross_0.json'
    options = common_feature_parser().parse_args()
    neighbor_sets, neighbor_weights = prepare_graph(options, file_prefix)
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name,
                       neighbor_sets=neighbor_sets,
                       neighbor_weights=neighbor_weights)
Пример #20
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])
    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist() +
                         df_test['question1'].tolist() +
                         df_test['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        CountVectorizer(max_df=0.5, min_df=2, max_features=200),
        TfidfTransformer(norm='l2'), TruncatedSVD(n_components=10))
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Пример #21
0
def main():
    options = common_feature_parser().parse_args()
    # from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook
    train_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    test_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])
    uf = UnionFind()
    for i, row in tqdm(train_df.iterrows()):
        uf.unite(str(row['question1']), str(row['question2']))
    for i, row in tqdm(test_df.iterrows()):
        uf.unite(str(row['question1']), str(row['question2']))

    features = {
        'train': calculate_features(train_df, uf),
        'test': calculate_features(test_df, uf)
    }
    joblib.dump(uf, 'tmp-union-find.pkl')

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, features=features[k])
Пример #22
0
def generate_data_files(config, options):
    for feature_id in config['features']:
        feature_creator_file = get_feature_creator_file(feature_id)
        if check_feature_existence(feature_creator_file,
                                   config['data_prefix']):
            print('Feature file for {} exists.'.format(feature_creator_file),
                  file=sys.stderr)
            continue
        commands = [
            "python3", feature_creator_file, "--data_prefix",
            config['data_prefix']
        ]
        if options.train_only:
            commands.append("--train_only")
        subprocess.call(commands)
    data_files = dict(generate_filename_from_prefix(config['data_prefix']))
    return data_files
def main():
    m = gensim.models.Doc2Vec.load('../data/input/enwiki_dbow/doc2vec.bin')
    options = common_feature_parser().parse_args()
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_word_match_feature(data_file=file_name, model=m)
Пример #24
0
def main():
    options = common_feature_parser().parse_args()
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name)
Пример #25
0
 def __init__(self, options):
     self.input_files = dict(
         generate_filename_from_prefix(options.data_prefix))
     self.train_only = options.train_only
     self.n_threads = options.n_threads
Пример #26
0
def main():
    options = common_feature_parser().parse_args()
    model = gensim.models.KeyedVectors.load_word2vec_format(
        'data/input/GoogleNews-vectors-negative300.bin', binary=True)
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_word_match_feature(data_file=file_name, model=model)
Пример #27
0
def check_feature_existence(feature_creator_file, data_prefix):
    for k, file_name in generate_filename_from_prefix(data_prefix):
        if not os.path.exists(
                feature_output_file(file_name, feature_creator_file)):
            return False
    return True