def get_complete_feature_set(dataframe):
    summary_list = list(dataframe['Summary'].values)
    text_list = list(dataframe['Text'].values)
    summary_features = hashfeatures.FeatureHash(
        max_feature_num=100).get_feature_set(summary_list)
    text_features = hashfeatures.FeatureHash(
        max_feature_num=400).get_feature_set(text_list)
    consolidated_feature_list = np.hstack((summary_features, text_features))
    return consolidated_feature_list
def get_total_features(search_frame,first_col_name='product_title',second_col_name='search_term'):
    feature_hash_first_text = hashfeatures.FeatureHash()
    feature_hash_second_text = hashfeatures.FeatureHash(max_feature_num=100)
    first_text_list = preprocess.text_clean_pipeline_list(list(search_frame[first_col_name].values))
    second_text_list = preprocess.text_clean_pipeline_list(list(search_frame[second_col_name].values))
    first_feature_set = feature_hash_first_text.get_feature_set(first_text_list)
    second_feature_set = feature_hash_second_text.get_feature_set(second_text_list)
    final_consolidated_feature_list = np.hstack((first_feature_set,second_feature_set))
    return final_consolidated_feature_list
def get_full_hash_features(dataframe):
    title_features = hashfeatures.FeatureHash(
        max_feature_num=20).get_feature_set(list(dataframe['title'].values))
    desc_features = hashfeatures.FeatureHash(
        max_feature_num=450).get_feature_set(
            list(dataframe['description'].values))
    attr_features = hashfeatures.FeatureHash(
        max_feature_num=100).get_feature_set(list(dataframe['attrs'].values))
    dataframe.drop(['title', 'description', 'attrs'], axis=1, inplace=True)
    full_features = np.hstack(
        (title_features, desc_features, attr_features, dataframe.values))
    return full_features
예제 #4
0
def get_full_feature_set(dataframe):
    title_list = list(dataframe['Title'].values)
    body_list = list(dataframe['BodyMarkdown'].values)
    clean_title_list = preprocess.text_clean_pipeline_list(title_list)
    clean_body_list = preprocess.text_clean_pipeline_list(body_list)
    title_feature = hashfeatures.FeatureHash(max_feature_num=100)
    body_feature = hashfeatures.FeatureHash(max_feature_num=400)
    title_hash_features = title_feature.get_feature_set(clean_title_list)
    body_hash_features = body_feature.get_feature_set(clean_body_list)
    del dataframe['Title']
    del dataframe['BodyMarkdown']
    full_feature_set = np.hstack(
        (title_hash_features, body_hash_features, dataframe.values))
    return full_feature_set
def get_hash_features(dataframe, column_dim_dict):
    text_features_master_list = list([])
    for key in column_dim_dict.keys():
        text_list = list(dataframe[key].values)
        text_features = hashfeatures.FeatureHash(
            max_feature_num=column_dim_dict[key]).get_feature_set(text_list)
        text_features_master_list.append(text_features)
    return reduce(lambda x, y: np.hstack((x, y)), text_features_master_list)
def get_label_encoded_features(rent_frame,text_columns):
    description_list = list(rent_frame['description'].values)
    features_list = list(rent_frame['features'].values)
    features_list = map(lambda s: " ".join(s), features_list)
    address_list = list(rent_frame['display_address'].values)
    street_list = list(rent_frame['street_address'].values)
    feature_hash = hashfeatures.FeatureHash(max_feature_num=150)
    description_hash = feature_hash.get_feature_set(description_list)
    features_list_hash = feature_hash.get_feature_set(features_list)
    address_hash = feature_hash.get_feature_set(address_list)
    street_hash = feature_hash.get_feature_set(street_list)
    rent_frame.drop(text_columns,axis=1,inplace=True)
    numerical_features = rent_frame.values
    return numpy.hstack((numerical_features,description_hash,features_list_hash,address_hash,street_hash))
예제 #7
0
def get_hashed_features(tweet_list):
    feat = hashfeatures.FeatureHash(max_feature_num=5000)
    hash_feature_set = feat.get_feature_set(tweet_list)
    return hash_feature_set
    return classifier_list, classifier_name_list


def report_classification_metrics(trained_model, X_test, y_test):
    predicted_values = trained_model.predict(X_test)
    print metrics.classification_report(y_test, predicted_values)
    print metrics.accuracy_score(y_test, predicted_values)


filename = 'C:\\Users\\rupachak\\Desktop\\Kaggle Data\\DonorsChoose\\train.csv'
train_frame = pd.read_csv(filename)
train_frame = train_frame.head(25000)
class_labels = list(train_frame['project_is_approved'].values)
summary_text = preprocess.text_clean_pipeline_list(
    list(train_frame['project_resource_summary'].values))
feature_set = hashfeatures.FeatureHash(
    max_feature_num=2000).get_feature_set(summary_text)

del train_frame
del summary_text
X_train, X_test, y_train, y_test = train_test_split(feature_set,
                                                    class_labels,
                                                    test_size=0.2,
                                                    random_state=42)
del class_labels
del feature_set

classifier_list, classifier_name_list = get_classifiers()
for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    classifier.fit(X_train, y_train)
    print "---------- For Classifier: ", classifier_name, " --------------------\n"
    report_classification_metrics(classifier, X_test, y_test)
예제 #9
0
    ada = AdaBoostClassifier(n_estimators=51,random_state=42)
    grad = GradientBoostingClassifier(n_estimators=101,random_state=42)
    classifier_list = [rf,bagg,extra,ada,grad]
    classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost']
    return classifier_list,classifier_name_list


def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
    print '--------- For Model : ------------', trained_model_name
    predicted_values = trained_model.predict(X_test)
    print metrics.classification_report(y_test,predicted_values)
    print "Accuracy Score : ",metrics.accuracy_score(y_test,predicted_values)
    print "---------------------------------------\n"


filename = 'train.csv'
author_frame = pd.read_csv(filename)
class_labels = list(author_frame['author'].values)
del author_frame['id']
del author_frame['author']
text_list = list(author_frame['text'].values)
cleaned_text_list = preprocess.text_clean_pipeline_list(text_list)
feat_hash = hashfeatures.FeatureHash(max_feature_num=1000)
text_features = feat_hash.get_feature_set(cleaned_text_list)
X_train,X_test,y_train,y_test = train_test_split(text_features,class_labels,test_size=0.2,random_state=42)
classifier_list,classifier_name_list = get_ensemble_models()
for classifier,classifier_name in zip(classifier_list,classifier_name_list):
    classifier.fit(X_train,y_train)
    print_evaluation_metrics(classifier,classifier_name,X_test,y_test)

        top_keys = get_top_n_dict_keys(word_dict, top_n=7)
        top_key_string = " ".join(top_keys)
        dataframe[column] = map(lambda x: top_key_string if x is np.nan else x,
                                dataframe[column].values)
    return dataframe


filename = 'training.csv'
train_frame = pd.read_csv(filename)
name_desc_cap_frame = train_frame[['name', 'description', 'caption']]
target_class_labels = train_frame['good'].values
train_frame.drop(['name', 'description', 'caption', 'good'],
                 axis=1,
                 inplace=True)
name_desc_cap_frame = fill_nan_in_string(name_desc_cap_frame)
name_features = hashfeatures.FeatureHash(max_feature_num=100).get_feature_set(
    name_desc_cap_frame['name'].values)
desc_features = hashfeatures.FeatureHash(max_feature_num=500).get_feature_set(
    name_desc_cap_frame['description'].values)
caption_features = hashfeatures.FeatureHash(
    max_feature_num=200).get_feature_set(name_desc_cap_frame['caption'].values)
train_features = Imputer().fit_transform(train_frame.values)
final_features = np.hstack(
    (name_features, desc_features, caption_features, train_features))
X_train, X_test, y_train, y_test = train_test_split(final_features,
                                                    target_class_labels,
                                                    test_size=0.1,
                                                    random_state=42)
classifer_list, classifier_name_list = get_ensemble_models()
for classifier, classifier_name in zip(classifer_list, classifier_name_list):
    classifier.fit(X_train, y_train)
    print_evaluation_metrics(classifier, classifier_name, X_test, y_test)
        y_test, predicted_values)
    print "Median Absolute Error : ", metrics.median_absolute_error(
        y_test, predicted_values)
    print "Mean Squared Error : ", metrics.mean_squared_error(
        y_test, predicted_values)
    print "R2 Score : ", metrics.r2_score(y_test, predicted_values)
    print "---------------------------------------\n"


filename = 'train.csv'
train_frame = pd.read_csv(filename)
columns_to_delete = ['Id', 'LocationRaw', 'ContractType']
train_frame.drop(columns_to_delete, axis=1, inplace=True)
train_frame.dropna(inplace=True)

title_features = hashfeatures.FeatureHash(max_feature_num=100).get_feature_set(
    list(train_frame['Title'].values))
description_features = hashfeatures.FeatureHash(
    max_feature_num=900).get_feature_set(
        list(train_frame['FullDescription'].values))
target_values = train_frame['SalaryNormalized'].values
train_frame.drop(['Title', 'FullDescription', 'SalaryNormalized'],
                 axis=1,
                 inplace=True)
train_frame = label_encode_frame(train_frame)
final_feature_set = np.hstack(
    (title_features, description_features, train_frame.values))
X_train, X_test, y_train, y_test = train_test_split(final_feature_set,
                                                    target_values,
                                                    test_size=0.2,
                                                    random_state=42)
regressor_list, regressor_name_list = get_ensemble_models()
    return pd.DataFrame(text_list)


text_filename = 'training_text'
mutation_filename = 'training_variants'
text_frame = get_data_frame(text_filename)
mutation_frame = pd.read_csv(mutation_filename)
mutation_frame['ID'] = map(lambda x: int(x), mutation_frame['ID'].values)
final_frame = pd.merge(text_frame,
                       mutation_frame,
                       left_on='ID',
                       right_on='ID',
                       how='outer')
class_labels = list(final_frame['Class'].values)
gene_text = list(final_frame['Text'].values)
gene_features = hashfeatures.FeatureHash(
    max_feature_num=5000).get_feature_set(gene_text)
del final_frame['Class']
del final_frame['Text']
del final_frame['ID']
final_frame = label_encode_frame(final_frame)
final_feature_set = np.hstack((gene_features, final_frame.values))
X_train, X_test, y_train, y_test = train_test_split(final_feature_set,
                                                    class_labels,
                                                    test_size=0.2,
                                                    random_state=42)
classifier_list, classifier_name_list = get_ensemble_models()
for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    classifier.fit(X_train, y_train)
    print_evaluation_metrics(classifier, classifier_name, X_test, y_test)

script_file_path = 'C:\\Users\\rupachak\\Desktop\\Kaggle Data\\Seinfield Scripts\\scripts.csv'
script_frame = pd.read_csv(script_file_path)
character_group_series = script_frame['Character'].value_counts()
filtered_character_list = []
for character, count in character_group_series.iteritems():
    if count > 300:
        filtered_character_list.append(character)

filtered_script_frame = script_frame[script_frame['Character'].isin(
    filtered_character_list)]
del script_frame
character_list = list(filtered_script_frame['Character'].values)
dialogue_list = preprocess.text_clean_pipeline_list(
    list(filtered_script_frame['Dialogue'].values))
hash_feature_set = hashfeatures.FeatureHash(
    max_feature_num=1000).get_feature_set(dialogue_list)
del filtered_script_frame
X_train, X_test, y_train, y_test = train_test_split(hash_feature_set,
                                                    character_list,
                                                    test_size=0.2,
                                                    random_state=42)
del character_list
del dialogue_list
del hash_feature_set
classifier_list, classifier_name_list = get_classifiers()
for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    classifier.fit(X_train, y_train)
    report_classification_metrics(classifier, X_test, y_test)
        'Gradient Boost'
    ]
    return classifier_list, classifier_name_list


def print_evaluation_metrics(trained_model, trained_model_name, X_test,
                             y_test):
    print '--------- For Model : ', trained_model_name
    predicted_values = trained_model.predict(X_test)
    print metrics.classification_report(y_test, predicted_values)
    print "Accuracy Score : ", metrics.accuracy_score(y_test, predicted_values)
    print "---------------------------------------\n"


filename = 'train.csv'
imperial_frame = pd.read_csv(filename)
feature_hash = hashfeatures.FeatureHash(max_feature_num=5000)
insult_features = feature_hash.get_feature_set(
    list(imperial_frame['Comment'].values))
class_labels = list(imperial_frame['Insult'].values)
rf_embed_features = RandomTreesEmbedding(n_estimators=151, random_state=42)
insult_features = rf_embed_features.fit_transform(insult_features)
X_train, X_test, y_train, y_test = train_test_split(insult_features,
                                                    class_labels,
                                                    test_size=0.1,
                                                    random_state=42)
classifier_list, classifier_name_list = get_ensemble_models()
for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    classifier.fit(X_train, y_train)
    print_evaluation_metrics(classifier, classifier_name, X_test, y_test)