Exemplo n.º 1
0
def concatenate_crowdtangle_group_data(suffix):

    if suffix == "fake_news_2021":
        df_list = []
        for file_index in range(5):
            df_list.append(
                import_data(folder="crowdtangle_group",
                            file_name="posts_" + suffix + "_group_" +
                            str(file_index + 1) + ".csv"))
        posts_group_df = pd.concat(df_list)
    else:
        posts_group_df = import_data(folder="crowdtangle_group",
                                     file_name="posts_" + suffix +
                                     "_group.csv")

    print('\nThere are {} Facebook groups about {}.'.format(
        posts_group_df.account_id.nunique(), suffix))

    posts_page_df = import_data(folder="crowdtangle_group",
                                file_name="posts_" + suffix + "_page.csv")
    print('There are {} Facebook pages about {}.'.format(
        posts_page_df.account_id.nunique(), suffix))

    posts_df = pd.concat([posts_group_df, posts_page_df])

    posts_df['date'] = pd.to_datetime(posts_df['date'])

    return posts_df
Exemplo n.º 2
0
def import_crowdtangle_group_data():

    posts_wi_date_df = import_data(folder="crowdtangle_group",
                                   file_name="posts_self_declared_wi_date.csv")
    print('\nThere are {} Facebook pages with the last strike date visible on the screenshot.'.\
        format(posts_wi_date_df.account_id.nunique()))

    posts_wo_date_df = import_data(folder="crowdtangle_group",
                                   file_name="posts_self_declared_wo_date.csv")
    list_wo_name = [
        'Artists For A Free World', 'Terrence K Williams',
        'Ben Garrison Cartoons', 'Wendy Bell Radio',
        'New Independence Network', 'Pruden POD & Post', 'PR Conservative',
        'Org of Conservative Trump Americans', 'Con Ciencia Indigena',
        'Republican Party of Lafayette County',
        'The Daily Perspective Podcast', 'Freedom Memes',
        'White Dragon Society', 'Robertson Family Values'
    ]
    posts_wo_date_df = posts_wo_date_df[~posts_wo_date_df['account_name'].
                                        isin(list_wo_name)]
    print('There are {} Facebook pages without the last strike date visible on the screenshot.'.\
        format(posts_wo_date_df.account_id.nunique()))

    posts_df = pd.concat([posts_wi_date_df, posts_wo_date_df])

    posts_df['date'] = pd.to_datetime(posts_df['date'])

    return posts_df
Exemplo n.º 3
0
def run_test(mode='tfidf', model='linear', regularizer='ridge',\
                 train_data_path='data/intuit_data',\
                 test_data_path='data/intuit_test_data', augment=False):
    """
    Prints out score report of model under given featurization 
    """

    modes = {'bow': 'Bag of Words', 'tfidf': 'TF-IDF'}

    print("Using featurization " + modes[mode] + "...")
    print("Training " + model.upper() + " model with " + regularizer.upper() +\
                 " regularization...")

    if augment:
        print("Importing Word2Vec Model...")
        w2v_model = Word2Vec.load_word2vec_format('w2v.bin', binary=True)

    print(
        "-------------------------------------------------------------------------"
    )

    emails_train, y_train = import_data(train_data_path)
    transform = generate_featurizer(emails_train, mode=mode)
    X_train = transform(emails_train)
    eff_labels = np.unique(y_train)
    if augment:
        auxillary_features = [featurize(email, eff_labels, w2v_model)\
                                 for email in emails_train]
        auxillary_features = np.vstack(auxillary_features)
        X_train = hstack((X_train, auxillary_features))

    clf = generate_model(X_train, y_train, model=model,\
                             regularizer=regularizer)

    emails_test, y_test = import_data(test_data_path)
    X_test = transform(emails_test)
    if augment:
        auxillary_features = [featurize(email, eff_labels, w2v_model)\
                                 for email in emails_test]
        auxillary_features = np.vstack(auxillary_features)
        X_test = hstack((X_test, auxillary_features))

    y_pred = clf.predict(X_test)
    labels = np.unique(y_test)
    print(classification_report(y_test, y_pred))
    accuracy = str(np.around(accuracy_score(y_test, y_pred), decimals=3))
    print("accuracy: " + accuracy)
    if model != 'linear':
        regularizer = 'No'
    generate_confusion_matrix(
        y_test, y_pred, eff_labels,
        model.upper() + " model -  " + regularizer.upper() +
        " regularization - " + modes[mode] + " featurization - " + accuracy,
        model + '-' + mode + '-' + regularizer.lower() + '.png', True)
    print(
        "-------------------------------------------------------------------------"
    )
def build_subwords_vocab(target_vocab_size=10000):
    train_neg = import_data('./train/neg')
    train_pos = import_data('./train/pos')
    train_raw = train_neg + train_pos
    train_clean = [clean_data(t) for t in train_raw]

    vocab_encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(train_clean, target_vocab_size)
    vocab_encoder.save_to_file('vocab')

    print(vocab_encoder.vocab_size)
Exemplo n.º 5
0
    def __init__(self):
        self.iv = bo.random_AES_key()
        self.key = bo.random_AES_key()

        file_name = "data_S3C17.txt"
        self.data = encode(
            random.choice(ut.import_data(file_name).splitlines()))
Exemplo n.º 6
0
    def challenge_6():
        print(f"\n-- Challenge 6 - Break repeating-key XOR --")
        print(f"-- Part 1 --")

        data_1 = encode("this is a test")
        data_2 = encode("wokka wokka!!!")

        print(f"String 1      : {decode(data_1)}")
        print(f"String 2      : {decode(data_2)}")
        print(f"Edit distance : {bo.edit_distance(data_1, data_2)}")
        print(f"-- Part 2 --")

        B64_ciphertext = ut.import_data("data_S1C6.txt")
        data = b64decode(B64_ciphertext)
        likely_key_sizes = bo.find_key_size(40, data)

        # Find most likely key.
        def key_comparison():
            for key_size in likely_key_sizes[0:3]:
                key = bo.key_finder(key_size, data)
                secret = bo.repeating_key_xor(data, key)
                score = bo.text_scorer(secret).score()
                yield score, key, secret

        score, key, secret = max(key_comparison())

        print(f"Most likely key sizes : {likely_key_sizes[0:3]}")
        print(f"Highest score         : {score}")
        print(f"Corresponding Key     : {decode(key)}")
        print(f"Secret                : \n{decode(secret[:90])}...")
Exemplo n.º 7
0
    def challenge_8():
        print(f"\n-- Challenge 8 - Detect AES in ECB mode --")
        print(f"-- Method 1 --")

        hex_ciphertext = ut.import_data("data_S1C8.txt")

        def text_breaker():
            for line_index, line in enumerate(hex_ciphertext.splitlines()):
                data = bytes.fromhex(line)
                unique_char_instances = len(list(Counter(data).items()))
                yield unique_char_instances, line_index

        unique_char_instances, line_index = min(text_breaker())
        print(
            f"Assume ECB 1:1 mapping has low diversity of characters compared"
            " to random data")
        print(f"Lowest number of unique chars : {unique_char_instances}")
        print(f"Corresponding line            : {line_index}")
        print(f"-- Method 2 --")

        # Find if data contains duplicate blocks.
        for line_index2, line in enumerate(hex_ciphertext.splitlines()):
            if bo.ECB_mode_check(bytes.fromhex(line)):
                break

        print(f"Find line with duplicate blocks")
        print(f"Corresponding line            : {line_index2}")
Exemplo n.º 8
0
    def challenge_10():
        print(f"\n-- Challenge 10 - Implement CBC mode --")

        data_p = bo.pad(16, b"This is a secret message! TOP SECRET")
        key = b"PASSWORDPASSWORD"
        iv = b"1122334455667788"

        ECB_1 = ocl.AESECB(key)
        CBC_1 = ocl.AESCBC(iv, key)

        ECB_ciphertext = ECB_1.encrypt(data_p)
        ECB_plaintext = bo.depad(ECB_1.decrypt(ECB_ciphertext))
        CBC_ciphertext = CBC_1.encrypt(data_p)
        CBC_plaintext = bo.depad(CBC_1.decrypt(CBC_ciphertext))

        print(f"Padded Secret Message : {data_p}")
        print(f"Key                   : {key}")
        print(f"ECB encrypted message : {ECB_ciphertext}")
        print(f"ECB decrypted message : {ECB_plaintext}")
        print(f"iv                    : {iv}")
        print(f"CBC encrypted message : {CBC_ciphertext}")
        print(f"CBC decrypted message : {CBC_plaintext}")
        print("----- Part 2 ------")

        data = b64decode(ut.import_data("data_S2C10.txt"))
        key = b"YELLOW SUBMARINE"
        iv = bytes([0]) * 16
        CBC_2 = ocl.AESCBC(iv, key)
        decrypted = decode(bo.depad(CBC_2.decrypt(data)))
        print(f"CBC decrypted message : \n{decrypted[0:90]}...")
Exemplo n.º 9
0
def main():
    model = create_model()
    model.summary()

    # Building Phase
    data = import_data("./dataset/crx_clean.data.txt")
    X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data)

    # Expand data dimension for kernel to convolve over
    X_train = np.expand_dims(X_train, axis=2)  # (None, 46, 1)
    X_test = np.expand_dims(X_test, axis=2)  # (None, 46, 1)

    # create model
    model = KerasClassifier(build_fn=create_model, verbose=0)

    # Operational Phase
    scorer = make_scorer(f1_score, pos_label='+')
    print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n")
    Y_pred_grid_search = grid_search_cv_CNN(model, X_train, Y_train, X_test,
                                            Y_test, scorer)
    Y_pred_grid_search = np.squeeze(Y_pred_grid_search)
    print()
    print()
    print(Y_pred_grid_search)
    print()
    print(Y_test)
    print()
    print_scores(Y_test, Y_pred_grid_search)
Exemplo n.º 10
0
    def challenge_7():
        print(f"\n-- Challenge 7 - AES in ECB mode --")

        key = encode("YELLOW SUBMARINE")
        data = b64decode(ut.import_data("data_S1C7.txt"))
        plaintext = ocl.AESECB(key).decrypt(data)

        print(f"Key    : {decode(key)}")
        print(f"Secret : \n{decode(plaintext[:90])}...")
Exemplo n.º 11
0
def print_individual_drops_statistics():
    df = import_data(folder="crowdtangle_list",
                     file_name="account_list_part_1.csv")
    df = df.dropna(subset=['june_drop'])
    df['june_drop'] = df['june_drop'].astype(str).apply(
        lambda x: x[:-1]).astype(int)
    print(
        '\nThere are {} accounts for which we can calculate the drop.'.format(
            len(df)))
    print('Among them, {} accounts have a drop (decrease).'.format(
        len(df[df['june_drop'] < 0])))
Exemplo n.º 12
0
def test_filter(clf, transform, test_data_path):
    print("Testing event filter...")
    print(
        "---------------------------------------------------------------------"
    )
    email_texts, y_test = import_data(test_data_path)
    y_test[np.where(y_test != 'no event')] = 'event'
    X_test = transform(email_texts)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    print("accuracy: " + str(accuracy))
    print(
        "---------------------------------------------------------------------"
    )
def main():
    # Building Phase
    data = import_data("./dataset/crx_clean.data.txt")
    X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data)

    # Operational Phase
    scorer = make_scorer(f1_score, pos_label='+')
    print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n")
    Y_pred_grid_search = grid_search_cv_mlp(X_train, Y_train, X_test, Y_test,
                                            scorer)

    print()
    print()
    print(Y_pred_grid_search)
    print()
    print(Y_test)
    print()
    print_scores(Y_test, Y_pred_grid_search)
Exemplo n.º 14
0
    def challenge_4():
        print("\n-- Challenge 4 - Detect single-char XOR --")

        file_name = "data_S1C4.txt"
        hex_ciphertext = ut.import_data(file_name)

        def text_breaker():
            for line_index, line in enumerate(hex_ciphertext.splitlines()):
                data = bytes.fromhex(line)
                score, byte = bo.single_byte_xor_breaker(data)
                yield score, byte, line_index, data

        score, byte, line_index, data = max(text_breaker())
        plaintext = bo.single_byte_xor(byte, data)

        print(f"Hex data file                    : {file_name}")
        print(f"Highest frequency analysis score : {score}")
        print(f"Corresponding line               : {line_index}")
        print(f"Corresponding key                : {decode(byte)}")
        print(f"Decrypted plaintext              : {decode(plaintext)}")
Exemplo n.º 15
0
def main():
	# Building Phase
	data = import_data(
		"./dataset/crx_clean.data.txt"
		)
	X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data)
	clf_entropy = train_using_entropy(X_train, Y_train)

	# Operational Phase
	print("\n### SINGLE TRAIN-TEST SPLIT ###\n")
	Y_pred_entropy = prediction(X_test, clf_entropy)
	print_scores(Y_test, Y_pred_entropy)

	print("\n### CROSS VAL USING STRATIFIED K FOLD ###\n")
	fold_scores = cv_with_entropy(X, Y)
	print("Cross Validate: ", fold_scores)
	print("Best F1_score: ", max(fold_scores)*100)

	scorer = make_scorer(f1_score, pos_label='+')
	print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n")
	Y_pred_grid_search = grid_search_cv_DT(X_train, Y_train, X_test, Y_test, scorer)
	print_scores(Y_test, Y_pred_grid_search)
Exemplo n.º 16
0
randgain = args.randgain
if randgain == 1:
    gainmin = 0.1
    gainmax = 0.8  # scaling the input range ~ [-1.25,1.25] in [-1,1]
    print(
        'at training, for every forward, apply random gain to [xref,xper] between ',
        gainmin, gainmax)
    print(
        'test data is loaded with random gains, kept fixed throughout the training'
    )
    rgains = [gainmin, gainmax]
else:
    rgains = False

train_loader, test_loader, train_refloader, test_refloader = import_data(
    data_path, subsets, Lsize, batch_size, train_ratio=0.8, rgains=rgains)

###############################################################################
### BUILD MODEL

nconv = args.nconv
nchan = args.nchan
dist_dp = args.dist_dp
dist_act = args.dist_act
ndim = [args.ndim0, args.ndim1]
classif_dp = args.classif_dp
classif_BN = args.classif_BN
classif_act = args.classif_act
minit = args.minit
print(
    '\nBUILDING with settings nconv,nchan,dist_dp,dist_act,ndim,classif_dp,classif_BN,classif_act,minit'
Exemplo n.º 17
0
    train_data.Survived[(train_data.Sex == 'male')
                        & (train_data.Pclass == 3)].value_counts(
                            normalize=True).plot.bar(alpha=0.5)
    plt.xticks(rotation='horizontal')
    plt.title('Poor men survived')
    # Rich women
    plt.subplot2grid((3, 4), (2, 2))
    train_data.Survived[(train_data.Sex == 'female')
                        & (train_data.Pclass == 1)].value_counts(
                            normalize=True).plot.bar(alpha=0.5,
                                                     color='#FA0000')
    plt.xticks(rotation='horizontal')
    plt.title('Rich women survived')
    # Poor women
    plt.subplot2grid((3, 4), (2, 3))
    train_data.Survived[(train_data.Sex == 'female')
                        & (train_data.Pclass == 3)].value_counts(
                            normalize=True).plot.bar(alpha=0.5,
                                                     color='#FA0000')
    plt.xticks(rotation='horizontal')
    plt.title('Poor women survived')

    plt.show()


if __name__ == "__main__":
    train_data, test_data = import_data()
    #train_data, test_data = data_wrangling(train_data, test_data)
    plot_basics(train_data)
    plot_gender(train_data)
Exemplo n.º 18
0
def main():

    # Importo le tabelle del dataset e i grafi
    stop_times, trips, routes, exceptions_service, calendar, stops, trips_with_stop_times = import_data(
    )
    stop_times_load = stop_times.copy().drop(['stop_sequence'], axis=1)

    graph_with_routes, graph_no_multiple_edges = import_graphs(
        'XML files//Complete_TrenordNetwork.xml',
        'XML files//CompleteGraph_NoMultipleEdges.xml')

    # Svolgo le varie fasi dell'analisi
    print('Pre analisi in corso')
    do_pre_analysis(graph_with_routes)
    print('Studio dei carichi in corso')
    do_load_analysis('1841', 'monday', exceptions_service, stops,
                     trips_with_stop_times)
    print('Studio dei percorsi minimi in corso')
    do_min_path_analysis('1581', '1711', '09:00:00', 'monday', 0,
                         trips_with_stop_times, stops, stop_times, trips,
                         graph_no_multiple_edges)
    print('Studio della gestione degli attacchi in corso')
    do_attack_handling_analysis(graph_with_routes, graph_no_multiple_edges)
    print('Fine!')
Exemplo n.º 19
0
    gainmax = 0.8  # scaling the input range ~ [-1.25,1.25] in [-1,1]
    print(
        'at training, for every forward, apply random gain to [xref,xper] between ',
        gainmin, gainmax)
    print(
        'test data is loaded with random gains, kept fixed throughout the training'
    )
    rgains = [gainmin, gainmax]
else:
    rgains = False
#train_loader,test_loader,train_refloader,test_refloader = import_data(data_path,subsets,Lsize,batch_size,train_ratio=0.8,rgains=rgains)

if args.use_npy == 0:
    train_loader, test_loader, train_refloader, test_refloader = import_data(
        Lsize,
        batch_size,
        train_ratio=0.8,
        dummy_test=args.dummy_test,
        audio_inputs_normalise=args.audio_inputs_normalise)
else:
    train_loader, test_loader, train_refloader, test_refloader = import_data(
        data_path,
        subsets,
        Lsize,
        batch_size,
        train_ratio=0.8,
        rgains=rgains,
        dummy_test=args.dummy_test,
        audio_inputs_normalise=args.audio_inputs_normalise)

###############################################################################
### BUILD MODEL
Exemplo n.º 20
0
def _convert_crf_output_to_json(crf_output):
    return json.dumps(utils.import_data(crf_output), indent=2, sort_keys=True)
Exemplo n.º 21
0
    url_df = url_df.dropna(subset=['scientific_topic'])

    return url_df


def keep_only_topic_data(url_df, TOPIC):
    if TOPIC in ["climate", "health", "covid"]:
        return url_df[url_df["scientific_topic"] == TOPIC]
    else:
        return url_df


if __name__ == "__main__":

    DATE = sys.argv[1]
    TOPIC = sys.argv[2] if len(sys.argv) >= 3 else ""

    url_df = import_data(folder="sciencefeedback",
                         file_name="Appearances-Grid view " + DATE + ".csv")
    url_df = keep_only_the_urls_considered_fake_by_facebook(url_df)
    url_df = clean_url_format(url_df)
    url_df = add_info_from_fact_check_table(url_df)
    url_df = keep_only_topic_data(url_df, TOPIC)
    url_df = url_df[[
        'url', 'url_cleaned', 'domain_name', 'Item reviewed',
        'Date of publication', 'scientific_topic'
    ]]
    print("There are {} fake news urls.".format(len(url_df)))
    export_data(url_df, 'sciencefeedback',
                "appearances_" + DATE + "_" + TOPIC + ".csv")
    clean_df = pd.DataFrame(columns=[
        "account_name", "account_id", "date", "share", "comment", "reaction"
    ])

    clean_df['account_name'] = df['account_name'].astype(str)
    clean_df['account_id'] = df['account_id'].astype(int)

    clean_df['date'] = pd.to_datetime(df['date'])

    clean_df["share"] = df[["actual_share_count"]].astype(int)
    clean_df["comment"] = df[["actual_comment_count"]].astype(int)

    clean_df["reaction"] = df[[
        "actual_like_count", "actual_favorite_count", "actual_love_count",
        "actual_wow_count", "actual_haha_count", "actual_sad_count",
        "actual_angry_count", "actual_thankful_count"
    ]].sum(axis=1).astype(int)

    return clean_df


if __name__ == "__main__":

    DATE = sys.argv[1]
    SUFFIX = sys.argv[2]

    df = import_data(folder="crowdtangle_group",
                     file_name='posts_group_' + DATE + '.csv')
    clean_df = clean_columns(df)
    export_data(clean_df, 'crowdtangle_group', 'posts_' + SUFFIX + '.csv')
pd.options.display.max_colwidth = 300

def create_template_csv_from_serie(serie, list_name):

    df = pd.DataFrame(columns=["Page or Account URL", "List"])
    df["Page or Account URL"] = serie.index
    df["List"] = list_name

    export_data(df, 'crowdtangle_list', list_name + '.csv')

    return df


if __name__=="__main__":

    df = import_data(folder="crowdtangle_url", file_name="posts_url_2021-01-04_.csv")    
    df = df.drop_duplicates(subset=['url', 'account_id'])
    s = df["account_url"].value_counts()

    top1_df = create_template_csv_from_serie(s[s > 45], "heloise_fake_news_groups_1")
    top2_df = create_template_csv_from_serie(s[(s <= 45) & (s > 35)], "heloise_fake_news_groups_2")
    top3_df = create_template_csv_from_serie(s[(s <= 35) & (s > 29)], "heloise_fake_news_groups_3")
    top4_df = create_template_csv_from_serie(s[(s <= 29) & (s > 26)], "heloise_fake_news_groups_4")
    top5_df = create_template_csv_from_serie(s[(s <= 26) & (s > 23)], "heloise_fake_news_groups_5")

    print(len(top1_df))
    print(len(top2_df))
    print(len(top3_df))
    print(len(top4_df))
    print(len(top5_df))
Exemplo n.º 24
0
from utils import import_data

dataset_path = "data"
dataset_version = "fake-v1.0"

fake_dataset = import_data(dataset_path, dataset_version)

dataset_path = "data"
dataset_version = "automated-v1.0"

automated_dataset = import_data(dataset_path, dataset_version)
Exemplo n.º 25
0
        if (group_index % 10 == 9) | (group_index
                                      == posts_df['account_id'].nunique() - 1):
            plt.tight_layout()
            save_figure(
                'z_part_2_all_groups_{}'.format(int(group_index / 10) + 1),
                folder='ip&m',
                dpi=100)

        group_index += 1


if __name__ == "__main__":

    posts_df = import_crowdtangle_group_data()
    pages_df = import_data(folder="crowdtangle_list",
                           file_name="page_list_part_2.csv")
    pages_df['date'] = pd.to_datetime(
        pages_df['reduced_distribution_start_date'])

    save_figure_4(posts_df, pages_df)
    save_supplementary_figure_2(posts_df, pages_df)
    save_figure_5(posts_df, pages_df)
    save_figure_5(posts_df, pages_df, period_length=30)

    screenshot_df = import_data(folder="crowdtangle_post_by_id",
                                file_name='screenshot_posts.csv')
    print_statistics_screenshot_posts(screenshot_df)

    # save_all_groups_figures(posts_df, pages_df)
Exemplo n.º 26
0
 def __init__(self):
     self.key = bo.random_AES_key()
     self.secret = b64decode(ut.import_data("data_S2C12.txt"))
Exemplo n.º 27
0
    def combine_predictions(self, y_filter, y_clf, event_indices):
        y_pred = []
        y_filter = list(y_filter)
        y_clf = list(y_clf)
        for i in range(len(y_filter) + len(y_clf)):
            if i in event_indices:
                y_pred.append(y_clf.pop(0))
            else:
                y_pred.append(y_filter.pop(0))
        return np.array(y_pred)


if __name__ == "__main__":
    from argparse import ArgumentParser

    email_texts, y_train = import_data('data/intuit_data')
    transform = generate_featurizer(email_texts)
    X_train = transform(email_texts)

    model = HierachicalClassifier('rf')
    model.fit(X_train, y_train)

    email_texts, y_test = import_data('data/intuit_test_data')
    X_test = transform(email_texts)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("accuracy: " + str(accuracy_score(y_test, y_pred)))

    generate_confusion_matrix(y_test, y_pred, np.unique(y_train),\
                                 'Hierachical Classification', 'hc.png', True)
Exemplo n.º 28
0
import nltk
import os

from utils import import_data, tag_data, extract_important_words, extract_simple_model

FILE = "model2"


def prepare_tools():
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')


if __name__ == "__main__":
    prepare_tools()
    path = "{}/data/{}.txt".format(os.curdir, FILE)
    data = import_data(path)
    tagged_data = tag_data(data)
    extracted_data = extract_important_words(tagged_data)
    model = extract_simple_model(extracted_data)
    print(model)
Exemplo n.º 29
0
###############
# ## SETUP ## #
###############

# load experiment config
with open(CONFIG_FILE) as file:
    config = json.load(file)
# directory for experiment results
exp_dir = config['exp_dir'] + '_' + datetime.datetime.now().strftime('%d-%m-%Y_%I-%M-%S_%p') + '_/'
# setup folders, save code, set seed and get device
setup_exp(exp_dir, config['seed'], ['log'], ['bpda_eot_attack.py', 'nets.py', 'utils.py', CONFIG_FILE])

print('Loading data and nets.')
# data loader
data, num_classes = import_data(config['data_type'], False, False)
attack_loader = DataLoader(data, batch_size=config['batch_size'], shuffle=config['subset_shuffle'], num_workers=0)
# get clf and ebm networks and load saved weights
clf = WideResNet(num_classes=num_classes).cuda()
clf.load_state_dict(t.load(config['clf_weight_path'], map_location=lambda storage, loc: storage.cuda()))
clf.eval()
if config['langevin_steps'] > 0:
    ebm = EBM().cuda()
    ebm.load_state_dict(t.load(config['ebm_weight_path'], map_location=lambda storage, loc: storage.cuda()))
    ebm.eval()

# cross-entropy loss function to generate attack gradients
criterion = t.nn.CrossEntropyLoss()

# rescale adversarial parameters for attacks on images with pixel intensities in the range [-1, 1]
config['adv_eps'] *= 2.0 / 255.0
Exemplo n.º 30
0
import sys
sys.path.append("..")
import utils as u


# In[4]:


# change this string to match the path on your computer
path_to_root = "/Users/mcapizzi/Github/dynet_tutorial/"


# In[5]:


trainX, trainY, testX, testY = u.import_data(path_to_root)


# In[6]:


trainX.shape, trainY.shape


# In[7]:


testX.shape, testY.shape


# The labels are either `1` or `0` where `1=Spam` and `0=Ham`