示例#1
0
def test_dump_json_to_file():
    obj = {"hello": "world"}
    dump_json_to_file(obj, "/tmp/dump_json_to_file/example.json")

    with open("/tmp/dump_json_to_file/example.json", "r") as f:
        actual = json.loads(f.read())

    assert actual == obj
示例#2
0
 def log_article(self, stock_name, search_result, content):
     """
     Log the news article to logs/article.log.json
     :param stock_name: name of the stock which was used as the search query
     :param search_result: a tuple with the first element being the title of the
     title of the search result and second element being the url.
     :param content: the text content of the article
     """
     title, url = search_result
     data = read_json_file(self.article_log)
     if data.get(stock_name) is None:
         data[stock_name] = dict()
     data[stock_name][url] = dict(title=title, content=content)
     dump_json_to_file(data, self.article_log)
     self.log(
         "Saved content of article {} to article.log.json. ({})".format(
             title, stock_name))
示例#3
0
def run_burst_detection(file_name_ds,
                        file_name_ex,
                        window_size_minutes=1,
                        train=True):
    windows = create_windows(file_name_ds,
                             window_size_minutes=window_size_minutes)

    print("Got ", len(windows), " time windows from ", file_name_ds)

    if len(windows) == 0:
        print("No tweets found, will be created an empty dataset")
        database = {}
        dump_json_to_file(file_name_ds, database)

    try:
        expectation_fd = open(file_name_ex, mode='r')
        expectation = json.load(expectation_fd)
        expectation_fd.close()
    except FileNotFoundError as FNE:
        print(str(FNE))
        print("No model found, creating an Empty one")
        expectation = {}
        dump_json_to_file(file_name_ex, expectation)

    for window in windows:
        out_ids = get_bursty_tweet(window, expectation, train=train)
        print("Number of tweets in window: ", len(window))
        print("Number of bursty tweets: ", len(out_ids))
        if (train):
            dump_json_to_file(file_name_ex, expectation)
示例#4
0
 def clear_article_log(self):
     with open(self.article_log, "w"):
         pass
     dump_json_to_file({}, self.article_log)
示例#5
0
def run_clustering(day_name,
                   delta,
                   time_delta,
                   use_burst=False,
                   expectation_file="",
                   use_tfidf=False):

    if use_tfidf:
        s1 = 'tfidf'
        vec = TfidfVectorizer(stop_words='english', tokenizer=tokenize_tweet)
    else:
        s1 = 'we'
        vec = None

    out = "" + s1 + "_" + str(delta) + "_" + str(time_delta) + ".csv"

    expectation_fd = open(expectation_file, mode='r')
    expectation = json.load(expectation_fd)
    expectation_fd.close()
    print("Expectation Model Loaded...")

    database_fd = open(day_name, mode='r')
    database = json.load(database_fd)
    database_fd.close()
    print("Database File Loaded...")

    monthArray = [
        "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
        "Nov", "Dic"
    ]
    try:
        for k in database:
            database[k]['id_str'] = str(k)
            database[k]['full_text'] = database[k]['text']
            date = database[k]['created_at'].split(" ")
            if len(date) == 2:
                date = date[0] + " " + date[1]
            else:
                date = str(date[5]) + "-" + str(monthArray.index(date[1]) +
                                                1) + "-" + str(
                                                    date[2]) + " " + str(
                                                        date[3])
            database[k]['created_at'] = date
    except:
        print("Dataset Fixed")

    database_2 = database

    try:
        database_2 = {}

        for k in database:
            if database[k]['annotations'] != ['None']:
                database_2[k] = database[k]
    except:
        database_2 = database

    dump_json_to_file(out, database_2)
    windows = create_windows(out)

    active_clusters = []
    inactive_clusters = []

    i = 0

    for window in windows:
        print("--------\nSize of Window: ", len(window))
        if use_burst:
            bursty_ids = get_bursty_tweet(window, expectation, train=False)
            print("Bursty tweets Found: ", len(bursty_ids))

            bursty_tweets = [database[str(ids)] for ids in bursty_ids]
            b_t = [(tweet['created_at'], tweet['id'])
                   for tweet in bursty_tweets]
            b_t.sort()
            bursty_tweets = [
                database[str(tweet_tuple[1])] for tweet_tuple in b_t
            ]
        else:
            bursty_tweets = window
        print("Clustering...")
        clustering_on_window(bursty_tweets, active_clusters, inactive_clusters,
                             delta, time_delta, cosine_similarity, use_tfidf,
                             vec)
        print("Active clusters: " + str(len(active_clusters)))
        print("Inctive clusters: " + str(len(inactive_clusters)))
        i += 1

    helper = {}

    for el in database:
        helper[database[el]['full_text']] = (el, database[el].get(
            'annotations', []))

    out_fs = open(out, mode='w')

    for c in active_clusters:
        textes = c.docs
        cluster_id = c.id
        for text in textes:
            info = helper[text]
            print(info[0],
                  '"' + text + '"',
                  info[1],
                  cluster_id,
                  sep=';',
                  file=out_fs)
    for c in inactive_clusters:
        textes = c.docs
        cluster_id = c.id
        for text in textes:
            info = helper[text]
            print(info[0],
                  '"' + text + '"',
                  info[1],
                  cluster_id,
                  sep=';',
                  file=out_fs)

    out_fs.close()

    ret = print_silhouette(active_clusters, inactive_clusters, use_tfidf, vec)
    print(ret)
示例#6
0
         game_type_html = get_html_from_url(base_node['link'])
         game_item_links = utils.get_href_in_td(game_type_html, 1)
         base_node[GAME_TREE_GAME_COLLECTION] = []
         for link in game_item_links:
             if len(str(link['text'])) > 1:
                 base_node[GAME_TREE_GAME_COLLECTION].append({
                     GAME_TREE_GAME_TIEBA_NAME:
                     link['text'],
                     GAME_TREE_GAME_TIEBA_LINK:
                     urljoin(START_PAGE, link['href'])
                 })
                 print("game added: ", link['text'], " type: ",
                       base_node[GAME_TREE_BASE_NAME])
                 game_count += 1
     print('gametree phase 2 done')
     utils.dump_json_to_file(JSON_TEMP_FILE_NAME, game_tree)
 # 补全 GameTree 第三部分 每个游戏贴吧的关注人数、帖子数量信息
 timestamp_start = time.time()
 finished_count = 0
 for base_node in game_tree[GAME_TREE_BASE_TYPE]:
     for game_tieba in base_node[GAME_TREE_GAME_COLLECTION]:
         try:
             # 以前同步爬取的代码,慢到天际
             # if GAME_TREE_GAME_TIEBA_FOLLOWERS in game_tieba:
             #     continue
             # if len(str(game_tieba[GAME_TREE_GAME_TIEBA_NAME])) <= 1:
             #     game_tieba[GAME_TREE_GAME_TIEBA_FOLLOWERS] = 0
             #     game_tieba[GAME_TREE_GAME_TIEBA_TOPICS] = 0
             #     continue
             # game_tieba_detail_url = TIEBA_DETAIL_URL.format(
             #     quote_plus(game_tieba[GAME_TREE_GAME_TIEBA_NAME]))
示例#7
0
def build_dataset_file(input_file, log_dir, output_dir, twitter_api):

    #get the file name in it's directory
    file_name = input_file.split('/')[-1]

    #read dataset from output_directory //it should fit in main memory with no problem
    out_path = join(output_dir, file_name)

    try:
        dataset_fd = open(out_path, mode='r')
        dataset = json.load(dataset_fd)
        dataset_fd.close()
    except FileNotFoundError:
        dataset = dict()
        dump_json_to_file(out_path, dataset)

    #open log - to check what is the last downloaded tweet in the file
    log_path = join(log_dir, file_name)

    try:
        log_fd = open(log_path, mode='r')
        last_line_read = json.load(log_fd)
        log_fd.close()
    except FileNotFoundError:
        last_line_read = 0
        dump_json_to_file(log_path, last_line_read)

    current_row = 1
    #read file for each row from input file
    with open(input_file) as index_fd:
        for line in index_fd:

            #increase the row number to get in the last position in the file
            if (current_row < last_line_read):
                current_row += 1
                continue

            if (current_row % 300 == 0):
                print("Dumping File")
                last_line_read = current_row
                #dump dataset and log
                dump_json_to_file(out_path, dataset)
                dump_json_to_file(log_path, last_line_read)

            #Get the id of the tweet to retrieve
            tweet_id = line.split("\t")[0].split(":")[2]

            #retrieve the tweet
            try:
                retrieved_tweet = twitter_api.get_status(tweet_id,
                                                         tweet_mode="extended")
                dataset[tweet_id] = retrieved_tweet._json
            except tweepy.RateLimitError as rle:
                print(str(rle))
                print("Exiting, the dataset will be updated")
                break
            except tweepy.TweepError as e:
                code = e.api_code
                if not (code == 34 or code == 63 or code == 144
                        or code == 179):
                    print("Unexpected error, quitting...")
                    print(str(e))
                    break
                #print(str(e))
                #print("TweetLost:" + str(tweet_id))

            current_row += 1

        #file finished
        last_line_read = current_row
        dump_json_to_file(out_path, dataset)
        dump_json_to_file(log_path, last_line_read)
    print("Quit:" + file_name)
示例#8
0
import sys

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from datasets import mnist
from models import fc_100_100_10, pca_filtered_model, train, load_from_file
from attacks import adversarial_example, adversarial_score
from utils import dump_json_to_file

argument_parser = argparse.ArgumentParser()
argument_parser.add_argument('--model', nargs="+", dest="model_paths",
                             help='path to models to attack')
argument_parser.add_argument('--eta', nargs='+', type=float, dest="eta_list",
                             default=np.arange(0, 0.25, 0.01),
                             help='values of eta for generating adv/ examples')
args = argument_parser.parse_args()

PREFIX = os.environ.get('PREFIX', '.')

X_train, y_train, X_test, y_test = mnist()

for model_path in args.model_paths:
    model = load_from_file(model_path)
    print(f"Computing adversarial score against {model.name}...", file=sys.stderr)
    adversarial_score_dictionary = {}
    for eta in args.eta_list:
        score = round(adversarial_score(model, X_test, y_test, eta), 3)
        adversarial_score_dictionary[eta] = score

    print(json.dumps(adversarial_score_dictionary))
    dump_json_to_file(adversarial_score_dictionary, f"{PREFIX}/attack/{model.name}/score.json")