def test_dump_json_to_file(): obj = {"hello": "world"} dump_json_to_file(obj, "/tmp/dump_json_to_file/example.json") with open("/tmp/dump_json_to_file/example.json", "r") as f: actual = json.loads(f.read()) assert actual == obj
def log_article(self, stock_name, search_result, content): """ Log the news article to logs/article.log.json :param stock_name: name of the stock which was used as the search query :param search_result: a tuple with the first element being the title of the title of the search result and second element being the url. :param content: the text content of the article """ title, url = search_result data = read_json_file(self.article_log) if data.get(stock_name) is None: data[stock_name] = dict() data[stock_name][url] = dict(title=title, content=content) dump_json_to_file(data, self.article_log) self.log( "Saved content of article {} to article.log.json. ({})".format( title, stock_name))
def run_burst_detection(file_name_ds, file_name_ex, window_size_minutes=1, train=True): windows = create_windows(file_name_ds, window_size_minutes=window_size_minutes) print("Got ", len(windows), " time windows from ", file_name_ds) if len(windows) == 0: print("No tweets found, will be created an empty dataset") database = {} dump_json_to_file(file_name_ds, database) try: expectation_fd = open(file_name_ex, mode='r') expectation = json.load(expectation_fd) expectation_fd.close() except FileNotFoundError as FNE: print(str(FNE)) print("No model found, creating an Empty one") expectation = {} dump_json_to_file(file_name_ex, expectation) for window in windows: out_ids = get_bursty_tweet(window, expectation, train=train) print("Number of tweets in window: ", len(window)) print("Number of bursty tweets: ", len(out_ids)) if (train): dump_json_to_file(file_name_ex, expectation)
def clear_article_log(self): with open(self.article_log, "w"): pass dump_json_to_file({}, self.article_log)
def run_clustering(day_name, delta, time_delta, use_burst=False, expectation_file="", use_tfidf=False): if use_tfidf: s1 = 'tfidf' vec = TfidfVectorizer(stop_words='english', tokenizer=tokenize_tweet) else: s1 = 'we' vec = None out = "" + s1 + "_" + str(delta) + "_" + str(time_delta) + ".csv" expectation_fd = open(expectation_file, mode='r') expectation = json.load(expectation_fd) expectation_fd.close() print("Expectation Model Loaded...") database_fd = open(day_name, mode='r') database = json.load(database_fd) database_fd.close() print("Database File Loaded...") monthArray = [ "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dic" ] try: for k in database: database[k]['id_str'] = str(k) database[k]['full_text'] = database[k]['text'] date = database[k]['created_at'].split(" ") if len(date) == 2: date = date[0] + " " + date[1] else: date = str(date[5]) + "-" + str(monthArray.index(date[1]) + 1) + "-" + str( date[2]) + " " + str( date[3]) database[k]['created_at'] = date except: print("Dataset Fixed") database_2 = database try: database_2 = {} for k in database: if database[k]['annotations'] != ['None']: database_2[k] = database[k] except: database_2 = database dump_json_to_file(out, database_2) windows = create_windows(out) active_clusters = [] inactive_clusters = [] i = 0 for window in windows: print("--------\nSize of Window: ", len(window)) if use_burst: bursty_ids = get_bursty_tweet(window, expectation, train=False) print("Bursty tweets Found: ", len(bursty_ids)) bursty_tweets = [database[str(ids)] for ids in bursty_ids] b_t = [(tweet['created_at'], tweet['id']) for tweet in bursty_tweets] b_t.sort() bursty_tweets = [ database[str(tweet_tuple[1])] for tweet_tuple in b_t ] else: bursty_tweets = window print("Clustering...") clustering_on_window(bursty_tweets, active_clusters, inactive_clusters, delta, time_delta, cosine_similarity, use_tfidf, vec) print("Active clusters: " + str(len(active_clusters))) print("Inctive clusters: " + str(len(inactive_clusters))) i += 1 helper = {} for el in database: helper[database[el]['full_text']] = (el, database[el].get( 'annotations', [])) out_fs = open(out, mode='w') for c in active_clusters: textes = c.docs cluster_id = c.id for text in textes: info = helper[text] print(info[0], '"' + text + '"', info[1], cluster_id, sep=';', file=out_fs) for c in inactive_clusters: textes = c.docs cluster_id = c.id for text in textes: info = helper[text] print(info[0], '"' + text + '"', info[1], cluster_id, sep=';', file=out_fs) out_fs.close() ret = print_silhouette(active_clusters, inactive_clusters, use_tfidf, vec) print(ret)
game_type_html = get_html_from_url(base_node['link']) game_item_links = utils.get_href_in_td(game_type_html, 1) base_node[GAME_TREE_GAME_COLLECTION] = [] for link in game_item_links: if len(str(link['text'])) > 1: base_node[GAME_TREE_GAME_COLLECTION].append({ GAME_TREE_GAME_TIEBA_NAME: link['text'], GAME_TREE_GAME_TIEBA_LINK: urljoin(START_PAGE, link['href']) }) print("game added: ", link['text'], " type: ", base_node[GAME_TREE_BASE_NAME]) game_count += 1 print('gametree phase 2 done') utils.dump_json_to_file(JSON_TEMP_FILE_NAME, game_tree) # 补全 GameTree 第三部分 每个游戏贴吧的关注人数、帖子数量信息 timestamp_start = time.time() finished_count = 0 for base_node in game_tree[GAME_TREE_BASE_TYPE]: for game_tieba in base_node[GAME_TREE_GAME_COLLECTION]: try: # 以前同步爬取的代码,慢到天际 # if GAME_TREE_GAME_TIEBA_FOLLOWERS in game_tieba: # continue # if len(str(game_tieba[GAME_TREE_GAME_TIEBA_NAME])) <= 1: # game_tieba[GAME_TREE_GAME_TIEBA_FOLLOWERS] = 0 # game_tieba[GAME_TREE_GAME_TIEBA_TOPICS] = 0 # continue # game_tieba_detail_url = TIEBA_DETAIL_URL.format( # quote_plus(game_tieba[GAME_TREE_GAME_TIEBA_NAME]))
def build_dataset_file(input_file, log_dir, output_dir, twitter_api): #get the file name in it's directory file_name = input_file.split('/')[-1] #read dataset from output_directory //it should fit in main memory with no problem out_path = join(output_dir, file_name) try: dataset_fd = open(out_path, mode='r') dataset = json.load(dataset_fd) dataset_fd.close() except FileNotFoundError: dataset = dict() dump_json_to_file(out_path, dataset) #open log - to check what is the last downloaded tweet in the file log_path = join(log_dir, file_name) try: log_fd = open(log_path, mode='r') last_line_read = json.load(log_fd) log_fd.close() except FileNotFoundError: last_line_read = 0 dump_json_to_file(log_path, last_line_read) current_row = 1 #read file for each row from input file with open(input_file) as index_fd: for line in index_fd: #increase the row number to get in the last position in the file if (current_row < last_line_read): current_row += 1 continue if (current_row % 300 == 0): print("Dumping File") last_line_read = current_row #dump dataset and log dump_json_to_file(out_path, dataset) dump_json_to_file(log_path, last_line_read) #Get the id of the tweet to retrieve tweet_id = line.split("\t")[0].split(":")[2] #retrieve the tweet try: retrieved_tweet = twitter_api.get_status(tweet_id, tweet_mode="extended") dataset[tweet_id] = retrieved_tweet._json except tweepy.RateLimitError as rle: print(str(rle)) print("Exiting, the dataset will be updated") break except tweepy.TweepError as e: code = e.api_code if not (code == 34 or code == 63 or code == 144 or code == 179): print("Unexpected error, quitting...") print(str(e)) break #print(str(e)) #print("TweetLost:" + str(tweet_id)) current_row += 1 #file finished last_line_read = current_row dump_json_to_file(out_path, dataset) dump_json_to_file(log_path, last_line_read) print("Quit:" + file_name)
import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from datasets import mnist from models import fc_100_100_10, pca_filtered_model, train, load_from_file from attacks import adversarial_example, adversarial_score from utils import dump_json_to_file argument_parser = argparse.ArgumentParser() argument_parser.add_argument('--model', nargs="+", dest="model_paths", help='path to models to attack') argument_parser.add_argument('--eta', nargs='+', type=float, dest="eta_list", default=np.arange(0, 0.25, 0.01), help='values of eta for generating adv/ examples') args = argument_parser.parse_args() PREFIX = os.environ.get('PREFIX', '.') X_train, y_train, X_test, y_test = mnist() for model_path in args.model_paths: model = load_from_file(model_path) print(f"Computing adversarial score against {model.name}...", file=sys.stderr) adversarial_score_dictionary = {} for eta in args.eta_list: score = round(adversarial_score(model, X_test, y_test, eta), 3) adversarial_score_dictionary[eta] = score print(json.dumps(adversarial_score_dictionary)) dump_json_to_file(adversarial_score_dictionary, f"{PREFIX}/attack/{model.name}/score.json")