Пример #1
0
test_dict = [{"north": 1.22, "east": 2.3, "name": "Lebron James"},
             {"north": 1.35, "east": 3.5, "name": "Kevin Durant"}]

# test_json_path = "data/5minute.json"
#
# test = json.load(open(test_json_path, encoding="utf8"))
#
# with open(test_json_path, encoding="utf8") as in_json:
# for line in in_json:
# if line != "\n":
# print(json.loads(line))

if "import_csv" in RUN_THESE_TESTS or "all" in RUN_THESE_TESTS:
    twitter_mongo = (test_mongo[0], test_mongo[1], "csv_import")
    data_import.import_files("data/subset.csv",
                             mongo_connection=twitter_mongo,
                             mongo_address=mongo_address)

if "import_json" in RUN_THESE_TESTS or "all" in RUN_THESE_TESTS:
    twitter_mongo = (test_mongo[0], test_mongo[1], "json_import")
    data_import.import_files("data/5minute.json",
                             mongo_connection=twitter_mongo,
                             mongo_address=mongo_address)

if "dump_csv" in RUN_THESE_TESTS or "all" in RUN_THESE_TESTS:
    data_import.dump_errors(test_list, "test_csv", "hello/test/1.csv", test_output + "csv_dump/")

if "dump_json" in RUN_THESE_TESTS or "all" in RUN_THESE_TESTS:
    data_import.dump_errors(test_dict, "test_j", "hello/test/1.json", test_output + "csv_dump/")

Пример #2
0
                 ("192.168.0.97:28003", "twitter", "address"))

# set the destination mongodb database
twitter_mongo = ("192.168.0.97:30000", "twitter", "tweets")

# point to datafiles
harvested_tweets = "data/input/final_data/Tweets_Apr12_Aug14.csv"
april_tweets = "data/input/final_data/GNIP_April.csv"
aug_oct_tweets = "data/input/final_data/GNIP_August_October.csv"

# folder to put sliced up csv files (this will allow for parallel inserts and address lookups)
output_folder = "data/input/new_data/chunks/"

# convert the file names into a tuple to loop through
files = (harvested_tweets,
         april_tweets,
         aug_oct_tweets)

# slice up each file
for file_name in files:
    print("Start slicing: %s at: %s" % (file_name, datetime.now()))
    data_import.create_partition_csv(file_name,
                                     output_folder=output_folder,
                                     num_rows=-1,
                                     chunk_size=10000)

# insert all files from the output folder. Note that the first argument can be a file as well,
# in which case the function imports that file only
data_import.import_files(output_folder,
                         mongo_connection=twitter_mongo,
                         mongo_address=mongo_address)