def main(start_date): """called by the script to do the processing""" # set the location of the settings file cap_dir = os.getenv("CAPSTONE_DIR") api_settings = os.path.join(cap_dir, 'settings', 'api_settings.json') project_settings = os.path.join(cap_dir, 'settings', 'project_settings.json') # set some settings database = load_setting(project_settings, 'db_name') # name of mongodb database metadata_tab = load_setting(project_settings, "nyt_metadata_table") nyt_api_key = load_setting(api_settings, 'NYT_API_KEY') # NYTimes API Key # open the connection to the database, get the table client = MongoClient() db = client[database] table = db[metadata_tab] # paginate through the search results # starting at the start date specified at runtime and a 2000 day initial window paginate_by_date(nyt_api_key, table, start_date=start_date, init_window=2000) # close the connection to the client client.close()
def main(fileout): # set the location of the settings file settings_file = os.path.join(os.getenv("CAPSTONE_DIR"), 'settings/project_settings.json') # set some settings database = load_setting(settings_file, 'db_name') # name of mongodb database # open the connection to the database, get the table client = MongoClient() db = client[database] table = db['nyt-recipe-metadata'] # get all results in the table results = table.find() with open(fileout, 'w') as f: for result in results: f.write(result['web_url'] + '\n')
def main(fileout): # set the location of the settings file settings_file = os.path.join(os.getenv("CAPSTONE_DIR"), 'settings/project_settings.json') # set some settings database = load_setting(settings_file, 'db_name') # name of mongodb database # open the connection to the database, get the table client = MongoClient() db = client[database] table = db['nyt-recipe-metadata'] # get all results in the table results = table.find() with open(fileout, 'w') as f: for result in results: f.write(result['web_url'] + '\n') if __name__ == "__main__": cap_dir = os.getenv("CAPSTONE_DIR") if not os.path.isdir(os.path.join(cap_dir, 'data')): os.makedirs(os.path.join(cap_dir, 'data')) file_out = load_setting(os.path.join(cap_dir, 'settings', 'project_settings.json'), 'nyt_url_file') outfile = os.path.join(cap_dir, 'data', file_out) main(outfile)
pickler(features, feature_pickle) # pickler(wc, bag_pickle) # pickler(vocab, vocab_pickle) # pickler(components, comp_pickle) if __name__ == "__main__": # set the location of the settings file cap_dir = os.getenv("CAPSTONE_DIR") settings_file = os.path.join(cap_dir, 'settings', 'project_settings.json') # load in the tables to use database = load_setting(settings_file, 'db_name') # name of mongodb database table = load_setting(settings_file, 'nyt_recipe_extracted') # connect to the client and database/collection client = MongoClient() db = client[database] tab = db[table] # name the pickles # bag_pickle = os.path.join(cap_dir, 'data/pickles/bag_of_ingredients.pkl') # vocab_pickle = os.path.join(cap_dir, 'data/pickles/vocabulary.pkl') # comp_pickle = os.path.join(cap_dir, 'data/pickles/recipe_components.pkl') feature_pickle = os.path.join(cap_dir, 'data/pickles/features.pkl') main(tab)
app = Flask(__name__) app.debug = True # set the location of the capstone directory cap_dir = os.getenv("CAPSTONE_DIR", '/home/ubuntu/gschool-capstone') # set the location of the settings file settings_file = os.path.join(cap_dir, 'settings', 'project_settings.json') api_file = os.path.join(cap_dir, 'settings', 'api_settings.json') # set the pickle paths pickle_path = os.path.join(cap_dir, 'data', 'pickles') # load in some settings database = load_setting(settings_file, item='db_name') # get the google captcha api key google_captcha_key = load_setting(api_file, item='GOOGLE_CAPTCHA') google_site_key = load_setting(api_file, item="GOOGLE_SITEKEY") with open(os.path.join(pickle_path, 'features.pkl'), 'rb') as f: features = pickle.load(f) with open(os.path.join(pickle_path, 'topics.pkl'), 'rb') as f: topics = pickle.load(f) client = MongoClient() db = client[database] tab = db['feedback']
if verbose: print('already have recipe') # status message for output if verbose: print("Processed {} out of {} recipes\n".format(i+1, n_urls)) # take a longer break every 100 recipes if i % 100 == 0: if verbose: print("Taking a break...\n") time.sleep(60) if __name__ == "__main__": # set the location of the settings file cap_dir = os.getenv("CAPSTONE_DIR") settings_file = os.path.join(cap_dir, 'settings', 'project_settings.json') database = load_setting(settings_file, 'db_name') # name of mongodb database recipe_tab = load_setting(settings_file, 'nyt_recipe_html') url_filename = load_setting(settings_file, 'nyt_url_file') # get a list of the urls to work on url_file = os.path.join(cap_dir, 'data', url_filename) with open(url_file, 'r') as f: urls = [line for line in f] # connect to the client and database/collection client = MongoClient() db = client[database] tab = db[recipe_tab] # get all the recipes from the url list
def main(i, o): to_parse = i.find() success = 0 for ix, recipe in enumerate(to_parse): s = parse_and_insert(recipe, o) success += s print("FINISHED. Successfully parsed {} of out {} recipes".format(success, ix)) if __name__ == "__main__": # set the location of the settings file cap_dir = os.getenv("CAPSTONE_DIR") settings_file = os.path.join(cap_dir, "settings", "project_settings.json") database = load_setting(settings_file, "db_name") # name of mongodb database in_table = load_setting(settings_file, "nyt_recipe_html") out_table = load_setting(settings_file, "nyt_recipe_extracted") # connect to the client and database/collection client = MongoClient() db = client[database] itab = db[in_table] otab = db[out_table] otab.remove({}) main(itab, otab)