def main(test_args=None): args = parse_args(sys.argv[1:] if test_args is None else test_args) input_ext = args.input_file[args.input_file.rfind('.'):].lower() if input_ext == '.json': spec = utils.read_json(args.input_file) xdb = utils.read_json(args.xdb) struct = Stitcher( spec, xdb, args.pdb_dir, args.cappings_dir, args.metadata_dir, args.show_fusion, args.disable_capping, args.skip_unused ).run() if args.out_file == '': args.out_file = args.input_file args.out_file = '.'.join(args.out_file.split('.')[:-1] + ['cif']) print('Saving to:', args.out_file) pdb_utils.save_cif(struct=struct, path=args.out_file) else: print('Unknown input file type: \"{}\"'.format(input_ext)) exit()
def update_hash_in_partitioned_files( data_directory, file_name, suffix_1='_partition_', suffix_2='_updated', mapping_file_with_hash='mapping_with_hash.json'): mapping_json = utilities.read_json( os.path.join(data_directory, mapping_file_with_hash)) dict_per_partition = {} empty = 0 non_empty = 0 pbar = tqdm(total=len(mapping_json)) for key in mapping_json: if mapping_json[key]['partition_index'] not in dict_per_partition: dict_per_partition[mapping_json[key]['partition_index']] = {} dict_per_partition[mapping_json[key]['partition_index']][ mapping_json[key]['id_index']] = {} else: dict_per_partition[mapping_json[key]['partition_index']][ mapping_json[key]['id_index']] = {} if mapping_json[key]['content_hash'].strip() == '': empty += 1 else: non_empty += 1 dict_per_partition[mapping_json[key]['partition_index']][ mapping_json[key] ['id_index']]['content_hash'] = mapping_json[key]['content_hash'] pbar.update(1) print('empty, non-empty:', empty, non_empty) pbar = tqdm(total=len(dict_per_partition)) for key in dict_per_partition: current_json = None if key == len(dict_per_partition): current_json = utilities.read_json( os.path.join(data_directory, file_name + suffix_1 + str(key) + '.json')) else: current_json = utilities.read_json( os.path.join( data_directory, file_name + suffix_1 + str(key) + suffix_2 + '.json')) for item in dict_per_partition[key]: current_json[item]['content_hash'] = dict_per_partition[key][item][ 'content_hash'] utilities.write_json( os.path.join( data_directory, file_name + '_with_hash_partition_' + str(key) + '.json'), current_json) pbar.update(1) return
def collect_core_selection_data(username, password, use_json=False): data_set = get_core_selection_data(username, password) try: os.mkdir("DeGiro") except Exception: print("DeGiro folder already exists") if use_json: data_set_filled = read_json(r"DeGiro/core_selection_filled.json") else: print("Filling JSON data_set_filled..") data_set_filled = {} for symbol in tqdm(data_set): data_set_filled[symbol] = fill_data_points_etfs(data_set[symbol]) data_set_filled[symbol]['ISIN'] = data_set[symbol]['ISIN'] with open(r"DeGiro/core_selection_filled.json", 'w') as handle: sorted_data_set_filled = sort_dictionary(data_set_filled) json.dump(sorted_data_set_filled, handle, indent=4) # Make legit data set print("Filling JSON data_set_filtered..") data_set_filtered = {} for symbol in data_set_filled: if data_set_filled[symbol]['summary'] is not None: data_set_filtered[symbol] = data_set_filled[symbol] data_set_filtered[symbol]['ISIN'] = data_set[symbol]['ISIN'] with open(r"DeGiro/core_selection_filtered.json", 'w') as handle: sorted_data_set_filtered = sort_dictionary(data_set_filtered) json.dump(sorted_data_set_filtered, handle, indent=4) print("Done!") return data_set_filtered
def stitch_images(self, whattostitch): print(colored("Image stitching started: {}".format(whattostitch), color='blue')) for dirName, subdirList, fileList in os.walk(self.path_RESULTS): if 'tile_list.json' in fileList: tile_list = read_json(dirName + '/tile_list.json') self.just_stitch_them(tile_list, dirName, whattostitch) print(colored("Image stitching completed", color='green'))
def taste(food): ''' The main public interface of the system. This function returns a JSON object that contains six taste scores, on a scale of 0 to 10: bitter, rich, salt, sour, sweet and umami ''' food = append_parsed(food) nutrients = get_nutrients(food) tastes = { "bitter": round(bitter(food, nutrients), 3), "rich": round(rich(nutrients), 3), "salt": round(salt(nutrients), 3), "sour": round(sour(food, nutrients), 3), "sweet": round(sweet(nutrients), 3), "umami": round(umami(food, nutrients, 3)) } if os.path.exists("adjustment_factors.json"): for taste, adjustment \ in utilities.read_json("adjustment_factors.json").items(): tastes[taste] -= adjustment if not os.path.exists("../Utilities/Team 2"): os.mkdir("../Utilities/Team 2") with open("../Utilities/Team 2/tastes.csv", "a") as csvfile: csvfile.write(",".join( [str(food['dish_id'])] + [str(round(tastes[key], 3)) for key in sorted(tastes.keys())]) + "\n") return tastes
def bitter(food, nutrition_data, LEVEL1_MULTIPLIER=0.80, LEVEL2_MULTIPLIER=1.40, MULTI_WORD_MULTIPLIER=2.3): ''' This function computes the bitter score for the food item. This computation is performed with three approaches: - The iron content is used a fraction of the total active nutrient weight - Three groups of descriptors are used, each of which has a different weightage towards the final scoring. - The bitter words are divided into two levels, each corresponding to an particular intensity of bitterness. Each of these levels again has a different weightage. Finally, a real value from 0-10 is returned as a bitter score for the food item. ''' try: bitter_descriptors = utilities.read_json("bitter_descriptors.json") descriptor_score = match_descriptors(food['ingredient_str'], bitter_descriptors) bitterscore = (nutrition_data["iron"] / total_nutrient_weight(nutrition_data)) pairings = zip( [LEVEL1_MULTIPLIER, LEVEL2_MULTIPLIER, MULTI_WORD_MULTIPLIER], ["bitter_l1", "bitter_l2", "multi_words"]) for pair in pairings: if descriptor_score.__contains__(pair[1]): bitterscore += pair[0] * descriptor_score[pair[1]] * 1 except Exception: bitterscore = 0 return round(bitterscore / 1.4571, 3)
def sour(food, nutrition_data, SOURNESS_FACTOR_X=0.5, SOURNESS_FACTOR_Y=0.15, SOURNESS_FACTOR_Z=0.35): ''' For sourness, an approach similar to how bitter was calculated is applied. Here, the nutrient directly affecting the sourness of a food item is the vitamin C content. Therefore, this is given the maximum weightage of 50% that counts towards the final final sour score for the dish. The other two metrics are calculated from comparing against an internally maintained database of ingredients and keywords, each of which is given its own weightage. These are divided into two levels: sour and too sour. Naturally, too sour keywords have a higher weightage than keywords that are tagged just sour. ''' food_words = food['ingredient_str'].upper().split(' ') try: vitamin_c = nutrition_data['vitamin_c'] * 1000 except KeyError: vitamin_c = 0.0 sour = utilities.read_json("sour.json") too_sour = utilities.read_json("too_sour.json") try: sour_score_x = vitamin_c / nutrition_data['weight'] except ZeroDivisionError: sour_score_x = 0 sour_score_y = 0 sour_score_z = 0 for word in food_words: if word in sour[word[0]]: sour_score_y += 1 if word in too_sour[word[0]]: sour_score_z += 1 sour_score = round(((SOURNESS_FACTOR_X * sour_score_x) + (SOURNESS_FACTOR_Y * sour_score_y) + (SOURNESS_FACTOR_Z * sour_score_z)) / 1.43, 3) if sour_score > 1: sour_score = 1 return round(sour_score * 10, 3)
def adjust_greedy(partitioned_data_directory, partitioned_file_name='javascript_partition_', remaining_file_name='remaining.json'): remaining_json = utilities.read_json( os.path.join(partitioned_data_directory, remaining_file_name)) pbar_overall = tqdm(total=len(remaining_json)) for key in remaining_json: current_json = utilities.read_json( os.path.join(partitioned_data_directory, partitioned_file_name + key + '.json')) for str_id_index in remaining_json[key]: if 'visit_id' not in remaining_json[key][str_id_index]: continue current_json[str_id_index]['visit_id'] = remaining_json[key][ str_id_index]['visit_id'] current_json[str_id_index]['top_url'] = remaining_json[key][ str_id_index]['top_url'] current_json[str_id_index]['script_url'] = remaining_json[key][ str_id_index]['script_url'] current_json[str_id_index]['script_line'] = remaining_json[key][ str_id_index]['script_line'] current_json[str_id_index]['script_col'] = remaining_json[key][ str_id_index]['script_col'] for item in remaining_json[key][str_id_index]['information']: current_json[str_id_index]['information'].append(item) utilities.write_json(os.path.join( partitioned_data_directory, partitioned_file_name + key + '_updated.json'), current_json, indent_length=0) pbar_overall.update(1) return
def _loadLayoutFile(path: str) -> pd.DataFrame: df: pd.DataFrame = pd.read_csv(get_resource('empty.csv'), names=KEY_SPEC.keys(), dtype=KEY_SPEC) assert len(df.columns) == len(KEY_SPEC) keyboard_json = read_json(path) layout = pd.read_json(json.dumps(keyboard_json['keys'])) df = df.append(layout) assert len(df.columns) == len(KEY_SPEC) setattr(df, 'keyboard_left', keyboard_json['left']) setattr(df, 'keyboard_top', keyboard_json['top']) setattr(df, 'keyboard_width', keyboard_json['width']) setattr(df, 'keyboard_height', keyboard_json['height']) return df
def request_token(self, credentials_path): credentials = read_json(credentials_path) authentication = Task(self.loop) hdrs = {'Content-Type': 'application/json'} req_auth = authentication.do_the_task(self.TOKEN_REQ_URL, hdrs, json.dumps(credentials)) self.TOKEN = req_auth[0] self.TOKEN['timestamp'] = time.time() headers.update( {'Authorization': 'Bearer {}'.format(self.TOKEN['id_token'])}) write_json(self.TOKEN, path_token) if 'status_code' in req_auth[0] and req_auth[0]['status_code'] == 200: print( colored("New token saved in: {}".format(path_token), color='green')) else: print( colored("New token request Failed, status code: {}".format( req_auth[0]['status_code']), color='red'))
def check_token(self, TOKEN_path): valid = False try: TOKEN = read_json(TOKEN_path) authentication = Task(self.loop) if token_seems_valid(TOKEN): hdrs = copy.deepcopy(headers) hdrs.update( {'Authorization': 'Bearer {}'.format(TOKEN['id_token'])}) auth = authentication.do_the_task( self.AUTHORISE_URL, hdrs, json.dumps({"token": TOKEN['id_token']})) if 'status_code' in auth[0] and auth[0]['status_code'] == 200: valid = True print( colored("Token authentication successful ", color='green')) self.TOKEN = TOKEN headers.update({ 'Authorization': 'Bearer {}'.format(self.TOKEN['id_token']) }) else: print( colored("Token authentication Failed, status code:{}". format(auth[0]['status_code']), color='red'), colored("\nbut, new Token request will be called", color='blue')) valid = False else: valid = False except: print(colored("Token Not found!", color='red'), colored("but, new request will be called", color='blue')) finally: return valid
def count_cars(self): print(colored("Cars counting started", color='blue')) cars = [] nof_maps = 0 for dirName, subdirList, fileList in os.walk(self.path_RESULTS): if 'mapId_0' in subdirList: nof_maps = len(subdirList) if 'detections.geojson' in fileList: cars_in_tile = 0 detection = read_json(dirName+'/'+'detections.geojson') for idx in range(len((detection['features']))): props = dict(detection['features'][idx]["properties"]) if props.get('class') and 'cars' in props.values(): cars_in_tile += int(props['count']) cars.append(cars_in_tile) print(colored("Total # of cars counted in the desired period: {}".format(sum(cars)), color='magenta', on_color='on_white', attrs=['bold'])) print(colored("counted in {} maps, resulting in {} of cars in average.".format(nof_maps, sum(cars)/nof_maps), color='magenta', on_color='on_white', attrs=['bold']))
def umami(food, nutrition_data, PROTEIN_SUPPLEMENT_MULTIPLIER=0.80, VEGETABLES_MULTIPLIER=7, MEAT_MULTIPLIER=3, STRING_MULTIPLIER=9.45): ''' Calculation of umami score is similar to that of bitter - The presense of iron along with the added effects of different categories of keywords, such as savoury vegetables, the savouriness of different meats as well as naturally occuring sources of protein, such as casein, all have their own weights that determines their contribution to the final score. ''' for key in nutrition_data.keys(): if nutrition_data[key] is None: nutrition_data[key] = 0 umami_descriptors = utilities.read_json("umami_descriptors.json") descriptor_score = match_descriptors(food['ingredient_str'], umami_descriptors) try: umamiscore = (nutrition_data["protein"] / total_nutrient_weight(nutrition_data)) pairings = zip([ PROTEIN_SUPPLEMENT_MULTIPLIER, VEGETABLES_MULTIPLIER, MEAT_MULTIPLIER, STRING_MULTIPLIER ], ["protein_supps", "vegetables", "meat", "savory_strings"]) for pair in pairings: if descriptor_score.__contains__(pair[1]): umamiscore += pair[0] * descriptor_score[pair[1]] except Exception: umamiscore = 0 return round(umamiscore, 3) if umamiscore <= 10 else 10
def extract_features(directory_path, result_path, keywords_file, feature_type_to_read): all_files = utilities.get_files_in_a_directory(directory_path) temp = utilities.read_file(keywords_file) keywords_list = [] for item in temp: keywords_list.append(item.strip()) for f_name in all_files: try: file_data = utilities.read_json(f_name) if feature_type_to_read == ALL: raw_features = new_walk(file_data) elif feature_type_to_read == NO_NAMES: raw_features = new_walk_no_names(file_data) elif feature_type_to_read == KEYWORD: raw_features = new_walk_reserved_words(file_data, keywords_list) utilities.append_list(f_name.replace(directory_path,result_directory).replace('json', 'txt'), raw_features) except Exception as e: print(f_name) print(str(e)) return
vector = vectorizer.fit_transform(all_recipes) for index in test_indices: test_dish = training_set[test_indices[index]] neighbors = get_neighbors(test_dish, training_set, vector, similarity_measure) neighbors_cuisines = [(get_cuisine_tags(dish_name[0]), dish_name[1]) for dish_name in neighbors][:7] cuisines_dict[test_dish['dish_name']] = knn(neighbors_cuisines) return cuisines_dict if __name__ == '__main__': if len(sys.argv) == 3 or len(sys.argv) == 4: all_recipes = list() sample_size = 1300 if len(sys.argv) == 4: sample_size = int(sys.argv[3]) all_dishes = read_json(sys.argv[1]) test_dishes = read_json(sys.argv[2]) for dish, value in classify_cuisine(all_dishes[:sample_size], test_dishes, cosine_similarity).items(): print(dish) print("------------") print(value) print() else: print( "python3 cuisine_classifier.py <path_do_dish_database> <path_to_test_dishes> <sample_size>(OPTIONAL)" )
def main(files): # We're done if there's only one file. if len(files) == 1: return # Check if files exist. for file in files: assert os.path.isfile(file) data = [] for i, file in enumerate(files): # Load files. data.append(read_json(file)) # Check if necessary fields are present in all files. # TODO: Maybe make a function to do this, since it's used often? Also, is this really necessary here? assert 'experiment_name' in data[i] assert 'experiment_paradigm' in data[i] assert 'date' in data[i] assert 'neuroid' in data[i] assert 'animal' in data[i]['neuroid'] assert 'f_sampling' in data[i] assert 'f_low' in data[i] assert 'f_high' in data[i] assert 'ellip_order' in data[i] assert 'threshold_sd' in data[i] assert 'chunks_for_threshold' in data[i] assert 'start_time' in data[i] assert 'stop_time' in data[i] assert 'spikes' in data[i] assert 'baseline' in data[i] assert 'spikes' in data[i]['baseline'] assert 'n_grey' in data[i]['baseline'] assert 'n_other' in data[i]['baseline'] assert 'n_trials' in data[i] assert 'n_channels' in data[i] assert 'stim_on_time' in data[i] assert 'stim_off_time' in data[i] assert 'stim_on_delay' in data[i] assert 'inter_trial_interval' in data[i] assert 'stim_size' in data[i] assert 'fixation_point_size' in data[i] assert 'fixation_window_size' in data[i] # Check if necessary field values match in all files. # TODO: Check if neuroid ids match too? assert all(session_data['experiment_name'] == data[0]['experiment_name'] for session_data in data) assert all(session_data['experiment_paradigm'] == data[0]['experiment_paradigm'] for session_data in data) assert all(session_data['neuroid']['animal'] == data[0]['neuroid']['animal'] for session_data in data) assert all(session_data['f_sampling'] == data[0]['f_sampling'] for session_data in data) assert all(session_data['f_low'] == data[0]['f_low'] for session_data in data) assert all(session_data['f_high'] == data[0]['f_high'] for session_data in data) assert all(session_data['ellip_order'] == data[0]['ellip_order'] for session_data in data) assert all(session_data['threshold_sd'] == data[0]['threshold_sd'] for session_data in data) assert all(session_data['chunks_for_threshold'] == data[0]['chunks_for_threshold'] for session_data in data) assert all(session_data['start_time'] == data[0]['start_time'] for session_data in data) assert all(session_data['stop_time'] == data[0]['stop_time'] for session_data in data) assert all(session_data['n_channels'] == data[0]['n_channels'] for session_data in data) assert all(session_data['stim_on_time'] == data[0]['stim_on_time'] for session_data in data) assert all(session_data['stim_off_time'] == data[0]['stim_off_time'] for session_data in data) assert all(session_data['stim_on_delay'] == data[0]['stim_on_delay'] for session_data in data) assert all(session_data['inter_trial_interval'] == data[0]['inter_trial_interval'] for session_data in data) assert all(session_data['stim_size'] == data[0]['stim_size'] for session_data in data) assert all(session_data['fixation_point_size'] == data[0]['fixation_point_size'] for session_data in data) assert all(session_data['fixation_window_size'] == data[0]['fixation_window_size'] for session_data in data) assert all(session_data['baseline']['n_grey'] == data[0]['baseline']['n_grey'] for session_data in data) assert all(session_data['baseline']['n_other'] == data[0]['baseline']['n_other'] for session_data in data) # Populate the new dictionary which will contain all the merged data. concatenated_data = dict() for key, value in data[0].items(): if key in ['spikes', 'baseline', 'trial_times', 'date', 'n_trials']: continue concatenated_data[key] = value # Merge dates on which the experiment was run. dates = list(set([session_data['date'] for session_data in data])) concatenated_data['date'] = ', '.join(dates) # Merge the number of trials. concatenated_data['n_trials'] = sum(session_data['n_trials'] for session_data in data) # Create a grouping_idx field so it is easy to identify which trials were run on which days # for normalization purposes. grouping_dates = [] # Temporary list of dates used to compute groupings grouping = [] # This is initially a list of n_trials per date (combines different sessions run on same day) for session_data in data: if session_data['date'] in grouping_dates: grouping[grouping_dates.index(session_data['date'])] += session_data['n_trials'] else: grouping_dates.append(session_data['date']) grouping.append(session_data['n_trials']) # We update the grouping variable so now it stores a list of indexes that can be used to select # appropriate coordinates from the trial dimension of a PSTH XArray. for i, group in enumerate(grouping): group = list(range(group)) print(group) if i != 0: group = [_ + grouping[i-1][-1] + 1 for _ in group] grouping[i] = group assert len(grouping) == len(concatenated_data['date'].split(',')) concatenated_data['grouping_idx'] = grouping # Merge spikes. # TODO: a more efficient way? concatenated_data['spikes'] = {} for channel in data[0]['spikes']: concatenated_data['spikes'][channel] = {} for item in data[0]['spikes'][channel]: _ = {} # Initialize an empty dictionary that will contain data for all trials. trial_counter = 0 # Initialize a counter for trial number. for session_data in data: for trial_data in session_data['spikes'][channel][item].values(): trial_counter += 1 _[trial_counter] = trial_data concatenated_data['spikes'][channel][item] = _ # Merge baseline. concatenated_data['baseline'] = {} concatenated_data['baseline']['n_grey'] = data[0]['baseline']['n_grey'] concatenated_data['baseline']['n_other'] = data[0]['baseline']['n_other'] concatenated_data['baseline']['spikes'] = {} for channel in data[0]['baseline']['spikes']: concatenated_data['baseline']['spikes'][channel] = {} for item in data[0]['baseline']['spikes'][channel]: _ = {} # Initialize an empty dictionary that will contain data for all trials. trial_counter = 0 # Initialize a counter for trial number. for session_data in data: for trial_data in session_data['baseline']['spikes'][channel][item].values(): trial_counter += 1 _[trial_counter] = trial_data concatenated_data['baseline']['spikes'][channel][item] = _ # Store data. with open('data.json', 'w') as f: json.dump(concatenated_data, f, indent=4) # TODO: Store in a braintree directory? return
def convert_to_json(db_addr, data_directory, file_name, partition_size=10): con = sqlite3.connect(db_addr) con.row_factory = sqlite3.Row cur = con.cursor() cur.execute("SELECT MAX(id) as max_id FROM javascript") total_rows = cur.fetchone()['max_id'] pbar = tqdm(total=total_rows) cur.execute( 'SELECT visit_id, script_url, top_level_url, symbol, arguments, value, script_line, script_col FROM javascript' ) js_data = {} # unique_scripts_keymap = {} id_index = 0 # key_counter = 0 id_key_map = {} # unique_scripts = [] current_partition_index = 1 required_partition_index = 1 next_partition_index = 1 next_check = True for row in cur: str_id = str(row[0]) + '|' + row[2] + '|' + row[1] if str_id not in id_key_map: if next_partition_index != current_partition_index: # check if the file exists read from there, otherwise create a new file/object utilities.write_json(os.path.join( data_directory, file_name + '_partition_' + str(current_partition_index) + '.json'), js_data, indent_length=0) if os.path.exists( os.path.join( data_directory, file_name + '_partition_' + str(next_partition_index) + '.json')): js_data = utilities.read_json( os.path.join( data_directory, file_name + '_partition_' + str(next_partition_index) + '.json')) print( '39: Writing:', file_name + '_partition_' + str(current_partition_index), 'Reading:', file_name + '_partition_' + str(next_partition_index), 'Next Partition: ', next_partition_index) else: print( '39: Writing:', file_name + '_partition_' + str(current_partition_index), 'Not Reading:', file_name + '_partition_' + str(next_partition_index), 'Next Partition: ', next_partition_index) current_partition_index = next_partition_index next_check = True id_index += 1 str_id_index = str(id_index) id_key_map[str_id] = {} id_key_map[str_id]['id_index'] = str_id_index id_key_map[str_id]['partition_index'] = current_partition_index js_data[str_id_index] = {} js_data[str_id_index]['information'] = [] else: str_id_index = id_key_map[str_id]['id_index'] required_partition_index = id_key_map[str_id]['partition_index'] if required_partition_index != current_partition_index: utilities.write_json(os.path.join( data_directory, file_name + '_partition_' + str(current_partition_index) + '.json'), js_data, indent_length=0) js_data = utilities.read_json( os.path.join( data_directory, file_name + '_partition_' + str(required_partition_index) + '.json')) print( '58: Writing:', file_name + '_partition_' + str(current_partition_index), 'Reading:', file_name + '_partition_' + str(required_partition_index), 'Next Partition: ', next_partition_index) current_partition_index = required_partition_index # make sure we have the right json object for the partition js_data[str_id_index]['visit_id'] = row[0] js_data[str_id_index]['top_url'] = row[2] js_data[str_id_index]['script_url'] = row[1] js_data[str_id_index]['script_line'] = row[6] js_data[str_id_index]['script_col'] = row[7] js_data[str_id_index]['information'].append({ 'symbol': row[3], 'argument': row[4], 'value': row[5] }) if id_index % partition_size == 0: # Because you may not update id_index and might still read rows before the partition. # if not os.path.exists(os.path.join(data_directory, file_name + '_partition_' + str(next_partition_index) + '.json')): if next_check: utilities.write_json(os.path.join( data_directory, file_name + '_partition_' + str(next_partition_index) + '.json'), js_data, indent_length=0) next_partition_index += 1 print('76: Writing:', file_name + '_partition_' + str(current_partition_index), 'Next Partition: ', next_partition_index) current_partition_index = next_partition_index required_partition_index = next_partition_index js_data = {} next_check = False pbar.update(1) utilities.write_json(os.path.join(data_directory, 'mapping.json'), id_key_map, indent_length=4) return
i.strip() for i in ingredient.lower().split(' ') if i not in adjectives ] measurement = str() prev_word = str() for word in ingredient_tokens: if len(difflib.get_close_matches(word, measurements, cutoff=0.9)) > 0: measurement = prev_word + ' ' + word prev_word = word if len(measurement) == 0: measurement = re.match(r'\d+(.\d+)?', ingredient) if measurement is not None: measurement = measurement.group(0) else: measurement = str() ingredient_tokens = [i.strip() for i in ingredient_tokens] ingredient = ' '.join(ingredient_tokens).strip() ingredient = ingredient.replace(measurement, '') ingredient = rejector.process(ingredient) ingredient_dict['measurement'] = measurement ingredient_dict['ingredient'] = rejector.process(ingredient) return ingredient_dict if __name__ == '__main__': for file in sys.argv[1:]: for item in utilities.read_json(file): print(json.dumps(parse_recipe(item), indent=" "))
def update_hash(db_addr, partitioned_data_directory, mapping_file_name='mapping.json'): mapping_json = utilities.read_json( os.path.join(partitioned_data_directory, mapping_file_name)) con = sqlite3.connect(db_addr) con.row_factory = sqlite3.Row cur = con.cursor() cur.execute("SELECT MAX(id) as max_id FROM http_responses") total_rows = cur.fetchone()['max_id'] pbar = tqdm(total=total_rows) cur.execute('SELECT visit_id, url, content_hash FROM http_responses') updated_mapping_object = {} url_updated_mapping_object = {} total_keys_matched = 0 for key in mapping_json: new_key = key.split('|', 2)[0].strip() + \ '|' + key.split('|', 2)[2].strip() updated_mapping_object[new_key] = {} updated_mapping_object[new_key]['old_key'] = key updated_mapping_object[new_key]['id_index'] = mapping_json[key][ 'id_index'] updated_mapping_object[new_key]['partition_index'] = mapping_json[key][ 'partition_index'] updated_mapping_object[new_key]['content_hash'] = '' mapping_json[key]['new_id_url_key'] = new_key new_key = key.split('|', 2)[2].strip() url_updated_mapping_object[new_key] = {} url_updated_mapping_object[new_key]['old_key'] = key url_updated_mapping_object[new_key]['id_index'] = mapping_json[key][ 'id_index'] url_updated_mapping_object[new_key]['partition_index'] = mapping_json[ key]['partition_index'] url_updated_mapping_object[new_key]['content_hash'] = '' mapping_json[key]['new_url_key'] = new_key for row in cur: if row[2] != '' and row[2] != None: db_row_key = str(row[0]) + '|' + row[1].strip() if db_row_key in updated_mapping_object: updated_mapping_object[db_row_key]['content_hash'] = row[ 2].strip() total_keys_matched += 1 if row[1].strip() != '' and row[1].strip( ) in url_updated_mapping_object: url_updated_mapping_object[ row[1].strip()]['content_hash'] = row[2].strip() total_keys_matched += 1 # if row[1].strip() != '': # for key in updated_mapping_object: # if key.split('|', 2)[1].strip() == row[1].strip(): # updated_mapping_object[key]['content_hash'] = row[2].strip() # total_keys_matched += 1 # break pbar.update(1) print('Total keys matched:', total_keys_matched) utilities.write_json(os.path.join(partitioned_data_directory, 'mapping_updated.json'), updated_mapping_object, indent_length=0) utilities.write_json(os.path.join(partitioned_data_directory, 'url_only_mapping_updated.json'), url_updated_mapping_object, indent_length=0) for key in mapping_json: mapping_json[key]['content_hash'] = updated_mapping_object[ mapping_json[key]['new_id_url_key']]['content_hash'] if mapping_json[key]['content_hash'] == '': mapping_json[key]['content_hash'] = url_updated_mapping_object[ mapping_json[key]['new_url_key']]['content_hash'] utilities.write_json(os.path.join(partitioned_data_directory, 'mapping_with_hash.json'), mapping_json, indent_length=0) return
ha, distance_ax = "right", 1 ax.text(angle_rad, 10 + distance_ax, cat[i], size=10, horizontalalignment=ha, verticalalignment="center") # Show polar plot plt.show() if arguments.profile: for dishfile in arguments.profile: for dish in utilities.read_json(dishfile): print("\n" + dish["dish_name"] + "\n" + "=" * len(dish["dish_name"])) data = taster.taste(dish) print(json.dumps(data, sort_keys=True, indent=" ")) show_graph(data) if arguments.validate: from validator import Validator adjustment = dict() if os.path.exists("adjustment_factors.json"): adjustment = utilities.read_json("adjustment_factors.json") for vjob in arguments.validate: gendata = json.load(open(vjob[1])) survdata = json.load(open(vjob[2])) vobj = Validator(gendata, survdata)