def clean_multi(): """ Deletes the multilingual data from the db. """ db = get_database() db.delete_collection(COLLECTION_PARALLELS_MULTI) db_file_collection = db.collection(COLLECTION_FILES) db_segments_collection = db.collection(COLLECTION_SEGMENTS) files = db_file_collection.all() for file in files: if "available_lang" in file: if len(file['available_lang']) > 0: segment_nrs = file['segment_keys'] for segment_nr in segment_nrs: current_doc = db_segments_collection.get(segment_nr) try: if current_doc: current_doc['parallel_ids_multi'] = [] db_segments_collection.update(current_doc) except (KeyError, AttributeError) as e: print( "Could not remove multilingual parallels from segment. Error: ", e) except DocumentInsertError as e: print( f"Could not remove multilingual segment {segment_nr}. Error: ", e) file['available_lang'] = [] db_file_collection.update(file)
def clean_all_lang_db(current_lang): print("Cleaning data for language",current_lang) db = get_database() segments_collection = db.collection(COLLECTION_SEGMENTS) segments_collection.delete_match({"lang":current_lang}) parallels_collection = db.collection(COLLECTION_PARALLELS) parallels_collection.delete_match({"src_lang":current_lang}) parallels_sorted_collection = db.collection(COLLECTION_PARALLELS_SORTED_BY_FILE) parallels_sorted_collection.delete_match({"lang":current_lang}) menu_categories_collection = db.collection(COLLECTION_MENU_CATEGORIES) menu_categories_collection.delete_match({"language":current_lang}) menu_collections_collection = db.collection(COLLECTION_MENU_COLLECTIONS) menu_collections_collection.delete_match({"language":current_lang}) parallels_count_collection = db.collection(COLLECTION_FILES_PARALLEL_COUNT) parallels_count_collection.delete_match({"language":current_lang}) files_collection = db.collection(COLLECTION_FILES) files_collection.delete_match({"language":current_lang}) print("Cleaning data done.")
def clean_totals_collection_db(): """ Clear the categories_parallel_count collection """ db = get_database() db.delete_collection(COLLECTION_CATEGORIES_PARALLEL_COUNT) db.create_collection(COLLECTION_CATEGORIES_PARALLEL_COUNT) print("totals collection cleaned.")
def load_menu_files(c): print("Loading menu collections...") db = get_database() load_all_menu_categories(db) load_all_menu_collections(db) create_collections_categories_graph(db) print("Menu data loading completed!")
def clean_search_index_db(): """ Clear all the search index views and collections. """ db = get_database() try: for name in INDEX_COLLECTION_NAMES: if db.has_collection(name): db.delete_collection(name) for name in INDEX_VIEW_NAMES: db.delete_view(name) except CollectionDeleteError as e: print("Error deleting collection %s: " % name, e) clean_analyzers(db) print("search index cleaned.")
def load_parallel_counts(source_name: str, target_name: str, total_length_count: list): if total_length_count: db = get_database() collection = db.collection(COLLECTION_CATEGORIES_PARALLEL_COUNT) doc = { "_key": source_name + "_" + target_name, "sourcecollection": source_name, "targetcollection": target_name, "totallengthcount": total_length_count, } try: collection.add_hash_index(["sourcecollection"], unique=False) collection.insert(doc) except (DocumentInsertError, IndexCreateError) as e: print("Could not load file. Error: ", e)
def clean_all_collections_db(): """ Clear all the database collections completely. """ db = get_database() current_name = "" try: for name in COLLECTION_NAMES: current_name = name db.delete_collection(name) for name in EDGE_COLLECTION_NAMES: current_name = name db.delete_collection(name) db.delete_graph(GRAPH_COLLECTIONS_CATEGORIES) except CollectionDeleteError as e: print("Error deleting collection %s: " % current_name, e) except GraphDeleteError as e: print("couldn't remove graph. It probably doesn't exist.", e) print("all collections cleaned.")
def load_multilingual_file(filepath): db = get_database() db_multi_collection = db.collection(COLLECTION_PARALLELS_MULTI) db_segments_collection = db.collection(COLLECTION_SEGMENTS) print("Loading", filepath) with gzip.open(filepath, 'r') as current_file: json_data = json.load(current_file) if len(json_data) > 0: filename = json_data[0]['root_segnr'][0].split(':')[0] tgt_lang = json_data[0]['tgt_lang'] update_filename(filename, tgt_lang, db) for parallel in json_data: parallel["_key"] = parallel["id"] try: db_multi_collection.insert_many(json_data) except (DocumentInsertError, IndexCreateError) as e: print(f"Could not save multilingual parallels. Error: ", e) add_multi_parallels_to_segments(json_data, db_segments_collection)
def clean_pali(c): """ Clear all the pali data from the database. :param c: invoke.py context object """ db = get_database() current_name = "" try: for name in COLLECTION_NAMES: current_name = name db.delete_collection(name) for name in EDGE_COLLECTION_NAMES: current_name = name db.delete_collection(name) db.delete_graph(GRAPH_COLLECTIONS_CATEGORIES) except CollectionDeleteError as e: print("Error deleting collection %s: " % current_name, e) except GraphDeleteError as e: print("couldn't remove graph. It probably doesn't exist.", e) print("all collections cleaned.")
def create_search_index( c, index_url_skt=DEFAULT_SOURCE_URL + "/search_index_sanskrit.json.gz", index_url_pli=DEFAULT_SOURCE_URL + "/search_index_pali.json.gz", index_url_tib=DEFAULT_SOURCE_URL + "/search_index_tibetan.json.gz", index_url_chn=DEFAULT_SOURCE_URL + "/search_index_chn.json.gz", ): """ Load index data for search index from path defined in .env. """ db = get_database() create_analyzers(db) collections = INDEX_COLLECTION_NAMES for name in collections: db.create_collection(name) load_search_index_skt(index_url_skt, db) load_search_index_pli(index_url_pli, db) load_search_index_chn(index_url_chn, db) load_search_index_tib(index_url_tib, db) create_search_views(db) print("Search index data loading completed.")
def clean_segment_collections_db(): """ Clear the segment database collections completely. """ db = get_database() try: db.delete_graph(GRAPH_FILES_SEGMENTS) db.delete_graph(GRAPH_FILES_PARALLELS) for name in ( COLLECTION_SEGMENTS, COLLECTION_PARALLELS, COLLECTION_FILES, COLLECTION_FILES_PARALLEL_COUNT, ): empty_collection(name, db) except (GraphDeleteError, CollectionDeleteError): print( f"couldn't remove graph: {GRAPH_FILES_SEGMENTS}. It probably doesn't exist." ) print("segment collections cleaned.")
def create_collections( c, collections=COLLECTION_NAMES, edge_collections=EDGE_COLLECTION_NAMES ): """ Create empty collections in database :param c: invoke.py context object :param collections: Array of collection names to be created :param edge_collections: Array of edge collection names to be created """ db = get_database() for name in collections: try: db.create_collection(name) except CollectionCreateError as e: print(f"Error creating collection {name}: ", e) for name in edge_collections: try: db.create_collection(name, edge=True) except CollectionCreateError as e: print("Error creating edge collection: ", e) print(f"created {collections} collections")
def load_segments_and_parallels_data_from_menu_file(menu_file_json, lang: str, root_url: str) -> None: file_url = f"{root_url}{lang}/{menu_file_json['filename']}.json.gz" db = get_database() if not file_url.endswith("gz"): print(f"{file_url} is not a gzip file. Ignoring.") return [segments, parallels] = get_segments_and_parallels_from_gzipped_remote_file(file_url) if segments: segment_keys, totallengthcount, totalfilelengthcount = load_segments( segments, parallels, db) load_files_collection(menu_file_json, segment_keys, lang, db) load_file_parallel_counts(menu_file_json, totallengthcount, totalfilelengthcount, db) if parallels: load_parallels(parallels, db) load_parallels_sorted(parallels, db, menu_file_json['filename'])
def clean_menu_collections_db(): """ Clear the menu database collections completely. """ db = get_database() try: db.delete_graph(GRAPH_COLLECTIONS_CATEGORIES) for name in ( COLLECTION_MENU_COLLECTIONS, COLLECTION_MENU_CATEGORIES, COLLECTION_LANGUAGES, ): empty_collection(name, db) for name in ( EDGE_COLLECTION_LANGUAGE_HAS_COLLECTIONS, EDGE_COLLECTION_COLLECTION_HAS_CATEGORIES, EDGE_COLLECTION_CATEGORY_HAS_FILES, ): empty_collection(name, db, edge=True) except (GraphDeleteError, CollectionDeleteError): print( f"couldn't remove object {GRAPH_COLLECTIONS_CATEGORIES}. It probably doesn't exist." ) print("menu data collections cleaned.")
def add_sources(c): db = get_database() print("adding source information") load_sources(db,DEFAULT_SOURCE_URL)
def add_indices(c): db = get_database() print("Creating Indices") create_indices(db) print("Creation of indices done.")
def calculate_parallel_totals(): # This function goes over all the data and groups it into totals for the visual view # This takes some time to run on the full dataset. db = get_database() collection_query_cursor = db.aql.execute( menu_queries.QUERY_CATEGORIES_PER_COLLECTION) collections = [doc for doc in collection_query_cursor] # for each collection, the totals to each other collection of that same language are calculated for col in collections: language = col["language"] source_collection = col["collection"] source_col_dict = {} for source_cat in col["categories"]: source_col_dict.update(source_cat) language_collection_list = get_collection_list_for_language( language, collections) for target_collection in language_collection_list: selected_category_dict = get_categories_for_language_collection( target_collection, collections) counted_parallels = [] for category, cat_name in source_col_dict.items(): all_files_cursor = db.aql.execute( menu_queries.QUERY_FILES_PER_CATEGORY, batch_size=100000, bind_vars={ "category": category, "language": language }, ) all_files = [doc for doc in all_files_cursor] add_category_totals_to_db( all_files, category, target_collection, selected_category_dict, language, ) total_par_list = {} for filename in all_files: parallel_count = filename["totallengthcount"] for categoryname in selected_category_dict: if categoryname not in total_par_list.keys(): if categoryname not in parallel_count.keys(): total_par_list[categoryname] = 0 else: total_par_list[categoryname] = parallel_count[ categoryname] elif categoryname in parallel_count.keys(): total_par_list[categoryname] += parallel_count[ categoryname] for key, value in total_par_list.items(): counted_parallels.append([ cat_name + " (" + category + ")", selected_category_dict[key].rstrip() + "_(" + key + ")", value, ]) load_parallel_counts(source_collection, target_collection, counted_parallels)