def add_collection(self, row_ser): ''' Add the collection to the list for each object ''' new_collection_field = [] ids = row_ser.loc['jade_collection'] if not isinstance(ids, list): ids = [ids] try: for coll_id in ids: matches = self.collection_df.at[coll_id, 'collection_name'] if isinstance(matches, np.ndarray): match_list = matches.tolist() elif isinstance(matches, str): match_list = [matches] else: print("Unrecognized type of collection", type(matches)) for name in match_list: if name not in new_collection_field: new_collection_field.append(name) row_ser['jade_collection'] = new_collection_field return row_ser except: return row_ser
def quantify_properties(self): ''' Run counts of properties ''' # Iterate through properties identified for faceting props = list( DATASET_OPTIONS['SUBSET_PROPERTIES_AND_QUANTITIES'].items()) iter = tqdm(props) iter.set_description("Quantifying subsets by facet") for prop, lim in iter: if prop in self.objects.columns.values: # Special cases if prop in ['dcterms_date']: # Date dc_dates_ser = self.objects[prop] dc_dates_ser = dc_dates_ser.apply(unwrap_list) dc_dates_ser = dc_dates_ser.dropna() for id in dc_dates_ser.index.values: try: date_val = dc_dates_ser[id] if not isinstance(date_val, list): date_list = [date_val] else: date_list = date_val for date_string in date_list: if not isinstance(date_string, str): date_string = str(date_string) yearlike = date_string.split('-')[0] if (len(yearlike) == 4) and (int(yearlike[0]) == 1) and (yearlike[3] in '0123456789'): year = yearlike dc_dates_ser[id] = str(year) else: dc_dates_ser.drop(id) print('Dropped unrecognized date value:', id, dc_dates_ser[id]) except: dc_dates_ser.drop(id) print('Dropped unrecognized date value:', id, dc_dates_ser[id]) if len(dc_dates_ser) > 1: self.add_to_quant(dc_dates_ser, sort_on_property_name=False) # All others / standard structure else: ser = self.objects[prop] ser = ser.dropna() if len(ser) > 1: self.add_to_quant(ser)
def quantify_relations(self): ''' Make a list of unique relation triples and a table of the most common subject–object pairs ''' # Iterate through relations in the Dataset uniq_rels = {} count_df_index = [] count_df_columns = [] iter = tqdm(self.objects.index.values) iter.set_description("Counting unique relations") for subjId in iter: row = self.objects.loc[subjId] row_rels_dict = row.loc['jade_relation'] if not pd.isnull(row_rels_dict): for relLabel, objIdList in row_rels_dict.items(): for objId in objIdList: # Find the types of each subject and object subjType = subjId.split('_')[0].capitalize() objType = objId.split('_')[0].capitalize() # Count the unique combinations of subject, relation, and object rel = " ".join([subjType, relLabel, objType]) if rel not in uniq_rels: uniq_rels[rel] = 1 else: uniq_rels[rel] += 1 # Make the dimensions for a dataframe if subjType not in count_df_index: count_df_index.append(subjType) if objType not in count_df_columns: count_df_columns.append(objType) # Sort and output simple list self.quant["unique_relation_list"] = pd.DataFrame.from_dict( dict(sort_by_item_counts(uniq_rels)), orient='index') # Make the dataframe count_df = pd.DataFrame(data=0, index=count_df_index, columns=count_df_columns) for rel in list(uniq_rels.keys()): count = uniq_rels[rel] try: subjType, relLabel, objType = rel.split(' ') count_df.at[subjType, objType] += count except: print("Error counting relation:", rel) self.quant["unique_relation_table"] = count_df
def identify_format(form_string): formats = { "paper" : ['paperback','pbk','soft','paper : alk. paper'], "hardcover" : ['hard','cloth','hb'], "ebook" : ['ebook','e-book','electronic','computer','online','remote'] } returnable = 'unknown' for fmat in list(formats.keys()): for desc in formats[fmat]: if desc in form_string.lower(): # print(fmat,form_string) if returnable != 'unknown': if returnable != fmat: print(f'Two different formats recognized: {returnable} and {fmat} in {form_string.lower()}') returnable = fmat return returnable
def compile_json(data, subj, relLabel, obj, obj2=None): ''' This function nests the passed objects into a JSON tree, assuming that "data" is already an existing dictionary. If only four objects are passed, the fourth will appear in list structure. If five are passed, the fifth will be a list. The function does not return anything because dictionaries are mutable objects. ''' if subj not in data: data[subj] = {} if obj2 == None: try: if relLabel not in data[subj]: data[subj][relLabel] = [] except: print(subj, relLabel, obj) if obj not in data[subj][relLabel]: data[subj][relLabel].append(obj) else: secondRelLabel = obj if relLabel not in data[subj]: data[subj][relLabel] = {} if secondRelLabel not in data[subj][relLabel]: data[subj][relLabel][secondRelLabel] = [] if obj2 not in data[subj][relLabel][secondRelLabel]: data[subj][relLabel][secondRelLabel].append(obj2)
def rename(file, format_spec): """Rename article with specified format""" file = pathlib.Path(file) print("Parsing {name}...".format(name=file.name)) article = Article(file.read_bytes()) new_file = format_spec.format( article = article, title = article.getTitle(), author = article.getAuthor(), board = article.getBoard(), time = article.getTime() or format_dummy ) new_file = safe_file_name(new_file) new_file = file.with_name(new_file) if file == new_file: print("Same file name!\n") return if new_file.exists(): num = 2 while True: temp_file = "{name} ({num}){ext}".format( num = num, name = new_file.stem, ext = new_file.suffix ) temp_file = new_file.with_name(temp_file) if file == temp_file: print("Same file name!\n") return if not temp_file.exists(): new_file = temp_file break num += 1 print("Rename to {name}...\n".format(name=new_file.name)) file.rename(new_file)
def rename(file, format_spec, dir=DIR()): """Rename article with specified format""" file = pathlib.Path(file) print("Parsing {name}...".format(name=file.name)) article = Article(file.read_bytes()) new_file = file.with_name(format_filename( article, file=file, dir=dir, format=format_spec )) if file == new_file: print("Same file name!\n") return if new_file.exists(): num = 2 while True: temp_file = "{name} ({num}){ext}".format( num = num, name = new_file.stem, ext = new_file.suffix ) temp_file = new_file.with_name(temp_file) if file == temp_file: print("Same file name!\n") return if not temp_file.exists(): new_file = temp_file break num += 1 print("Rename to {name}...\n".format(name=new_file.name)) file.rename(new_file)
def add_location_types(self, row): ''' Look for null type values and adds location if location in jade_id prefix ''' try: if pd.isnull(row.loc['jade_type']): if type(row.loc['jade_id']) == type(""): if row.loc['jade_id'].split("_")[0] == "location": row.loc['jade_type'] = "Location" else: print("Type null but not location:", row) else: print('Dropped type not included:', row['jade_url']) return row except: print("Unknown problem during adding location type for:", row)
# -*- coding: utf-8 -*- from __future__ import print_function import sys from safeprint import print print(sys.version) print(u"你好世界!こんにちは世界안녕하세요세계") print("你好世界!こんにちは世界안녕하세요세계")
import mysql.connector from diskcache import Cache import pandas as pd import numpy as np from bs4 import BeautifulSoup from tqdm import tqdm from safeprint import print ''' Options ''' try: # Options file setup credit Sam Sciolla with open(os.path.join('options.json')) as env_file: ENV = json.loads(env_file.read()) except: print( '"Options.json" not found; please add "options.json" to the current directory.' ) ''' SQL Connection ''' DB = mysql.connector.connect(host=ENV['SQL']['HOST'], user=ENV['SQL']['USER'], passwd=ENV['SQL']['PASSWORD'], database=ENV['SQL']['DATABASE']) CUR = DB.cursor(buffered=True) ''' Setup ''' BEGIN = datetime.datetime.now() TS = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") ITEM_ELEMENTS = ENV['ELEMENT_DICTIONARY']['DCTERMS_IN_USE']
def print_tree(self): fs = open(self._tmp_file, "w+", encoding="utf-8") files_list = self.list_dir(self._root_path) need_line = [] for file_node in files_list: if int(self.level) != 0 and file_node.level > int(self.level): continue if not file_node.is_end: if file_node.level not in need_line: need_line.append(file_node.level) else: if file_node.level in need_line: need_line.remove(file_node.level) for i in range(0, file_node.level): if i in need_line: print(" ", end="", file=fs) print(' ', end="", file=fs) if file_node.is_end: print("* " + self.get_md_file_link(file_node), end="", file=fs) else: print("* " + self.get_md_file_link(file_node), end="", file=fs) max_line_words = 100 - len(need_line) * 5 - file_node.basename_len( ) first_line = True if len(file_node.comments): comments = file_node.comments.copy() while len(comments): if len(comments[0]) > max_line_words: line_content = comments[0][0:max_line_words] comments[0] = comments[0][max_line_words:] else: line_content = comments[0] del comments[0] if first_line: print(" : " + line_content + "<br/>", file=fs) first_line = False else: if file_node.is_end: for i in range(0, file_node.level + 1): if i in need_line: print(" ", end="", file=fs) print(' ', end="", file=fs) for i in range(0, file_node.basename_len() + 1): print(' ', end="", file=fs) print(" ", end="", file=fs) print(line_content) else: for i in range(0, file_node.level + 1): if i in need_line: print(" ", end="", file=fs) if i != file_node.level: print(' ', end="", file=fs) for i in range(0, file_node.basename_len() + 3): print(" ", end="", file=fs) print(" ", end="", file=fs) print(line_content + "<br/>", file=fs) else: print("", file=fs) fs.close() target_update_file = os.path.join(self._root_path, "README.md") if not os.path.exists(target_update_file) \ or os.path.isdir(target_update_file): open('README.md', 'a', encoding="utf-8").close() with open(self._tmp_file, 'r', encoding="utf-8") as f: source_file = f.read() with open(target_update_file, 'r', encoding="utf-8") as f: target_file = f.read() try: target_start = target_file.index(self.start_line) target_end = target_file.index(self.end_line, target_start) target_end += len(self.end_line) except Exception as e: target_end = target_start = 0 source_file = "{}\n\n{}\n{}".format(self.start_line, source_file, self.end_line) if target_end == 0 and target_start == 0: target_file += '\n' + source_file else: target_file = target_file.replace( target_file[target_start:target_end], source_file) safeprint.print(target_file) with open(target_update_file, 'w', encoding="utf-8") as f: f.write(target_file)
def identify_books() -> None: # Load input data input_path = os.path.join(*BOOKS_CSV_PATH_ELEMS) if '.xlsx' in BOOKS_CSV_PATH_ELEMS[-1]: press_books_df = pd.read_excel(input_path, dtype=str, index_col='ID') # press_books_df = press_books_df.iloc[1:] # Remove dummy record else: press_books_df = pd.read_csv(input_path, dtype=str, index_col='ID') # print(press_books_df) matches_df = pd.DataFrame({},columns=ENV["OUTPUT_COLUMNS"]) if ALREADY_CSV_PATH_ELEMS[-1] != "": already_input_path = os.path.join(*ALREADY_CSV_PATH_ELEMS) if '.xlsx' in ALREADY_CSV_PATH_ELEMS[-1]: already_books_df = pd.read_excel(already_input_path, dtype=str,index_col=0) else: already_books_df = pd.read_csv(already_input_path,dtype=str,index_col=0) # print(press_books_df) # # for id in already_books_df.index.to_list(): # # print(id.split("_")[0]) # # press_books_df.drop(id.split("_")[0]) # print(press_books_df) matches_df = matches_df.append(already_books_df) # print(matches_df) # Crosswalk to consistent column names # press_books_df = press_books_df.rename(columns=INPUT_TO_IDENTIFY_CW) # logger.debug(press_books_df.columns) # Limit number of records for testing purposes if TEST_MODE_OPTS['ON']: # logger.info('TEST_MODE is ON.') press_books_df = press_books_df.iloc[:len(matches_df)+TEST_MODE_OPTS['NUM_RECORDS']] # For each record, fetch WorldCat data, compare to record, analyze and accumulate matches non_matching_books = {} num_books_with_matches = 0 iter = tqdm(press_books_df.iterrows()) for press_book_row_tup in iter: iter.set_description("Looking up books") new_book_dict = press_book_row_tup[1].to_dict() new_book_dict['ID'] = press_book_row_tup[0] uncat_isbn_string = new_book_dict['Uncategorized ISBN'] if type(uncat_isbn_string) == type(''): uncat_isbns = uncat_isbn_string.split(' ; ') new_book_dict['Uncategorized ISBN'] = '' new_book_dict['ebook ISBN'] = '' new_book_dict['paper ISBN'] = '' new_book_dict['hardcover ISBN'] = '' for isbn_string in uncat_isbns: canon_isbn = get_canon_isbn(isbn_string) isbn_fmat = identify_format(isbn_string) if isbn_fmat == 'unknown': isbn_fmat = 'Uncategorized' already_there = new_book_dict[f'{isbn_fmat} ISBN'] if canon_isbn not in already_there: if len(already_there) > 0: new_book_dict[f'{isbn_fmat} ISBN'] += " ; " new_book_dict[f'{isbn_fmat} ISBN'] += canon_isbn if (new_book_dict['ID'] not in matches_df['ID']): # logger.info(new_book_dict) matching_records_df = look_up_book_in_resource(new_book_dict) matches_df = matches_df.append(pd.Series( new_book_dict, name=new_book_dict['ID'] )) if not matching_records_df.empty: matches_df = matches_df.append(matching_records_df) # logger.debug('Matching Manifests') # logger.debug(matches_df.describe()) # matches_df = matches_df[ENV["OUTPUT_COLUMNS"]] # print(matches_df) # Add stats for copyright holder holders = {} for id in matches_df.index.values: if "_" not in id: rightsholder = str(matches_df.at[id,'Copyright Holder']) if rightsholder not in holders: holders[rightsholder] = 1 else: holders[rightsholder] += 1 publisher = str(matches_df.at[id,'Publisher']) if publisher+' - '+rightsholder in ENV['PUBLISHER_RIGHTSHOLDER_MATCHES']: new_rightsholder = False elif publisher != rightsholder: new_rightsholder = True print(publisher," != ",rightsholder) else: new_rightsholder = False matches_df.at[id,'New Rightsholder'] = new_rightsholder for id in matches_df.index.values: if "_" not in id: rightsholder = matches_df.at[id,'Copyright Holder'] if not pd.isnull(rightsholder): matches_df.at[id,'Rightsholder Rank'] = holders[rightsholder] # Generate Excel output if not matches_df.empty: try: save_excel(matches_df,'output') except: save_csv(matches_df,'output') # matches_df.to_csv(os.path.join('data', 'matched_manifests.csv'), index=False) # if non_matching_books: # no_isbn_matches_df = pd.DataFrame.from_dict(non_matching_books,orient='index') # no_isbn_matches_df = no_isbn_matches_df[ENV["OUTPUT_COLUMNS"]] # try: # save_excel(no_isbn_matches_df,'not_matched') # except: # save_csv(no_isbn_matches_df,'not_matched') # no_isbn_matches_df.to_csv(os.path.join('data', 'no_isbn_matches.csv'), index=False) # Log Summary Report report_str = '** Summary Report from identify.py **\n\n' report_str += f'-- Total number of books included in search: {len(press_books_df)}\n' report_str += f'-- Number of books successfully matched with records with ISBNs: {num_books_with_matches}\n' report_str += f'-- Number of books with no matching records: {len(non_matching_books)}\n' # logger.info(f'\n\n{report_str}') return None
normalize_univ, \ NA_PATTERN from db_cache import make_request_using_cache # , set_up_database # Initialize settings and global variables BEGIN = datetime.now() TS = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # logger = logging.getLogger(__name__) try: with open(os.path.join('config', 'env.json')) as env_file: ENV = json.loads(env_file.read()) except FileNotFoundError: print('Configuration file could not be found') # logger.error('Configuration file could not be found; please add env.json to the config directory.') # logging.basicConfig(level=ENV.get('LOG_LEVEL', 'NOTSET')) # # Set up database if necessary # if not os.path.isfile(os.path.join(*ENV['DB_CACHE_PATH'])): # set_up_database() BOOKS_CSV_PATH_ELEMS = ENV['BOOKS_CSV_PATH'] ALREADY_CSV_PATH_ELEMS = ENV['ALREADY_CSV_PATH'] worldcat_config = ENV['RESOURCE'] API_KEY = worldcat_config['BIB_RESOURCE_KEY'] BIB_BASE_URL = worldcat_config['BIB_RESOURCE_BASE_URL'] TEST_MODE_OPTS = ENV['TEST_MODE']
#get_close_matches est en faite une fonction de la librairie #difflib qui va calculer le pourcentage de ressemblace entre #entre se que l'utilisateur entre et tout les mot du fichier #json elif len(get_close_matches(w, data.keys())) > 0: #On pren la premier valeur car la fontion get_close_matches #retourne une liste de valeur yn = input( "you want to say %s instead ? Enter Y if yes or N if No : " % get_close_matches(w, data.keys())[0]) yn = yn.lower() if yn == "y": return data[get_close_matches(w, data.keys())[0]] elif yn == "n": return " Sorry this word doesn't exist." else: return "i didn't understand your entry." else: return " Sorry this word doesn't exist." word = input("Enter the word : ") output = Translate(word) if type(output) == list: for item in output: print(item) else: print(Translate(word))