예제 #1
0
    def add_collection(self, row_ser):
        '''
        Add the collection to the list for each object
        '''

        new_collection_field = []
        ids = row_ser.loc['jade_collection']
        if not isinstance(ids, list):
            ids = [ids]

        try:
            for coll_id in ids:
                matches = self.collection_df.at[coll_id, 'collection_name']

                if isinstance(matches, np.ndarray):
                    match_list = matches.tolist()
                elif isinstance(matches, str):
                    match_list = [matches]
                else:
                    print("Unrecognized type of collection", type(matches))

                for name in match_list:
                    if name not in new_collection_field:
                        new_collection_field.append(name)
            row_ser['jade_collection'] = new_collection_field
            return row_ser
        except:
            return row_ser
예제 #2
0
    def quantify_properties(self):
        '''
        Run counts of properties
        '''

        # Iterate through properties identified for faceting
        props = list(
            DATASET_OPTIONS['SUBSET_PROPERTIES_AND_QUANTITIES'].items())
        iter = tqdm(props)
        iter.set_description("Quantifying subsets by facet")
        for prop, lim in iter:
            if prop in self.objects.columns.values:

                # Special cases
                if prop in ['dcterms_date']:

                    # Date
                    dc_dates_ser = self.objects[prop]
                    dc_dates_ser = dc_dates_ser.apply(unwrap_list)
                    dc_dates_ser = dc_dates_ser.dropna()
                    for id in dc_dates_ser.index.values:
                        try:
                            date_val = dc_dates_ser[id]
                            if not isinstance(date_val, list):
                                date_list = [date_val]
                            else:
                                date_list = date_val
                            for date_string in date_list:
                                if not isinstance(date_string, str):
                                    date_string = str(date_string)
                                yearlike = date_string.split('-')[0]
                                if (len(yearlike)
                                        == 4) and (int(yearlike[0])
                                                   == 1) and (yearlike[3]
                                                              in '0123456789'):
                                    year = yearlike
                                    dc_dates_ser[id] = str(year)
                                else:
                                    dc_dates_ser.drop(id)
                                    print('Dropped unrecognized date value:',
                                          id, dc_dates_ser[id])
                        except:
                            dc_dates_ser.drop(id)
                            print('Dropped unrecognized date value:', id,
                                  dc_dates_ser[id])
                    if len(dc_dates_ser) > 1:
                        self.add_to_quant(dc_dates_ser,
                                          sort_on_property_name=False)

                # All others / standard structure
                else:
                    ser = self.objects[prop]
                    ser = ser.dropna()
                    if len(ser) > 1:
                        self.add_to_quant(ser)
예제 #3
0
    def quantify_relations(self):
        '''
        Make a list of unique relation triples and a table of the most common subject–object pairs
        '''

        # Iterate through relations in the Dataset
        uniq_rels = {}
        count_df_index = []
        count_df_columns = []
        iter = tqdm(self.objects.index.values)
        iter.set_description("Counting unique relations")
        for subjId in iter:
            row = self.objects.loc[subjId]
            row_rels_dict = row.loc['jade_relation']
            if not pd.isnull(row_rels_dict):
                for relLabel, objIdList in row_rels_dict.items():
                    for objId in objIdList:

                        # Find the types of each subject and object
                        subjType = subjId.split('_')[0].capitalize()
                        objType = objId.split('_')[0].capitalize()

                        # Count the unique combinations of subject, relation, and object
                        rel = " ".join([subjType, relLabel, objType])

                        if rel not in uniq_rels:
                            uniq_rels[rel] = 1
                        else:
                            uniq_rels[rel] += 1

                        # Make the dimensions for a dataframe
                        if subjType not in count_df_index:
                            count_df_index.append(subjType)
                        if objType not in count_df_columns:
                            count_df_columns.append(objType)

        # Sort and output simple list
        self.quant["unique_relation_list"] = pd.DataFrame.from_dict(
            dict(sort_by_item_counts(uniq_rels)), orient='index')

        # Make the dataframe
        count_df = pd.DataFrame(data=0,
                                index=count_df_index,
                                columns=count_df_columns)
        for rel in list(uniq_rels.keys()):
            count = uniq_rels[rel]
            try:
                subjType, relLabel, objType = rel.split(' ')
                count_df.at[subjType, objType] += count
            except:
                print("Error counting relation:", rel)
        self.quant["unique_relation_table"] = count_df
예제 #4
0
def identify_format(form_string):
    formats = {
        "paper" : ['paperback','pbk','soft','paper : alk. paper'],
        "hardcover" : ['hard','cloth','hb'],
        "ebook" : ['ebook','e-book','electronic','computer','online','remote']
    }

    returnable = 'unknown'
    for fmat in list(formats.keys()):
        for desc in formats[fmat]:
            if desc in form_string.lower():
                # print(fmat,form_string)
                if returnable != 'unknown':
                    if returnable != fmat:
                        print(f'Two different formats recognized: {returnable} and {fmat} in {form_string.lower()}')
                returnable = fmat
    return returnable
예제 #5
0
def compile_json(data, subj, relLabel, obj, obj2=None):
    '''
    This function nests the passed objects into a JSON tree, assuming that "data" is already an existing dictionary. If only four objects are passed, the fourth will appear in list structure. If five are passed, the fifth will be a list. The function does not return anything because dictionaries are mutable objects.
    '''

    if subj not in data:
        data[subj] = {}
    if obj2 == None:
        try:
            if relLabel not in data[subj]:
                data[subj][relLabel] = []
        except:
            print(subj, relLabel, obj)
        if obj not in data[subj][relLabel]:
            data[subj][relLabel].append(obj)
    else:
        secondRelLabel = obj
        if relLabel not in data[subj]:
            data[subj][relLabel] = {}
        if secondRelLabel not in data[subj][relLabel]:
            data[subj][relLabel][secondRelLabel] = []
        if obj2 not in data[subj][relLabel][secondRelLabel]:
            data[subj][relLabel][secondRelLabel].append(obj2)
예제 #6
0
def rename(file, format_spec):
	"""Rename article with specified format"""
	file = pathlib.Path(file)
	
	print("Parsing {name}...".format(name=file.name))
	
	article = Article(file.read_bytes())
	
	new_file = format_spec.format(
		article = article,
		title = article.getTitle(),
		author = article.getAuthor(),
		board = article.getBoard(),
		time = article.getTime() or format_dummy
	)
	new_file = safe_file_name(new_file)
	new_file = file.with_name(new_file)
	
	if file == new_file:
		print("Same file name!\n")
		return
	
	if new_file.exists():
		num = 2
		
		while True:
			temp_file = "{name} ({num}){ext}".format(
				num = num,
				name = new_file.stem,
				ext = new_file.suffix
			)
			temp_file = new_file.with_name(temp_file)
			
			if file == temp_file:
				print("Same file name!\n")
				return
				
			if not temp_file.exists():
				new_file = temp_file
				break
				
			num += 1
	
	print("Rename to {name}...\n".format(name=new_file.name))
	
	file.rename(new_file)
예제 #7
0
def rename(file, format_spec, dir=DIR()):
	"""Rename article with specified format"""
	file = pathlib.Path(file)

	print("Parsing {name}...".format(name=file.name))
	article = Article(file.read_bytes())

	new_file = file.with_name(format_filename(
		article,
		file=file,
		dir=dir,
		format=format_spec
	))

	if file == new_file:
		print("Same file name!\n")
		return

	if new_file.exists():
		num = 2

		while True:
			temp_file = "{name} ({num}){ext}".format(
				num = num,
				name = new_file.stem,
				ext = new_file.suffix
			)
			temp_file = new_file.with_name(temp_file)

			if file == temp_file:
				print("Same file name!\n")
				return

			if not temp_file.exists():
				new_file = temp_file
				break

			num += 1

	print("Rename to {name}...\n".format(name=new_file.name))

	file.rename(new_file)
예제 #8
0
    def add_location_types(self, row):
        '''
        Look for null type values and adds location if location in jade_id prefix
        '''

        try:
            if pd.isnull(row.loc['jade_type']):
                if type(row.loc['jade_id']) == type(""):
                    if row.loc['jade_id'].split("_")[0] == "location":
                        row.loc['jade_type'] = "Location"
                    else:
                        print("Type null but not location:", row)

                else:
                    print('Dropped type not included:', row['jade_url'])
            return row
        except:
            print("Unknown problem during adding location type for:", row)
예제 #9
0
# -*- coding: utf-8 -*-
from __future__ import print_function

import sys

from safeprint import print

print(sys.version)
print(u"你好世界!こんにちは世界안녕하세요세계")
print("你好世界!こんにちは世界안녕하세요세계")
예제 #10
0
import mysql.connector
from diskcache import Cache
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm
from safeprint import print
'''
Options
'''
try:  # Options file setup credit Sam Sciolla
    with open(os.path.join('options.json')) as env_file:
        ENV = json.loads(env_file.read())
except:
    print(
        '"Options.json" not found; please add "options.json" to the current directory.'
    )
'''
SQL Connection
'''
DB = mysql.connector.connect(host=ENV['SQL']['HOST'],
                             user=ENV['SQL']['USER'],
                             passwd=ENV['SQL']['PASSWORD'],
                             database=ENV['SQL']['DATABASE'])
CUR = DB.cursor(buffered=True)
'''
Setup
'''
BEGIN = datetime.datetime.now()
TS = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
ITEM_ELEMENTS = ENV['ELEMENT_DICTIONARY']['DCTERMS_IN_USE']
예제 #11
0
    def print_tree(self):
        fs = open(self._tmp_file, "w+", encoding="utf-8")
        files_list = self.list_dir(self._root_path)
        need_line = []
        for file_node in files_list:
            if int(self.level) != 0 and file_node.level > int(self.level):
                continue
            if not file_node.is_end:
                if file_node.level not in need_line:
                    need_line.append(file_node.level)
            else:
                if file_node.level in need_line:
                    need_line.remove(file_node.level)
            for i in range(0, file_node.level):
                if i in need_line:
                    print(" ", end="", file=fs)
                print('   ', end="", file=fs)
            if file_node.is_end:
                print("* " + self.get_md_file_link(file_node), end="", file=fs)
            else:
                print("* " + self.get_md_file_link(file_node), end="", file=fs)

            max_line_words = 100 - len(need_line) * 5 - file_node.basename_len(
            )
            first_line = True
            if len(file_node.comments):
                comments = file_node.comments.copy()
                while len(comments):
                    if len(comments[0]) > max_line_words:
                        line_content = comments[0][0:max_line_words]
                        comments[0] = comments[0][max_line_words:]
                    else:
                        line_content = comments[0]
                        del comments[0]
                    if first_line:
                        print("&nbsp;:&nbsp;" + line_content + "<br/>",
                              file=fs)
                        first_line = False
                    else:
                        if file_node.is_end:
                            for i in range(0, file_node.level + 1):
                                if i in need_line:
                                    print(" ", end="", file=fs)
                                print('   ', end="", file=fs)
                            for i in range(0, file_node.basename_len() + 1):
                                print('&nbsp;', end="", file=fs)
                            print("&nbsp;&nbsp;", end="", file=fs)
                            print(line_content)
                        else:
                            for i in range(0, file_node.level + 1):
                                if i in need_line:
                                    print(" ", end="", file=fs)
                                if i != file_node.level:
                                    print('   ', end="", file=fs)
                            for i in range(0, file_node.basename_len() + 3):
                                print("&nbsp;", end="", file=fs)
                            print("&nbsp;&nbsp;", end="", file=fs)
                            print(line_content + "<br/>", file=fs)
            else:
                print("", file=fs)

        fs.close()

        target_update_file = os.path.join(self._root_path, "README.md")
        if not os.path.exists(target_update_file) \
                or os.path.isdir(target_update_file):
            open('README.md', 'a', encoding="utf-8").close()

        with open(self._tmp_file, 'r', encoding="utf-8") as f:
            source_file = f.read()

        with open(target_update_file, 'r', encoding="utf-8") as f:
            target_file = f.read()

        try:
            target_start = target_file.index(self.start_line)
            target_end = target_file.index(self.end_line, target_start)
            target_end += len(self.end_line)
        except Exception as e:
            target_end = target_start = 0

        source_file = "{}\n\n{}\n{}".format(self.start_line, source_file,
                                            self.end_line)

        if target_end == 0 and target_start == 0:
            target_file += '\n' + source_file
        else:
            target_file = target_file.replace(
                target_file[target_start:target_end], source_file)

        safeprint.print(target_file)

        with open(target_update_file, 'w', encoding="utf-8") as f:
            f.write(target_file)
예제 #12
0
def identify_books() -> None:
    # Load input data
    input_path = os.path.join(*BOOKS_CSV_PATH_ELEMS)
    if '.xlsx' in BOOKS_CSV_PATH_ELEMS[-1]:
        press_books_df = pd.read_excel(input_path, dtype=str, index_col='ID')
        # press_books_df = press_books_df.iloc[1:]  # Remove dummy record
    else:
        press_books_df = pd.read_csv(input_path, dtype=str, index_col='ID')

    # print(press_books_df)

    matches_df = pd.DataFrame({},columns=ENV["OUTPUT_COLUMNS"])

    if ALREADY_CSV_PATH_ELEMS[-1] != "":
        already_input_path = os.path.join(*ALREADY_CSV_PATH_ELEMS)
        if '.xlsx' in ALREADY_CSV_PATH_ELEMS[-1]:
            already_books_df = pd.read_excel(already_input_path, dtype=str,index_col=0)
        else:
            already_books_df = pd.read_csv(already_input_path,dtype=str,index_col=0)


        # print(press_books_df)
        # # for id in already_books_df.index.to_list():
        # #     print(id.split("_")[0])
        # #     press_books_df.drop(id.split("_")[0])
        # print(press_books_df)
        matches_df = matches_df.append(already_books_df)
        # print(matches_df)

    # Crosswalk to consistent column names
    # press_books_df = press_books_df.rename(columns=INPUT_TO_IDENTIFY_CW)
    # logger.debug(press_books_df.columns)

    # Limit number of records for testing purposes
    if TEST_MODE_OPTS['ON']:
        # logger.info('TEST_MODE is ON.')
        press_books_df = press_books_df.iloc[:len(matches_df)+TEST_MODE_OPTS['NUM_RECORDS']]

    # For each record, fetch WorldCat data, compare to record, analyze and accumulate matches
    non_matching_books = {}
    num_books_with_matches = 0

    iter = tqdm(press_books_df.iterrows())
    for press_book_row_tup in iter:
        iter.set_description("Looking up books")
        new_book_dict = press_book_row_tup[1].to_dict()
        new_book_dict['ID'] = press_book_row_tup[0]


        uncat_isbn_string = new_book_dict['Uncategorized ISBN']
        if type(uncat_isbn_string) == type(''):
            uncat_isbns = uncat_isbn_string.split(' ; ')
            new_book_dict['Uncategorized ISBN'] = ''
            new_book_dict['ebook ISBN'] = ''
            new_book_dict['paper ISBN'] = ''
            new_book_dict['hardcover ISBN'] = ''

            for isbn_string in uncat_isbns:
                canon_isbn = get_canon_isbn(isbn_string)
                isbn_fmat = identify_format(isbn_string)
                if isbn_fmat == 'unknown':
                    isbn_fmat = 'Uncategorized'
                already_there = new_book_dict[f'{isbn_fmat} ISBN']
                if canon_isbn not in already_there:
                    if len(already_there) > 0:
                        new_book_dict[f'{isbn_fmat} ISBN'] += " ; "
                    new_book_dict[f'{isbn_fmat} ISBN'] += canon_isbn

        if (new_book_dict['ID'] not in matches_df['ID']):
            # logger.info(new_book_dict)

            matching_records_df = look_up_book_in_resource(new_book_dict)

            matches_df = matches_df.append(pd.Series(
                new_book_dict,
                name=new_book_dict['ID']
            ))

            if not matching_records_df.empty:
                matches_df = matches_df.append(matching_records_df)

    # logger.debug('Matching Manifests')
    # logger.debug(matches_df.describe())

    # matches_df = matches_df[ENV["OUTPUT_COLUMNS"]]
    # print(matches_df)

    # Add stats for copyright holder
    holders = {}
    for id in matches_df.index.values:
        if "_" not in id:
            rightsholder = str(matches_df.at[id,'Copyright Holder'])
            if rightsholder not in holders:
                holders[rightsholder] = 1
            else:
                holders[rightsholder] += 1

            publisher = str(matches_df.at[id,'Publisher'])

            if publisher+' - '+rightsholder in ENV['PUBLISHER_RIGHTSHOLDER_MATCHES']:
                new_rightsholder = False
            elif publisher != rightsholder:
                new_rightsholder = True
                print(publisher," != ",rightsholder)
            else:
                new_rightsholder = False

        matches_df.at[id,'New Rightsholder'] = new_rightsholder

    for id in matches_df.index.values:
        if "_" not in id:
            rightsholder = matches_df.at[id,'Copyright Holder']
        if not pd.isnull(rightsholder):
            matches_df.at[id,'Rightsholder Rank'] = holders[rightsholder]

    # Generate Excel output
    if not matches_df.empty:
        try:
            save_excel(matches_df,'output')
        except:
            save_csv(matches_df,'output')
        # matches_df.to_csv(os.path.join('data', 'matched_manifests.csv'), index=False)

    # if non_matching_books:
    #     no_isbn_matches_df = pd.DataFrame.from_dict(non_matching_books,orient='index')
    #     no_isbn_matches_df = no_isbn_matches_df[ENV["OUTPUT_COLUMNS"]]
    #     try:
    #         save_excel(no_isbn_matches_df,'not_matched')
    #     except:
    #         save_csv(no_isbn_matches_df,'not_matched')
        # no_isbn_matches_df.to_csv(os.path.join('data', 'no_isbn_matches.csv'), index=False)

    # Log Summary Report
    report_str = '** Summary Report from identify.py **\n\n'
    report_str += f'-- Total number of books included in search: {len(press_books_df)}\n'
    report_str += f'-- Number of books successfully matched with records with ISBNs: {num_books_with_matches}\n'
    report_str += f'-- Number of books with no matching records: {len(non_matching_books)}\n'
    # logger.info(f'\n\n{report_str}')
    return None
예제 #13
0
                    normalize_univ, \
                    NA_PATTERN
from db_cache import make_request_using_cache # , set_up_database


# Initialize settings and global variables
BEGIN = datetime.now()
TS = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# logger = logging.getLogger(__name__)

try:
    with open(os.path.join('config', 'env.json')) as env_file:
        ENV = json.loads(env_file.read())
except FileNotFoundError:
    print('Configuration file could not be found')
    # logger.error('Configuration file could not be found; please add env.json to the config directory.')

# logging.basicConfig(level=ENV.get('LOG_LEVEL', 'NOTSET'))

# # Set up database if necessary
# if not os.path.isfile(os.path.join(*ENV['DB_CACHE_PATH'])):
#     set_up_database()

BOOKS_CSV_PATH_ELEMS = ENV['BOOKS_CSV_PATH']
ALREADY_CSV_PATH_ELEMS = ENV['ALREADY_CSV_PATH']

worldcat_config = ENV['RESOURCE']
API_KEY = worldcat_config['BIB_RESOURCE_KEY']
BIB_BASE_URL = worldcat_config['BIB_RESOURCE_BASE_URL']
TEST_MODE_OPTS = ENV['TEST_MODE']
예제 #14
0
        #get_close_matches est en faite une fonction de la librairie
        #difflib qui va calculer le pourcentage de ressemblace entre
        #entre se que l'utilisateur entre et tout les mot du fichier
        #json
    elif len(get_close_matches(w, data.keys())) > 0:

        #On pren la premier valeur car la fontion get_close_matches
        #retourne une liste de valeur
        yn = input(
            "you want to say %s instead ? Enter Y if yes or N if No : " %
            get_close_matches(w, data.keys())[0])
        yn = yn.lower()
        if yn == "y":
            return data[get_close_matches(w, data.keys())[0]]
        elif yn == "n":
            return " Sorry this word doesn't exist."
        else:
            return "i didn't understand your entry."
    else:
        return " Sorry this word doesn't exist."


word = input("Enter the word : ")
output = Translate(word)
if type(output) == list:
    for item in output:
        print(item)
else:
    print(Translate(word))