Пример #1
0
def mangle_isbn(raw):
    result = dict()
    for value in raw.strip().split():
        if isbnlib.is_isbn13(value):
            result['isbn13'] = isbnlib.mask(value)
        elif isbnlib.is_isbn10(value):
            result['isbn10'] = isbnlib.mask(value)
    return result
Пример #2
0
    async def check(self, entry):
        fmt = self._cfg.get('isbn_format', entry)
        if not fmt:
            return []

        isbn = entry.data.get('isbn')
        if not isbn:
            return []

        clean_isbn = clean(isbn)
        if not clean_isbn or notisbn(clean_isbn):
            return []

        if fmt not in ('canonical', 'masked'):
            raise ConfigurationError("The option 'isbn_format' must be \
                either of 'canonical' or 'masked'.")

        if fmt == 'canonical':
            cisbn = canonical(clean_isbn)
            if cisbn != isbn:
                return [(type(self).NAME,
                         "ISBN '{}' is not in canonical format.".format(isbn),
                         "Canonical format would be '{}'".format(cisbn))]
        elif fmt == 'masked':
            misbn = mask(clean_isbn)
            if misbn != isbn:
                return [(type(self).NAME,
                         "ISBN '{}' is not in masked format.".format(isbn),
                         "Masked format would be '{}'".format(misbn))]

        return []
Пример #3
0
 def get_meta(self):
     """
     Return the book's meta data (Title, Authors, Year, etc...) in a dictionary form, with the isbn13 field masked.
     """
     d = meta(canonical(self.isbnlike))
     d['ISBN-13'] = mask(d['ISBN-13'])
     return d
Пример #4
0
def is_valid(isbn_id):
    """
    Check that a given string is a valid ISBN.

    :param isbn_id: the isbn to be checked.
    :returns: boolean indicating whether the isbn is valid or not.

    >>> is_valid("978-3-16-148410-0")
    True

    >>> is_valid("9783161484100")
    True

    >>> is_valid("9783161484100aa")
    False

    >>> is_valid("abcd")
    False

    >>> is_valid("0136091814")
    True

    >>> is_valid("0136091812")
    False

    >>> is_valid("9780136091817")
    False

    >>> is_valid("123456789X")
    True
    """
    return ((not isbnlib.notisbn(isbn_id)) and
            (isbnlib.get_canonical_isbn(isbn_id) == isbn_id
             or isbnlib.mask(isbnlib.get_canonical_isbn(isbn_id)) == isbn_id))
Пример #5
0
def fix_isbn(entry):
    if 'isbn' in entry:
        value = entry['isbn']
        if isbnlib.is_isbn10(value):
            value = isbnlib.to_isbn13(value)
        if not isbnlib.is_isbn13(value):
            raise Exception(f'invalid isbn in {entry["ID"]}: {entry["isbn"]}')
        entry['isbn'] = isbnlib.mask(value, separator='-')
    return entry
Пример #6
0
def classify_isbn(isbnlike):
    isbn = {}
    isbn['canon'] = ib.canonical(isbnlike)
    if ib.is_isbn10(isbnlike):
        isbn['type'] = 'isbn10'
    elif ib.is_isbn10('0'+isbn['canon']):
        isbn['canon'] = '0'+isbn['canon']
        isbn['masked'] = ib.mask(isbn['canon'])
        isbn['type'] = 'isbn10'
    elif ib.is_isbn10('00'+isbn['canon']):
        isbn['canon'] = '00'+isbn['canon']
        isbn['masked'] = ib.mask(isbn['canon'])
        isbn['type'] = 'isbn10'
    elif ib.is_isbn13(isbn['canon']):
        isbn['masked'] = ib.mask(isbn['canon'])
        isbn['type'] = 'isbn13'
    else:
        isbn['type'] = 'invalid?'
    return isbn
Пример #7
0
def check_isbns(isbns, return_value="last"):
    isbn = None
    all_isbns = []

    # Check the validity of regex matches
    for i in isbns:
        if isbnlib.is_isbn10(i[0]) or isbnlib.is_isbn13(i[0]):
            if return_value == "last":
                isbn = isbnlib.mask(i[0], separator='-')
            elif return_value == "first":
                return isbnlib.mask(i[0], separator='-')
            elif return_value == "all":
                all_isbns.append(isbnlib.mask(i[0], separator='-'))
            else:
                return None
        else:
            #print >>sys.stderr, i[0] + " is not valid ISBN"
            continue

    return isbn if return_value != "all" else all_isbns
def isbn2bibtex(isbn):
    if not (isbnlib.is_isbn10(isbn) or isbnlib.is_isbn13(isbn)):
        bibTexCode = "Koodi ei ole ISBN-koodi :("     
    else:
        try:
            bookID = isbn2id(isbnlib.mask(isbn.replace('-','')))
        except:
            bookID = isbn2id(isbn.replace('-',''))
        if bookID == False:
            print('Kokeillaan googlea')
            bibTexCode = "Koodia ei löydy tietokannasta :("
            bibtex = bibformatters['bibtex']
            try:
                bibTexCode = bibtex(isbnlib.meta(isbn.replace('-',''),'goob'))
            except:
                bibTexCode = 'Kirjaa ei löydy Googlen eikä Finnan tietokannoista :('
        else:
            try:        
                bibTexCode = id2bibtex(bookID,isbnlib.mask(isbn.replace('-','')))
            except:
                bibTexCode = id2bibtex(bookID,isbn.replace('-',''))
    return bibTexCode
Пример #9
0
 def extract_isbn(value):
     try:
         isbns = isbnlib.get_isbnlike(value)
         isbn = isbns[0]
     except:
         raise ValidationError(f"Bad format {value}")
     if len(isbns) > 1:
         raise ValidationError("Too much ISBN numbers")
     elif (len(isbns) == 0) or (not isbnlib.is_isbn10(isbn) and not isbnlib.to_isbn13(isbn)):
         raise ValidationError("It is not ISBN number")
     elif len(isbns) == 1:
         return isbnlib.mask(isbn)
     else:
         raise ValidationError("Unexpected option")
Пример #10
0
def check_isbn(entry, try_fix):
    """Check and format ISBN.

    More information about ISBN:
    https://en.wikipedia.org/wiki/International_Standard_Book_Number
    """
    isbn_string = entry['isbn']
    # is_valid_isbn = False
    if isbnlib.is_isbn10(isbn_string):
        # is_valid_isbn = True
        try:
            if int(entry['year']) >= 2007:
                err_message(entry,
                            ("ISBN10 ({}) were issued only before 2007," +
                             " year is actually {}").format(
                                 isbn_string, entry['year']))
                return False
            return True
        # pylint: disable=bare-except
        except:
            return False
    elif isbnlib.is_isbn13(isbn_string):
        # is_valid_isbn = True
        try:
            if int(entry['year']) < 2007 and isbn_string.starstwith('978'):
                err_message(entry,
                            ("ISBN13 ({}) were issued only after 2007," +
                             " year is actually {}").format(
                                 isbn_string, entry['year']))
            return True
        # pylint: disable=bare-except
        except:
            return False
    else:
        if isbn_string != 'TODO':
            err_message(entry, "Invalid ISBN {}".format(isbn_string))
        # TODO try to look up isbn using isbnlib.goom()
        # intitle:Understanding+inauthor:McLuhan&tbs=,
        #        cdr:1,cd_min:Jan+1_2+1964,cd_max:Dec+31_2+1974&num=10
        return False

    if try_fix:
        _fix_based_on_isbn(isbn_string, entry)

    entry['isbn'] = isbnlib.mask(isbn_string)
    return True
Пример #11
0
def normalizeISBN(isbn):
    """
    >>> normalizeISBN('978800105473-4')
    '978-80-01-05473-4'

    >>> normalizeISBN('80978800105473-4')
    '80978800105473-4'

    >>> normalizeISBN('988800105473-4')
    '988800105473-4'

    >>> normalizeISBN('978-80-254-94677')
    '978-80-254-9467-7'
    """
    try:
        return isbnlib.mask(isbnlib.canonical(isbn))
    except isbnlib.NotValidISBNError:
        return isbn
Пример #12
0
    def generate(self):
        """Generate self.sfnt, self.ctnt and self.reft.

        self.dictionary should be ready before calling this function.
        The dictionary will be cleaned up (empty values will be removed) and
        all values will be encoded using encode_for_template() function.
        ISBN (if exist) will be hyphenated.
        """
        self.dictionary = dict_cleanup(self.dictionary)
        self.dictionary = encode_for_template(self.dictionary)
        if 'isbn' in self.dictionary:
            masked = isbnlib.mask(self.dictionary['isbn'])
            if masked:
                self.dictionary['isbn'] = masked
        self.sfnt = generator.sfn_template(self.dictionary)
        self.ctnt = generator.citation_template(self.dictionary,
                                                self.date_format)
        self.reft = generator.reference_tag(self.dictionary,
                                            self.sfnt,
                                            self.ctnt)
Пример #13
0
 def search(self, **kwargs):
     self.query_dict = {
         'noVariants': 'true',
     }
     self.query_dict.update(kwargs)
     if self.isbn is None:
         self.query_dict['titleStartsWith'] = self.querystring
         data = self.do_request(self.query_dict)
         self.total = data['data']['total']
         self.pages = self.get_page_count(self.total)
         self.results = self.parse_results(data)
         self.mapping = self.raw_mapping(data)
         return self
     else:
         self.query_dict['isbn'] = isbnlib.mask(self.isbn, '-')
         data = self.do_request(self.query_dict)
         self.total = data['data']['total']
         self.results = self.parse_results(data)
         self.mapping = self.raw_mapping(data)
     return self
Пример #14
0
def fix_isbn(prop, isbn_version, is_isbnversion):
    """
    1. Gets the ISBNs list
    2. checks if the ISBN is valid
    2.1. If valid but badly hyphenated, fixes it
    2.2. If not valid, adds it to an error list.
    """
    print(colored('\n== Fixing {}s =='.format(isbn_version), 'yellow'))
    wrong_isbn = []
    isbn_list = get_isbn_list(prop)
    wrong_hyphenation = 0
    for r in isbn_list:
        wd_isbn = r['isbn']['value']
        qid = get_qid(r['book']['value'])
        if is_isbnversion(wd_isbn):
            isbn_mask = isbnlib.mask(wd_isbn)
            if isbn_mask != wd_isbn:
                wrong_hyphenation += set_mask(qid, prop, wd_isbn, isbn_mask)
        else:
            wrong_isbn.append((qid, wd_isbn))
    
    print(colored('{} wrong ISBN hyphenation(s) fixed.'.format(wrong_hyphenation), 'blue'))
    return wrong_isbn
Пример #15
0
def is_valid(isbn_id):
    """
    Check that a given string is a valid ISBN.

    :param isbn_id: the isbn to be checked.
    :returns: boolean indicating whether the isbn is valid or not.

    >>> is_valid("978-3-16-148410-0")
    True

    >>> is_valid("9783161484100")
    True

    >>> is_valid("9783161484100aa")
    False

    >>> is_valid("abcd")
    False

    >>> is_valid("0136091814")
    True

    >>> is_valid("0136091812")
    False

    >>> is_valid("9780136091817")
    False

    >>> is_valid("123456789X")
    True
    """
    return (
        (not isbnlib.notisbn(isbn_id)) and (
            isbnlib.get_canonical_isbn(isbn_id) == isbn_id or
            isbnlib.mask(isbnlib.get_canonical_isbn(isbn_id)) == isbn_id)
    )
Пример #16
0
def normalize_isbn(val):
    """Normalize an ISBN identifier."""
    return mask(val)
Пример #17
0
def handle_isbn(val):
    if val:
        return mask(to_isbn13(str(val)))
Пример #18
0
def normalize_isbn(val):
    """Normalize an ISBN identifier."""
    return mask(val)
Пример #19
0
 def isbn_mask(self):
     return isbnlib.mask(self.isbn, '-')
Пример #20
0
def marcxml_parsing(x):

    # tree = ElementTree.parse("./raw_data/sample_1k_marc.xml")
    tree = ElementTree.parse(x)
    collection = tree.getroot()

    code_336 = pd.read_csv("./raw_data/336_code.csv")
    code_337 = pd.read_csv("./raw_data/337_code.csv")
    code_338 = pd.read_csv("./raw_data/338_code.csv")

    features = []  # list of features

    # range(len(collection))
    for i in range(len(collection)):
        row = {}
        print("---------------------  " + str(i))
        record = collection[i]

        leader = record.find('{http://www.loc.gov/MARC21/slim}leader')
        leader_6 = leader.text[6]
        leader_17 = leader.text[17]
        leader_18 = leader.text[18]
        # print(leader_type)
        row['leader_6'] = leader_6
        row['leader_17'] = leader_17
        row['leader_18'] = leader_18

        control = record.findall(
            '{http://www.loc.gov/MARC21/slim}controlfield')
        F006 = 0
        F007 = 0
        for c in control:
            tag = c.get('tag')
            # print(tag)

            if tag == '001':
                oclc_controlnum = c.text
                # print(physical_desc)
                row['F001_a'] = oclc_controlnum

            if tag == '006':
                F006 = F006 + 1

            if tag == '007':
                F007 = F007 + 1

            if tag == '008':
                value = c.text
                # print(value)
                pub_code = value[6]
                pub_year_1 = value[7:11]
                pub_year_2 = value[11:15]
                place = value[15:18]
                audience = value[22]
                cont_nature = value[24:28]
                government = value[28]
                literary = value[33]
                language = value[35:38]
                catalog_source = value[39]
                # print(place, language, catalog_source)
                row['F008_06'] = pub_code
                row['F008_0710'] = pub_year_1
                row['F008_1114'] = pub_year_2
                row['F008_1517'] = place
                row['F008_22'] = audience
                row['F008_2427_a'] = bool(re.search('a', cont_nature))
                row['F008_2427_b'] = bool(re.search('b', cont_nature))
                row['F008_2427_c'] = bool(re.search('c', cont_nature))
                row['F008_2427_d'] = bool(re.search('d', cont_nature))
                row['F008_2427_e'] = bool(re.search('e', cont_nature))
                row['F008_2427_f'] = bool(re.search('f', cont_nature))
                row['F008_2427_g'] = bool(re.search('g', cont_nature))
                row['F008_2427_i'] = bool(re.search('i', cont_nature))
                row['F008_2427_j'] = bool(re.search('j', cont_nature))
                row['F008_2427_k'] = bool(re.search('k', cont_nature))
                row['F008_2427_l'] = bool(re.search('l', cont_nature))
                row['F008_2427_m'] = bool(re.search('m', cont_nature))
                row['F008_2427_n'] = bool(re.search('n', cont_nature))
                row['F008_2427_o'] = bool(re.search('o', cont_nature))
                row['F008_2427_p'] = bool(re.search('p', cont_nature))
                row['F008_2427_q'] = bool(re.search('q', cont_nature))
                row['F008_2427_r'] = bool(re.search('r', cont_nature))
                row['F008_2427_s'] = bool(re.search('s', cont_nature))
                row['F008_2427_t'] = bool(re.search('t', cont_nature))
                row['F008_2427_u'] = bool(re.search('u', cont_nature))
                row['F008_2427_v'] = bool(re.search('v', cont_nature))
                row['F008_2427_w'] = bool(re.search('w', cont_nature))
                row['F008_2427_y'] = bool(re.search('y', cont_nature))
                row['F008_2427_z'] = bool(re.search('z', cont_nature))
                row['F008_2427_2'] = bool(re.search('2', cont_nature))
                row['F008_2427_5'] = bool(re.search('5', cont_nature))
                row['F008_2427_6'] = bool(re.search('6', cont_nature))
                row['F008_28'] = government
                row['F008_33'] = literary
                row['F008_3537'] = language
                row['F008_39'] = catalog_source

                if place is None:
                    row['008_1517'] = "NA"
                if language is None:
                    row['008_3537'] = "NA"
                if len(catalog_source) == 0:
                    row['008_39'] = "NA"

        row['006_is'] = 1 if F006 > 0 else 0
        row['007_is'] = 1 if F007 > 0 else 0

        data = record.findall('{http://www.loc.gov/MARC21/slim}datafield')

        F040_e = 0
        F041_is = 0
        F050_is = 0
        F082_is = 0
        F260_is = 0
        F264_is = 0
        F26x_is = 0
        F336_is = 0
        F337_is = 0
        F338_is = 0
        F490_is = 0
        F6xxa_is = 0
        F6xxv_is = 0
        F6xxy_is = 0
        F6xxz_is = 0
        isbn_list = []
        isbn_tag_list = []
        F041_a_list = []
        F041_h_list = []
        F050_a1_list = []
        F050_a2_list = []
        F082_a1_list = []
        F082_a2_list = []
        F260_b_list = []
        F260_c_list = []
        F264_b_list = []
        F264_c_list = []
        F26x_b_list = []
        F26x_c_list = []
        F336_b_list = []
        F337_b_list = []
        F338_b_list = []
        F490_a_list = []
        F6xx_a_list = []
        F6xx_v_list = []
        F6xx_y_list = []
        F6xx_z_list = []

        for d in data:
            tag = d.get('tag')
            print("---------------------  " + str(i) + "---- " + tag)

            if tag == '020':
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        isbn = s.text

                        if len(isbn) == 10 and is_isbn10(
                                str(isbn)) == True and mask(isbn) is not None:
                            isbn_text = str(isbn)
                            isbn_list.append(isbn_text)
                            isbn_tag = '--'.join(mask(isbn).split("-")[0:2])
                            isbn_tag_list.append(isbn_tag)
                        elif len(isbn) == 13 and is_isbn13(
                                str(isbn)) == True and mask(
                                    isbn) is not None and isbn[0:3] == "978":
                            isbn_text = str(isbn)
                            isbn_list.append(isbn_text)
                            isbn_tag = '--'.join(
                                mask(to_isbn10(isbn)).split("-")[0:2])
                            isbn_tag_list.append(isbn_tag)

            if tag == "040":
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'e':
                        if s.text == 'rda' or s.text == "RDA":
                            F040_e = F040_e + 1

            if tag == "041":
                F041_is = F041_is + 1
                F041_ind1 = d.get('ind1')
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F041_a_list.append(s.text)
                    if s.get('code') == 'h':
                        F041_h_list.append(s.text)

            if tag == '050':
                F050_is = F050_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        match = re.search(r'^[A-Z]{1,3}', str(s.text))
                        match2 = re.search(
                            r'^[A-Z]{1,3}[0-9]{1,}(?=\.|[A-z]|$| )',
                            str(s.text))
                        if match and match2:
                            F050_a1_list.append(match.group())
                            F050_a2_list.append(match2.group())

            if tag == '082':
                F082_is = F082_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        match = re.search(r'^[0-9]{3}', str(s.text))
                        if match:
                            F082_a1_list.append(match.group()[0])
                            F082_a2_list.append(match.group())

            if tag == '260':
                F260_is = F260_is + 1
                F26x_is = F26x_is + 1
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'b':
                        F260_b_list.append(s.text)
                        F26x_b_list.append(s.text)
                        if len(
                                re.findall(
                                    "printed by |distributed by |distributed in ",
                                    s.text.lower())) > 0:
                            F260_is = F260_is - 1
                            F26x_is = F26x_is - 1
                    if s.get('code') == 'c':
                        F260_c_list.append(s.text)
                        text_26x = re.findall("\d{4}", s.text)
                        F26x_c_list.extend(text_26x)

            if tag == '264' and d.get('ind2') == '1':
                F264_is = F264_is + 1
                F26x_is = F26x_is + 1
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'b':
                        F264_b_list.append(s.text)
                        F26x_b_list.append(s.text)
                        if len(
                                re.findall(
                                    "printed by |distributed by |distributed in ",
                                    s.text.lower())) > 0:
                            F264_is = F264_is - 1
                            F26x_is = F26x_is - 1
                    if s.get('code') == 'c':
                        F264_c_list.append(s.text)
                        text_26x = re.findall("\d{4}", s.text)
                        F26x_c_list.extend(text_26x)

            if tag == '336':
                F336_is = F336_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                sub_code_list = []
                for t in subfields:
                    sub_code_list.append(t.get("code"))
                b_is = "b" in sub_code_list
                a_is = "a" in sub_code_list
                if b_is > 0:
                    for s in subfields:
                        if s.get('code') == 'b':
                            F336_b_value = s.text
                        if s.get('code') == '2':
                            F336_2_value = s.text
                elif b_is == 0 and a_is > 0:
                    for s in subfields:
                        if s.get('code') == 'a' and s.text in code_336[
                                '336_a'].values:
                            text_336b = code_336.loc[code_336['336_a'] ==
                                                     s.text, '336_b'].values[0]
                            F336_b_value = text_336b
                        if s.get('code') == '2':
                            F336_2_value = s.text
                if "rda" in F336_2_value.lower():
                    F336_b_list.append(F336_b_value)

            if tag == '337':
                F337_is = F337_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                sub_code_list = []
                for t in subfields:
                    sub_code_list.append(t.get("code"))
                b_is = "b" in sub_code_list
                a_is = "a" in sub_code_list
                if b_is > 0:
                    for s in subfields:
                        if s.get('code') == 'b':
                            F337_b_value = s.text
                        if s.get('code') == '2':
                            F337_2_value = s.text
                elif b_is == 0 and a_is > 0:
                    for s in subfields:
                        if s.get('code') == 'a' and s.text in code_337[
                                '337_a'].values:
                            text_337b = code_337.loc[code_337['337_a'] ==
                                                     s.text, '337_b'].values[0]
                            F337_b_value = text_337b
                        if s.get('code') == '2':
                            F337_2_value = s.text
                if "rda" in F337_2_value.lower():
                    F337_b_list.append(F337_b_value)

            if tag == '338':
                F338_is = F338_is + 1
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                sub_code_list = []
                for t in subfields:
                    sub_code_list.append(t.get("code"))
                b_is = "b" in sub_code_list
                a_is = "a" in sub_code_list
                if b_is > 0:
                    for s in subfields:
                        if s.get('code') == 'b':
                            F338_b_value = s.text
                        if s.get('code') == '2':
                            F338_2_value = s.text
                elif b_is == 0 and a_is > 0:
                    for s in subfields:
                        if s.get('code') == 'a' and s.text in code_338[
                                '338_a'].values:
                            text_338b = code_338.loc[code_338['338_a'] ==
                                                     s.text, '338_b'].values[0]
                            F338_b_value = text_338b
                        if s.get('code') == '2':
                            F338_2_value = s.text
                if "rda" in F338_2_value.lower():
                    F338_b_list.append(F338_b_value)

            if tag == '490':
                F490_is = F490_is + 1
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F490_a_list.append(s.text)

            if tag in ['600', '610', '611', '630', '650'
                       ] and d.get('ind2') == "0":
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F6xxa_is = F6xxa_is + 1
                        F6xx_a_list.append(clean_text(s.text))
                    if s.get('code') == 'v':
                        F6xxv_is = F6xxv_is + 1
                        F6xx_v_list.append(clean_text(s.text))
                    if s.get('code') == 'y':
                        F6xxy_is = F6xxy_is + 1
                        F6xx_y_list.append(clean_text(s.text))
                    if s.get('code') == 'z':
                        F6xxz_is = F6xxz_is + 1
                        F6xx_z_list.append(clean_text(s.text))

            if tag == "651" and d.get('ind2') == "0":
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F6xxz_is = F6xxz_is + 1
                        F6xx_z_list.append(clean_text(s.text))
                    if s.get('code') == 'v':
                        F6xxv_is = F6xxv_is + 1
                        F6xx_v_list.append(clean_text(s.text))
                    if s.get('code') == 'y':
                        F6xxy_is = F6xxy_is + 1
                        F6xx_y_list.append(clean_text(s.text))
                    if s.get('code') == 'z':
                        F6xxz_is = F6xxz_is + 1
                        F6xx_z_list.append(clean_text(s.text))

            if tag == "655" and d.get('ind2') == "0":
                # print(d)
                subfields = d.findall(
                    '{http://www.loc.gov/MARC21/slim}subfield')
                for s in subfields:
                    if s.get('code') == 'a':
                        F6xxv_is = F6xxv_is + 1
                        F6xx_v_list.append(clean_text(s.text))
                    if s.get('code') == 'v':
                        F6xxv_is = F6xxv_is + 1
                        F6xx_v_list.append(clean_text(s.text))
                    if s.get('code') == 'y':
                        F6xxy_is = F6xxy_is + 1
                        F6xx_y_list.append(clean_text(s.text))
                    if s.get('code') == 'z':
                        F6xxz_is = F6xxz_is + 1
                        F6xx_z_list.append(clean_text(s.text))

        # print(code)
        # print(value)

        isbn_list1 = set(isbn_list)
        isbn_tag_list1 = set(isbn_tag_list)
        if (len(isbn_tag_list) > 0):
            row['isbn'] = " ;; ".join(set(isbn_list1))
            row['isbn_tag'] = " ;; ".join(set(isbn_tag_list1))
            row['isbn1'] = isbn_list[0]
            row['isbn_tag1'] = isbn_tag_list[0]
        else:
            row['isbn'] = "NA"
            row['isbn_tag'] = "NA"
            row['isbn1'] = "NA"
            row['isbn_tag1'] = "NA"

        if F040_e > 0:
            row['F040_e'] = 1
        else:
            row['F040_e'] = 0

        if F041_is > 0:
            row['F041_ind1'] = F041_ind1
            row['F041_a'] = " ;; ".join(F041_a_list)
            row['F041_h'] = " ;; ".join(F041_h_list)
        else:
            row['F041_ind1'] = "NA"
            row['F041_a'] = "NA"
            row['F041_h'] = "NA"

        if len(F050_a1_list) > 0:
            row['F050_a1'] = " ;; ".join(set(F050_a1_list))
            row['F050_a2'] = " ;; ".join(set(F050_a2_list))
        else:
            row['F050_a1'] = "NA"
            row['F050_a2'] = "NA"

        if len(F082_a1_list) > 0:
            row['F082_a1'] = " ;; ".join(set(F082_a1_list))
            row['F082_a2'] = " ;; ".join(set(F082_a2_list))
        else:
            row['F082_a1'] = "NA"
            row['F082_a2'] = "NA"

        row['F260_is'] = F260_is
        if F260_is > 0:
            row['F260_b'] = " ;; ".join(F260_b_list)
            row['F260_c'] = " ;; ".join(F260_c_list)
        else:
            row['F260_b'] = "NA"
            row['F260_c'] = "NA"

        row['F264_is'] = F264_is
        if F264_is > 0:
            row['F264_b'] = " ;; ".join(F264_b_list[0:0 + F26x_is])
            row['F264_c'] = " ;; ".join(F264_c_list)
        else:
            row['F264_b'] = "NA"
            row['F264_c'] = "NA"

        row['F26x_is'] = F26x_is
        if F26x_is > 0:
            row['F26x_b'] = " ;; ".join(set(F26x_b_list[0:0 + F26x_is]))
            row['F26x_c'] = " ;; ".join(set(F26x_c_list))
        else:
            row['F26x_b'] = "NA"
            row['F26x_c'] = "NA"

        if F336_is > 0:
            F336_b_text = F336_b_list
            row['F336_b'] = " ;; ".join(F336_b_text)
            row['F336_b_txt'] = bool(re.search('txt', row['F336_b']))
            row['F336_b_sti'] = bool(re.search('sti', row['F336_b']))
            row['F336_b_cri'] = bool(re.search('cri', row['F336_b']))
            row['F336_b_spw'] = bool(re.search('spw', row['F336_b']))
            row['F336_b_tct'] = bool(re.search('tct', row['F336_b']))
        else:
            row['F336_b'] = "NA"
            row['F336_b_txt'] = ""
            row['F336_b_sti'] = ""
            row['F336_b_cri'] = ""
            row['F336_b_spw'] = ""
            row['F336_b_tct'] = ""

        if F337_is > 0:
            F337_b_text = F337_b_list
            row['F337_b'] = " ;; ".join(F337_b_text)
            row['F337_b_c'] = bool(re.search('c', row['F337_b']))
            row['F337_b_h'] = bool(re.search('h', row['F337_b']))
            row['F337_b_n'] = bool(re.search('n', row['F337_b']))
            row['F337_b_s'] = bool(re.search('s', row['F337_b']))
        else:
            row['F337_b'] = "NA"
            row['F337_b_c'] = ""
            row['F337_b_h'] = ""
            row['F337_b_n'] = ""
            row['F337_b_s'] = ""

        if F338_is > 0:
            F338_b_text = F338_b_list
            row['F338_b'] = " ;; ".join(F338_b_text)
            row['F338_b_cd'] = bool(re.search('cd', row['F338_b']))
            row['F338_b_cr'] = bool(re.search('cr', row['F338_b']))
            row['F338_b_hd'] = bool(re.search('hd', row['F338_b']))
            row['F338_b_he'] = bool(re.search('he', row['F338_b']))
            row['F338_b_nb'] = bool(re.search('nb', row['F338_b']))
            row['F338_b_sd'] = bool(re.search('sd', row['F338_b']))
        else:
            row['F338_b'] = "NA"
            row['F338_b_cd'] = ""
            row['F338_b_cr'] = ""
            row['F338_b_hd'] = ""
            row['F338_b_he'] = ""
            row['F338_b_nb'] = ""
            row['F338_b_sd'] = ""

        if F490_is > 0:
            row['F490_a'] = " ;; ".join(F490_a_list)
        else:
            row['F490_a'] = "NA"

        if F6xxa_is > 0:
            row['F6xx_a'] = " ;; ".join(set(F6xx_a_list))
        else:
            row['F6xx_a'] = "NA"

        if F6xxv_is > 0:
            row['F6xx_v'] = " ;; ".join(set(F6xx_v_list))
        else:
            row['F6xx_v'] = "NA"

        if F6xxy_is > 0:
            row['F6xx_y'] = " ;; ".join(set(F6xx_y_list))
        else:
            row['F6xx_y'] = "NA"

        if F6xxz_is > 0:
            row['F6xx_z'] = " ;; ".join(set(F6xx_z_list))
        else:
            row['F6xx_z'] = "NA"

        features.append(row)

    df = pd.DataFrame(features)
    return (df)
def com_isbn_mastk(isbn_string, isbn_seperator='-'):
    """
    Mask (hyphenate) a canonical ISBN.
    """
    return isbnlib.mask(isbn_string, separator=isbn_seperator)
Пример #22
0
def get_default_isbn(isbn_list):
    for isbn in isbn_list:
        if isbnlib.is_isbn13(isbn) or isbnlib.is_isbn10(isbn):
            return isbnlib.mask(isbn)

    return ""
Пример #23
0
#!/usr/bin/env python

import sys
import os
import yaml
import isbnlib

metafile = sys.argv[1]
metadata = open(metafile, 'r').read()
yamldata = yaml.load(metadata)

identifier = {}

if "identifier" in yamldata:
    for id in yamldata["identifier"]:
        if "key" in id:
            isbnlike = isbnlib.get_isbnlike(id["text"])[0]
            if isbnlib.is_isbn13(isbnlike):
                identifier[id["key"]] = isbnlib.EAN13(isbnlike)

isbn = identifier[
    sys.argv[2]] if sys.argv[2] in identifier else "9786056644504"

if len(sys.argv) >= 4 and sys.argv[3] == "mask":
    print(isbnlib.mask(isbn))
else:
    print(isbn)
Пример #24
0
def handle_isbn(val):
    return mask(to_isbn13(str(val)))
Пример #25
0
 def td_format(self, content):
     try:
         return mask(content)
     except:
         return content
Пример #26
0
    for item in batch:
        try:
            isbn = item['data']['ISBN']
        except KeyError:
            # Not a book-ish item
            continue
            
        if not isbn:
            # No ISBN listed
            continue

        # Transform to canonical (bare) form, then return to standardised
        # form with hyphens.
        canisbn = get_canonical_isbn(isbn.replace(' ', ''))

        if not canisbn:
            # This most likely means that the ISBN given is bogus
            # (e.g. has a faulty checksum). Some books have bogus
            # ISBNs printed on them, so they are used for cataloguing
            # by some libraries despite being formally invalid.
            print("Error extracting ISBN from "+str(isbn))
            continue

        newisbn = mask(canisbn)
        if newisbn != isbn:
            assert newisbn
            print("Updating "+str(isbn)+" to "+str(newisbn))
            item['data']['ISBN'] = newisbn
            if not zot.update_item(item):
                raise Exception("Zotero write failed for "+str(isbn))
Пример #27
0
 def isbn_str(self):
     if self._isbn:
         return isbnlib.mask(self._isbn)
     else:
         return ''