def _validate_book_id(self): # TODO: add check constraints # Validate book identifier if self.book_id_type == 'ISBN': if pyisbn.validate(self.book_id): if len(self.book_id) == 10: self.isbn10 = self.book_id self.isbn13 = pyisbn.convert(self.book_id) else: self.isbn13 = self.book_id self.isbn10 = pyisbn.convert(self.book_id) else: raise ValidationError( {'book_id': '{} is an invalid ISBN'.format(self.book_id)}) elif self.book_id_type == 'ASIN': regex = r"[A-Z0-9]{10}" # Remove whitespaces self.book_id = self.book_id.strip() if len(self.book_id) == 10 and re.fullmatch(regex, self.book_id): self.asin = self.book_id else: raise ValidationError( {'book_id': '{} is an invalid ASIN'.format(self.book_id)}) else: raise ValidationError({ 'book_id_type': 'Allowed Book Id types: {}'.format(self.allowed_book_id_types) })
def combine_title(): import pyisbn with open('isbn2title4.pickle', 'rb') as f: data = pickle.load(f) print(len(data)) pop_key = [] for isbn in data: if data[isbn] == '': pop_key.append(isbn) for key in pop_key: data.pop(key, None) print(len(data)) with open('isbn2title2.pickle', 'rb') as f: data2 = pickle.load(f) print(len(data)) for isbn in data2: orig_isbn = isbn if not isinstance(isbn, str): isbn = int(isbn) isbn = str(isbn) if len(isbn) == 10 or len(isbn) == 13: if len(str(isbn)) != 13: try: isbn13 = pyisbn.convert(isbn) except: continue else: isbn13 = isbn else: continue if isbn13 not in data and len(str(data2[orig_isbn])) != 0: data[isbn13] = data2[orig_isbn] print(len(data)) with open('isbn2title4.pickle', 'wb') as f: pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
def parse_isbn(isbn): '''生成 10 位和 13 位 isbn 码。 如果 isbn 码转换失败,直接返回原来的输入 :param isbn: 原始的 isbn ''' try: isbn_a = pyisbn.convert(str(isbn)) isbn_b = pyisbn.convert(isbn_a) result = {} result['isbn%d' % len(isbn_a)] = isbn_a result['isbn%d' % len(isbn_b)] = isbn_b except pyisbn.IsbnError: result = {'isbn10': isbn, 'isbn13': isbn} return result
def __init__(self, line=None, period=None, metric=None): super(CounterBook, self).__init__(line, period, metric) if line is not None: self.isbn = line[3].strip().replace('-', '') if len(self.isbn) == 10: self.isbn = pyisbn.convert(self.isbn) self.issn = line[4].strip() self.eissn = None
def __init__(self, *args, **kwargs): # automatically generate key_name isbn10 = kwargs.get("isbn10") isbn13 = kwargs.get("isbn13", (pyisbn.convert(isbn10) if isbn10 else None)) if isbn13: kwargs["key_name"] = "book:%s" % isbn13 super(Book, self).__init__(*args, **kwargs) self.links = LinkAttribute(self)
def normalize(element, version='13'): element = element.replace('-', '') if version == '13': if len(element) == 10: return pyisbn.convert(element) elif len(element) == 13: return pyisbn.convert(pyisbn.convert(element)) else: raise ValueError('ISBN is neither 10 or 13 chars long') elif version == '10': if len(element) == 10: return pyisbn.convert(pyisbn.convert(element)) elif len(element) == 13: return pyisbn.Isbn13(element).convert() else: raise ValueError('ISBN is neither 10 or 13 chars long') else: raise ValueError('version can only be one of 10 or 13')
def parse_isbn(raw): '''将 isbn 转换成 10 / 13 位以及带 hyphen 形式''' a, b = raw, raw isbn = { 'isbn10': raw, 'isbn13': raw, 'isbn10-hyphen': raw, 'isbn13-hyphen': raw } with ignores(pyisbn.IsbnError): a = pyisbn.convert(raw) b = pyisbn.convert(a) isbn = {'isbn%d' % len(i): i for i in [a, b]} with ignores(isbn_hyphenate.IsbnMalformedError): isbn['isbn10-hyphen'] = isbn_hyphenate.hyphenate(isbn['isbn10']) isbn['isbn13-hyphen'] = isbn_hyphenate.hyphenate(isbn['isbn13']) return isbn
def get_queryset(self): query = self.request.query_params.get('query', None) if not query: return [] results = [] # by by id try: _id = int(query) results += Book.objects.filter(id=_id) except: pass # by title results += Book.objects.filter(title__icontains=query) # by publisher name for publisher in Publisher.objects.filter(name__icontains=query): results += publisher.books.all() # by creator name for creator in Creator.objects.filter(name__icontains=query): results += creator.books.all() # by subject for subject in Subject.objects.filter(name__icontains=query): results += subject.books.all() # by isbn try: # clean the query isbn first _isbn = query.replace('-', '') if len(_isbn) == 10: # convert to isbn 13 _isbn = pyisbn.convert(_isbn) results += Book.objects.filter(isbn_clean__icontains=_isbn) except: pass # by lang results += Book.objects.filter(lang__icontains=query) # by doe results += Book.objects.filter(doe__icontains=query) # by place results += Book.objects.filter(place__icontains=query) return results
def to_python(self, value): value = value.replace("-", "") if len(value) == 13: if Isbn(value).validate(): return value else: raise ValidationError("ISBN did not validate") elif (len(value)) == 10: if Isbn(value).validate(): return pyisbn.convert(value) else: raise ValidationError("ISBN did not validate") else: raise ValidationError("ISBN has to be either 10 or 13 digits long")
def search(request): results = { 'sources': {}, 'error': {}, } ''' more: false, results: [ { text: "Western", children: [ { id: "CA", text: "California" }, { id: "AZ", text: "Arizona" } ] }, { text: "Eastern", children: [ { id: "FL", text: "Florida" } ] } ] ''' title = request.QUERY_PARAMS.get('title', False) if title: sources = Source.objects.filter(title__icontains=title) source_serializer = SourceSerializer(sources, many=True) results['sources']['spuqi'] = source_serializer.data isbn_str = request.QUERY_PARAMS.get('isbn', False) if isbn_str: try: pyisbn.Isbn(isbn_str) except pyisbn.IsbnError: results['error'] = { 'message': _('ISBN number must contain only digit-numbers'), } else: isbn_number = pyisbn.convert(isbn_str) if not pyisbn.validate(isbn_number): results['error'] = { 'message': _('A valid ISBN number is required') } #try: # if googlebooks_api.list('isbn:%s' % isbn)['totalItems'] > 0: # results['sources']['googlebooks'] = googlebooks_api.list( # 'isbn:%s' % isbn)['items'] results['sources']['googlebooks'] = googlebooks_api.list( 'isbn:%s' % isbn_number) # except ConnectionError: # results.errors.append(_('Could not connect to Google Book API')) return Response({"results": results})
def parse_isbns(s): import pyisbn """ Given a string, find as many uniq ISBNs in it an return them. """ pattern = re.compile('[0-9X-]{10,25}') isbns = set() for candidate in pattern.findall(s): candidate = candidate.replace('-', '').replace(' ', '') if len(candidate) == 10: try: isbns.add(pyisbn.convert(candidate)) except pyisbn.IsbnError as err: logger.error('%s: %s' % (s, err)) elif len(candidate) == 13: isbns.add(candidate) return list(isbns)
def __init__(self, line=None, period=None, metric=None, month_data=None, title="", platform="", publisher="", isbn=None, issn=None): super(CounterBook, self).__init__(line, period, metric, month_data, title, platform, publisher) self.eissn = None if line is not None: self.isbn = line[3].strip().replace('-', '') if len(self.isbn) == 10: self.isbn = pyisbn.convert(self.isbn) self.issn = line[4].strip() if isbn is not None: self.isbn = isbn if issn is not None: self.issn = issn
def combine_ratings(): import pyisbn with open('isbn2rating4.pickle', 'rb') as f: data = pickle.load(f) # not_valid = 0 # not_in_region = 0 # for isbn in data: # try: # if not pyisbn.validate(isbn): # not_valid += 1 # except: # not_in_region += 1 # print(not_valid/len(data)) # print(not_in_region/len(data)) print(len(data)) pop_key = [] for isbn in data: if data[isbn] == '': pop_key.append(isbn) for key in pop_key: data.pop(key, None) for isbn in data: data[isbn] = float(data[isbn]) print(len(data)) with open('isbn2rating3.pickle', 'rb') as f: data2 = pickle.load(f) print(len(data)) for isbn in data2: orig_isbn = isbn if not isinstance(isbn, str): isbn = int(isbn) isbn = str(isbn) if len(isbn) == 10 or len(isbn) == 13: if len(str(isbn)) != 13: try: isbn13 = pyisbn.convert(isbn) except: continue else: isbn13 = isbn else: continue if isbn13 not in data and len(str(data2[orig_isbn])) != 0: data[isbn13] = float(data2[orig_isbn]) print(len(data)) with open('isbn2rating4.pickle', 'wb') as f: pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
def get_by_isbn(cls, isbn, create=True, defaults=None): if len(isbn) == 10: # NOTE: always use ISBN-13 # this should also raise an exception if isbn is invalid isbn = pyisbn.convert(isbn) if not pyisbn.validate(isbn): raise ValueError("Invalid ISBN: %s" % isbn) memkey = "book:%s" % isbn book = memcache.get(memkey) if book: return book if not create: book = Book.get_by_key_name(memkey) else: defaults = defaults or {} defaults["isbn13"] = isbn book = Book.get_or_insert(memkey, **defaults) if book: memcache.set(book.memkey, book, Book.MEMCACHE_TTL) return book
def convert(isbn): if len(isbn) != 13: return pyisbn.convert(isbn) return isbn
isbns_per_rule = 100 # DO NOT change the value of any variables below this line! start_string = "t.ISSN IN ('" join_string = "','" end_string = "')\n" isbns = set() rules = [] isbnfile = open(isbn_filename, 'r') for line in isbnfile: isbn = line.strip() isbns.add(isbn) if len(isbn) == 10: isbns.add(pyisbn.convert(isbn)) isbnfile.close() temp_list = [] for isbn in isbns: temp_list.append(isbn) # batch isbns into strings if len(temp_list) >= isbns_per_rule: rules.append(start_string + join_string.join(temp_list) + end_string) temp_list = [] # batch any remainder into a final string if temp_list:
def test_convert_invalid(): with raises(IsbnError, match='Only ISBN-13s with 978 Bookland code can be ' 'converted to ISBN-10.'): convert('9790000000001')
title = line.strip().replace(isbn, '').strip() testisbns.append(isbn) print "Success!" isbns = [] for testisbn in testisbns: isbns.append(str(testisbn.strip().replace('-', ''))) print len(isbns) urlstup = [] for isbn in isbns: isbnraw = isbn[:] if len(isbn) == 10: pass elif len(isbn) == 13: try: isbn = pyisbn.convert(isbn) except pyisbn.IsbnError as e: print "Invalid ISBN: ", isbn continue else: print "Length of ISBN not 13 or 10: ", isbn continue isbn_title_author = (isbnraw, isbn, title, author) urlstup.append(fullurl) print len(urls) urls = [] for isbn in isbns: if len(isbn) == 10: pass elif len(isbn) == 13: try:
soup = BeautifulSoup(review, 'html.parser') review = ''.join(soup.findAll(text=True)) review = review.strip() if review == "": continue else: pass reviews_parsed.append(review) reviews_tup.append(reviews_parsed) reviews_dict = dict() reviews_dict["reviews"] = reviews_parsed reviews_dict["isbn"] = row[0] reviews_dict["avg_rating"] = float(row[2].split()[0]) reviews_dicts.append(reviews_dict) reviews_dict = dict() reviews_dict["reviews"] = reviews_parsed reviews_dict["isbn"] = pyisbn.convert(row[0]) reviews_dict["avg_rating"] = float(row[2].split()[0]) reviews_dicts.append(reviews_dict) print "total rows read {}".format(len(results1)) print "len reviews_tup {}".format(len(reviews_tup)) print "time took to process: {}".format(time.time() - start) books = dict() books["books"] = reviews_dicts print type(books) with open('isbn_avgrating_reviews.json', 'w') as f: for chunk in json.JSONEncoder().iterencode(books): f.write(chunk)
def isbn_10(self): if len(self.isbn) == 10: return self.isbn else: return pyisbn.convert(self.isbn)
def find_books(self, direct_search_only=False): """ Lookup book details on google books. Utilising book info web services to look up related isbn or title. :ISBN lookup: First lookup the ISBN on google books, if that yielded no results, the related ISBN are looked up on xISBN and librarything and the result is looked up on google books. If no related ISBNs found on xISBN or LibraryThing, the title is looked up on isbndb and isbnplus, and lookup the book on google books using the yielded title. If none of these steps. :Title lookup: If provided, books will be also looked up on google books by title. """ found_books = [] lookup_results = [] errors = [] found_books += self.lookup_by_title() if bool(self.isbns): for isbn in self.isbns: try: isbn = isbn.strip() # first search on google books lookup_results += self.search_google_books(isbn, None) # if not on google, search amazon if not lookup_results: lookup_results += self.search_amazon(isbn) # if dirct_search_only is not set, then try related # and title search if not direct_search_only and not lookup_results: lookup_results += self.lookup_by_related_isbn(isbn) if not lookup_results: lookup_results = self.lookup_by_title( self.lookup_title(isbn)) # ensure that all results have the targeted isbn # pylint: disable=C0200 for i in range(len(lookup_results)): if len(isbn) == 13: lookup_results[i].isbn13 = isbn lookup_results[i].isbn10 = pyisbn.convert(isbn) else: lookup_results[i].isbn10 = isbn lookup_results[i].isbn13 = pyisbn.convert(isbn) found_books += lookup_results except Exception as err: errors.append( time.strftime('%Y-%m-%d_%H:%M:%S - ') + str(err)) continue found_books = sorted(set(found_books), key=lambda b: b.title) if errors: flash(errors) return found_books
def convert_entry(self, entry): ISBN, TITLE, AUTHOR, FORMAT = 'ISBN', 'Title', 'Author', 'Format' NUM_OF_PAGES, PRIVATE_NOTE = 'Number of pages', 'Private Note' PUBLISHER, PUB_DATE = 'Publisher', 'Publication date' COMMENT_TITLE, COMMENT_CONTENT = 'Comment title', 'Comment content' STATUS, STARS = 'Status', 'Stars' PRIORITY = 'Priority' TAGS = 'Tags' ISBN = self.headers[ISBN] TITLE = self.headers[TITLE] AUTHOR = self.headers[AUTHOR] FORMAT = self.headers[FORMAT] NUM_OF_PAGES = self.headers[NUM_OF_PAGES] PUBLISHER = self.headers[PUBLISHER] PUB_DATE = self.headers[PUB_DATE] PRIVATE_NOTE = self.headers[PRIVATE_NOTE] COMMENT_TITLE = self.headers[COMMENT_TITLE] COMMENT_CONTENT = self.headers[COMMENT_CONTENT] STATUS = self.headers[STATUS] STARS = self.headers[STARS] PRIORITY = self.headers[PRIORITY] TAGS = self.headers[TAGS] title = entry.get(TITLE) author, additional_authors = None, None if AUTHOR in entry: all_authors = list(map(str.strip, entry[AUTHOR].split(','))) if len(all_authors) > 0: author = all_authors[0] if len(all_authors) > 1: additional_authors = ', '.join(all_authors[1:]) isbn13 = entry.get(ISBN) isbn10 = None if isbn13: isbn13 = isbn13[1:-1] try: isbn10 = pyisbn.convert(isbn13) if len(isbn13) == 10 and len(isbn10) == 13: isbn13, isbn10 = isbn10, isbn13 except pyisbn.IsbnError: # ignore inconvertible ISBNs pass publisher = entry.get(PUBLISHER) binding = entry.get(FORMAT) num_of_pages = entry.get(NUM_OF_PAGES) year_published = entry.get(PUB_DATE) if year_published: year_published = year_published[1:-1].replace('-', '/') private_notes = self._convert_linebreak(entry.get(PRIVATE_NOTE)) # wishlist if PRIORITY in entry: bookshelves = ['to-read'] my_rating = my_review = date_read = date_added = None # bookshelve else: my_rating = entry.get(STARS) my_review = self._convert_comment( entry.get(COMMENT_TITLE), entry.get(COMMENT_CONTENT)) tags = entry.get(TAGS) status = entry.get(STATUS) date_read, date_added, bookshelves = self._convert_status(status, tags) if len(bookshelves) == 0: logging.warning('cannot parse %s: %s', title, status) if self.only_isbn: title = '' author = '' additional_authors = '' publisher = '' binding = '' num_of_pages = '' year_published = '' return (title, author, additional_authors, isbn10, isbn13, my_rating, publisher, binding, num_of_pages, year_published, date_read, date_added, ','.join(bookshelves), my_review, private_notes)
def test_convert(isbn): assert convert(convert(isbn)) == isbn
def test_convert(isbn): expect(convert(convert(isbn))) == isbn.replace('-', '')
def post(request): """ receives a json file and creates book objects :param request: :return: """ json_file = request.FILES.get('json_file', None) if not json_file: return Response({"error": "No files"}, status=status.HTTP_400_BAD_REQUEST) # open the json file and decode as uft-8 f = open(json_file.temporary_file_path(), 'r', encoding='utf-8') # convert json to python dictionary books = json.loads(f.read()) # used to show progress counter = 0 for book in books: # show progress if counter % 10000 == 0: print(counter, len(books)) try: try: # if the book exists dont' go through converting json Book.objects.get(id=book['book_id']) except Book.DoesNotExist: print("new book") title = book['title'] book_id = book['book_id'] isbn = book['isbn'] image = book['image_link'] pdf = book['pdf_link'] page_count = book['pages'] edition = book['edition'] count = book['count'] lang = book['lang'] doe = book['doe'] place = book['place'] issue_date_str = book['issue_date'] volume = book['volume'] # clean isbn does not have '-' and is converted to isbn 13 isbn_clean = book['isbn'].replace('-', '') if len(isbn_clean == 10): try: isbn_clean = pyisbn.convert(isbn_clean) except: pass try: # issue date might be blank or in wrong format date = issue_date_str.split('/') date[0] = '13' + date[0] issue_date = jdatetime.date(int(date[0]), int(date[1]), int(date[2])) except: # if no valid issue_date found, set that to None issue_date = None try: # price might be blank or in the wrong format price = int(book['price']) except: # set None if no valid price price = None if book['publisher']: # if book has publisher available try: # if publisher already in database publisher = Publisher.objects.get( id=book['publisher']['id']) except Publisher.DoesNotExist: # if publisher not in database, create it publisher = Publisher.objects.create( id=book['publisher']['id'], name=book['publisher']['name']) else: # if no publisher available set it to None publisher = None # construct the book object the_book = Book(title=title, publisher=publisher, id=book_id, isbn=isbn, issue_date=issue_date, price=price, image=image, pdf=pdf, page_count=page_count, edition=edition, count=count, lang=lang, place=place, doe=doe, volume=volume, isbn_clean=isbn_clean) the_book.save() try: # subjects may not be present for subject in book['subjects']: # get to create subject objects try: subject = Subject.objects.get(id=subject['id']) except Subject.DoesNotExist: subject = Subject.objects.create( id=subject['id'], name=subject['title']) the_book.subjects.add(subject) except: pass try: # creators may not be present for creator in book['authors']: # get or create, creators try: creator = Creator.objects.get(id=creator['id']) except Creator.DoesNotExist: creator = Creator.objects.create( id=creator['id'], name=creator['name'], type=creator['type']) the_book.creators.add(creator) except: pass # save the book object into database the_book.save() except: # if any uncut exception occur raise it with the book id that caused it raise Exception('book id: ' + book['book_id']) counter += 1 return Response({"status": "done"})
def scrape(name): #page = open(name) page = open('page.txt') soup = BeautifulSoup(page) table = soup.find('table') rows = table.findAll('tr') length = len(rows) #for each class finallist = [] first = True previous = [] p = 1329 for i in range(1329, length): print p p += 1 currentrow = {} thisrow = rows[i] columns = thisrow.findAll('td') #make list of course designations i.e. MAE 305 classes = columns[1].find('u') classes = classes.findAll(text = quote) course_desig = [] #print classes for c in classes: c = re.sub(' +', '', c) c = re.sub('\n', '', c) if (str(c) != '<br/>'): if str(c) != "": course_desig.append(c) if course_desig == previous: continue else: previous = course_desig currentrow['coursedesig'] = (course_desig) #course name name = columns[2].contents[0] name = name.encode('utf-8').strip() currentrow['coursename'] = (name) #get page url for reading lists pageurl = columns[11].find('a')['href'] #print pageurl text = get_books_page(pageurl) #get the required books information for this class thiscoursesbooks = [] required = text.find(id = 'requiredList') if required != None: required = required.findAll(class_='viewReading') for g in required: thisbook = {} title = g.find(text = "Title: ") if title != None: title = title.parent.findNext('td') if title != None: title = title.text else: continue thisbook['title'] = title author = g.find(text = "Author: ") if author != None: author = author.parent.findNext('td') if author != None: author = author.text #print author thisbook['author'] = author isbn10 = g.find(text= "ISBN: ") if isbn10 != None: isbn10 = isbn10.parent.findNext('td') if isbn10 != None: isbn10 = isbn10.text #print isbn10 if isbn10.isdigit(): isbn13 = pyisbn.convert(isbn10) else: continue thisbook['isbn10'] = isbn10 thisbook['isbn13'] = isbn13 labprice = get_labyrinth_price(g) thisbook['labprice'] = labprice thisbook['amazonprice']=(get_amazon_price(isbn13)) thisbook['image'] = (get_amazon_image()) thisbook['edition'] = (get_amazon_edition()) thisbook['required'] = True thiscoursesbooks.append(thisbook) recommended = text.find(id = 'recommendedListContainer') if recommended != None: recommended = recommended.findAll(class_='viewReading') for g in recommended: thisbook = {} title = g.find(text = "Title: ") if title != None: title = title.parent.findNext('td') if title != None: title = title.text else: continue thisbook['title'] = title author = g.find(text = "Author: ") if author != None: author = author.parent.findNext('td') if author != None: author = author.text thisbook['author'] = author isbn10 = g.find(text= "ISBN: ") if isbn10 != None: isbn10 = isbn10.parent.findNext('td') if isbn10 != None: isbn10 = isbn10.text if isbn10.isdigit(): isbn13 = pyisbn.convert(isbn10) else: continue thisbook['isbn10'] = isbn10 thisbook['isbn13'] = isbn13 labprice = get_labyrinth_price(g) thisbook['labprice'] = labprice thisbook['amazonprice']=(get_amazon_price(isbn13)) thisbook['image'] = (get_amazon_image()) thisbook['edition'] = (get_amazon_edition()) thisbook['required'] = False thiscoursesbooks.append(thisbook) currentrow['booklist'] = (thiscoursesbooks) if (first == True): f = open('text5.txt', 'r+') f.write( "[\n") first = False f.write(str(currentrow)+',\n') print currentrow finallist.append(currentrow) f.write( "]")
def update_to_goodreads(entries, cookies, disk_cache, limit, wait): """Update book entries to Goodreads. :param entries: list of books :param cookies: login cookie for Goodreads :param disk_cache: cache of updated books """ session = requests.Session() success = [] error = [] for entry in entries: isbn13 = entry['isbn13'] isbns = [isbn13] try: isbn10 = pyisbn.convert(isbn13) isbns.append(isbn10) except Exception: pass resp = check_exists(session, (isbn10, isbn13), cookies) if not resp: logging.warning('{} couldn\'t be found'.format(repr_book(entry))) error.append(entry) disk_cache[entry['isbn13']] = 'e' random_wait(2) continue url = get_edit_url(resp) if not url: logging.warning('{}\' url is not found'.format(repr_book(entry))) error.append(entry) disk_cache[entry['isbn13']] = 'e' random_wait(2) continue submit_url, form_data = get_form_data(session, cookies, url) if not form_data: logging.warning('{}\' form data is not found'.format(repr_book( entry))) error.append(entry) disk_cache[entry['isbn13']] = 'e' random_wait(2) continue # Do not cause any updates form_data['review[cog_explicit]'] = '0' for key in ('add_to_blog', 'add_update'): if key in form_data: form_data[key] = '0' # sanity check if len([key for key in form_data if 'readingSessionDatePicker' in key ]) != 10: logging.warning('{}\' date is problematic'.format(repr_book( entry))) logging.warning(form_data) error.append(entry) disk_cache[entry['isbn13']] = 'e' continue if update_book(entry, form_data, submit_url, session, cookies): success.append(entry) disk_cache[entry['isbn13']] = '' else: error.append(entry) disk_cache[entry['isbn13']] = 'e' if limit is not None and len(success) >= limit: break random_wait() return success, error
def test_convert_invalid(): with expect.raises(IsbnError, 'Only ISBN-13s with 978 Bookland code can be converted ' 'to ISBN-10.'): convert('0000000000000')
def load(self, directory, index="crossref", doc_type="crossref", bulk_size=100000): """ Load data for a file into Es-Index :param directory: The path for the directory with the data :param index: Name of the index :param doc_type: Name of the doc_type :param bulk_size: The bulksize for committing the data into the es index :return: """ cache = list() counter = 0 # total = len([name for name in os.listdir(directory) if os.path.join(directory, name) and name.endswith('json.xz')]) # with tqdm(total=total) as pbar_o: for root, dir_names, file_names in os.walk(directory): for filename in file_names: if filename.endswith('json.xz'): print("OPEN FILE") # pbar_o.update() with lzma.open(os.path.join(root, filename), 'rt', encoding='utf-8') as f: line = f.readline() while line: json_object = json.loads(line) # json_object['oid'] = json_object['_id']['$oid'] # del json_object['_id'] doc_id = json_object['DOI'] # convert isbn numbers if 'ISBN' in json_object.keys(): isbn_list = list() for isbn in json_object['ISBN']: if len(isbn.replace('-', '')) == 10: isbn_list.append(pyisbn.convert(isbn)) json_object['ISBN'] = isbn_list json_object = self.remove_affiliation( json_object, 'author') json_object = self.remove_affiliation( json_object, 'editor') json_object = self.remove_unused_fields( json_object) data = dict() data['_op_type'] = 'index' data['_index'] = index data['_type'] = doc_type data['_id'] = doc_id data['_source'] = json_object cache.append(data) counter += 1 if counter >= bulk_size: self.batch(cache) cache = [] counter = 0 line = f.readline() # after each file write down what keys were deleted. with open('logging/deleted_keys.txt', 'w', encoding='utf-8') as file: for key in self.deleted_key: file.write(key + '\n')