def get_book(start, end, cat_pk, task_target): """ target is 'get_task' """ for book in Book.objects.filter(category__id=int(cat_pk)).order_by('pk')[start:end]: path = os.path.join(BOOK_PATH, book.name).encode('utf-8') if not os.path.exists(path): pk = book.pk cls = get_bookitem_model(pk) items = cls.objects.values_list('content', flat=True).filter(book__id=pk).order_by('pk') raws = set(map(lambda x : x.rsplit('/', 1)[-1], items)) if raws: task_target.send((book.name, book.author, raws))
def get_book(): for name in os.listdir(BOOK_PATH): path = os.path.join(BOOK_PATH, name) crawled = set(os.listdir(path)) book = Book.objects.get(name=name) pk = book.pk cls = get_bookitem_model(pk) items = cls.objects.values_list('content', flat=True).filter(book__id=pk).order_by('pk') raws = set(map(lambda x : x.rsplit('/', 1)[-1], items)) reserved = raws.difference(crawled) if reserved: s = '%s%s' % (book.name, book.author) key = md5(s.encode('utf-8')).hexdigest() yield (book.name, book.author, reserved)
def load_full_detail(data_path): has_part = False item = get_data(data_path) name = item.get('name', '') book = Book.objects.get(name=name) cls = get_bookitem_model(book.pk) intro = item.get('intro', '') book.intro = intro content = item.get('content', '') keys = content.keys() if len(keys) > 1: has_part = True process_has_part(content, cls, book) else: process_null_part(content, cls, book) book.has_part = has_part book.save()
def print_miss(): book = Book.objects.values('id', 'name') for b in book: pk = b['id'] part = pk % 10 filename = '%s.part' % part cls = get_bookitem_model(pk) count = cls.objects.filter(book__id=pk).count() if count == 0: name = b['name'] path = DATA_PATTERN % name if os.path.exists(path): with open(filename, 'a') as f: f.write('%s\n'% name.decode('utf-8')) else: with open('lost.txt', 'a') as f1: f1.write('%s\n'% name.encode('utf-8'))
def get_book(start, end, filename, task_target): """ target is 'get_task' """ names = (linecache.getline(filename, n) for n in range(start, end+1)) for name in names: name = name.strip() path = os.path.join(BOOK_PATH, name) crawled = set(os.listdir(path)) book = Book.objects.get(name=name) pk = book.pk cls = get_bookitem_model(pk) items = cls.objects.values_list('content', flat=True).filter(book__id=pk).order_by('pk') raws = set(map(lambda x : x.rsplit('/', 1)[-1], items)) reserved = raws.difference(crawled) if reserved: task_target.send((book.name, book.author, reserved))
def load_signle_detail(pk, sleep): has_part = False data_dir = "/var/www/wwwroot/ireader/pybook/detail/" #data_dir = "/home/zg163/djcode/ireader/pybook/detail/" book = Book.objects.get(pk=pk) cls = get_bookitem_model(pk) data_path = os.path.join(data_dir, '%s.json' % book.name.encode('utf-8')) item = get_data(data_path) intro = item.get('intro', '') content = item.get('content', '') book.intro = intro keys = content.keys() if len(keys) > 1: has_part = True process_has_part(content, cls, book, sleep) else: process_null_part(content, cls, book, sleep) book.has_part = has_part book.save()
def print_lost(): dir1 = '/mnt/data1/book' dir2 = '/data' book = Book.objects.all() f = open('lost_url.log', 'a') for b in book: cls = get_bookitem_model(b.pk) s = '%s%s' % (b.name.encode('utf-8'), b.author.encode('utf-8')) key = md5(s).hexdigest() prefix = adict.get(key) items = cls.objects.filter(book__id=b.pk) for item in items: content = item.content path1 = os.path.join(dir1, content) path2 = os.path.join(dir2, content) if not os.path.exists(path1) and not os.path.exists(path2): sub, name, pk = content.split('/') suffix = '%s.html' % pk url = urlparse.urljoin(prefix, suffix) f.write('%s\t%s\t%s\n' % ( name.encode('utf-8'), sub.encode('utf-8'), url.encode("utf-8"))) f.close()
def get_book(pk): d = 'new%s.txt' % pk for name in os.listdir(BOOK_PATH): try: path = os.path.join(BOOK_PATH, name) crawled = set(os.listdir(path)) book = Book.objects.get(name=name) pk = book.pk cls = get_bookitem_model(pk) items = cls.objects.values_list('content', flat=True).filter(book__id=pk).order_by('pk') raws = set(map(lambda x : x.rsplit('/', 1)[-1], items)) reserved = raws.difference(crawled) if reserved: #print name, reserved with open(d, 'a') as f: f.write('%s%s' % (name, os.linesep)) #for item in reserved: # log_path = os.path.join(path, item) # with open(log_path, 'w') as f: # f.write('') except: pass
def get_book_chapters(pk): object_list = [] values = ('id', 'name') book = get_single_book(pk) has_part = book.get('has_part', False) partition = pk % settings.BOOKITEM_PARTITION itemcls = get_bookitem_model(pk) if has_part: bookparts = BookPart.objects.filter( book__id=pk ).order_by('pk') for bp in bookparts: chapters = itemcls.objects.values( *values ).filter( part__id=bp.pk ).order_by('pk') object_list.append({bp.name: chapters}) else: chapters = itemcls.objects.values(*values).filter(book__id=pk).order_by('pk') object_list.append({'A': chapters}) recom_list = get_recommand_books(pk) return book, object_list, partition, recom_list