示例#1
0
 def __init__(self, novel_url, category):
     super().__init__(novel_url)
     self.finished_flag = False
     self.category = category
     self.novel = Novel()
     self.author = Author()
     self.chapter_list_iter = None
示例#2
0
 def updateAuthors(self, authors):
     currentAuthors = [(a.name, a.primary) for a in self.authors]
     newAuthors = filter(lambda x: x[0] is None, authors)
     if newAuthors != currentAuthors:
         self.authors = [
             a for a in self.authors
             if a.name in list(set(currentAuthors) & set(newAuthors))
         ]
         for new in list(set(newAuthors) - set(currentAuthors)):
             self.authors.append(Author(name=new[0], primary=new[1]))
 def __init__(self, _id, content=None, **kwargs):
     self._id = str(_id)
     self.author = Author(_id=self._id)
     if not self.author.full_name:
         self.valid = True
     else:
         return
     if self.author.insert_flag:
         if content is None:
             self.content = Downloader(host + self._id)()
             if self.content:
                 self.valid = True
             else:
                 self.logger.error("当前网页为空,无法进行解析\t_id:" + self._id)
                 self.valid = False
                 return
         else:
             self.valid = True
             self.content = content
         self.selector = etree.HTML(self.content)
async def get_book_with_isbn(isbn: str):
    author_dict = {"name": "name1", "book": ["book1", "book2"]}
    author1 = Author(**author_dict)

    book_dict = {
        "name": "mr kishan",
        "isbn": "124",
        "author": author1,
        "year": 1292
    }
    book1 = Book(**book_dict)
    return book1
示例#5
0
 def addAuthor(self, authors):
     for auth in authors:
         if auth[0] is None:
             print('No author name! for {}'.format(self.uuid))
             continue
         self.authors.append(Author(name=auth[0], primary=auth[1]))
def add_author(name, biography):
    validate_params(name, biography)
    author = Author(name, biography)
    insert(author)
def existing_author1():
    author = Author(author_id=1,
                    first_name='Herman',
                    last_name='Melville',
                    middle_name='M')
    return author
def new_author2():
    author = Author(first_name='Ursela', last_name='LeGuin', middle_name='K')
    return author
def new_author1():
    author = Author(first_name='Herman', last_name='Melville', middle_name='M')
    return author
def existing_author2():
    author = Author(author_id=2,
                    first_name='Ursela',
                    last_name='LeGuin',
                    middle_name='K')
    return author
class AuthorPage(object):
    valid = False  # 网页数据的有效性,无效则不进行解析与储存
    logger = get_logger(__name__, __name__)

    def __init__(self, _id, content=None, **kwargs):
        self._id = str(_id)
        self.author = Author(_id=self._id)
        if not self.author.full_name:
            self.valid = True
        else:
            return
        if self.author.insert_flag:
            if content is None:
                self.content = Downloader(host + self._id)()
                if self.content:
                    self.valid = True
                else:
                    self.logger.error("当前网页为空,无法进行解析\t_id:" + self._id)
                    self.valid = False
                    return
            else:
                self.valid = True
                self.content = content
            self.selector = etree.HTML(self.content)

    def run(self):
        if not self.valid:
            self.logger.info("该author已存在\t_id:" + self._id)
            return
        self.logger.info("开始解析author:" + self._id)
        self.main_page()
        if not self.author.insert_flag:
            self.logger.info("无效网页,已剔除:" + self._id)
            return
        self.get_partners()
        self.get_papers()
        self.author.save()
        self.save_publications()
        self.logger.info("完成author:" + self._id)

    def main_page(self):
        self.author.full_name = deep_select(self.selector, 0, xpath="//head/title/text()").replace("AAN: ", "")
        if "ValueError" in self.author.full_name:
            self.author.insert_flag = False
            self.valid = False
            return
        self.author.publications = deep_select(self.selector, 0, xpath="//table/tbody/tr[1]/td/text()")
        self.author.affiliations = deep_select(self.selector, return_type="list",
                                               xpath="//table/tbody/tr[5]/td/ul/li/text()")

    def get_partners(self):
        if not self.valid:
            return
        self.selector = etree.HTML(Downloader('http://aan.how/browse/author/collaborators/' + self._id)())
        name = deep_select(self.selector, return_type="list",
                           xpath="//tr[@class='gradeA']/td[1]/a/text()")
        self.author.partners_full_name = name
        # 合作文章数量
        num = deep_select(self.selector, return_type="list",
                          xpath="//tr[@class='gradeA']/td[2]/text()")
        for x in range(len(name)):
            papers_id = deep_select(self.selector, return_type="list",
                                    xpath="//tr[@class='gradeA'][" + str(x + 1) + "]/td[3]/a/text()")
            self.author.collaborators.append({"author": name[x], "num": num[x], "papers_id": papers_id})

        partners_id = deep_select(self.selector, return_type="list",
                                  xpath="//tr[@class='gradeA']/td[1]/a/@href")
        self.author.partners_id = [to_num(x) for x in partners_id]

    def get_papers(self):
        if not self.valid:
            return
        self.selector = etree.HTML(Downloader('http://aan.how/browse/author/publications/' + self._id)())
        papers_url_id = deep_select(self.selector, return_type="list",
                                    xpath="///tr[@class='gradeA']/td[2]/a/@href")
        self.author.papers_count = len(papers_url_id)
        self.author.publications = [to_num(x) for x in papers_url_id]
        # 使用多线程爬取网页
        # paper_thread = PaperPageThread(self.author.publications, 10)
        # paper_thread.start()

    def save_many(self, papers: []):
        # 一次存多个对象
        ids = [x for x in papers._id]
        try:
            col_paper.insert_many([x.__dict__ for x in papers])
        except Exception as e:
            self.logger.error("ids:%s\t%s" % (ids, e))

    def save_publications(self):
        for x in self.author.publications:
            col_page.update_one({"_id": x}, {'$set': {"used": False}}, True)