Пример #1
0
class Paper:
    logger = get_logger(TAG=__name__, file_name=__name__)

    def __init__(self):
        # ACL ID
        self._id = ""
        self.url_id = ""
        self.title = ""
        self.authors = []
        self.authors_full_name = []
        self.venue = ""
        self.year = 0
        self.abstract = ""
        # 引用的文章-->paper._id
        self.out_citations = []
        # 被引用的文章-->paper._id
        self.in_citations = []
        # 每一个引文包含了paper_id,line,sentence
        # 字典类型,包含:paper_id(-->paper._id),line(所在行),sentence(引文句);
        self.citing_sentences = []
        self.session = ""

    def save(self):
        # 写入mongoDB数据库,只存一个
        try:
            col_paper.insert_one(self.__dict__)
            self.logger.info("写入\tid:" + self._id)
        except Exception as e:
            self.logger.error("id:%s\turl_id:%s\t%s" %
                              (self._id, self.url_id, e))
Пример #2
0
 def __init__(self, length, thread_num=6):
     self.thread_num = thread_num
     self.length = length
     self.loggers = [
         get_logger(__name__ + str(x), __name__ + str(x))
         for x in range(1, thread_num + 1)
     ]
     logger.info("任务数:%s\t线程数:%s" % (str(length), str(thread_num)))
Пример #3
0
    def __init__(self, things, thread_num=6):
        """

        :param things: 作者页的链接list
        :param thread_num:
        """
        self.thread_num = thread_num
        self.things = things
        self.loggers = [
            get_logger(__name__ + str(x), __name__ + str(x))
            for x in range(1, thread_num + 1)
        ]
Пример #4
0
class Downloader:
    logger = get_logger(__name__)

    def __init__(self, url, delay=5, user_agent=r"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)",
                 num_retries=0):
        """

        :param delay:
        :param user_agent:默认使用随机ua
        :param prxies:
        :param num_retries:
        :param cache:
        """
        self.throttle = Throttle(delay)
        self.user_agent = UserAgent()
        self.num_retries = num_retries
        self.url = url

    def __call__(self):
        return self.dow(self.url)

    def dow(self, url):
        result = self.download(url=url, headers={"User-Agent": self.user_agent.random}, num_retries=self.num_retries)
        return result

    def download(self, url, headers, num_retries):
        self.logger.info("Downloading: " + url)
        try:
            req = requests.get(url, headers=headers)
            html = req.text
            if 500 <= req.status_code < 600:
                # 服务器错误则忽略缓存并重新下载
                time.sleep(2)
                html = requests.get(url, headers=headers).text
                if 500 <= req.status_code < 600:
                    html = ""
        except requests.exceptions.RequestException as e:
            self.logger.error(e)
            html = None
            if num_retries > 0:
                if hasattr(e, 'code') and 500 <= e.code <= 600:
                    html = self.download(url=url, headers=headers, num_retries=num_retries - 1)
        # if html is None:
        #     self.throttle.wait(url)
        #     html = self.download(url=url, headers=headers, num_retries=num_retries)
        return html
Пример #5
0
   Author :        Carl
   Author_email:   [email protected]
   date:          meta_data.py
   Description :
-------------------------------------------------
#  If this run wrong, don't ask me , I don't know why;
#  If this run right, thank god, and I don't know why.
#  Maybe the answer, my friend, is blowing in the wind.
-------------------------------------------------
"""
__author__ = 'Carl'
from util.dao import col_paper, col_author
from util.Log import get_logger
import re

logger = get_logger(TAG="paper", file_name="paper")


class MetaData:
    def __init__(self):
        self.paper_id = None
        self.title = None
        self.authors = None
        self.venue = None
        self.year = None


def write_data():
    with open("../data_set/acl-metadata.txt", "r", encoding='UTF-8') as f:
        while True:
            try:
Пример #6
0
# -*- coding:utf8 -*-
import threading
from time import ctime, time
from spider.paper_page2 import PaperPage
from util.Log import get_logger
from logging import Logger

logger = get_logger("mutil_thread_paper")
lock = threading.Lock()


class PageThread(object):
    def __init__(self, length, thread_num=6):
        self.thread_num = thread_num
        self.length = length
        self.loggers = [
            get_logger(__name__ + str(x), __name__ + str(x))
            for x in range(1, thread_num + 1)
        ]
        logger.info("任务数:%s\t线程数:%s" % (str(length), str(thread_num)))

    def get_range(self):
        # 完成范围的均分
        ranges = []
        length = self.length
        offset = int(int(length) / self.thread_num)
        for i in range(self.thread_num):
            if i == (self.thread_num - 1):
                ranges.append((i * offset, length))
            else:
                ranges.append((i * offset, (i + 1) * offset))
Пример #7
0
class Author:
    logger = get_logger(TAG=__name__, file_name=__name__)

    def __init__(self, _id=None, full_name=None):
        """

        :param _id: id为字符串类型
        :param full_name:
        """
        self._id = str(_id)
        self.full_name = full_name
        self.papers_count = 0
        # 为了便于计算,合作者包括自己
        self.partners_full_name = []
        self.partners_id = []
        self.affiliations = []
        self.collaborators = []
        # 作者所发表的文章-->paper.url_id
        self.publications = []
        self.insert_flag = True
        self.search()

    def search(self):
        """
        根据ID查询数据库中的信息
        :return:
        """
        try:
            au = col_author.find_one({"_id": self._id})
            if au is not None:
                if not self.full_name:
                    self.full_name = au["full_name"]
                self.partners_full_name = au['partners_full_name']
                self.partners_id = au['partners_id']
                self.papers_count = au['partners_id']
                self.partners_id = au['partners_id']
                self.affiliations = au['affiliations']
                self.collaborators = au['collaborators']
                # 作者所发表的文章-->paper.url_id
                self.publications = au['publications']
                self.insert_flag = False
        except Exception as e:
            self.logger.error("id:%s\t%s" % (self._id, e))

    def add_partner_full_name(self, authors_name: list):
        rs = set(self.partners_full_name)
        rs = rs.union(authors_name)
        self.partners_full_name = list(rs)

    def add_partner_id(self, _id: list):
        rs = set(self.partners_id)
        rs = rs.union(_id)
        self.partners_id = list(rs)

    def save(self):
        # 写入mongoDB数据库
        try:
            if self.insert_flag:
                col_author.insert_one(self.__dict__)
            else:
                col_author.replace_one({"_id": self._id}, self.__dict__)
            self.logger.info("写入\tid:" + self._id)
        except Exception as e:
            self.logger.error("id:%s\t%s" % (self._id, e))
Пример #8
0
class AuthorPage(object):
    valid = False  # 网页数据的有效性,无效则不进行解析与储存
    logger = get_logger(__name__, __name__)

    def __init__(self, _id, content=None, **kwargs):
        self._id = str(_id)
        self.author = Author(_id=self._id)
        if not self.author.full_name:
            self.valid = True
        else:
            return
        if self.author.insert_flag:
            if content is None:
                self.content = Downloader(host + self._id)()
                if self.content:
                    self.valid = True
                else:
                    self.logger.error("当前网页为空,无法进行解析\t_id:" + self._id)
                    self.valid = False
                    return
            else:
                self.valid = True
                self.content = content
            self.selector = etree.HTML(self.content)

    def run(self):
        if not self.valid:
            self.logger.info("该author已存在\t_id:" + self._id)
            return
        self.logger.info("开始解析author:" + self._id)
        self.main_page()
        if not self.author.insert_flag:
            self.logger.info("无效网页,已剔除:" + self._id)
            return
        self.get_partners()
        self.get_papers()
        self.author.save()
        self.save_publications()
        self.logger.info("完成author:" + self._id)

    def main_page(self):
        self.author.full_name = deep_select(self.selector, 0, xpath="//head/title/text()").replace("AAN: ", "")
        if "ValueError" in self.author.full_name:
            self.author.insert_flag = False
            self.valid = False
            return
        self.author.publications = deep_select(self.selector, 0, xpath="//table/tbody/tr[1]/td/text()")
        self.author.affiliations = deep_select(self.selector, return_type="list",
                                               xpath="//table/tbody/tr[5]/td/ul/li/text()")

    def get_partners(self):
        if not self.valid:
            return
        self.selector = etree.HTML(Downloader('http://aan.how/browse/author/collaborators/' + self._id)())
        name = deep_select(self.selector, return_type="list",
                           xpath="//tr[@class='gradeA']/td[1]/a/text()")
        self.author.partners_full_name = name
        # 合作文章数量
        num = deep_select(self.selector, return_type="list",
                          xpath="//tr[@class='gradeA']/td[2]/text()")
        for x in range(len(name)):
            papers_id = deep_select(self.selector, return_type="list",
                                    xpath="//tr[@class='gradeA'][" + str(x + 1) + "]/td[3]/a/text()")
            self.author.collaborators.append({"author": name[x], "num": num[x], "papers_id": papers_id})

        partners_id = deep_select(self.selector, return_type="list",
                                  xpath="//tr[@class='gradeA']/td[1]/a/@href")
        self.author.partners_id = [to_num(x) for x in partners_id]

    def get_papers(self):
        if not self.valid:
            return
        self.selector = etree.HTML(Downloader('http://aan.how/browse/author/publications/' + self._id)())
        papers_url_id = deep_select(self.selector, return_type="list",
                                    xpath="///tr[@class='gradeA']/td[2]/a/@href")
        self.author.papers_count = len(papers_url_id)
        self.author.publications = [to_num(x) for x in papers_url_id]
        # 使用多线程爬取网页
        # paper_thread = PaperPageThread(self.author.publications, 10)
        # paper_thread.start()

    def save_many(self, papers: []):
        # 一次存多个对象
        ids = [x for x in papers._id]
        try:
            col_paper.insert_many([x.__dict__ for x in papers])
        except Exception as e:
            self.logger.error("ids:%s\t%s" % (ids, e))

    def save_publications(self):
        for x in self.author.publications:
            col_page.update_one({"_id": x}, {'$set': {"used": False}}, True)
Пример #9
0
   Description :
-------------------------------------------------
#  If this run wrong, don't ask me , I don't know why;
#  If this run right, thank god, and I don't know why.
#  Maybe the answer, my friend, is blowing in the wind.                 
-------------------------------------------------
"""
__author__ = 'Carl'
from util.downloader import Downloader
from util.Log import get_logger
from model.paper import Paper
from util.stringUtil import *
from util.dao import col_paper
from lxml import etree

logger = get_logger("paper_page", "paper_page")
host = "http://aan.how/browse/paper/"


class PaperPage(object):
    valid = False  # 网页数据的有效性,无效则不进行解析与储存

    def __init__(self, _id, content=None, **kwargs):
        self._id = str(_id)
        self.paper = Paper()
        page_data = col_paper.find_one({"url_id": self._id})
        if page_data:
            # 数据库中已经存在,直接返回
            return
        if content is None:
            self.content = Downloader(host + self._id)()