def extract_paper_from_IEEE(self, req, filename):
        """
      this function will access a given url  and will find the link of the pdf.
      Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
      :param paper_url: e.g. "http://dx.doi.org/10.1007/BF00264597"
      :return:
      """
        # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
        # the site will strike you out because you are a robot!

        webpage = req.read()
        # parse the html code
        soup = BeautifulSoup(webpage, 'html.parser')
        menus = json.loads(
            re.search(r"global.document.metadata\s*=\s*(.*);",
                      soup.getText()).group(1))
        pdfpath = str(menus['pdfPath']).replace("iel", "ielx")
        pdf_link = "http://ieeexplore.ieee.org" + pdfpath

        print("Access in " + pdf_link)
        return tools.downloadFile(url=pdf_link,
                                  folder=cfg.folder_pdf,
                                  overwrite=False,
                                  localfilename=filename,
                                  printOutput=False)
    def extract_paper_from_SPRINGER(self, req, filename):
        """
      this function will access a given url  and will find the link of the pdf.
      Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
      :param paper_url: e.g. "http://dx.doi.org/10.1007/BF00264597"
      :return:
      """
        # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
        # the site will strike you out because you are a robot!
        # req = Request(paper_url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = req.read()
        # parse the html code
        soup = BeautifulSoup(webpage, 'html.parser')
        # select only the link tags
        for link in soup.find_all('a'):
            # the name of in the link tag is "FullTextPDF"
            if str(link.get('href')).endswith('.pdf'):
                # for instance : "/content/pdf/10.1007%2FBF00264597.pdf"
                href_link = link.get('href')
                prefix = "http://link.springer.com"
                pdf_link = prefix + href_link

                print("Access in " + pdf_link)
                return tools.downloadFile(url=pdf_link,
                                          folder=cfg.folder_pdf,
                                          overwrite=False,
                                          localfilename=filename,
                                          printOutput=False)
        raise BaseException(
            req.geturl() + ' does not contain a valid SPRINGER download link.')
    def extract_paper_from_AAAI(self, req, filename):
        """
      this function will access a given url  and will find the link of the pdf.
      Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
      :param paper_url: e.g. "http://www.aaai.org/ocs/index.php/ICWSM/ICWSM16/paper/view/13130"
      :return:
      """
        # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
        # the site will strike you out because you are a robot!
        paper_url = req
        if "viewPaper" not in paper_url:
            paper_url = paper_url.replace("view", "viewPaper")

        req = Request(paper_url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        # parse the html code
        soup = BeautifulSoup(webpage, 'html.parser')
        # select only the link tags
        for link in soup.find_all('a', href=True, text='PDF'):
            # the name of in the link tag is "FullTextPDF"
            pdf_link = link.get("href").replace("view", "download")
            print("Access in " + pdf_link)
            return tools.downloadFile(url=pdf_link,
                                      folder=cfg.folder_pdf,
                                      overwrite=False,
                                      localfilename=filename,
                                      printOutput=False)
        raise BaseException(paper_url +
                            ' does not contain a valid AAAI download link.')
 def extract_paper_from_ICWSM(self, req, filename):
     """
   this function will access a given url  and will find the link of the pdf.
   Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
   :param paper_url: e.g. "http://www.icwsm.org/papers/paper54.html"
   :return:
   """
     # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
     # the site will strike you out because you are a robot!
     # req = Request(paper_url ,headers={'User-Agent': 'Mozilla/5.0'})
     webpage = req.read()
     # parse the html code
     soup = BeautifulSoup(webpage, 'html.parser')
     # select only the link tags
     for link in soup.find_all('a', href=True, text='PDF'):
         # the name of in the link tag is "FullTextPDF"
         prefix = "http://www.icwsm.org/papers/"
         suffix = link.get("href")
         pdf_link = prefix + suffix
         print("Access in " + pdf_link)
         return tools.downloadFile(url=pdf_link,
                                   folder=cfg.folder_pdf,
                                   overwrite=False,
                                   localfilename=filename,
                                   printOutput=False)
     raise BaseException(req.geturl() +
                         ' does not contain a valid ICWSM download link.')
 def extract_paper_from_ACM(self, req, filename):
     """
       this function will access a given url  and will find the link of the pdf.
       Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
       :param paper_url: DOI link of and ACM Page
       :return:
       """
     # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
     # the site will strike you out because you are a robot!
     # req = Request(paper_url ,headers={'User-Agent': 'Mozilla/5.0'})
     webpage = req.read()
     # parse the html code
     soup = BeautifulSoup(webpage, 'html.parser')
     # select only the link tags
     for link in soup.find_all('a'):
         # the name of in the link tag is "FullTextPDF"
         if str(link.get('name')).endswith('PDF') or str(link.get('name')).endswith('Pdf') \
         or str(link.get('name')).endswith('pdf'):
             href_link = link.get('href')
             prefix = "http://dl.acm.org/"
             pdf_link = prefix + href_link
             # To avoid any conflicts I am taking the id of the link and the
             # ftid and I concatinate them together. I put +3 and +6 to
             # exclude  the "id=" and "ftid="
             pdf_id = href_link[href_link.find("id=") + 3: href_link.find("&f")]
             file_id = href_link[href_link.find("ftid=") + 6: href_link.find("&d")]
             # localfilename = pdf_id + '_' + file_id+'.pdf'
             # folder = "C:/Users/User/Documents/acm_pdfs/"
             print("donwload file " + pdf_link)
             return tools.downloadFile(url=pdf_link, folder=cfg.folder_pdf, overwrite=False,
                                       localfilename=filename, printOutput=False)
     raise BaseException(req.geturl() + ' does not contain a valid ACM download link.')
    def download_and_store(self, paper, db):
        """
    stores stuff in mongo db, and downloads the PDF
    :param paper:
    :param db:
    :return:
    """
        # global filters

        global skip
        # the ee XML tag indicates that this paper has some kind of source attached (this will usually be an URL)
        if 'ee' in paper:
            # Do we want to skip this file? There are lots of reasons, see below...
            # Skipping means we will not try to download it

            skip = False
            # filters have been set
            """
      if len(filters) > 0:
        for k, v in filters.items():
          if k == 'scraper':
            self.enabledScrapers.add(v)
            continue
          if not (k in paper and paper[k] == v):
            skip = True
        if not skip:
          if "dblpkey" in paper:
            print("Filter matched: " + str(paper["dblpkey"]))
      """

            # do NOT skip if paper has a key, an ee entry
            if (not skip and type(paper['dblpkey']) is str and type(paper['ee']) is str) and \
                    (('book_title' in paper and paper['book_title'] in self.book_titles) or
                     ('journal' in paper and paper['journal'] in self.journals)):
                # check if it one of our supported types. IMPORTANT: ADD NEW TYPES HERE IF WE HAVE THEM!
                # also here we are checking if the resolved doi belonges in to one of our crawlers
                # if yes then we proceed with the download otherwise we store the url in a file
                # with the not-supported repositories

                # check if the paper was already successfully downloaded
                # downloadinfo is the dictionary which is later stored in the Mongo "downloads" collection to memorize
                # which URLs have been accessed, and if that was successful or not
                downloadinfo = {
                    '_id': paper['ee'],
                    'url': paper['ee'],
                    'dblpkey': paper['dblpkey'],
                    'lastaccessed': datetime.datetime.now(),
                    'success': True
                }
                print("Publication matched: " + str(paper["dblpkey"]))
                skip = False
                filename = paper['dblpkey'] + ".pdf"
                req = ""
                actual_url = ""
                url_open = ""
                # down_info = db.downloads.find_one({'dblpkey': paper['dblpkey']})

                if self.skipPreviouslyAccessedURLs and self.storeToMongo:
                    result = db.downloads.find_one(
                        {'_id': downloadinfo['_id']})

                    if result is None:
                        skip = False
                    # if it wasn't successful try once more
                    elif result['success'] is False:
                        last_access = result['lastaccessed']
                        current_Date = downloadinfo['lastaccessed']
                        days_previous_check = (current_Date - last_access).days
                        skip = True
                        ####### change it later!!!!!!
                        if days_previous_check >= 30:
                            skip = False
                            print()
                            print("Paper: {}, Last Check: {} days ago!".format(
                                paper['dblpkey'], days_previous_check))
                            print()

                    # check if the download date was greater than 30 days

                    # elif number_of_Days < 2:
                    #  skip = False
                    else:
                        skip = True
                        if result['success']:
                            skip = True
                            global numOfPDFobtained
                            global paperCounter
                            global numOfPDFobtainedInThisSession
                            numOfPDFobtained += 1
                            if numOfPDFobtained % self.statusEveryXdownloads is 0:
                                logging.info(
                                    'DBLP XML PROGRESS: XML Paper Entries {}      PDFs {}     PDFs in this Session {} '
                                    .format(paperCounter, numOfPDFobtained,
                                            numOfPDFobtainedInThisSession))
                else:
                    skip = False  # url not in download collection of mongo db

                if skip is False:

                    pub_info = db.publications.find_one(
                        {'dblpkey': paper['dblpkey']})
                    if pub_info is None:
                        skip = False

                        try:
                            req = Request(
                                paper['ee'],
                                headers={'User-Agent': 'Mozilla/5.0'})
                            url_open = urlopen(req)
                            # if url_open.status != 200:
                            #  skip = True
                            # raise BaseException("HTTPError {}".format(url_open.status))
                            # else:
                            # downloadinfo = {}
                            actual_url = url_open.geturl()
                            global num_of_access
                            # Here we need to add a time delay because we access the
                            # sleep for a random duration of time between 60 and 360 seconds

                            rndm_time = int(random.uniform(60, 360))
                            print(
                                "Crawler sleeps for {} min - Times Access Repositories: {}"
                                .format(float(rndm_time / int(60)),
                                        num_of_access))

                            num_of_access += 1
                            time.sleep(rndm_time)
                            if (paper['ee'].lower().endswith("pdf") and "pdf" in self.enabledScrapers) or (
                                    "ieee" in str(actual_url)) or ("springer" in actual_url) or (
                                    "acm" in actual_url) or paper['ee'].startswith("http://www.aaai.org") \
                                    or paper['ee'].startswith("http://www.icwsm.org"):
                                filename = paper['dblpkey'] + ".pdf"
                                skip = False
                                # decide if we want to skip this entry
                                # (e.g., it has been accessed before and we are in the mood for skipping)
                            else:
                                skip = True  # this ee entry is not interesting to us
                                print(
                                    "{}, Repository not supported: {}".format(
                                        paper['dblpkey'], actual_url))
                                downloadinfo['success'] = False
                                downloadinfo[
                                    'error'] = "{}, Repository not supported: {}".format(
                                        paper['dblpkey'], actual_url)
                                db.downloads.replace_one(
                                    {'_id': downloadinfo['_id']},
                                    downloadinfo,
                                    upsert=True)
                                with open(cfg.folder_log +
                                          "not_supported_repos.txt",
                                          'a',
                                          encoding='UTF-8') as f:
                                    f.write(actual_url)
                                    f.write("\n")
                        except BaseException:
                            logging.exception('Cannot download or store ' +
                                              paper['ee'] + " with dblpkey: " +
                                              paper['dblpkey'],
                                              exc_info=True)
                            skip = True  # error with the url_open so skip the download
                            print("first try catch!!! skip: {}".format(skip))
                            if self.storeToMongo:
                                downloadinfo['success'] = False
                                ex = sys.exc_info()
                                downloadinfo['error'] = repr(ex)
                                db.downloads.replace_one(
                                    {'_id': downloadinfo['_id']},
                                    downloadinfo,
                                    upsert=True)
                    else:
                        db.downloads.replace_one({'_id': downloadinfo['_id']},
                                                 downloadinfo)
                        skip = True
                else:
                    print("{} already in DB".format(paper['dblpkey']))
                    skip = True  # already exist in the db

                # Do the Download and store to MongoDB
                # print("Proceed with: {} : the download and store: Skip: {}".format(paper['dblpkey'],skip))
                if not skip:
                    try:

                        # download based on type. IMPORTANT: Add supported types here, and also a few lines above!
                        if paper['ee'].lower().endswith(
                                "pdf") and "pdf" in self.enabledScrapers:
                            # Normal PDF download
                            self.newPapersIn = True  # There are new additions
                            skipped = not tools.downloadFile(
                                downloadinfo['url'],
                                overwrite=False,
                                folder=cfg.folder_pdf,
                                localfilename=filename)

                        elif "springer" in actual_url:
                            # go to springer crawller
                            self.newPapersIn = True  # There are new additions
                            global num_of_access_in_springer
                            num_of_access_in_springer += 1
                            print(
                                "{}, publisher: Springer, #Access: {}".format(
                                    paper['dblpkey'],
                                    num_of_access_in_springer))
                            skipped = not self.extract_paper_from_SPRINGER(
                                url_open, filename)

                        elif "acm" in actual_url:
                            # go to acm crawler
                            self.newPapersIn = True  # There are new additions
                            global num_of_access_in_acm
                            num_of_access_in_acm += 1
                            print("{}, publisher: ACM, #Access: {}".format(
                                paper['dblpkey'], num_of_access_in_acm))
                            skipped = not self.extract_paper_from_ACM(
                                url_open, filename)

                        elif "ieee" in actual_url:
                            # go to ieee crawler
                            self.newPapersIn = True  # There are new additions
                            global num_of_access_in_ieee
                            num_of_access_in_ieee += 1
                            print("{}, publisher: IEEE, #Access: {}".format(
                                paper['dblpkey'], num_of_access_in_ieee))
                            skipped = not self.extract_paper_from_IEEE(
                                url_open, filename)

                        elif paper['ee'].startswith("http://www.aaai.org"):
                            # go to aaai crawler
                            self.newPapersIn = True  # There are new additions
                            global num_of_access_in_aaai
                            num_of_access_in_aaai += 1
                            print("{}, publisher: AAAI, #Access: {}".format(
                                paper['dblpkey'], num_of_access_in_aaai))
                            skipped = not self.extract_paper_from_AAAI(
                                actual_url, filename)

                        elif paper['ee'].startswith("http://www.icwsm.org"):
                            # got to icwsm crawler
                            self.newPapersIn = True  # There are new additions
                            global num_of_access_in_icwsm
                            num_of_access_in_icwsm += 1
                            print("{}, publisher: ICWSM, #Access: {}".format(
                                paper['dblpkey'], num_of_access_in_icwsm))
                            skipped = not self.extract_paper_from_ICWSM(
                                paper['ee'], filename)

                        else:
                            skipped = True

                        if skipped:
                            logging.info(' Used local PDF copy for ' +
                                         paper['dblpkey'])
                        else:
                            logging.info(' Downloaded ' + paper['dblpkey'])
                            # global numOfPDFobtainedInThisSession
                            numOfPDFobtainedInThisSession += 1
                            # store
                            if self.storeToMongo:
                                # set additional data
                                paper['_id'] = paper['dblpkey']
                                # store to mongo
                                db.publications.replace_one(
                                    {'_id': paper['_id']}, paper, upsert=True)
                                db.downloads.replace_one(
                                    {'_id': downloadinfo['_id']},
                                    downloadinfo,
                                    upsert=True)
                    except BaseException:
                        logging.exception('Cannot download or store ' +
                                          paper['ee'] + " with dblpkey: " +
                                          paper['dblpkey'],
                                          exc_info=True)
                        print("second try catch")
                        if self.storeToMongo:
                            downloadinfo['success'] = False
                            ex = sys.exc_info()
                            downloadinfo['error'] = repr(ex)
                            db.downloads.replace_one(
                                {'_id': downloadinfo['_id']},
                                downloadinfo,
                                upsert=True)
Пример #7
0
from pyhelpers import tools
from html.parser import HTMLParser
import json
import requests


print(tools.downloadFile('http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=5089263',  localfilename="1.pdf", overwrite=True))
print(tools.downloadFile('http://dl.acm.org/ft_gateway.cfm?id=2964799&ftid=1751125&dwn=1&CFID=847673629&CFTOKEN=22464829', localfilename="2.pdf", overwrite=True))

class PageParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.doc = {'references':[], 'citedby':[]}
        self.section = None

    def handle_starttag(self, tag, attrs):
        if tag == 'meta':
            d = dict(attrs)
            if 'name' in d and d['name'].startswith('citation_'):
                name = d['name'][9:]
                content = d['content']
                self.doc[name] = content
        elif tag == 'a':
            d = dict(attrs)
            if 'name' in d:
                name = d['name']
                self.section = name if name in ['references', 'citedby'] else None
            if self.section is not None and 'href' in d and d['href'].startswith('citation.cfm?'):
                query = d['href'][13:]
                for p in query.split('&'):
                    a = p.split('=')