예제 #1
0
class PaperAbstractCrawler(AbstractCrawler):
    """
    Class :  PaperNameCrawler
    Description: Paper Name crawler crawls the name of paper from Tuhat Database of University of Helsinki given name
    of the author.
    """
    def __init__(self):
        """ Set up mysql messenger
        @param self Pointer to class
        """
        super(PaperAbstractCrawler, self).__init__()
        self.mm = MysqlMessager("PaperLinks")
        self.libraries = []

    def crawl(self):
        """ crawl information from page
        @param self Pointer to class
        """
        import time
        import random
        #self.mm.clear_table()
        sql = "SELECT PaperNames.PaperName,PaperNames.Paper_ID, Persons.FirstName, LastName FROM PaperNames inner join Persons on PaperNames.P_ID = Persons.ID"
        self.mm.execute_sql(sql)
        iter = self.mm.fetch()
        for row in iter:
            if row[1] > 795:
                self.name = row[0]
                url_name = '+'.join(self.name.split(' '))
                url = "http://scholar.google.fi/scholar?as_q=%s&as_occt=title&hl=en" % url_name
                print url
                try:
                    soup = self._downloader(url)
                    self._parse_and_store(soup, row[1])
                except Exception, e:
                    print e
                time.sleep(random.randint(60, 130))
예제 #2
0
class PaperNameCrawler(AbstractCrawler):
    """
    Class :  PaperNameCrawler
    Description: Paper Name crawler crawls the name of paper from Tuhat Database of University of Helsinki given name
    of the author.
    """
    def __init__(self):
        """ Set up mysql messenger
        @param self Pointer to class
        """
        super(PaperNameCrawler, self).__init__()
        self.mm = MysqlMessager("PaperNames")

    def crawl(self):
        """ crawl information from page
        @param self Pointer to class
        """
        self.mm.clear_table()
        sql = "SELECT * FROM Persons"
        self.mm.execute_sql(sql)
        iter = self.mm.fetch()
        for row in iter:
            url = row[3]
            soup = self._downloader(url)
            self._parse_and_store(soup, row[0])

    def _downloader(self, url, out_folder="doc/"):
        """ Download the web page and store it python data structure
        @param self Pointer to class
        @param url URL to be downloaded
        @param out_folder Folder that stores information
        """ ""
        return super(PaperNameCrawler, self)._downloader(url)

    def _parse_and_store(self, soup, foreign_key):
        def doi2url(doi):
            """
            Return a bibTeX string of metadata for a given DOI.
            ##TODO: Not working for now
            """
            try:
                link = urlopen(doi).geturl()
            except Exception, e:
                # Error occured while resolving doi address
                print "Exception happened while processing doi: %s" % e
                link = doi
            print link
            return link

        super(PaperNameCrawler, self)._parse_and_store(soup)
        print self.log_dir
        log_file = codecs.open(
            self.log_dir + "paper_name_crawler_log_file.txt", "w", "utf-8")
        for p in soup.findAll('p', {'class': 'uh_relationlist'}):
            inner_soup = self._downloader(p.a['href'])
            if 'publications.html' == p.a['href'].split('/')[-1]:
                for inner_link in inner_soup.findAll('h2', {'class': 'title'}):
                    paper_names = inner_link.a.span.contents[0]
                    paper_link = inner_link.a['href']
                    paper_soup = self._downloader(paper_link)
                    paper_out_link_resolved = "default"
                    try:
                        doi = paper_soup.findAll(
                            'ul',
                            {'class': 'relations digital_object_identifiers'
                             })[0].li.a['href']
                        paper_out_link_resolved = u"\"" + doi2url(doi) + u"\""
                        #paper_out_link_resolved =  doi2url(doi)
                        time.sleep(60)
                    except:
                        # can not find any links in the web page
                        try:
                            # then tries to find whether there is any link connects to the paper
                            for h in paper_soup.findAll(
                                    'h3', {'class': 'subheader'}):
                                if h.contents[0] == "Links":
                                    print "Links"
                                    paper_out_link_resolved = u"\"" + h.parent.ul.li.a[
                                        'href'] + u"\""
                        except:
                            # can not find links either
                            pass
                    sql = u"INSERT INTO PaperNames (Paper_ID, PaperName, Link, P_ID) VALUES ( default,\"" +\
                          paper_names + u"\", " + paper_out_link_resolved + u", " + str(foreign_key) + u")"
                    print sql
                    self.mm.execute_sql(sql, log_file)
        log_file.close()