示例#1
0
    def __get_paper_from_acm (self, entry_url):
        resp_body = self.op.open (entry_url).read ()
        root = sp.fromstring (resp_body)

        divmain = root.xpath ("//div[@id='divmain']")[0]

        title = divmain.xpath ("div/h1/strong")[0].text
        
        # use regex to extract abstract link
        abst_url = re.compile (r"tab.abstract.cfm[^']*").search (resp_body).group (0)
        abst_url = 'http://dl.acm.org/' + abst_url
        abst_body = self.op.open (abst_url).read ()
        
        # extract all text node from this dom tree
        abst = ''.join (sp.fromstring (abst_body).xpath ('//div/p/div/p/descendant-or-self::*/text()'))
        
        # instantiate a Paper class
        paper = Paper (title, abst)

        # locate the author table block
        author_table = divmain.xpath ("table/tr/td/table")[1]
        
        # add each author
        for author_row in author_table.xpath ('tr'):
            name = author_row.xpath ('td/a/text()')[0]
            affn = author_row.xpath ('td/a/small/text()')[0]
            paper.add_author (Author (name, affn))

        return paper
示例#2
0
    def __get_paper_from_ms (self, entry_url):
        resp_body = self.__deljs_html (self.op.open (entry_url).read ())
        root = sp.fromstring (resp_body)
 
        title = root.xpath ("//span[@id='ctl00_MainContent_PaperItem_title']")[0].text
        #abst = root.xpath ("//span[@id='ctl00_MainContent_PaperItem_snippet']")[0].text

        # instantiate a Paper class
        paper = Paper (title)

        # locate the div block for the paper description
        paper_div = root.xpath ("//div[@id='ctl00_MainContent_PaperItem_divPaper']/div")[1]
       
        for author_url in paper_div.xpath ("a[@class='author-name-tooltip']/@href"):
            # print author_url
            paper.add_author (self.__get_author_from_ms (author_url))

        return paper
示例#3
0
    def __get_paper_from_acm (self, entry_url):
        resp_body = self.__deljs_html (self.op.open (self.__wrapper (entry_url)).read ())
        root = sp.fromstring (resp_body)

        divmain = root.xpath ("//div[@id='divmain']")[0]

        title = divmain.xpath ("div/h1/strong")[0].text
        
        # UPDATE: NO NEED FOR ABSTRACT
        # use regex to extract abstract link
        #abst_url = re.compile (r"tab.abstract.cfm[^']*").search (resp_body).group (0)
        #abst_url = 'http://dl.acm.org/' + abst_url
        #abst_body = self.op.open (abst_url).read ()
        
        # extract all text node from this dom tree
        #abst = ''.join (sp.fromstring (abst_body).xpath ('//div/p/div/p/descendant-or-self::*/text()'))
        
        # instantiate a Paper class
        paper = Paper (title)

        # locate the author table block
        author_table = divmain.xpath ("table/tr/td/table")[1]

        # add each author
        for author_row in author_table.xpath ('tr'):
            name = author_row.xpath ('td/a/text()')[0]
            
            # if the text is in tag <a>, then it has a link to this affiliation
            if len (author_row.xpath ('td/a/small/text()')) > 0:
                affn = author_row.xpath ('td/a/small/text()')[0]
            elif len (author_row.xpath ('td/small/text()')) > 0:
                affn = author_row.xpath ('td/small/text()')[0]
            else:
                affn = ""

            paper.add_author (Author (name, affn))

        return paper