示例#1
0
def getLinksECTA(startYear, endYear):
    """
    Helper that gets a dataframe of links to ECTA articles in the given timeframe.
    startYear: int
        The first year for which ECTA info is requested
    endYear: int
        The last year (inclusive) for which ECTA info is requested
    """
    rootUrl = "https://onlinelibrary.wiley.com/toc/14680262/"
    years = range(startYear, endYear + 1)
    issues = range(1, 7)
    issueMap = {1: '01', 2: '03', 3: '05', 4: '07', 5: '09', 6: '11'}
    linkList = []
    for year in years:
        vol = year - 1932
        for issue in issues:
            url = rootUrl + str(year) + "/" + str(vol) + "/" + str(issue)
            month = issueMap[issue]
            issueDate = str(year) + '-' + month + '-01'
            soup = quickSoup(url)
            all_abs = soup.find_all('a', attrs = {'title': 'Abstract'})
            urls = [a['href'] for a in all_abs]
            dates = [issueDate for i in range(len(urls))]
            compiled = list(zip(dates, urls))
            linkList = linkList + compiled
    ecta_links = pd.DataFrame(linkList, columns = ['Date', 'ArticleURL'])
    ecta_links['ArticleURL'] = ecta_links['ArticleURL'].apply(lambda x: x.replace('/abs/', '/full/'))
    ecta_links['ArticleURL'] = "https://onlinelibrary.wiley.com" + ecta_links['ArticleURL']
    return(ecta_links)
示例#2
0
def getLinksAER():
    """
    Helper to get the links for AER articles.
    """
    rootUrl = "https://www.aeaweb.org/journals/aer/issues"
    rootSoup = quickSoup(rootUrl)
    issues = rootSoup.find_all('div', attrs={'style': 'margin-top:5px;'})
    bigList = []
    for issue in issues:
        littleList = []
        year = int(issue.get_text().split(" ")[1])
        monthName = issue.get_text().split(" ")[0]
        monthName = monthName[1:]
        monthList = [
            "January", "February", "March", "April", "May", "June", "July",
            "August", "September", "October", "November", "December"
        ]
        monthNum = monthList.index(monthName) + 1
        mmyy = date(year=year, month=monthNum, day=1)
        littleList.append(mmyy)
        href = issue.find('a')['href']
        littleList.append("https://www.aeaweb.org" + href)
        bigList.append(littleList)

    rootDf = pd.DataFrame(bigList, columns=['Date', 'ArticleURL'])
    return (rootDf)
示例#3
0
 def getArticles(row):
     issueSoup = quickSoup(row['URL'])
     articles = issueSoup.find_all('h3')
     for art in articles:
         if art.find('a') is not None:
             artUrl = "https://www.aeaweb.org" + art.find('a')['href']
             artTitle = art.find('a').get_text()
             newRow = [row['Date'], artUrl, artTitle]
             aerList.append(newRow)
示例#4
0
def getAbstractAER(url):
    """
    Helper to get the abstract given a URL.
    """
    page = quickSoup(url)
    abst = page.find("section",
                     attrs={"class": "article-information abstract"})
    if abst is not None:
        cleanedAbs = abst.get_text().strip().replace("Abstract", "").strip()
        return cleanedAbs
示例#5
0
def getDetailsECTA(url):
    """
    Helper that gets the title and abstract from an ECTA article.
    url: string
        Location of the ECTA article being scraped
    """
    try:
        soup = quickSoup(url)
        abst = soup.find('div', attrs = {'class': 'article-section__content en main'})
        if abst is None:
            return None
        abstText = abst.get_text().strip()
        title = soup.find('h2', attrs = {'class': 'citation__title'}).get_text()
        return (url, title, abstText)
    except Exception as e:
        print(e)
        raise