def getLinksECTA(startYear, endYear): """ Helper that gets a dataframe of links to ECTA articles in the given timeframe. startYear: int The first year for which ECTA info is requested endYear: int The last year (inclusive) for which ECTA info is requested """ rootUrl = "https://onlinelibrary.wiley.com/toc/14680262/" years = range(startYear, endYear + 1) issues = range(1, 7) issueMap = {1: '01', 2: '03', 3: '05', 4: '07', 5: '09', 6: '11'} linkList = [] for year in years: vol = year - 1932 for issue in issues: url = rootUrl + str(year) + "/" + str(vol) + "/" + str(issue) month = issueMap[issue] issueDate = str(year) + '-' + month + '-01' soup = quickSoup(url) all_abs = soup.find_all('a', attrs = {'title': 'Abstract'}) urls = [a['href'] for a in all_abs] dates = [issueDate for i in range(len(urls))] compiled = list(zip(dates, urls)) linkList = linkList + compiled ecta_links = pd.DataFrame(linkList, columns = ['Date', 'ArticleURL']) ecta_links['ArticleURL'] = ecta_links['ArticleURL'].apply(lambda x: x.replace('/abs/', '/full/')) ecta_links['ArticleURL'] = "https://onlinelibrary.wiley.com" + ecta_links['ArticleURL'] return(ecta_links)
def getLinksAER(): """ Helper to get the links for AER articles. """ rootUrl = "https://www.aeaweb.org/journals/aer/issues" rootSoup = quickSoup(rootUrl) issues = rootSoup.find_all('div', attrs={'style': 'margin-top:5px;'}) bigList = [] for issue in issues: littleList = [] year = int(issue.get_text().split(" ")[1]) monthName = issue.get_text().split(" ")[0] monthName = monthName[1:] monthList = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" ] monthNum = monthList.index(monthName) + 1 mmyy = date(year=year, month=monthNum, day=1) littleList.append(mmyy) href = issue.find('a')['href'] littleList.append("https://www.aeaweb.org" + href) bigList.append(littleList) rootDf = pd.DataFrame(bigList, columns=['Date', 'ArticleURL']) return (rootDf)
def getArticles(row): issueSoup = quickSoup(row['URL']) articles = issueSoup.find_all('h3') for art in articles: if art.find('a') is not None: artUrl = "https://www.aeaweb.org" + art.find('a')['href'] artTitle = art.find('a').get_text() newRow = [row['Date'], artUrl, artTitle] aerList.append(newRow)
def getAbstractAER(url): """ Helper to get the abstract given a URL. """ page = quickSoup(url) abst = page.find("section", attrs={"class": "article-information abstract"}) if abst is not None: cleanedAbs = abst.get_text().strip().replace("Abstract", "").strip() return cleanedAbs
def getDetailsECTA(url): """ Helper that gets the title and abstract from an ECTA article. url: string Location of the ECTA article being scraped """ try: soup = quickSoup(url) abst = soup.find('div', attrs = {'class': 'article-section__content en main'}) if abst is None: return None abstText = abst.get_text().strip() title = soup.find('h2', attrs = {'class': 'citation__title'}).get_text() return (url, title, abstText) except Exception as e: print(e) raise