/
getMetaArticles.py
82 lines (74 loc) · 2.14 KB
/
getMetaArticles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from bs4 import BeautifulSoup
import os
import reader
import re
#We get meta articles from the Oxford Journal by searching for "cancer meta analysis"
metaList = "oxfordJournalMeta.txt"
baseURL = "http://jnci.oxfordjournals.org"
'''
oxfordJournalMeta.txt contains URLs of full meta articles in the Oxford Journals.
Adds the URL to the text file.
'''
def addMetaURL(url):
newFile = open(metaList, 'a')
newFile.write(url)
newFile.write("\n")
newFile.close()
'''
Removes duplicate article URLs from the file
'''
def removeMetaDuplicates():
lines = open(metaList, 'r').readlines()
lines_set = set(lines)
out = open(metaList, 'w')
for line in lines_set:
out.write(line)
'''
url is a search result on Oxford Journals with meta articles.
Scans the search results and adds all meta article urls to the list.
'''
def addAllLinks(url):
source = reader.readURL(url)
soup = BeautifulSoup(source)
links = soup.findAll("a", {"rel":"full-text"})
for link in links:
url = link['href'].split("?sid")[0] #getting rid of search id
articleURL = baseURL + url
addMetaURL(articleURL)
removeMetaDuplicates()
#addAllLinks(url)
'''
Give a URL for an article, creates a new file in meta.
File named after the article title.
First line is url.
Remainder of file is the url page source.
'''
def writeSource(url, source):
metaFolder = 'meta'
soup = BeautifulSoup(source)
if soup.title is not None:
title = cleanTitle(soup.title.string)
newName = '{}/{}.txt'.format(metaFolder,title)
if not os.path.isfile(newName):
newFile = open(newName, 'w')
newFile.write(url)
newFile.write("\n")
newFile.write(source)
newFile.close()
'''
Makes a title into a valid filename
'''
def cleanTitle(title):
title = title.strip()
title = re.sub('[^\w\-_\. ]', ' ', title)
return title
'''
Reads list of urls from a text file.
'''
def readMeta():
lines = [line.rstrip('\n') for line in open(metaList)]
for i in range(len(lines)):
link = lines[i]
source = reader.readURL(link)
writeSource(link, source)
readMeta()