-
Notifications
You must be signed in to change notification settings - Fork 0
/
post_spider.py
149 lines (111 loc) · 3.83 KB
/
post_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
:Name:
post_spider.py
:Authors:
Soufian Salim (soufi@nsal.im)
:Date:
October 28th, 2014
:Description:
forum.ubuntu-fr.org post spider
"""
from datetime import datetime
from scrapy import Spider, Selector, Item, Field, Request, FormRequest
from settings import start_date, end_date, thread_json_file, username, password
from utility import extract_identifier, compute_date
import codecs, json
class Post(Item):
"""
Post
"""
author = Field()
author_id = Field()
thread = Field()
number = Field()
datetime = Field()
content = Field()
signature = Field()
modification = Field()
class PostSpider(Spider):
"""
This spider crawls forum.ubuntu-fr.org threads to scrape post information
"""
name = "post"
allowed_domains = ["forum.ubuntu-fr.org"]
start_urls = ["http://forum.ubuntu-fr.org"]
def parse(self, response):
"""
Master function
"""
return self.login(response)
def login(self, response):
"""
Attempts to login
"""
return FormRequest.from_response(response,
formxpath="//form[@id=\"login_top\"]",
formdata={
# "form_sent": "1",
# "redirect_url": "/index.php",
"req_username": username,
"req_password": password,
},
callback=self.request_threads
)
def request_threads(self, response):
"""
Sends a request to each thread url
"""
# check that the login succeeded before going on
if "title=\"Déconnexion\"" not in response.body:
print("error: could not login")
return
with codecs.open(thread_json_file, "r", "utf-8") as json_data:
json_threads = json.load(json_data)
for thread in json_threads:
thread_last_post_date = datetime.strptime(thread["last_post_date"], "%Y-%m-%d %H:%M:%S")
if thread_last_post_date < start_date or thread_last_post_date > end_date:
continue
yield Request(thread["url"], callback=self.parse_thread)
def parse_thread(self, response):
"""
Parses the http://forum.ubuntu-fr.org thread pages
"""
link_selector = Selector(text=response.css(".pagelink").extract()[0])
links = link_selector.xpath("//a/text()").extract()
page_count = int(links[-2]) if len(links) > 0 else 1
bp_selector = Selector(text=response.css(".blockpost")[0].extract())
first_post_date = compute_date(bp_selector.xpath("//h2/span/a/text()").extract()[0])
if first_post_date > end_date or first_post_date < start_date:
return
for page_number in xrange(1, page_count + 1):
yield Request("{0}&p={1}".format(response.request.url, page_number), callback=self.parse_page)
def parse_page(self, response):
"""
Parses one page of the forum
"""
for bp in response.css(".blockpost"):
bp_selector = Selector(text=bp.extract())
message = "".join(bp_selector.xpath(
"//div[@class='postmsg']/node()[not(local-name() = 'div' and @class='postsignature') and not(local-name() = 'p' and @class='postedit')]"
).extract()).strip()
signature = bp_selector.css(".postsignature").xpath("p/node()").extract()
modification = bp_selector.css(".postedit").extract()
if len(modification) > 0:
s = modification[0]
modification = str(compute_date(s[s.find("(")+1:s.find(")")]))
else:
modification = False
author_link_list = bp_selector.xpath("//strong/a/@href").extract()
post = Post(
author=bp_selector.xpath("//strong/a/text()").extract()[0] if len(author_link_list) > 0 else bp_selector.xpath("//strong/text()").extract()[0],
author_id=extract_identifier(author_link_list[0]) if len(author_link_list) > 0 else None,
number=int(bp_selector.xpath("//h2/span/span/text()").extract()[0][1:]),
datetime=str(compute_date(bp_selector.xpath("//h2/span/a/text()").extract()[0])),
content=message,
signature="".join(signature).strip() if len(signature) > 0 else False,
modification=modification,
thread=extract_identifier(response.request.url)
)
yield post