-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_danbooru.py
123 lines (107 loc) · 4.32 KB
/
scrape_danbooru.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import abc
import Utility as utl
from Network import Network
from AbstractScraper import AbstractScraper
class DanbooruScraper(AbstractScraper):
def __init__(self, config, network):
self.searchForMD5 = 'http://danbooru.donmai.us/posts?utf8=%E2%9C%93&tags=md5%3A'
self.urlPostBase = 'http://danbooru.donmai.us/posts/'
self.urlDataBase = 'http://danbooru.donmai.us/data/'
self.network = network
self.network.setupAuth('donmai.us', config)
self.localFile = ''
def setLocalFile(self, target):
self.localFile = target
def findPostByMD5(self, md5):
"""Given an MD5, return the postID. May not be implemented, depending on the service"""
url = self.searchForMD5 + md5
response, data = self.network.urlRequest(url, 'html')
if response == 200:
data = self.network.htmlEncode(data)
try:
postID = data.article['id'][5:]
return postID
except:
print('Error finding postID for MD5: ' + md5)
return 0
return 0
def findPostByFileName(self, filename):
md5 = utl.fileName(filename)
if 'sample' in filename:
md5 = md5.replace('sample_', '')
md5 = md5.replace('sample-', '')
url = self.searchForMD5 + md5
response, data = self.network.urlRequest(url, 'html')
if response == 200:
data = self.network.htmlEncode(data)
try:
postID = data.article['id'][5:]
return postID
except:
print('Error finding postID for MD5: ' + md5)
return 0
return 0
def postExists(self, postID):
url = self.urlPostBase + str(postID)
response, data = self.network.urlRequest(url, 'html')
if response == 200:
return True
else:
return False
def generateRawData(self, postID):
"""Given a postID, get the html/json/xml text containing the data we want"""
url = self.urlPostBase + postID + '.json'
response, raw_data = self.network.urlRequest(url, 'json')
#clean_data = self.network.urlEncode(raw_data)
return raw_data
def extractPostInfo(self, rawData):
"""Given raw html/json/xml, generate all data for the post"""
temp = {}
temp['id'] = rawData['id']
try:
temp['source'] = rawData['source']
except:
temp['source'] = 'http://danbooru.donmai.us/posts/' + temp['id']
temp['md5'] = rawData['md5']
temp['rating'] = rawData['rating']
temp['width'] = rawData['image_width']
temp['height'] = rawData['image_height']
#temp['tags'] = rawData['tag_string']
temp['extension'] = rawData['file_ext']
temp['pool'] = rawData['pool_string']
temp['file_size'] = rawData['file_size']
try:
temp['tag_string_artist'] = rawData['tag_string_artist']
except:
temp['tag_string_artist'] = ''
try:
temp['tag_string_character'] = rawData['tag_string_character']
except:
temp['tag_string_character'] = ''
try:
temp['tag_string_copyright'] = rawData['tag_string_copyright']
except:
temp['tag_string_copyright'] = ''
try:
temp['tag_string_general'] = rawData['tag_string_general']
except:
temp['tag_string_general'] = ''
t = rawData['large_file_url']
# If this is a sample, get the full version
if 'sample' in t:
a = t.rfind('-')
t = 'data/' + t[a+1:]
temp['large_loc'] = 'http://danbooru.donmai.us/' + t
temp['local_file'] = self.localFile
temp['tag_string'] = rawData['tag_string']
fin = self.localFile.rfind('\\')
target = self.localFile[:fin] + '\\' + rawData['md5'] + '.' + rawData['file_ext']
temp['target_file'] = target
# if self.flag:
# temp['flag'] = 1
# else:
# temp['flag'] = 0
if rawData['is_pending'] == True:
print("This file is yet to be approved on danbooru")
print("Hence, it will show as malformed and will be redownloaded, even if the file is fine")
return temp