-
Notifications
You must be signed in to change notification settings - Fork 0
/
Functions.py
256 lines (236 loc) · 12.9 KB
/
Functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# -*- coding: utf-8 -*-
#Program written by Milenko A. Fadic
#May 2015, Bologna
#IMPORT MODULES
import webbrowser
import cookielib
import urllib, urllib2
from urllib2 import urlopen
from bs4 import BeautifulSoup,NavigableString
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import html5lib
import csv
import time
from datetime import datetime
from datetime import date
from time import gmtime, strftime
#########################Functions###################################################
#This part of the program is done so I can set a download rate. If it is a weekday, I put a delay of 20 after each loop in order not to overload the server.
#The delay can be adjusted below.
# It opens driver. It refresh it so the icon in the calendar work. If it does not open, i do a time out
def opendriver(driver, sitescrap):
try:
driver.get(sitescrap)
except TimeoutException:
#driver.close
driver.get(sitescrap)
else:
print "BROWSER SHOULD BE OPEN, 3O SECONDS IS THE TIMEOUT RATE"
driver.set_page_load_timeout(45)
#DOES A CHECK ON THE DATE AND THEN DOES A PAUSE BASED ON THE HIGH DOWNLOAD OR LOW DOWNLOAD
def hourpeak(time_sleep_low_rate, time_sleep_high_rate ):
c_time=strftime("%H", gmtime()) #This sets the time and integer time to see the rate of download. They are the args that go in hourpeak
c_time2=int(c_time)
c_time3=strftime("%H:%M:%S", gmtime())
c_date=strftime("%Y-%m-%d",gmtime())
c_day=date.isoweekday(date.today())
if ((c_day < 6) and (c_time2 >10 or c_time2 <2)) :
peak="lowdownload"
else:
peak= "highdownload"
if peak== "lowdownload": #Function that defines rate of download. I can change the numbers accordinigly
time.sleep(time_sleep_low_rate)
else:
time.sleep(time_sleep_high_rate)
print "This is the rate of download", peak
#SCRAPS FIRST TAB
def scrap_2(soup, sys, driver, company_ruc, ActionChains, admin_a):
table_num=0
for tables in soup.find_all('div', {'class':'z-listbox'}): #This looks at the number of boxes
max_page=[]
try: #This looks to see if there are multiple pages per record, if so I have to scrap each of them...
next_button=tables.find_all('button', {'class':'z-paging-next'})
next_button_c=tables.find('button', {'class':'z-paging-next'})
number_pages= tables.find_all('span', {'class':'z-paging-text'}) #This is the text that has the number of pages in there.....
for i in number_pages:
pg=i.text.strip().replace("/","") #REPLACE NUMBERS
pg=pg.strip()
max_page.append(pg) #FIND THE NUMBERS OF PAGES
mpg=max(max_page)
mpg=int(mpg)
print "This is the number of pages", mpg
#IF THERE IS ONLY ONE PAGE, THEN FOR EACH LINE ENCODE IT, THEN APPEND IT TO A LIST A. AFTER IT IS FINISH FOR EVERY LINE THEN APPEND LIST A TO LIST B AND WRITE IT TO PROGRAM.
if mpg < 2:
print "Div only has one page"
resultk=[]
for tr in tables.find_all('tr', {'class':'z-listitem'}):
result=[]
result.append((company_ruc,'|'))
for elem in tr:
tabresult =[elem.text.strip().encode('utf-8')]
result.append(([tabresult],"|"))
resultk.append([result])
with open(admin_a, 'a') as f:
csv.writer(f).writerows(resultk)
#IF THERE ARE MORE THAN ONE PAGE, THEN FOR EACH PAGE AND EACH LINE ENCODE IT,
#THEN APPEND IT TO A LIST A. AFTER IT IS FINISH FOR EVERY LINE THEN APPEND LIST A TO LIST B AND WRITE IT TO PROGRAM.
else:
j=1
print "Admin has more than two pages"
while j< mpg+1: #NOTE THAT THE CONTENT CHANGES, SO I HAVE TO DEFINE A SOUP EACH TIME
content = driver.page_source
soup = BeautifulSoup(''.join(content))
for tr in soup.find_all('tr', {'class':'z-listitem'}):
result=[]
result.append((company_ruc,'|'))
for elem in tr:
tabresult =[elem.text.strip().encode('utf-8')]
result.append(([tabresult],"|"))
with open(admin_a, 'a') as f:
csv.writer(f).writerow(result)
j=j+1
button_driver=driver.find_elements_by_class_name('z-paging-next')
ActionChains(driver).move_to_element(button_driver[table_num]).click().perform()
time.sleep(3)
#IN CASE OF AN EXCEPTION THEN PRINT ERRORS
except:
print sys.exc_info()
finally:
table_num=table_num+1
#FOR SCRAP 3,4,AND 1. THEN PROCEUDRE IS SIMILAR TO SCRAP 2. THERE ARE SUBTLE DIFFERENCES SUCH AS WHERE TO PLACE THE IDENTIFIER, BUT THE LOGIC IS THE SAME. THEY ARE NOT ANNOTATED
#BUT SERVE FOR ADDITIONAL DETAILS.
def scrap_3(soup, sys, driver, company_ruc, ActionChains, kardex):
table_num=0
for tables in soup.find_all('div', {'class':'z-listbox'}):
print "tables found"
max_page=[]
next_button=tables.find_all('button', {'class':'z-paging-next'})
next_button_c=tables.find('button', {'class':'z-paging-next'})
number_pages= tables.find_all('span', {'class':'z-paging-text'})
mpg=[]
for i in number_pages:
pg=i.text.strip().replace("/","")
pg=pg.strip()
max_page.append(pg)
mpg=max(max_page)
mpg=int(mpg)
if mpg < 2:
print "Has only 1 table"
for mytable in tables.find_all('div',{'class':'z-listbox-body'}): #FIND THE TABLE
resulte=[]
print "FOUND ALL DIVS"
for tr in mytable.find_all('tr',{'class':'z-listitem'}):
result=[]
result.append((company_ruc,'@|'))
for elem in tr: #FOR EACH ELEMENT IN EACH ROW I STRIP, ENCODE, AND ENTER THE TEXT INTO A LIST
tabresult =[elem.text.strip().encode('utf-8')]
result.append(([tabresult],"@|"))
resulte.append([result])
with open(kardex, 'a') as f:
csv.writer(f).writerows(resulte)
else:
j=1
resultj=[]
while j< mpg+1:
print "Has Multiple Tables"
content = driver.page_source
soup = BeautifulSoup(''.join(content))
for tr in soup.find_all('tr',{'class':'z-listitem'}):
result=[]
result.append((company_ruc,'@'))
for elem in tr: #FOR EACH ELEMENT IN EACH ROW I STRIP, ENCODE, AND ENTER THE TEXT INTO A LIST
tabresult =[elem.text.strip().encode('utf-8')]
result.append(([tabresult],"@|"))
resultj.append([result])
with open(kardex, 'a') as f:
csv.writer(f).writerow(result)
j=j+1
button_driver=driver.find_elements_by_class_name('z-paging-next')
ActionChains(driver).move_to_element(button_driver[table_num]).click().perform()
time.sleep(3)
def scrap_4(soup, sys, driver, company_ruc, ActionChains, documents):
tab_pop_up=soup.find_all('div', {'class':'z-tabpanel'})
for tables in tab_pop_up[2] :
print "Found Pop up"
max_page=[]
next_button=tables.find_all('button', {'class':'z-paging-next'})
next_button_c=tables.find('button', {'class':'z-paging-next'})
number_pages= tables.find_all('span', {'class':'z-paging-text'})
mpg=[]
for i in number_pages:
max_page=[]
pg=i.text.strip().replace("/","")
pg=pg.strip()
max_page.append(pg)
print "found number of pages"
mpg=max(max_page)
mpg=int(mpg)
if mpg < 2:
print "only 1 page"
resultj=[]
content = driver.page_source
soup = BeautifulSoup(''.join(content))
for tr in soup.find_all('tr',{'class':'z-listitem'}):
result=[]
result.append((company_ruc,'@'))
for elem in tr: #FOR EACH ELEMENT IN EACH ROW I STRIP, ENCODE, AND ENTER THE TEXT INTO A LIST
tabresult =[elem.text.strip().encode('utf-8')]
result.append(([tabresult],"@|"))
resultj.append([result])
with open(documents, 'a') as f:
csv.writer(f).writerows(resultj)
print "finished one page"
else:
j=0
resultk=[]
while j< mpg:
print "many pages"
content = driver.page_source
soup = BeautifulSoup(''.join(content))
for tr in soup.find_all('tr',{'class':'z-listitem'}): #FOR EACH LINE
result=[]
result.append((company_ruc,'@'))
for elem in tr: #FOR EACH ELEMENT IN EACH ROW I STRIP, ENCODE, AND ENTER THE TEXT INTO A LIST
tabresult =[elem.text.strip().encode('utf-8')]
result.append(([tabresult],"@|"))
resultk.append(result)
with open(documents, 'a') as f:
csv.writer(f).writerows(resultk)
resultk=[]
j=j+1
button_driver=driver.find_elements_by_class_name('z-paging-next')
ActionChains(driver).move_to_element(button_driver[2]).click().perform()
time.sleep(3)
def scrap_1(soup,sys,driver, company_ruc, ActionCHains, info_general_a, info_general_b):
z=1
for mytable in soup.find_all('div',{'class':'z-groupbox-3d'}): #FIND THE TABLE
resultc=[]
resultd=[]
for tr in mytable.find_all('tr',{'class':'z-row'}):
k=1
resulta=[]
resultb=[]
for td in tr.find_all('td', {'class':'z-row-inner'}):
for br in td.find_all('br'):
br.replaceWith("@")
if (k % 2==0): #These are the response, meaning the text
values= td.contents
resulta.append((company_ruc, "|Total",z, "|Fila", k, "|", [values],"|"))
else: #This is the description
desciption= td.text
resultb.append((company_ruc, "|Total",z, "|", k, "|", [desciption],"|"))
z=z+1
k=k+1
resultc.append([resulta])
resultd.append([resultb])
with open(info_general_a, 'a') as f:
csv.writer(f).writerows(resultc)
with open(info_general_b, 'a') as f:
csv.writer(f).writerows(resultd)
##