def guancha(keyword, page): global time driver = webdriver.PhantomJS() url = 'https://user.guancha.cn/main/search?click=news&keyword=' + urllib.quote(keyword) driver.get(url) for x in range(1, page): print('Start fetching page ' + str(x) + '...') driver.find_element_by_class_name('index-add-more').click() time.sleep(3) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') # print(soup.prettify()) title_tag = soup.find_all('div', class_='list-item') s = "%Y%m%d%H%M%S" with open(datetime.datetime.now().strftime(s) + '.csv', 'w') as csvFile: writer = csv.writer(csvFile, dialect='excel') writer.writerow(['title','url','date']) for tag in title_tag: h4 = tag.find('h4') # print(str(h4.a.attrs['href'])) time = str(h4.a.attrs['href']).rsplit('/',1)[1].replace('.shtml','') times = time.rsplit('_',1)[0] # print(h4.a.text) print(times.replace('_','/') + ' ' + h4.a.text) writer.writerow([h4.a.text.encode('utf-8'), h4.a.attrs['href'], times.replace('_','/')])
def parse_traza(self, traza): """@brief Toma el nombre de un fichero de traza y devuelve 3 valores, fichero, caso, time @param traza El nombre del fichero de traza. @retval Los 3 valores fich, caso, time """ # Estructura de los nombres # 0(fichero):1(caso):2(timestamp) # Ejemplo: LoanApprovalProcess.bpts:LargeAmount-1267033799.94.log # El timestamp se encuentra en segundos y es obtenido con time.time() try: fich, caso, time = traza.split(":") time = time.rsplit(".", 1)[0] except: # log.warning(_("Hay una traza que no sigue el formato: " + traza)) return "", "", "" return fich, caso, time
def _parse_iso_8601(value): """ Parses an ISO8601:2004 date time string. """ # remove trailing 'Z' value = value.replace('Z', '') # split between date and time try: (date, time) = value.split("T") except: date = value time = "" # remove all hyphens in date date = date.replace('-', '') # remove colons in time time = time.replace(':', '') # guess date pattern length_date = len(date) if date.count('W') == 1 and length_date == 8: # we got a week date: YYYYWwwD # remove week indicator 'W' date = date.replace('W', '') date_pattern = "%Y%W%w" year = int(date[0:4]) # [Www] is the week number prefixed by the letter 'W', from W01 # through W53. # strpftime %W == Week number of the year (Monday as the first day # of the week) as a decimal number [00,53]. All days in a new year # preceding the first Monday are considered to be in week 0. week = int(date[4:6]) - 1 # [D] is the weekday number, from 1 through 7, beginning with # Monday and ending with Sunday. # strpftime %w == Weekday as a decimal number [0(Sunday),6] day = int(date[6]) if day == 7: day = 0 date = "%04d%02d%1d" % (year, week, day) elif length_date == 7 and date.isdigit() and value.count('-') != 2: # we got a ordinal date: YYYYDDD date_pattern = "%Y%j" elif length_date == 8 and date.isdigit(): # we got a calendar date: YYYYMMDD date_pattern = "%Y%m%d" else: raise ValueError("Wrong or incomplete ISO8601:2004 date format") # check for time zone information # note that the zone designator is the actual offset from UTC and # does not include any information on daylight saving time if time.count('+') == 1 and '+' in time[-6:]: (time, tz) = time.rsplit('+') delta = -1 elif time.count('-') == 1 and '-' in time[-6:]: (time, tz) = time.rsplit('-') delta = 1 else: delta = 0 if delta: while len(tz) < 3: tz += '0' delta = delta * (int(tz[0:2]) * 60 * 60 + int(tz[2:]) * 60) # split microseconds ms = 0 if '.' in time: (time, ms) = time.split(".") ms = float('0.' + ms.strip()) # guess time pattern length_time = len(time) if length_time == 6 and time.isdigit(): time_pattern = "%H%M%S" elif length_time == 4 and time.isdigit(): time_pattern = "%H%M" elif length_time == 2 and time.isdigit(): time_pattern = "%H" elif length_time == 0: time_pattern = "" else: raise ValueError("Wrong or incomplete ISO8601:2004 time format") # parse patterns dt = datetime.datetime.strptime(date + 'T' + time, date_pattern + 'T' + time_pattern) # add microseconds and eventually correct time zone return UTCDateTime(dt) + (float(delta) + ms)
def _parseISO8601(value): """ Parses an ISO8601:2004 date time string. """ # remove trailing 'Z' value = value.replace('Z', '') # split between date and time try: (date, time) = value.split("T") except: date = value time = "" # remove all hyphens in date date = date.replace('-', '') # remove colons in time time = time.replace(':', '') # guess date pattern length_date = len(date) if date.count('W') == 1 and length_date == 8: # we got a week date: YYYYWwwD # remove week indicator 'W' date = date.replace('W', '') date_pattern = "%Y%W%w" year = int(date[0:4]) # [Www] is the week number prefixed by the letter 'W', from W01 # through W53. # strpftime %W == Week number of the year (Monday as the first day # of the week) as a decimal number [00,53]. All days in a new year # preceding the first Monday are considered to be in week 0. week = int(date[4:6]) - 1 # [D] is the weekday number, from 1 through 7, beginning with # Monday and ending with Sunday. # strpftime %w == Weekday as a decimal number [0(Sunday),6] day = int(date[6]) if day == 7: day = 0 date = "%04d%02d%1d" % (year, week, day) elif length_date == 7 and date.isdigit() and value.count('-') != 2: # we got a ordinal date: YYYYDDD date_pattern = "%Y%j" elif length_date == 8 and date.isdigit(): # we got a calendar date: YYYYMMDD date_pattern = "%Y%m%d" else: raise ValueError("Wrong or incomplete ISO8601:2004 date format") # check for time zone information # note that the zone designator is the actual offset from UTC and # does not include any information on daylight saving time if time.count('+') == 1 and '+' in time[-6:]: (time, tz) = time.rsplit('+') delta = -1 elif time.count('-') == 1 and '-' in time[-6:]: (time, tz) = time.rsplit('-') delta = 1 else: delta = 0 if delta: tz = tz.replace(':', '') # XXX: not needed while len(tz) < 3: tz += '0' delta = delta * (int(tz[0:2]) * 60 * 60 + int(tz[2:]) * 60) # split microseconds ms = 0 if '.' in time: (time, ms) = time.split(".") ms = float('0.' + ms.strip()) # guess time pattern length_time = len(time) if length_time == 6 and time.isdigit(): time_pattern = "%H%M%S" elif length_time == 4 and time.isdigit(): time_pattern = "%H%M" elif length_time == 2 and time.isdigit(): time_pattern = "%H" elif length_time == 0: time_pattern = "" else: raise ValueError("Wrong or incomplete ISO8601:2004 time format") # parse patterns dt = datetime.datetime.strptime(date + 'T' + time, date_pattern + 'T' + time_pattern) # add microseconds and eventually correct time zone return UTCDateTime(dt) + (float(delta) + ms)