def _process_dates(self): """internal method to parse the gcal_url for start and end date info and set the _start_date_arrow and _end_date_arrow to instances of arrow objs """ #dont rerun if _start_date_arrow or _end_date_arrow is set or if gcal_url not found if (self._start_date_arrow or self._end_date_arrow) or not self.gcal_url: return gcal_url = self.gcal_url gcal_url_date_time_match = self.gcal_url_date_time_pattern.search(gcal_url) if not gcal_url_date_time_match: return (gcal_url_start_date_str, gcal_url_end_date_str) = gcal_url_date_time_match.groups() # add time to date if no time spesified if 'T' not in gcal_url_start_date_str: gcal_url_start_date_str += 'T000000' if 'T' not in gcal_url_end_date_str: gcal_url_end_date_str += 'T000000' self._start_date_arrow = Arrow.strptime(gcal_url_start_date_str, self.gcal_url_date_time_format, tzinfo=self.event_timezone) self._end_date_arrow = Arrow.strptime(gcal_url_end_date_str, self.gcal_url_date_time_format, tzinfo=self.event_timezone)
def run(self, symbol): uid = self.get_uuid(symbol) if uid is None: return url = 'http://www.newrank.cn/xdnphb/detail/getAccountArticle' params = { 'uuid': uid, } r = self.req_post(url, data=params) datas = r.json() try: infos = datas['value']['lastestArticle'] for info in infos: source_url = self.parse_url(info.get('url')) if self.repeat_check(source_url): continue title = info.get('title') wx_id = info.get('account') author = info.get('author') post_time = info.get('publicTime') post_time = Arrow.strptime(post_time, '%Y-%m-%d %H:%M:%S', tzinfo='Asia/Shanghai').timestamp summary = info.get('summary') content, img = self.get_content(source_url) if info.get('imageUrl') is None: image = img else: image = info.get('imageUrl') self.add_result(title=title, author=author, post_time=post_time, source_name=author, source_url=source_url, summary=summary, spider_name=self.spider_name, content=content, image=image, category=self.category, aid=wx_id) except Exception as e: self.log.error(e)
def parse_stamps(self, expr=STAMP_RE, fmt='%H:%M, %d %B %Y (%Z)'): stamps = [] algo = self.archiver.config['algo'] try: maxage = str2time(re.search(r"^old\((\w+)\)$", algo).group(1)) except AttributeError as e: e.args = ("Malformed archive algorithm",) raise ArchiveError(e) for thread in self.threads: if mwp_parse(thread['header']).get(0).level != 2: # the header is not level 2 stamps = [] continue for stamp in expr.finditer(thread['content']): # This for loop can probably be optimised, but ain't nobody # got time fo' dat #if stamp.group(1) in MONTHS: try: stamps.append(Arrow.strptime(stamp.group(0), fmt)) except ValueError: # Invalid stamps should not be parsed, ever continue if stamps: # The most recent stamp should be used to see if we should archive most_recent = max(stamps) thread['stamp'] = most_recent thread['oldenough'] = Arrow.utcnow() - most_recent > maxage pass # No stamps were found, abandon thread stamps = []
def parse_stamps(self, expr=STAMP_RE, fmt='%H:%M, %d %B %Y (%Z)'): stamps = [] algo = self.archiver.config['algo'] try: maxage = str2time(re.search(r"^old\((\w+)\)$", algo).group(1)) except AttributeError as e: e.args = ("Malformed archive algorithm", ) raise ArchiveError(e) for thread in self.threads: if mwp_parse(thread['header']).get(0).level != 2: # the header is not level 2 stamps = [] continue for stamp in expr.finditer(thread['content']): # This for loop can probably be optimised, but ain't nobody # got time fo' dat #if stamp.group(1) in MONTHS: try: stamps.append(Arrow.strptime(stamp.group(0), fmt)) except ValueError: # Invalid stamps should not be parsed, ever continue if stamps: # The most recent stamp should be used to see if we should archive most_recent = max(stamps) thread['stamp'] = most_recent thread['oldenough'] = Arrow.utcnow() - most_recent > maxage pass # No stamps were found, abandon thread stamps = []
def parse(self, kind, aid, summary): url = 'http://api.smzdm.com/v1/%s/articles/%s' % (kind, aid) if self.blf.exist(url): return self.blf.add(url) try: r = self.req_get(url) data = r.json().get('data') title = data.get('article_title') author = data.get('article_referrals') post_time = data.get('article_date') post_time = Arrow.strptime(post_time, '%Y-%m-%d %H:%M:%S', tzinfo='Asia/Shanghai').timestamp source_url = data.get('article_url') # summary = data.get('summary') content = data.get('article_filter_content') try: content = self.get_img(BeautifulSoup('<div>%s</div>' % content, 'lxml'), 'src') except Exception as e: self.log.exception(e) image = data.get('article_pic') # self.add_result(title=title, author=author, post_time=post_time, source_name=self.spider_name, # source_url=source_url, summary=summary, # content=content, image=image, category=self.category, aid=kind) self.add_result(title=title, author=author, post_time=post_time, source_name='什么值得买', source_url=source_url, summary=summary, spider_name=self.spider_name, content=content, image=image, category=self.category, aid=kind) except Exception as e: self.log.error(e)
def get_datestr_and_dateint(self, datestr_area): rt = dict(datestr='', dateint=0) if isinstance(datestr_area, unicode) or isinstance(datestr_area, str): for time_re, arrow_fmt in time_formats: findall = time_re.findall(datestr_area) if findall: ar = Arrow.strptime(findall[0].encode('utf8'), arrow_fmt, 'Asia/Shanghai') if ar.year < 2000: ar = ar.replace(year=Arrow.now().datetime.year) rt = dict(datestr=findall[0], dateint=ar.timestamp) break return rt
def __init__(self, status_json): self.metadata = status_json self.id = status_json["id"] self.userName = status_json["user"]["screen_name"] self.userID = status_json["user"]["id"] # creation converts, e.g. "Thu May 25 15:18:25 +0000 2017" to int timestamp self.creation = Arrow.strptime(status_json["created_at"], "%a %b %d %H:%M:%S %z %Y").timestamp self.text = status_json["full_text"] # source converts the html "a" tag string to its inner text self.source = re.search(">.*?<", status_json["source"])[0].strip("><") self.favoriteCount = status_json["favorite_count"] self.retweets = status_json["retweet_count"] self.language = status_json["lang"] self.mentions = self.getMentions() self.hashtags = self.getHashtags() self.url = self.getUrls() self.medias = self.getMedia() self.filename = datetime.strftime( datetime.utcfromtimestamp(self.creation), "%Y-%m-%d_%H-%M-%S_UTC")
def parse(self, kind, aid, summary): url = 'http://api.smzdm.com/v1/%s/articles/%s' % (kind, aid) if self.blf.exist(url): return self.blf.add(url) try: r = self.req_get(url) data = r.json().get('data') title = data.get('article_title') author = data.get('article_referrals') post_time = data.get('article_date') post_time = Arrow.strptime(post_time, '%Y-%m-%d %H:%M:%S', tzinfo='Asia/Shanghai').timestamp source_url = data.get('article_url') # summary = data.get('summary') content = data.get('article_filter_content') try: content = self.get_img( BeautifulSoup('<div>%s</div>' % content, 'lxml'), 'src') except Exception as e: self.log.exception(e) image = data.get('article_pic') # self.add_result(title=title, author=author, post_time=post_time, source_name=self.spider_name, # source_url=source_url, summary=summary, # content=content, image=image, category=self.category, aid=kind) self.add_result(title=title, author=author, post_time=post_time, source_name='什么值得买', source_url=source_url, summary=summary, spider_name=self.spider_name, content=content, image=image, category=self.category, aid=kind) except Exception as e: self.log.error(e)
def parse_time(time_str): t = ct.search(time_str).group(0) return Arrow.strptime(t, '%Y-%m-%d %H:%M:%S', tzinfo='Asia/Shanghai').timestamp
def isostrptime(stamp): """I'm lazy, and can never remember the format string""" return Arrow.strptime(stamp, "%Y-%m-%dT%H:%M:%SZ")