def keyword_traffics(self, filters, start_date, end_date): if not self.account: raise GoogleException('Please set account id.', 504) if isinstance(start_date, (str, unicode)): start_date = dateParse(start_date) if isinstance(end_date, (str, unicode)): end_date = dateParse(end_date) if isinstance(filters, (str, unicode)): filters = eval(filters) data = self.account.get_data(start_date, end_date, metrics=['visits',], dimensions=['keyword', ], filters=filters, sort=['-visits',], max_results=GoogleAnalytics.MAX_RESULTS) return data
def parse_author(self, response): published_time = dateParse( response.css( 'meta[property="article:published_time"]::attr(content)'). extract_first()).replace(tzinfo=None) try: modified_time = dateParse( response.css( 'meta[property="article:modified_time"]::attr(content)'). extract_first()).replace(tzinfo=None) except: modified_time = published_time todays_date = datetime.now() if published_time.date() < todays_date.date(): return None for item in ( response.css('body::attr(class)').extract_first()).split(' '): if 'postid' in item: id_constructor = item.split('-') qmfashionItem = QmfashionItem( _id='siddysays' + '-' + id_constructor[len(id_constructor) - 1], published_time=published_time, modified_time=modified_time, url=response.request.url, title=response.css('.title a::text').extract_first(), opening_text=extract_first_paragraph( response, "div.blogpost div.posttext div.sentry"), news_source="Siddysays", posted=False) return qmfashionItem
def parse_author(self, response): published_time = dateParse( response.css( 'meta[property="article:published_time"]::attr(content)'). extract_first()).replace(tzinfo=None) try: modified_time = dateParse( response.css( 'meta[property="article:modified_time"]::attr(content)'). extract_first()).replace(tzinfo=None) except: modified_time = published_time todays_date = datetime.now() if published_time.date() < todays_date.date(): return None id_constructor = (response.css( 'body article.the-post::attr(id)').extract_first()).split('-') qmfashionItem = QmfashionItem( _id='karachista' + '-' + id_constructor[len(id_constructor) - 1], published_time=published_time, modified_time=modified_time, url=response.request.url, title=response.css( 'body article.the-post header.post-header div.post-meta .post-title::text' ).extract_first(), opening_text=extract_first_paragraph( response, 'body article.the-post div.post-content'), news_source="Karachista", posted=False) return qmfashionItem
def parse_author(self, response): global scrape_next_page published_time = dateParse(response.css('meta[property="article:published_time"]::attr(content)').extract_first()) todays_date = datetime.now() if published_time.date() < todays_date.date(): scrape_next_page = False return None try: modified_time = dateParse(response.css('meta[property="article:modified_time"]::attr(content)').extract_first()) except: modified_time = published_time id_extractor = response.css('article::attr(id)').extract_first().split('-') first_paragraph = extract_summary(response, "div.post-" + str(id_extractor[len(id_extractor)-1])) category = response.css('meta[property="article:tag"]::attr(content)').extract() category.append('Business') newsterItem = NewsterItem( _id = 'brecorder' + '-' + str(id_extractor[len(id_extractor)-1]), url = response.request.url, published_time = published_time, modified_time = modified_time, title = response.css('title::text').extract_first().split('|')[0], category = list(set(category)), content = '\n\n'.join(response.css('div.post-' + str(id_extractor[len(id_extractor)-1]) + ' p *::text').extract()), image_link = response.css('meta[property="og:image"]::attr(content)').extract_first(), summary = first_paragraph ) return newsterItem
def parseFlight(_class, string, date): # Remove keywords from flight string removeKeywords = ['Departing flight', 'depart', 'arrive', 'Change Planes in', 'stop', 'stops', 'Plane Change'] regex = '|'.join(removeKeywords) # Turn into list and filter out blank [""] elements infoList = filter( lambda el: el != "", re.sub(regex, "", string).split(' ')) # Parse number of layovers stops = int(infoList[4]) if infoList[4] != 'Non' else 0 # Parse departure and arrival times departureDT = dateParse("%s %s" % (date, infoList[2])) arrivalDT = dateParse("%s %s" % (date, infoList[3])) # If your flight goes past midnight, it must arrive the next day if (arrivalDT < departureDT): departureDT += timedelta(days=1) price = infoList[1].split('$')[-1] # Build flight info dict flight = { 'flights': tuple(infoList[0].split('/')), 'price': price, 'depDate': departureDT, 'arrDate': arrivalDT, 'stops': stops, } return flight
def parse_author(self, response): meta = response.css('head meta') published_time = dateParse( meta.css('[property="article:published_time"]::attr(content)'). extract_first()) modified_time = dateParse( meta.css('[property="article:modified_time"]::attr(content)'). extract_first()) first_paragraph = extract_summary(response, "article.story .story__content") category = response.css( 'meta[property="article:section"]::attr(content)').extract() category.append(response.request.meta['category']) newsterItem = NewsterItem( _id='dawn' + '-' + response.css('.story__title::attr(data-id)').extract_first(), url=response.request.url, published_time=published_time, modified_time=modified_time, title=response.css('.story__title a::text').extract_first(), category=list(set(category)), content='\n\n'.join( response.css( 'article.story .story__content p *::text').extract()), image_link=meta.css( '[property="og:image"]::attr(content)').extract_first(), summary=first_paragraph) return newsterItem
def parse_author(self, response): published_time = dateParse( response.css( 'meta[property="article:published_time"]::attr(content)'). extract_first()).replace(tzinfo=None) try: modified_time = dateParse( response.css( 'meta[property="article:modified_time"]::attr(content)'). extract_first()).replace(tzinfo=None) except: modified_time = published_time todays_date = datetime.now() if published_time.date() < todays_date.date(): return None article_title = response.css( 'meta[property="og:title"]::attr(content)').extract_first() qmfashionItem = QmfashionItem( _id='secretcloset' + '-' + hashlib.md5(article_title.encode('utf-8')).hexdigest(), published_time=published_time, modified_time=modified_time, url=response.request.url, title=article_title, opening_text=extract_first_paragraph( response, 'div.main-container div.blogs div.blog-details'), news_source="Secretcloset", posted=False) return qmfashionItem
def parseFlight(_class, string, date, points = None): """ General format: Departing flight 123(/456) $0000 12:30AM depart 7:25AM arrive (Non/1/2)stop (Change planes in XXX) [always] [flt1/2] [price] [departure] [arrival] [# stops] [connection] """ removeKeywords = ['Departing flight', 'depart', 'arrive', 'Change Planes in', 'stop', 'stops', 'Plane Change'] regex = '|'.join(removeKeywords) infoList = filter(lambda el: el!="", re.sub(regex, "", string).split(' ')) stops = int(infoList[4]) if infoList[4] != 'Non' else 0 if stops == 0: connecting_arpts = [] elif ( infoList[5] not in SWAFareSpider.cities): connecting_arpts = [] else: connecting_arpts = list(infoList[5].split('/')) departureDT = dateParse("%s %s" % (date, infoList[2]) ) arrivalDT = dateParse("%s %s" % (date, infoList[3]) ) if ( arrivalDT < departureDT ): arrivalDT += timedelta(days=1) flight = { 'flight': tuple(infoList[0].split('/')), 'price': int(infoList[1][1:].replace(",","")), 'depart': departureDT, 'arrive': arrivalDT, 'depart_date' : date, 'stops': stops, 'connecting_arpts': connecting_arpts, 'fare_validity_date': datetime.now(), 'points' : int(points.replace(",","")) } return flight
def is_date(string): try: dateParse(string) #print(string + " is a date") return True except ValueError: #print(string + " is not a date") return False
def parse_author(self, response): published_time = dateParse( response.css( 'meta[property="og:article:published_time"]::attr(content)'). extract_first()).astimezone(get_localzone()).replace(tzinfo=None) todays_date = datetime.datetime.now(datetime.timezone.utc).astimezone( get_localzone()) if published_time.date() < todays_date.date(): return None try: modified_time = dateParse( response.css( 'meta[property="og:article:modified_time"]::attr(content)' ).extract_first()).astimezone( get_localzone()).replace(tzinfo=None) except: modified_time = published_time id_extractor = str(response.request.url).split('-') first_paragraph = extract_summary( response, "div.container_17wb1 div.body_1gnLA") if len(first_paragraph) < 7: first_paragraph = extract_summary( response, "div.StandardArticleBody_container div.StandardArticleBody_body" ) category = response.css( 'meta[property="og:article:section"]::attr(content)').extract() category.append('Tigrosa-Internation') content = response.css( 'div.container_17wb1 div.body_1gnLA p *::text').extract() if len(content) < 10: content = response.css( 'div.StandardArticleBody_container div.StandardArticleBody_body p *::text' ).extract() newsterItem = NewsterItem( _id='reuters' + '-' + id_extractor[len(id_extractor) - 1], url=response.request.url, published_time=published_time, modified_time=modified_time, title=response.css( 'head title::text').extract_first().lstrip().split('|')[0], category=category, content='\n\n'.join(content), image_link=response.css( 'meta[property="og:image"]::attr(content)').extract_first(), summary=first_paragraph) return newsterItem
def test_it_should_handle_time_interval(self): r = get_entity_value({ 'kind': 'TimeInterval', 'from': '2017-06-07 18:00:00 +02:00', 'to': '2017-06-08 00:00:00 +02:00', }) expected_from = dateParse('2017-06-07 18:00:00 +02:00') expected_to = dateParse('2017-06-08 00:00:00 +02:00') expect(r).to.be.a(tuple) expect(r[0]).to.equal(expected_from) expect(r[1]).to.equal(expected_to)
def parse_author(self, response): global scrape_next_page meta = response.css('head meta') header = response.css('div.story .template__header') try: published_time = dateParse( meta.css('[property="article:published_time"]::attr(content)'). extract_first()).replace(tzinfo=None) except: return None todays_date = datetime.now() if published_time.date() < todays_date.date(): scrape_next_page = False return None try: modified_time = dateParse( meta.css('[property="article:modified_time"]::attr(content)'). extract_first()).replace(tzinfo=None) except: modified_time = published_time first_paragraph = extract_summary( response, "div.clearfix.story-content.read-full") category = response.css( 'meta[property="article:tag"]::attr(content)').extract() article_section = response.css( 'meta[property="article:section"]::attr(content)').extract_first() if article_section.lower() == 'pakistan': article_section = 'national' category.append(article_section) newsterItem = NewsterItem( _id='tribune' + '-' + response.css('.story::attr(id)').extract_first().split('-')[1], url=response.request.url, published_time=published_time, modified_time=modified_time, title=response.css( 'div.story.clearfix h1.title a::text').extract_first(), category=list(set(category)), content='\n\n'.join( response.css('div.clearfix.story-content.read-full p *::text'). extract()), image_link=response.css( 'div.story-image-container img::attr(src)').extract_first(), summary=first_paragraph) return newsterItem
def parse_author(self, response): published_time = dateParse( response.css( 'article header.entry-header span.posted-on time[itemprop="datePublished"]::attr(content)' ).extract_first()).replace(tzinfo=None) todays_date = datetime.now() if published_time.date() < todays_date.date(): return None try: modified_time = dateParse( response.css( 'article header.entry-header span.posted-on time[itemprop="dateModified"]::attr(content)' ).extract_first()).replace(tzinfo=None) except: modified_time = published_time article_title = response.css( 'div.container h2.entry-title::text').extract_first() id_extractor = response.css('article::attr(id)').extract_first().split( '-') first_paragraph = extract_summary( response, "article div.entry-content div.content-body") category = [] if response.request.meta['category'].lower() == 'pakistan': category.append('National') else: category.append(response.request.meta['category']) newsterItem = NewsterItem( _id='dailypakistan' + '-' + id_extractor[len(id_extractor) - 1], url=response.request.url, published_time=published_time, modified_time=modified_time, title=article_title, category=list(set(category)), content='\n\n'.join( response.css( 'article div.entry-content div.content-body p *::text'). extract()), image_link=response.css( 'article header.entry-header div[itemprop="image"] img::attr(src)' ).extract_first(), summary=first_paragraph) return newsterItem
def parse_author(self, response): global scrape_next_page article_section = response.css( 'meta[property="article:section"]::attr(content)').extract_first() if article_section != 'HEADLINES': return None article_title = response.css( 'div.td-post-header h1.entry-title::text').extract_first() published_time = dateParse( response.css('meta[itemprop="datePublished"]::attr(content)'). extract_first()).replace(tzinfo=None) try: modified_time = dateParse( response.css('meta[itemprop="dateModified"]::attr(content)'). extract_first()).replace(tzinfo=None) except: modified_time = published_time todays_date = datetime.now() if published_time.date() < todays_date.date(): scrape_next_page = False return None child_int = 1 while True: first_para_text = "div.td-post-content > p:nth-of-type(" + str( child_int) + ") *::text" first_paragraph = ''.join(response.css(first_para_text).extract()) if len(first_paragraph) > 7: break child_int = child_int + 1 newsterItem = NewsterItem( _id='pakistantoday' + '-' + response.css('article::attr(id)').extract_first().split('-')[1], url=response.request.url, published_time=published_time, modified_time=modified_time, title=article_title, content='\n\n'.join( response.css('div.td-post-content p *::text').extract()), image_link=response.css( 'div.td-post-featured-image img::attr(src)').extract_first(), summary=first_paragraph) return newsterItem
def keyevent(): body = request.get_json() requiredFields = ['requestTime', 'pressedKeys'] if all(field in body for field in requiredFields): requestTimeStr = body['requestTime'] pressedKeysDict = body['pressedKeys'] pressedKeys = [int(k) for k in pressedKeysDict] requestTime = dateParse(requestTimeStr) lastRequest = dateParse(state['controller']['lastRequest']) if requestTime > lastRequest: ALLOWED_KEYS = set([32, 37, 38, 39, 40]) allowedPressedKeys = list(ALLOWED_KEYS & set(pressedKeys)) state['controller']['pressedKeys'] = allowedPressedKeys state['controller']['lastRequest'] = requestTimeStr return ''
def parse_author(self, response): published_time = dateParse( response.css( 'meta[property="article:published_time"]::attr(content)'). extract_first()).astimezone(get_localzone()).replace(tzinfo=None) todays_date = datetime.datetime.now(datetime.timezone.utc).astimezone( get_localzone()) if published_time.date() < todays_date.date(): return None try: modified_time = dateParse( response.css( 'meta[property="article:modified_time"]::attr(content)'). extract_first()).astimezone( get_localzone()).replace(tzinfo=None) except: modified_time = published_time id_extractor = response.css('article::attr(id)').extract_first().split( '-') first_paragraph = extract_summary( response, "article.post-" + str(id_extractor[len(id_extractor) - 1]) + " div.td-post-content") category = response.css( 'article div.td-post-source-tags li a::text').extract() category.append('Tigrosa-Internation') newsterItem = NewsterItem( _id='mettisglobal' + '-' + str(id_extractor[len(id_extractor) - 1]), url=response.request.url, published_time=published_time, modified_time=modified_time, title=response.css( 'meta[property="og:title"]::attr(content)').extract_first(), category=list(set(category)), content='\n\n'.join( response.css('article.post-' + str(id_extractor[len(id_extractor) - 1]) + ' div.td-post-content p *::text').extract()), image_link=response.css( 'meta[property="og:image"]::attr(content)').extract_first(), summary=first_paragraph) return newsterItem
def parse_author(self, response): published_time = dateParse( response.css('div.container div.category-date::text'). extract_first()).replace(tzinfo=None) modified_time = published_time todays_date = datetime.now() if published_time.date() < todays_date.date(): return None first_paragraph = extract_summary(response, "div.story-detail") article_url_peices = str(response.request.url).split('/') newsterItem = NewsterItem( _id='thenews' + '-' + article_url_peices[len(article_url_peices) - 1].split('-')[0], url=response.request.url, published_time=published_time, modified_time=modified_time, title=response.css( 'meta[property="og:title"]::attr(content)').extract_first(), category=response.css( 'body div.detail-content div.category-name h2::text').extract( ), content='\n\n'.join( response.css('div.story-detail p *::text').extract()), image_link=response.css( 'meta[property="og:image"]::attr(content)').extract_first(), summary=first_paragraph) return newsterItem
def main(): data = json.load(open('rates.json', 'r')) headers = [] for d in data: if d["currency"] in headers: pass else: headers.append(d["currency"]) arranged_data = {} l_h = [] plt.figure() for header in headers: arranged_data[header] = list( filter(lambda d: d["currency"] == header, data)) dates = [] rates = [] for d in arranged_data[header]: try: rates.append(float(d["rate"])) dates.append(dateParse(d["date"])) except: pass h, = plt.plot(dates, rates, label=header) l_h.append(h) with open('rates_rearranged.json', 'w') as outfile: json.dump(arranged_data, outfile) plt.legend(handles=l_h) plt.show()
def controllerLoop(): while True: if state['done']: done() break now = datetime.now(pytz.utc) parsedLastRequest = dateParse(state['controller']['lastRequest']) dormant = (now - parsedLastRequest).seconds >= 2 if not dormant: pressedKeys = state['controller']['pressedKeys'] SPACEBAR = 32 LEFT = 37 UP = 38 RIGHT = 39 DOWN = 40 if SPACEBAR in pressedKeys: state['done'] = True elif any(key in pressedKeys for key in [LEFT, UP, RIGHT, DOWN]): UNIT = 0.05 targetX, targetY = state['currentCoordinates'] if LEFT in pressedKeys: targetX -= UNIT if UP in pressedKeys: targetY += UNIT if RIGHT in pressedKeys: targetX += UNIT if DOWN in pressedKeys: targetY -= UNIT moved = moveArmTo(targetX, targetY) if moved: state['currentCoordinates'] = [targetX, targetY] else: sleep(0.1)
def get_forecasts(request): """ :type request: Request """ date = request.slot('date').first().value if not date: return request.ask('date', _('For when do you want the forecast?')) # pylint: disable=E0602 location = request.slot('city').first().value if not location: return request.ask('city', _('For where do you want the forecast?')) # pylint: disable=E0602 request.show(_("Well, I'm on it!")) # pylint: disable=E0602 time.sleep(3) # Simulate fetching # Do something with the key api_key = request.env('WEATHER_API_KEY') # pylint: disable=W0612 request.show(_("It's kinda sunny!"), # pylint: disable=E0602 cards=[{ "media": b64_icons['sunny'], "header": "24°C", "subhead": dateParse(date).strftime("%A %d %B"), "text": "Looks like it's sunny outside!" }], terminate=True) # pylint: disable=E0602
def parse_author(self, response): published_time = dateParse(response.css('main[id="content"] article header div.entry-meta time::attr(datetime)').extract_first()).replace(tzinfo=None) modified_time = published_time todays_date = datetime.now() if published_time.date() < todays_date.date(): return None id_constructor = response.css('div.site-content main[id="content"] article::attr(id)').extract_first().split('-') first_paragraph = extract_first_paragraph(response,'div.site-content main[id="content"] article div.entry-content') if first_paragraph is None: first_paragraph = response.css('div.site-content main[id="content"] article div.single-title .entry-title::text').extract_first() qmfashionItem = QmfashionItem( _id = 'sunday' + '-' + id_constructor[len(id_constructor)-1], published_time = published_time, modified_time = modified_time, url = response.request.url, title = response.css('div.site-content main[id="content"] article div.single-title .entry-title::text').extract_first(), opening_text = first_paragraph, news_source = "Sunday.com.pk", posted = False ) return qmfashionItem
def convertDate( strDate ) : ''' This function accepts the date string as returned by the IMAP server and translates it in to the client's local time (zone) and returns it as a string formatted as desired in the final output. ''' try: from dateutil.parser import parse as dateParse except ImportError: print("dateutil module missing. Try: pip install python-dateutil") import sys sys.exit(1) dt = dateParse( strDate.split( '(' ) [0] ) # We perform a split on the left parenthesis for the sometime possibility that the date string ends with something like (GMT-06:00) Local = LocalTimezone() # create an instance of the LocalTimezone class defined above try: ldt = dt.astimezone( Local ) except ValueError: print('Error - Using .astimezone(local).') return '' return ldt.strftime( '%b %d - %I:%M %P' )
def getTopic(topicURL): #to get the topic ID we split the url twice topicID = topicURL.split("=")[-1].split(".")[0] print topicID tree = parsedHTML('topic', topicID, '0') #Get the timestamp first, if it's not from a year we want, skip it try: timestamp = tree.xpath('//div[@class="smalltext"]')[1].text_content() except: print 'NO TIMESTAMP FOUND' print topicURL return False todayDate = str(time.strftime('%d %B %Y')) timestamp = timestamp.replace('Today', todayDate) timestamp = timestamp.replace(' at', ",") timestamp = dateParse(timestamp) postBody = tree.xpath('//div[@class="post"]')[0].text_content() print postBody authorActivity = tree.xpath( '//td[@class="poster_info"]/div[@class="smalltext"]')[0].text_content( ).split('Activity: ')[-1].split('\n')[0] return [postBody, timestamp, authorActivity]
def parse_author(self, response): published_time = dateParse( response.css( 'body article[id="the-post"] p.post-meta span.tie-date::text'). extract_first()).replace(tzinfo=None) modified_time = published_time todays_date = datetime.now() if published_time.date() < todays_date.date(): return None for item in ( response.css('body::attr(class)').extract_first()).split(' '): if 'postid' in item: id_constructor = item.split('-') qmfashionItem = QmfashionItem( _id='trendinginsocial' + '-' + id_constructor[len(id_constructor) - 1], published_time=published_time, modified_time=modified_time, url=response.request.url, title=response.css( 'article[id="the-post"] div.post-inner .post-title span::text' ).extract_first(), opening_text=extract_first_paragraph( response, 'article[id="the-post"] div.post-inner div.entry >'), news_source="Trendinginsocial", posted=False) return qmfashionItem
def parseFlight(_class, string, date): """ General format: Departing flight 123(/456) $0000 12:30AM depart 7:25AM arrive (Non/1/2)stop (Change planes in XXX) [always] [flt1/2] [price] [departure] [arrival] [# stops] [connection] """ # Remove keywords from flight string removeKeywords = [ 'Departing flight', 'depart', 'arrive', 'Change Planes in', 'stop', 'stops', 'Plane Change' ] regex = '|'.join(removeKeywords) # Turn into list and filter out blank [""] elements infoList = filter(lambda el: el != "", re.sub(regex, "", string).split(' ')) # Parse number of layovers stops = int(infoList[4]) if infoList[4] != 'Non' else 0 # Parse connecting airports (if applicable) if (infoList[5] not in SWAFareSpider.cities): # no valid connection connectingArpts = None else: connectingArpts = tuple(infoList[5].split('/')) # Parse departure and arrival times departureDT = dateParse("%s %s" % (date, infoList[2])) arrivalDT = dateParse("%s %s" % (date, infoList[3])) # If your flight goes past midnight, it must arrive the next day if (arrivalDT < departureDT): departureDT += timedelta(days=1) # Build flight info dict flight = { 'flight': tuple(infoList[0].split('/')), 'price': infoList[1], 'depart': departureDT, 'arrive': arrivalDT, 'stops': stops, 'connectingArpts': connectingArpts, 'fareValidityDate': datetime.now() } return flight
def get_article_urls(end_date): """Main function.""" filename = "coindesk_headlines.csv" urls, current_page = [], 1 has_next_page, out_of_range = True, False while has_next_page and not out_of_range: config = results_config(current_page) tree = parse_html(config["coindesk"]["page_url"]) items = tree.xpath(config["coindesk"]["item_XPATH"]) for item in items: if config["coindesk"]["date_on_page"] and config["coindesk"][ "date_ordered"] and end_date: date = (dateParse( item.xpath( config["coindesk"]["date_XPATH"])[0].get("datetime")) ).strftime("%Y-%m-%d") if dateParse(date) <= dateParse(end_date): out_of_range = True url = item.xpath(config["coindesk"]["url_XPATH"])[0].get("href") if "://" not in url: url = results_config( current_page)["coindesk"]["base_url"] + url url_filters = [ "/videos/", "/audio/", "/gadfly/", "/features/", "/press-releases/" ] if any(filter in url for filter in url_filters): pass else: urls.append(url) if len(items) < config["coindesk"]["results_per_page"]: has_next_page = False collect_articles(urls, end_date, filename) current_page += 1 urls = []
def parse_author(self, response): meta = response.css('head meta') category = meta.css( '[property="article:section"]::attr(content)').extract() valid_article = False for item in category: if item == "Business" or item == "National": valid_article = True break if not valid_article: return None published_time = dateParse( meta.css('[property="article:published_time"]::attr(content)'). extract_first()).replace(tzinfo=None) try: modified_time = dateParse( meta.css('[property="article:modified_time"]::attr(content)'). extract_first()).replace(tzinfo=None) except: modified_time = published_time article_title = response.css('head title::text').extract_first() first_paragraph = extract_summary(response, "article .post-content") newsterItem = NewsterItem( _id='nation' + '-' + hashlib.md5(article_title.encode('utf-8')).hexdigest(), url=response.request.url, published_time=published_time, modified_time=modified_time, title=article_title, category=list(set(category)), content='\n\n'.join( response.css('article .post-content p *::text').extract()), image_link=meta.css( '[property="og:image"]::attr(content)').extract_first(), summary=first_paragraph) return newsterItem
def parseFlight(_class, string, date): """ General format: Departing flight 123(/456) $0000 12:30AM depart 7:25AM arrive (Non/1/2)stop (Change planes in XXX) [always] [flt1/2] [price] [departure] [arrival] [# stops] [connection] """ # Remove keywords from flight string removeKeywords = ['Departing flight', 'depart', 'arrive', 'Change Planes in', 'stop', 'stops', 'Plane Change'] regex = '|'.join(removeKeywords) # Turn into list and filter out blank [""] elements infoList = filter(lambda el: el!="", re.sub(regex, "", string).split(' ')) # Parse number of layovers stops = int(infoList[4]) if infoList[4] != 'Non' else 0 # Parse connecting airports (if applicable) if ( infoList[5] not in SWAFareSpider.cities ): # no valid connection connectingArpts = None else: connectingArpts = tuple(infoList[5].split('/')) # Parse departure and arrival times departureDT = dateParse("%s %s" % (date, infoList[2]) ) arrivalDT = dateParse("%s %s" % (date, infoList[3]) ) # If your flight goes past midnight, it must arrive the next day if ( arrivalDT < departureDT ): departureDT += timedelta(days=1) # Build flight info dict flight = { 'flight': tuple(infoList[0].split('/')), 'price': infoList[1], 'depart': departureDT, 'arrive': arrivalDT, 'stops': stops, 'connectingArpts': connectingArpts, 'fareValidityDate': datetime.now() } return flight
def test_it_should_handle_instant_time(self): r = get_entity_value({ 'kind': 'InstantTime', 'value': '2017-06-13 18:00:00 +02:00', 'grain': 'Hour', 'precision': 'Exact', }) expected = dateParse('2017-06-13 18:00:00 +02:00') expect(r).to.be.a(datetime.datetime) expect(r).to.equal(expected)
def collect_articles(urls, end_date, filename): """Loops over all the URLs collected in the parent function.""" for url in urls: tree = parse_html(url) config = page_config(tree) try: if end_date and dateParse(config["date"]) < dateParse(end_date): break else: csv_writer = csv.writer( open( os.path.dirname(os.getcwd()) + "/../data/" + filename, "a")) csv_writer.writerow( [config["date"], ftfy.fix_text(config["title"]), url]) except: print("\nEXCEPTION OCCURED\n") pass
def parse_author(self, response): try: published_time = dateParse( response.css( 'meta[property="article:published_time"]::attr(content)'). extract_first()).replace(tzinfo=None) except: published_time = datetime.now() try: modified_time = dateParse( response.css( 'meta[property="article:modified_time"]::attr(content)'). extract_first()).replace(tzinfo=None) except: modified_time = published_time todays_date = datetime.now() if published_time.date() < todays_date.date(): raise scrapy.exceptions.CloseSpider('termination condition met') return None for item in ( response.css('body::attr(class)').extract_first()).split(' '): if 'postid' in item: id_constructor = item.split('-') qmfashionItem = QmfashionItem( _id='mangobaaz' + '-' + id_constructor[len(id_constructor) - 1], published_time=published_time, modified_time=modified_time, url=response.request.url, title=response.css('title::text').extract_first(), opening_text=extract_first_paragraph( response, 'article[id="post-' + id_constructor[len(id_constructor) - 1] + '"] div.entry-content >'), news_source="Mangobaaz", posted=False) return qmfashionItem
def parse_author(self, response): global scrape_next_page published_time = dateParse(response.css('meta[property="article:published_time"]::attr(content)').extract_first()).replace(tzinfo=None) todays_date = datetime.now() if published_time.date() < todays_date.date(): scrape_next_page = False return None try: modified_time = dateParse(response.css('meta[property="article:modified_time"]::attr(content)').extract_first()).replace(tzinfo=None) except: modified_time = published_time article_title = response.css('div.post-header h1.entry-title::text').extract_first() id_extractor = response.css('body::attr(class)').extract_first().split(' ') for item in id_extractor: if 'postid' in item: article_id = item.split('-')[len(item.split('-'))-1] first_paragraph = extract_summary(response, "main.content div.entry-content") category = response.css('meta[property="article:section"]::attr(content)').extract() category.append(response.request.meta['category']) newsterItem = NewsterItem( _id = 'dailytimes' + '-' + article_id, url = response.request.url, published_time = published_time, modified_time = modified_time, title = article_title, category = list(set(category)), content = '\n\n'.join(response.css('main.content div.entry-content p *::text').extract()), image_link = response.css('meta[property="og:image"]::attr(content)').extract_first(), summary = first_paragraph ) return newsterItem
def parse_author(self, response): published_time = dateParse( response.css( 'meta[property="article:published_time"]::attr(content)'). extract_first()).replace(tzinfo=None) try: modified_time = dateParse( response.css( 'meta[property="article:modified_time"]::attr(content)'). extract_first()).replace(tzinfo=None) except: modified_time = published_time todays_date = datetime.now() if published_time.date() < todays_date.date(): return None for item in ( response.css('body::attr(class)').extract_first()).split(' '): if 'postid' in item: id_constructor = item.split('-') qmfashionItem = QmfashionItem( _id='edition.pk' + '-' + id_constructor[len(id_constructor) - 1], published_time=published_time, modified_time=modified_time, url=response.request.url, title=response.css( 'body article.post div[id="post-header"] .post-title::text'). extract_first(), opening_text=extract_first_paragraph( response, 'body article.post div[id="post-area"] div[id="content-area"] div.content-main' ), news_source="Edition.pk", posted=False) return qmfashionItem
def getMinDate(genName,baseURL='http://159.203.100.177:3000'): url = baseURL+"/generators_times" querystring = {"id": "eq."+genName, "select": "min_time"} payload = "" headers = { 'cache-control': "no-cache" } response = requests.request("GET", url, data=payload, headers=headers, params=querystring) if len(response.json()): dateStr=response.json()[0].get('min_time') return dateParse(dateStr).date()
def convertDate(d, df): result = -1 # Sometimes dates are just the year or day. try: if d != "" and not pd.isnull(d): df = "%Y-%m-%d" dt = dateParse(d, dayfirst=df) result = (dt.strftime(df)) else: result = d except ValueError: result = -1 return result
def convertDate(d, dayfirst): result = -1 # Sometimes dates are just the year or day, so we try to parse the date and if that fails, just # pass the d parameter back, because it is either not a full date, or a bad value try: if d != "" and not pd.isnull(d): df = "%Y-%m-%d" dt = dateParse(d, dayfirst=dayfirst) result = (dt.strftime(df)) else: result = d except ValueError: result = -1 return result
def __init__(self,rawDate): self.rawTime=rawDate self.dateTime=dateParse(rawDate) self._unixTime=mktime(self.dateTime.timetuple())+1e-6*self.dateTime.microsecond
def gritsSearch(self, params): folder = self.gritsFolder() self.checkAccess() limit, offset, sort = self.getPagingParameters(params, 'meta.date') sDate = dateParse(params.get('start', '1990-01-01')) eDate = dateParse(params.get('end', str(datetime.now()))) useRegex = 'regex' in params query = { 'folderId': folder['_id'], 'meta.date': {'$gte': sDate, '$lt': eDate} } self.addToQuery(query, params, 'country', useRegex) self.addToQuery(query, params, 'disease', useRegex) self.addToQuery(query, params, 'species', useRegex) self.addToQuery(query, params, 'feed', useRegex) self.addToQuery(query, params, 'description', useRegex) self.addToQuery( query, params, 'diagnosis', useRegex, 'meta.diagnosis.diseases', 'name' ) self.addToQuery(query, params, 'id', useRegex, 'name') model = ModelImporter().model('item') cursor = model.find( query=query, fields=None, offset=offset, limit=limit, sort=sort ) result = list(cursor) if not self.checkAccess(priv=True, fail=False): result = [model.filter(i) for i in result] if 'randomSymptoms' in params: try: filterBySymptom = set(json.loads(params['filterSymptoms'])) except Exception: filterBySymptom = False filtered = [] for r in result: r['meta']['symptoms'] = self.getSymptomFromId(r['_id']) if filterBySymptom: s2 = set(r['meta']['symptoms']) if not filterBySymptom.isdisjoint(s2): filtered.append(r) else: filtered.append(r) result = filtered if 'geoJSON' in params: result = self.togeoJSON(result) return result
def handleRequestSuccess(self,workQueueItem, response): result = json.load(response) if "items" in result: for item in result['items']: #database mapping db_meta = {'id': item['id']} #snippet snippet = item['snippet'] db_meta['snippet_publishedAt'] = dateParse(snippet['publishedAt']) db_meta['snippet_channel_id'] = snippet['channelId'] db_meta['snippet_title'] = snippet['title'] db_meta['snippet_description'] = snippet['description'] db_meta['snippet_channel_title'] = snippet['channelTitle'] db_meta['snippet_category_id'] = snippet['categoryId'] db_meta['snippet_liveBroadcastContent'] = snippet['liveBroadcastContent'] db_meta['snippet_tags'] = json.dumps(snippet['tags']) if snippet.get('tags') else '' #contentDetails c_details = item['contentDetails'] db_meta['contentDetails_duration'] = c_details['duration'] db_meta['contentDetails_durationAsSeconds'] = self.ISO8601durationToSeconds(c_details['duration']) db_meta['contentDetails_dimension'] = c_details['dimension'] db_meta['contentDetails_definition'] = c_details['definition'] db_meta['contentDetails_caption'] = c_details['caption'] db_meta['contentDetails_licensedContent'] = c_details['licensedContent'] #status status = item['status'] db_meta['status_uploadStatus'] = status['uploadStatus'] db_meta['status_privacyStatus'] = status['privacyStatus'] db_meta['status_license'] = status['license'] db_meta['status_embeddable'] = status['embeddable'] db_meta['status_publicStatsViewable'] = status['publicStatsViewable'] #statistics stats = item['statistics'] db_meta['statistics_viewCount'] = stats['viewCount'] db_meta['statistics_likeCount'] = stats.get('likeCount') or '' db_meta['statistics_dislikeCount'] = stats.get('dislikeCount') or '' db_meta['statistics_favoriteCount'] = stats['favoriteCount'] db_meta['statistics_commentCount'] = stats['commentCount'] #recordingDetails def deep_get(item, *attrs): ''' Get item, or return fallback value from nested dicts ''' if item and not isinstance(item, dict): return item if not item: return None return deep_get(item.get(attrs[0]), *attrs[1:]) if deep_get(item, 'recordingDetails', 'recordingDate'): db_meta['recordingDetails_recordingDate'] = dateParse(deep_get(item, 'recordingDetails', 'recordingDate')) else: db_meta['recordingDetails_recordingDate'] = datetime.utcfromtimestamp(0) db_meta['recordingDetails_location_latitude'] = deep_get(item, 'recordingDetails', 'location', 'latitude') or 0 db_meta['recordingDetails_location_longitude'] = deep_get(item, 'recordingDetails', 'location', 'longitude') or 0 db_meta['recordingDetails_location_altitude'] = deep_get(item, 'recordingDetails', 'location', 'altitude') or 0 self.resultList[item['id']] = db_meta
def toDatetime(string): try: return dateParse(self.formatInputString(string)) except: return None
def main(): # === Extract options === parser = OptionParser(usage="usage: %prog [options] <timeColumn> <timeInterval> <groupByColumn> ...") parser.add_option( '-p', '--pivot', dest='pivot', action='store_true', default=False, help='store the data until the end of the stream and then pivot it into groupByCol groups (SIGNIFICANT MEMORY USAGE)' # noqa ) parser.add_option('-s', dest='sep', default='|', help='groupByCol separator when pivoting') parser.add_option('-m', '--multplier', dest='multiplier', default=100) (options, args) = parser.parse_args() if len(args) < 3: parser.print_usage() exit() pivot = options.pivot colNameSep = options.sep multiplier = int(options.multiplier) timeCol = int(args[0]) interval = int(args[1]) groupCols = [] for i in range(2, len(args)): groupCols.append(int(args[i])) # Data is a complex data structure; with the following layout: # Timestamp (start of interval) # -- {(col1 val, col2 val, col3 val...)} # -- -- Count data = {} # Similarly, if we're pivoting we'll keep track of unique columns through time # (col1 val, col2 val, col3 val...) uniqueCols = set() lineCount = 0 for line in sys.stdin: parts = line.strip().split(' ') # Find the agg time ctime = int(dateParse(parts[timeCol]).strftime('%s')) # Yes, this is horribly inefficient; meh ctime = (ctime / interval) * interval colVals = [] for i in groupCols: colVals.append(parts[i]) colVals = tuple(colVals) if ctime not in data: data[ctime] = {} if colVals not in data[ctime]: data[ctime][colVals] = 1 else: data[ctime][colVals] += 1 if not pivot: lineCount = (lineCount + 1) % 1000 if lineCount == 0: # Flush the buffers if possible for ptime in sorted(data.keys()): if ptime + (2 * interval) < ctime: for dataline in data[ptime]: sys.stdout.write("%s\t%s\t%s\n" % ( datetime.fromtimestamp(ptime).strftime('%Y-%m-%d %H:%M:%S'), colNameSep.join(dataline), data[ptime][dataline] * multiplier )) del data[ptime] else: uniqueCols.add(colVals) # And here we are at the end... if pivot: # Must create the BIG table now outline = ['time'] for cols in uniqueCols: outline.append(colNameSep.join(cols)) sys.stdout.write("\t".join(outline)) sys.stdout.write("\n") for ptime in sorted(data.keys()): outline = [datetime.fromtimestamp(ptime).strftime('%Y-%m-%d %H:%M:%S')] for cols in uniqueCols: if cols in data[ptime]: outline.append(str(data[ptime][cols] * multiplier)) else: outline.append('0') sys.stdout.write("\t".join(outline)) sys.stdout.write("\n")
def __init__(self, fromCity=None, date=None, toCity=None, *args, **kwargs): super(SWAFareSpider, self).__init__(**kwargs) self.origin = fromCity self.outDate = dateParse(date) self.destination = toCity