def date_parser(date): index = date.find('-') year = int(date[:index]) next_index = date.find('-', index + 1) month = int(date[index + 1:next_index]) day = int(date[next_index + 1:]) return {'month': month, 'day': day, 'year': year}
def _editDate(self, date): format_date = None if date.find('about ') >= 0: new_date = date[date.find('about ') + 6 :] # Number of days to push back for current day if new_date[ :new_date.find(' ')] == 'a': num_day = 1 else: num_day = int(new_date[ :new_date.find(' ')]) # Finds a possible date of joining (based off Facebook estimation) if new_date.find('weeks') >= 0: format_date = datetime.now() - timedelta(days= 7 * num_day) else: format_date = datetime.now() - timedelta(days= 30 * num_day) elif date.find('Joined on ') >= 0: new_date = date[date.find('Joined on ') + 10: ] today = datetime.today() if new_date == 'Monday': offset = (today.weekday()) % 7 elif new_date == 'Tuesday': offset = (today.weekday() - 1) % 7 elif new_date == 'Wednesday': offset = (today.weekday() - 2) % 7 elif new_date == 'Thursday': offset = (today.weekday() - 3) % 7 elif new_date == 'Friday': offset = (today.weekday() - 4) % 7 elif new_date == 'Saturday': offset = (today.weekday() - 5) % 7 else: offset = (today.weekday() - 6) format_date = datetime.now() - timedelta(days=offset) else: format_date = datetime.strptime(date[date.find('on ') + 3:], '%B %d, %Y') return format_date
def __compareTime(self, date): hour = datetime.now().time().hour minute = datetime.now().time().minute hour_param = int(date[:date.find(':')]) minute_param = int(date[date.find(':') + 1:]) if hour == hour_param and minute == minute_param: return True else: return False
def get_week_result(url, input_text="USD"): page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') tb_rows = soup.find_all('tr', class_='calendar__row') final = [] i = 0 for row in tb_rows: x = [] # print(row.find('span', class_='worse')) if row.find('td', class_='calendar__cell calendar__actual actual'): span = row.find('td', class_='actual') if span.find('span', class_='worse'): if row.find('td', class_='calendar__cell calendar__currency currency'): i += 1 x.append(i) x.append((row.find('td', class_='calendar__cell calendar__currency currency').text[1:4])) x.append("bullish") date_ = row.find('td', class_='calendar__cell calendar__date date').find('span', class_='date') if date_ != None: date = date_ x.append(date.find().text[:]) final.append(x) elif span.find('span', class_='better'): if row.find('td', class_='calendar__cell calendar__currency currency'): i += 1 x.append(i) x.append((row.find('td', class_='calendar__cell calendar__currency currency').text[1:4])) x.append("bearish") date_ = row.find('td', class_='calendar__cell calendar__date date').find('span', class_='date') if date_ != None: date = date_ x.append(date.find().text[:]) final.append(x) else: if row.find('td', class_='calendar__cell calendar__currency currency'): i += 1 x.append(i) x.append((row.find('td', class_='calendar__cell calendar__currency currency').text[1:4])) x.append("neutral") date_ = row.find('td', class_='calendar__cell calendar__date date').find('span', class_='date') if date_ != None: date = date_ x.append(date.find().text[:]) final.append(x) # input_text = input("Input the Currency (Ex:- NZD, USD, JPY, AUD, JPY) : ") return final
def getData(file, cols): "Takes in an HTML DOM and returns a pandas dataframes" with open(file) as f: soup = BeautifulSoup(f, 'html.parser') text = str(soup.find_all('script')[5]) first = 0 results = [] while first != -1: beginning = text.find("[new Date", first) ending = text.find("]", beginning) dataText = text[beginning + 1:ending] row = dataText.split(",") results.append(row) first = ending results = results[:-1] for i, row in enumerate(results): date = row[0] date = date[:-1] junkIdx = date.find("(") row[0] = date[junkIdx + 1:] results[i] = row df = pd.DataFrame(results, columns=cols) df['Date'] = df['Date'].apply( lambda x: datetime.strptime(x, '"%Y/%m/%d"')) #covert to float numerical_columns = cols[1:] for num_col in numerical_columns: df[num_col] = df[num_col].apply( lambda x: float(0) if x == "null" else math.log(float(x))) return df
def extract_google(query_terms, startDate, endDate): if len(startDate) == 0: startDate = datetime.datetime.today().strftime('%d/%m/%Y') if len(endDate) == 0: endDate = datetime.datetime.strftime( datetime.datetime.today().date() - datetime.timedelta(days=7), '%d/%m/%Y') startDate = datetime.datetime.strptime(startDate, '%Y-%m-%d').strftime('%d/%m/%y') endDate = datetime.datetime.strptime(endDate, '%Y-%m-%d').strftime('%d/%m/%y') final_articles = [] print(startDate) print(endDate) print("Crawling Starting") # here extracting news from google news googlenews = GoogleNews() googlenews.setTimeRange(startDate, endDate) for query in query_terms: googlenews.clear() #forming the search term googlenews.search("India Technology " + query) result = googlenews.result() for n in range(len(result)): source = result[n]['media'] url = result[n]['link'] try: article = Article(url) article.download() article.parse() except Exception as e: print("Trouble downloading so skipping") continue content = article.text # summarize the content temp_content = re.sub(r'^\s*[\(\[].*?[\)\]]\s*', '', content) sentences = sent_detector.tokenize(temp_content) summary = (" ".join(sentences[:2]).strip()) date = result[n]['date'] if (date.find('ago') != -1): date = current.date() title = result[n]['title'] # content=result[n]['desc'] img = result[n]['img'] #adding the extracted info in final_articles list final_articles.append({ 'source': source, 'url': url, 'date': date, 'title': title, 'content': content, 'img': img }) return final_articles
def split_datetime_range(date): if isinstance(date,(tuple,list)): d0,d1 = date elif isinstance(date, (str,unicode)) and date.find('~')>=0: d0,d1 = date.split('~') else: d0,d1 = date, date return d0, d1
def ru_months(date, simple=None): try: return [ date.replace(month[0], (simple and month[1] or month[2])) for month in months if date.find(month[0]) != -1 ][0] except IndexError: return ''
def extract_date(date) -> object: crrct_date = 0 new_date = "" from datetime import date today = date.today() print("Today's date:", today) if date.find(",") != -1: mnth_day, year = date.split(',') if mnth_day.find(" ") != -1: month, day = mnth_day.split(" ") crrct_date = 1 day = day.strip() month = month.strip() year = year.strip() if month == "January": new_date = "1" + "/" elif month == "February": new_date = "2" + "/" elif month == "March": new_date = "3" + "/" elif month == "April": new_date = "4" + "/" elif month == "May": new_date = "5" + "/" elif month == "June": new_date = "6" + "/" elif month == "July": new_date = "7" + "/" elif month == "August": new_date = "8" + "/" elif month == "September": new_date = "9" + "/" elif month == "October": new_date = "10" + "/" elif month == "November": new_date = "11" + "/" elif month == "December": new_date = "12" + "/" else: crrct_date = 0 new_date += day + "/" new_date += year if crrct_date == 1: return new_date else: return ""
def format_goodwill(listing): title = listing[0] title = title[title.find("\n"):] title = title[21:title.find("\r")] listing[0] = title date = listing[4] date = date[:date.find(' ')] slash1 = date.find('/') slash2 = date[slash1 + 1:].find("/") + slash1 + 1 month = date[:slash1] day = date[slash1 + 1:slash2] year = date[slash2 + 1:] if len(month) == 1: month = "0" + month if len(day) == 1: day = "0" + day listing[4] = year + "-" + month + "-" + day if listing[9] == 'Buy It Now': listing[3] = listing[1] listing[1] = 'ex' return listing
def parse_datetime(date): """ parse string date to datetime type :param date: :return: """ if not date: return None if isinstance(date, str) and date.find(birthday_sep): return dateparser.parse(date) elif isinstance(date, str): return fix_cn_parse_year_error(dateparser.parse(str(date))) elif isinstance(date, int): return dateparser.parse(str(date))
def CallFromSQL(x, y, date): date = str(date) SQLite3conn = sqlite3.connect("DailyDB") query = 'select * from DailyTable' if date.find("-") > -1: query = query + " where Symbol = '" + x + "' and insertionDay = '" + date + "';" cursor = SQLite3conn.cursor() cursor.execute(query) info = cursor.description info = pandas.DataFrame(info) titles = info[0] results = cursor.fetchall() results = pandas.DataFrame(results) if len(results) > 0: results.columns = titles cursor.close() return results
def getNews(url): try: res = requests.get(url) soup = BeautifulSoup(res.text,"lxml") title = soup.title.string #print "Title: "+title kinds = soup.find(attrs={"name":"section"})['content'] #print "Kinds: "+kinds date = soup.find("time").text.strip() date = date.replace(u"年","-") date = date.replace(u"月","-") date = date.replace(u"日","") if kinds != u"寵物動物": if date.find(':')!=-1: date+=":00" else: date+=" 00:00:00" date = date.encode('utf-8') date = datetime.strptime(date,"%Y-%m-%d %H:%M:%S")
def get(self, request, id): hike = Hike.objects.get(id=id) context = base_context(request, title=hike.name) this_hike = {} # Сюда вставить все достопримечательности this_hike['landmarks'] = list(Landmark.objects.filter(is_public=True)) if hike.image.name is not None and hike.image.name != "": this_hike['image'] = hike.image else: this_hike['image'] = '' days = [] months = [ 'января', 'февраля', 'марта', 'апреля', 'мая', 'июня', 'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря' ] for day in Day.objects.filter(hike=hike).order_by('date'): day_id = int(day.name.split()[1]) if day_id == 1 or day.description != '' or day.caption != '': data = {} data['description'] = day.description data['header'] = day.caption data['date'] = str( day.date.day) + ' ' + months[day.date.month - 1] data['name'] = day.name data['id'] = str(day_id) days.insert(0, data) else: if days[0]['date'].find(' - ') != -1: days[0]['date'] = days[0]['date'][:days[0]['date']. find(' - ')] if days[0]['name'].find(' - ') != -1: days[0]['name'] = days[0]['name'][:days[0]['name']. find(' - ')] # days[0]['name'].replace('День', 'Дни') days[0]['date'] += ' - ' + str( day.date.day) + ' ' + months[day.date.month - 1] days[0]['name'] += ' - ' + str(day_id) days = sorted(days, key=lambda x: int(x['id'])) for day in days: if day['date'].count(day['date'].split(' ')[-1]) > 1: date = day['date'] month = day['date'].split(' ')[-1] day['date'] = date[:date.find(month) - 1] + date[date.find(month) + len(month):] this_hike['days'] = days participants = [] usernames = [] for participant in hike.participants.all(): usernames.append(participant.username) props = [full_name(participant), participant.username, ''] if participant.profile.avatar.name != '': props[2] = participant.profile.avatar props.append("/account/" + participant.username) participants.append(props) this_hike['participants'] = participants this_hike['usernames'] = usernames if hike.limit_of_members - len(participants) > 0: this_hike['vacancies'] = hike.limit_of_members - len(participants) this_hike['free_plases'] = "Yup" else: this_hike['vacancies'] = 0 this_hike['free_plases'] = "Nope" context['content'] = hike.__dict__ context['content'].update(this_hike) context['content']['creator'] = hike.creator context['rus_date'] = beauty_date_interval(hike.start_date, hike.end_date, True, True) if 0 < int(str(this_hike['vacancies'])[-1:]) < 5: context['number_of_free_places'] = str( this_hike['vacancies']) + ' места' else: context['number_of_free_places'] = str( this_hike['vacancies']) + ' мест' context['author_full_name'] = full_name(hike.creator) # Комментарии comments = [] hike_comments_models = Message.objects.filter( hike=hike).order_by('creation_datetime') for ct_model in hike_comments_models: ct = {} ct['author'] = full_name(ct_model.author) ct['author_username'] = ct_model.author.username ct['comment'] = ct_model.text ct['avatar'] = '' if ct_model.author.profile.avatar.name != '': ct['avatar'] = ct_model.author.profile.avatar published_time = ct_model.creation_datetime.strftime( '%H:%M, %d ') + months[ct_model.creation_datetime.month - 1] ct['time_published'] = published_time comments.append(ct) context['comments'] = comments return render(request, "hike.html", context)
import time #Using datetime.strptime() datetemp = sys.argv[1] datetemp1 = datetime.strptime(datetemp,"%m/%d/%Y %H:%M:%S") ##datetemp2 = datetemp1.ctime() print datetemp1 myFile = open("logAfterAssignment1.txt","r") lateAuthor = [] submissionDate = [] count = 0 for date in myFile: if(count==0): indexofDate = date.find("Date:") if indexofDate != -1: ds = date.split() x = (str(ds[2])+' '+str(ds[3])+' '+str(ds[5])+' '+str(ds[4])) subDate = datetime.strptime(x,"%b %d %Y %H:%M:%S",) if subDate>datetemp1: submissionDate.append(date) count = 1 else: indexofAuthor = date.find("Author:") if indexofAuthor !=-1: lateAuthor.append(date) count = 0 myFile.close() mydic = dict(zip(lateAuthor, submissionDate))
def _get_time(self, commit): date = commit["date"] date = date[:date.find("T")] time = datetime.strptime(date, '%Y-%m-%d').date() return time
def parser(): req = urllib2.Request('https://spaceflightnow.com/launch-schedule/') response = urllib2.urlopen(req) the_page = response.read() d = datetime.utcnow() h = HTMLParser() cal = Calendar() cal.add('version', 2.0) cal.add('prodid', '-//madkat//SpaceX feed//EN') # Get all DATETAG indexes date_group = [m.start() for m in re.finditer(DATETAG, the_page)] # For each date index in date_group, extract the other data for _idx in range(len(date_group)): date_idx = date_group[_idx] if _idx + 1 == len(date_group): block_end = len(the_page) else: block_end = date_group[_idx + 1] date_start_idx = date_idx + len(DATETAG) date_end_idx = the_page[date_start_idx:block_end].find( SPANENDTAG) + date_start_idx date = the_page[date_start_idx:date_end_idx] if '/' in date: _idx = date.find('/') date = date[_idx + 1:] found_month = False mth_idx = 0 while not found_month and mth_idx < 12: if SH_MTH[mth_idx] in date: _idx = date.find(SH_MTH[mth_idx]) day = date[_idx + len(SH_MTH[mth_idx]) + 1:] found_month = True break if FL_MTH[mth_idx] in date: _idx = date.find(FL_MTH[mth_idx]) day = date[_idx + len(FL_MTH[mth_idx]) + 1:] found_month = True break mth_idx += 1 # If I find a day, or month, start building datetime object # Otherwise, I just skip the event if found_month and day != '': event = Event() # Check if day has '/' in it year = d.year _idx = day.find('/') if _idx != -1: day = day[_idx + 1:] mth = mth_idx + 1 if mth < d.month: year += 1 # Get event title mission_start_idx = the_page[date_end_idx:block_end].find( MISSIONTAG) + len(MISSIONTAG) + date_end_idx mission_end_idx = the_page[mission_start_idx:block_end].find( SPANENDTAG) + mission_start_idx mission = the_page[mission_start_idx:mission_end_idx] mission = re.sub(r'[^\x00-\x7F]+', '-', mission) # Escape all sorts of weird characters mission = mission.decode("ascii", errors="ignore").encode() # Escape HTML characters & add summary event.add('summary', h.unescape(mission)) # Get launch window launch_win_start_idx = the_page[mission_end_idx:block_end].find( LAUNCHWINDOWTAG) + len(LAUNCHWINDOWTAG) + mission_end_idx launch_win_end_idx = the_page[launch_win_start_idx:block_end].find( SPANSTARTTAG) + launch_win_start_idx launch_win_raw = the_page[launch_win_start_idx:launch_win_end_idx] is_gmt_idx = launch_win_raw.find(GMT) # If there is no launch window yet, just make it a 24hr event (all day equivalent?) if is_gmt_idx == -1: launch_win = "0000-2359" else: launch_win = re.search(LAUNCHREGEX, launch_win_raw[:is_gmt_idx]).group(0) # Parse launch window if '-' in launch_win: # I have a launch window! ev_date = datetime(year, mth, int(day), int(launch_win[:2]), int(launch_win[2:4]), 0, 0, tzinfo=pytz.utc) ev_date_end = datetime(year, mth, int(day), int(launch_win[5:7]), int(launch_win[7:]), 0, 0, tzinfo=pytz.utc) else: ev_date = datetime(year, mth, int(day), int(launch_win[:2]), int(launch_win[2:4]), 0, 0, tzinfo=pytz.utc) ev_date_end = ev_date + timedelta(hours=1) event.add('dtstart', ev_date) event.add('dtend', ev_date_end) # Get event location loc_start_idx = the_page[launch_win_end_idx:block_end].find( LOCTAG) + len(LOCTAG) + launch_win_end_idx loc_end_idx = the_page[loc_start_idx:block_end].find( DIVENDTAG) + loc_start_idx location = the_page[loc_start_idx:loc_end_idx] event.add('location', location) # Get event description desc_start_idx = the_page[launch_win_end_idx:block_end].find( DESCTAG) + launch_win_end_idx + len(DESCTAG) desc_end_idx = the_page[desc_start_idx:block_end].find( UPDATETAG) + desc_start_idx desc = the_page[desc_start_idx:desc_end_idx].decode( "ascii", errors="ignore").encode() desc_filtered = h.unescape(desc) # If it didn't have a launch window, write a comment in description if launch_win == "0000-2359": desc_filtered = "Launch window currently unavailable. Please check at a later time. " + desc_filtered event.add('description', desc_filtered) # Add event to calendar cal.add_component(event) # Return calendar return cal.to_ical()
def wiki_event_extract_2003(month,year,dir): #extractor for 2003.1~2006.4 events = [] print month,year sp = BeautifulSoup(open(dir+month+'_'+year)) spans = sp.find_all('span','mw-headline') span_dates = [s for s in spans if s.text.find(year)>=0] for span_date in span_dates: date = span_date.text day = '' for d in ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']: if date.find(d)>=0: day = d break type = '' lis = span_date.find_next('ul')('li') parent = None for li in lis: title,txt,anchs,urls='','',[],[] lis2 = li('li') if len(lis2)>0: parent = li try: if li.contents[1].startswith(':'): title = li.a.text except: print 'Type err' for li2 in lis2: txt = li2.text txt = txt[:txt.rfind('.')] anchs = '||'.join(link.text+'=>'+link['title'] for link in li2("a") if link['href'].startswith('/wiki/')) urls = '||'.join(lk['href'] for lk in li2("a","external text")) if len(urls)==0: sups = li('sup','reference') refs = [sp.find(id=lk.a['href'][1:]) for lk in sups] if len(refs)>0: hrefs = [ref('a','external free')[0]['href'] for ref in refs if len(ref('a','external free'))>0] if len(hrefs)==0: hrefs = [ref('a','external text')[0]['href'] for ref in refs if len(ref('a','external text'))>0] urls = '||'.join(hrefs) if len(urls)==0: urls = '||'.join(lk['href'] for lk in li2('a','external autonumber')) events.append((date,day,type,title,txt,anchs,urls)) else: if li.find_parent().find_parent()==parent: continue txt = li.text txt = txt[:txt.rfind('.')] anchs = '||'.join(link.text+'=>'+link['title'] for link in li("a") if link['href'].startswith('/wiki/')) urls = '||'.join(lk['href'] for lk in li("a","external text")) if len(urls)==0: sups = li('sup','reference') refs = [sp.find(id=lk.a['href'][1:]) for lk in sups] if len(refs)>0: hrefs = [ref('a','external free')[0]['href'] for ref in refs if len(ref('a','external free'))>0] if len(hrefs)==0: hrefs = [ref('a','external text')[0]['href'] for ref in refs if len(ref('a','external text'))>0] urls = '||'.join(hrefs) if len(urls)==0: urls = '||'.join(lk['href'] for lk in li('a','external autonumber')) events.append((date,day,type,title,txt,anchs,urls)) return events
def filter(filtered_listing, unfiltered_listing): date = str(unfiltered_listing[4]) date = date[date.find('data-countdown') + 16:] date = date[:date.find('>') - 1] filtered_listing[4] = date return filtered_listing
Boroughs = [] for hood in Results: hoods = hood.find(class_='result-hood') if hoods is not None: boro = hoods.get_text() correct = re.search(r'\((.*?)\)', boro).group(1) Boroughs.append(correct) else: boro = None Boroughs.append(boro) PostDates = [] for date in Results: dates = date.find(class_='result-date')['datetime'] PostDates.append(dates) Description = [] specs = [] for link in Links: listing = requests.get(str(link)) Listing_soup = BeautifulSoup(listing.content, features="lxml") Desc = Listing_soup.find('section', {'id': 'postingbody'}) if Desc is not None: stuff = Desc.get_text() Description.append(stuff) else:
def format_goodwill_listing(formatted_item, original_item): date = str(original_item[4]) date = date[date.find('data-countdown')+16:] date = date[:date.find('>')-1] formatted_item[4] = date return formatted_item