def scrape_ads_to_bucket(self, destination_bucket_dir, ad_list_url, count=None, min_word_count=None, max_word_count=None, date=None): # Ensure things are typed correctly count = int(count) if count else None min_word_count = int(min_word_count) if min_word_count else None max_word_count = int(max_word_count) if max_word_count else None if date: if type(date) == str: date = convert_to_date(date) # TODO : Add abstraction via generator function to allow counts > one page of ads # Grab the HTML source code result_page = requests.get(ad_list_url) result_soup = BeautifulSoup(result_page.text, 'html.parser') # Scrape all ad URLs from ad list page ad_urls = [] for ad_element in result_soup.find_all('li', {'class':'result-row'}): # Check date to allow for filtering datetime_string = ad_element.find('time')['datetime'] datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%d %H:%M") url = ad_element.find('a')['href'] # If date is filtered, skip any non-matching dates if date and datetime_obj.date() != date.date(): continue # Check if it's "nearby" meaning not in the location we want nearby_element = ad_element.find('span', {'class': 'nearby'}) if nearby_element: continue # Note the URL ad_urls += [url] # Scrape the filtered URLs successful_uploads = 0 bucket_paths = [] for url in ad_urls: try: bucket_path = self.scrape_ad_to_bucket( destination_bucket_dir=destination_bucket_dir, ad_url=url, min_word_count=min_word_count, max_word_count=max_word_count ) bucket_paths += [bucket_path] successful_uploads += 1 if count and successful_uploads >= count: break except Exception as e: print(e) return bucket_paths
def insert_gapfill(_dataset, classroom_id, student_id, abs_filename, update=False, debug=False): '''insert gapfill into records''' status = True msg = "" try: dataset = read_json(abs_filename) except json.decoder.JSONDecodeError: msg = "[insert_gapfill] {} is corrupted. JSON Decode Error.".format(abs_filename) logger.critical(msg) return (False, msg, 0) header = None nb_values = 0 records = [] db = connect() student = db.students.find_one({"student":student_id}) try: group = student["group"] except KeyError: group = None for chap, v in dataset.items(): if len(v["records"]) > 0: for record in v['records']: # chapter = get_lesson_from_stimulus(record["target"]) record.update({ "classroom": classroom_id, "student": student_id, "group": group, "word_nb": int(chap), # "chapter": None, "chapter": int(chap), "subject":"letters", "tag": record["target"], "word": record["target"], "value": record["target"], "dataset": _dataset["dataset"], "lesson": None }) record["day"] = convert_to_date(record["unixTime"]) record["unixTime"] = convert_to_isodate(record["unixTime"]) record["game"] = str(record["minigameId"]) record["isClicked"] = int(record["isClicked"] == True) del record["minigameId"] del record["targetId"] record["elapsedTime"] = float(record["elapsedTime"]) records.append(record) nb_values +=1 try: db.records.insert_many(records) except Exception as e: status = False msg = "[insert_assessements()] failed to insert records for student {}. Error:{}".format(student_id, e) if nb_values == 0: status = False msg = "[insert_assessements()] . No records found for {}".format(student_id) return(status, msg, nb_values)
def put(self): data = parser.parse_args() try: db_note = Note.query.filter_by(title=data['title']).first() db_note.description = data['description'] db_note.is_markdown = data['isMarkdownFile'] date = convert_to_date(data['date']) db_note.date = date if date != '' else db_note.date db_note.categories = self._get_categories(data['categories']) db.session.commit() return db_note.json() except (ValueError, DatabaseError) as e: db.session.rollback() return abort(400, message=f"Your request contains wrong data! {e}")
def post(self): data = parser.parse_args() try: date = convert_to_date(data['date']) if date != '': note = Note(title=data['title'], description=data['description'], is_markdown=data['isMarkdownFile'], date=date) else: note = Note(title=data['title'], description=data['description'], is_markdown=data['isMarkdownFile']) note.categories = self._get_categories(data['categories']) note.save() return note.json() except (ValueError, DatabaseError) as e: db.session.rollback() return abort(400, message=f"Your request contains wrong data! {e}")
def __init__(self, page: int = 1, page_size: int = 5, category: str = "", start_date: str = "", end_date: str = ""): self.page = page self.page_size = page_size self.category = category self.start_date = convert_to_date(start_date) self.end_date = convert_to_date(end_date)
def poem_stitcher(cities=None, urls=None, dont_post_if_runtime_under=None, min_length=None, max_length=None, date=None, image_flavor=None, all_of_day=False, no_youtube_upload=False, voice=None, speaking_rate=None, pitch=None, upload_to_bucket_path=None): # Toss invalid combinations of args if all_of_day and (not date or not cities): raise BadOptionsError( 'Must specify DATE and CITIES when using ALL_OF_DAY flag') # Ensure correct typing dont_post_if_runtime_under = float( dont_post_if_runtime_under) if dont_post_if_runtime_under else None date = convert_to_date(date) if type( date) == str else date if type(date) != None else None # TODO Hash args to create the dir destination_bucket_dir = f'{cities[0].replace(" ", "").lower()}-{str(date.date())}' print(cities, date) ############## # SCRAPE ADS # ############## # Form request list for scrapers scraper_request_list = [] # NOTE: Handled cases: (all-of-day, date, city) if all_of_day: # Scrape each city for ads from DATE for city in cities: scraper_request_list.append({ 'method': 'POST', 'url': CRAIGSLIST_SCRAPER_ENDPOINT, 'json': { 'destination_bucket_dir': destination_bucket_dir, 'city': city.replace(' ', '').lower(), 'date': f'{date.month}-{date.day}-{date.year}', # TODO: Attach min word count and such here } }) print(scraper_request_list) # Send requests from list concurrently responses = handle_requests(scraper_request_list) print(responses) # Capture all scraped ad bucket paths ad_bucket_paths = [] for response in responses: # Filter out TimeoutError that we can catch from workers who responded too slowly try: ad_bucket_paths += eval(response.decode('utf-8')) except: print(response) pass if ad_bucket_paths == []: raise ValueError('No ads collected') ################## # GENERATE POEMS # ################## # Request poem for each scraped ad's blob maker_request_list = [] if all_of_day: for ad_bucket_path in ad_bucket_paths: maker_request_list.append({ 'method': 'POST', 'url': POEM_MAKER_ENDPOINT, 'json': { 'bucket_path': ad_bucket_path, 'destination_bucket_dir': destination_bucket_dir, 'image_flavor': image_flavor, 'voice': voice, 'pitch': pitch, 'speaking_rate': speaking_rate } }) else: print('Not yet handling cases other than --all-of-day. Exiting...') exit() responses = handle_requests(maker_request_list) print(responses) # Capture all videos bucket paths video_bucket_paths = [] for response in responses: # Filter out TimeoutError that we can catch from workers who responded too slowly try: response_string = response.decode('utf-8') if not response_string == '' and 'Exception' not in response_string and 'Rate' not in response_string: video_bucket_paths.append(response_string) except: print(response) pass print(video_bucket_paths) if video_bucket_paths == []: raise ValueError('No poems successfully made') # Grab the blobs video_blobs = [ get_blob('craig-the-poet', bucket_path) for bucket_path in video_bucket_paths ] ######### # ORDER # ######### # Order the blobs by time of post def to_datetime(b): # e.g. 2019-12-25T21:27:07-0600 datetime_string = b.metadata['ad-posted-time'] date_string, time_string = datetime_string.split('T') year, month, day = date_string.split('-') hour, minute, second = time_string.split('-')[0].split(':') return datetime.strptime( f'{year}-{month}-{day}-{hour}-{minute}-{second}', "%Y-%m-%d-%H-%M-%S") video_blobs = sorted(video_blobs, key=to_datetime) ############ # VALIDATE # ############ # Check current sum of poem run time total_runtime = sum( float(blob.metadata['runtime']) for blob in video_blobs) if dont_post_if_runtime_under and total_runtime < dont_post_if_runtime_under: raise Exception('Minimum runtime length not met. Exiting...') ################ # CONCAT POEMS # ################ # Download all poems that we've selected makedir('poems') local_poem_filepaths = [] for i, blob in enumerate(video_blobs): local_poem_filepath = f'poems/poem-{i}.mp4' blob.download_to_filename(local_poem_filepath) local_poem_filepaths += [local_poem_filepath] # Concat all the poem into one concat_videos(local_poem_filepaths, 'out.mp4', **FFMPEG_CONFIG) print('Concatenation complete') if upload_to_bucket_path: upload_file_to_bucket('craig-the-poet', 'out.mp4', upload_to_bucket_path) # if no_upload: # return './out.mp4' ##################### # UPLOAD TO YOUTUBE # ##################### # Get information for description video_info_list = [] current_runtime = 0. for i, blob in enumerate(video_blobs): runtime = float(blob.metadata['runtime']) video_info_list.append({ 'title': blob.metadata['ad-title'], 'start-time': current_runtime }) current_runtime += runtime title = f'Missed Connections {cities[0]} {date.month}-{date.day}-{date.year}' def float_to_youtube_time(s): minutes = str(int(s) // 60) seconds = str(int(s) % 60) return f'{minutes}:{"0" + seconds if len(seconds) != 2 else seconds}' description = f'My name is Craigothy, and I am a poet. These works are all inspired by true events.\n' for info in video_info_list: description += f'\n{float_to_youtube_time(info["start-time"])} -- {info["title"]}' description += '\n\nAll rights reserved for pleasure.' keywords = f'love,craigslist,poetry,humanity,craig' args = { # Default args for google-api-client namespace objects 'auth_host_name': 'localhost', 'auth_host_port': [8080, 8090], 'category': '22', 'logging_level': 'ERROR', 'noauth_local_webserver': True, # Default was False 'privacyStatus': 'public', 'file': 'out.mp4', 'title': title, 'description': description, 'keywords': keywords } print(args) if no_youtube_upload: return './out.mp4' print('Uploading to YouTube...') upload_youtube_video(args) return 'Success'
def insert_gp(_dataset, classroom_id, student_id, abs_filename, update=False, debug=False): ''' insert GP records into Records ''' db = connect() status = True msg = "" try: dataset = read_json(abs_filename) except json.decoder.JSONDecodeError: status = False msg = "{} is corrupted. End file missing. Skipping insertion".format(abs_filename) # logger.warning(msg) return (status, msg, 0) games = set() header = None student = db.students.find_one({"student":student_id}) try: group = student["group"] except KeyError: status = False msg = '[insert_gp] student {} has no group'.format(student) # logger.warning(') group = None values = 0 records = [] for key, v in dataset.items(): if len(v["records"]) > 0: step = db.path.find_one({"visualaudio":key}) # send to report and call Cassandra if step is None: msg = "No chapter or lesson found in db.path with visualaudio=`{}`".format(key) if not "." in key: step = db.path.find_one({"visual":key.split("-")[0]}) if step is None: status = False msg = "No chapter or lesson found in db.path with visual=`{}`".format(key.split("-")[0]) # logger.warning('[insert_gp] '+msg) #insert_report(["warning", "insert_gp", abs_filename, "db.records", False, msg, len(v["records"]) ]) step = {"chapter":-1, "lesson":-1, "tag": key, "CV": ""} # syllabs handling: choose to affect it to first syllab ... :() else: # print(key) step = {"chapter":-1, "lesson":-1, "tag": key, "CV": "NA"} # first_syllabs = key.split(".") # step = db.path.find_one({"visualaudio":first_syllabs[0]}) game_ids = list(set([n["minigameId"] for n in v["records"]])) games.update(game_ids) for record in v["records"]: try: record["stimulus_tag"] = GP_TAGS[record["stimulus"]] # print(record["stimulus_tag"]) except KeyError: status = False msg = '[insert_gp] No tag in path found for stimulus_tag {}'.format(record["stimulus"]) # logger.warning(') record["stimulus_tag"] = None record["classroom"] = classroom_id record["student"] = student_id record["group"] = group record["chapter"] = int(step["chapter"]) record["lesson"] = int(step["lesson"]) record["tag"] = step["tag"] record["target_tag"] = step["tag"] record["value"] = key record["dataset"] ="gp" record["subject"] = "letters" record["CV"] = step["CV"] record["day"] = convert_to_date(record["unixTime"]) record["unixTime"] = convert_to_isodate(record["unixTime"]) record["isClicked"] = int(record["isClicked"] == True) record["game"] = str(record["minigameId"]) del record["minigameId"] values += 1 records.append(record) try: db.records.insert_many(records) except Exception as e: status = False msg = '[insert_gp] fails to insert {} records for student {}. Error {}'.format(values, student_id, e) if values == 0: status = False msg = "[insert_gp] has no records for student {}".format(student_id) return (status, msg, values)
def insert_assessments(_dataset, classroom_id, student_id, abs_filename, update=False, debug=False): ''' insert assessments ''' header = None msg = "" records = [] nb_values = 0 db = connect() status = True msg = "" # rank = db.path.find_one({"subject": _dataset["subject"]}, sort=[("chapter", -1)]) student = db.students.find_one({"student":student_id}) if student is None: status = False msg = "[insert_assessements] student not found {}".format(student_id) else: try: group = student["group"] except KeyError: group = None with open(abs_filename, "r") as f: try: for row in f.readlines(): line = json.loads(row) if len(line["records"]) > 0: for record in line["records"]: record.update( {k: v for k, v in line.items() if k != "records"}) record["tag"] = record["value"] record["chapter"] = int(record["chapterId"]) record["lesson"] = None del record["chapterId"] record.update({ "classroom": classroom_id, "student": student_id, "group": group, "dataset": _dataset["dataset"], "subject": _dataset["subject"], "game" : "fish" }) if _dataset["subject"] == "letters": record["word"] = record["value"] record["day"] = convert_to_date(record["unixTime"]) record["unixTime"] = convert_to_isodate(record["unixTime"]) record["assessmentEndTime"] = convert_to_isodate( record["assessmentEndTime"]) try: records.append(record) nb_values +=1 except pymongo.errors.DuplicateKeyError: pass if nb_values == 0: status = False msg = "[insert_assessements] No records found for {}".format(abs_filename) db.records.insert_many(records) except json.decoder.JSONDecodeError: msg = "{} is corrupted. JSON DECODE Error".format(abs_filename) # logger.warning("[insert_assessement]"+ msg) # insert_report(["error", "insert_assessments", abs_filename, "db.records", False, msg, "" ]) return (False, msg, 0) return(status, msg, nb_values)
def insert_nb(_dataset, classroom_id, student_id, abs_filename, update=False, debug=False): ''' insert numbers records into Records ''' try: dataset = read_json(abs_filename) except json.decoder.JSONDecodeError: msg = "[insert_nb()] File {} is corrupted. JSON DECODE ERROR. Skipping insertion".format(abs_filename) # logger.warning(msg) return (False, msg, 0) db = connect() status = True msg = "" values = 0 student = db.students.find_one({"student":student_id}) try: group = student["group"] except KeyError: group = None header = None values = 0 nb_records = [] for nb,v in dataset.items(): records = flatten_records(v) if len(records) > 0: try: step = db.path.find_one({"visual":int(nb)}) except ValueError: step = None # send to report and call Cassandra if step is None: msg = "[insert_nb] No chapter or lesson found in db.path with visualaudio=`{}`".format(nb) # logger.warning("[insert_nb]"+ msg) step = db.path.find_one({"visual":nb.split("-")[0]}) if step is None: msg = "[insert_nb] No chapter or lesson found in db.path with visual=`{}`".format(nb.split("-")[0]) # logger.warning("[insert_nb]"+ msg) step = {"chapter":-1, "lesson":-1, "tag": int(nb)} for record in records: if record["stimulus"] == "": record["stimulus"] = None record["stimulus_tag"] = None else: record["stimulus"] = int(record["stimulus"]) record["stimulus_tag"] = NB_TAGS[int(record["stimulus"])] record["tag"] = int(nb) record["value"] = int(nb) record["classroom"] = classroom_id record["student"] = student_id record["group"] = group record["target"] = int(nb) record["target_tag"] = step["tag"] record["chapter"] = step["chapter"] record["lesson"] = step["lesson"] record["dataset"] = "numbers" record["subject"] = "numbers" record["isClicked"] = int(record["isClicked"] == True) record["day"] = convert_to_date(record["unixTime"]) record["unixTime"] = convert_to_isodate(record["unixTime"]) record["game"] = str(record["minigameId"]) record["CV"] = "N" del record["minigameId"] nb_records.append(record) values +=1 if len(nb_records) > 0 and values > 0: db.records.insert_many(nb_records) else: status, msg = False, "[insert_nb()] No records inserted in dataset numbers for student {}".format(student_id) # logger.info("[insert_gp]"+ msg) # return(status, msg, 0) # return(status, msg, values) return status, msg, values