def extract_info_from_html(html_content: str): soup = BeautifulSoup(html_content, "html.parser") speech_content_div_tag = soup.find("div", class_="field-docs-content") paragraphs = [] for p_tag in speech_content_div_tag.contents: if type(p_tag) == Tag: _string = p_tag.string if _string: paragraphs.append(_string.strip()) full_text = " ".join(paragraphs) sentences = break_text_into_sentences(full_text) president_name_div_tag = soup.find("div", class_="field-title") president_name = None for i in president_name_div_tag.contents: if type(i) == Tag: president_name = i.contents[0].text span_tag = soup.find("span", class_="presidential-ordinal-number") president_seq = span_tag.text speech_year_span_tag = soup.find("span", class_="date-display-single") speech_year = speech_year_span_tag.text.strip() speech_year = speech_year[-4:] logger.info( f"{president_seq} President of the United States: {president_name}") return speech_year, president_name, sentences
def get_video_metadata(video_path: str) -> dict: # check if the file exist video_path = Path(video_path) if not video_path.is_file(): logger.error(f"Invalid video_path: `{video_path}` does not exist.") raise Exception("Invalid video_path: file does not exist.") # check if it is a video file known_video_formats = (".mp4", ".flv", ".mov", ".avi", ".wmv", ".mkv") video_path_obs = video_path.resolve() head, tail = os.path.split(video_path_obs) name, ext = os.path.splitext(tail) if ext not in known_video_formats: logger.warning( f"Invalid video_path: `{tail}` is not a known video format.") raise Exception( f"Invalid video_path: `{tail}` is not a known video format.") command_template = "ffprobe -v error -select_streams v:0 -show_entries stream=width,height,avg_frame_rate,duration -of json" args = shlex.split(command_template) args.append(str(video_path)) proc = subprocess.Popen(args, stdout=subprocess.PIPE) out: bytes = proc.communicate()[0] json_string: str = out.decode("utf-8").strip() # logger.debug(json_string) json_obj: dict = json.loads(json_string) streams: list = json_obj.get("streams", []) if len(streams) == 1: _data = streams[0] elif len(streams) == 0: raise Exception() else: _data: dict = streams[0] logger.info(f"More than one stream is found at {video_path}") width: int = _data.get("width") height: int = _data.get("height") ratio = width / height avg_frame_rate: str = _data.get("avg_frame_rate") frame_rate: int = round(eval(avg_frame_rate)) if avg_frame_rate else None duration: float = round(float(_data.get("duration")), 2) video_metadata: dict = { "filepath": str(video_path_obs), "filename": name, "ext": ext, "width": width, "height": height, "ratio": ratio, # width / height "duration": duration, # in number of seconds "fps": frame_rate, # frame per seconds "avg_frame_rate": avg_frame_rate, } # logger.debug(json.dumps(video_metadata, indent=4)) return video_metadata
def parse_single_event(event_lines: List[str]) -> Dict[str, str]: if event_lines[0] == "BEGIN:VEVENT" and event_lines[-1] == "END:VEVENT": event_lines = event_lines[1:-1] else: logger.error("Invalid Event") return event = OrderedDict({ "SUMMARY": None, "DESCRIPTION": None, "LOCATION": None, "DTSTART": None, "DTEND": None, "UID": None, "RRULE": None, "EXDATE": None, }) for line in event_lines: colon_index = line.find(":") key = line[:colon_index] if key not in event: continue line_value = line[colon_index + 1:] # timezone fix if key == "DTSTART" or key == "DTEND" or key == "EXDATE": event[key + ";TZID=Asia/Singapore"] = line_value elif key == "RRULE": # TODO: convert UNTIL field to GMT (add Z ad the back as well) # see: https://www.kanzaki.com/docs/ical/recur.html event[key] = line_value else: event[key] = line_value # correct summary original_summary = event.get("SUMMARY") logger.info(original_summary) if original_summary: room_index = original_summary.find("ROOM:") if room_index != -1: location_value = original_summary[room_index + 5:] event["LOCATION"] = location_value.strip() original_summary = original_summary[:room_index] chop_index = original_summary.find(":") if chop_index != -1: chopped = original_summary[:chop_index - 2] event["SUMMARY"] = chopped.strip() if event.get("SUMMARY") == event.get("DESCRIPTION"): event.pop("DESCRIPTION") unused = [] for key in event.keys(): if not event.get(key): unused.append(key) for key in unused: event.pop(key) return event
def bump_version(filepath: Path) -> Path: head, tail = os.path.split(filepath) name, ext = os.path.splitext(tail) suffix = 1 while filepath.is_file(): logger.info(f"{filepath} already exist") suffix += 1 new_name = f"{name}_v{str(suffix)}{ext}" filepath = Path(head) / new_name return filepath
def main(): """Start the bot""" logger.info("Getting bot_token from environment") bot_token = os.environ.get("BOT_TOKEN", None) if bot_token == "REPLACE_ME" or bot_token is None: logger.info("Getting bot_token from src/config.conf") config_fp = curr_folder / "config.conf" if not config_fp.is_file(): logger.error("bot_token not found: No Config File is Found.") return with config_fp.open() as f: config = json.load(f) bot_token = config.get("bot_token", None) if bot_token == "REPLACE_ME" or bot_token is None: logger.error( "bot_token not found: Failed getting bot token from environment and 'config.conf'" ) return load_stats() # Create the Updater and pass it your bot's token. # Make sure to set use_context=True to use the new context based callbacks # Post version 12 this will no longer be necessary updater = Updater(bot_token, use_context=True) # Get the dispatcher to register handlers dispatcher = updater.dispatcher # Command Handlers dispatcher.add_handler(CommandHandler("start", start)) dispatcher.add_handler(CommandHandler("help", help)) dispatcher.add_handler(CommandHandler("stats", stats)) dispatcher.add_handler(CommandHandler("friends", friends)) dispatcher.add_handler(CommandHandler("source", source)) # Message Handlers dispatcher.add_handler(MessageHandler(Filters.text, echo)) dispatcher.add_handler(MessageHandler(Filters.document, ics)) # log all errors dispatcher.add_error_handler(error) # Start the Bot updater.start_polling() updater.idle()
def main(): with open("links.txt") as f: links = f.readlines() for link in links: page_url = link.strip() r = requests.get(page_url) if r.status_code == 200: logger.info(page_url) html_content = r.text speech_year, president_name, sentences = extract_info_from_html( html_content) logger.info(speech_year) logger.info(president_name) export_path = (Path("Inaugural_Addresses") / f"{speech_year} {president_name}.txt") with export_path.open("w") as f: f.write("\n".join(sentences)) logger.info(f"Exported: {export_path}") else: logger.error(page_url) logger.error("Unexpected response with non-200 error code")
def migrate_from_unlabelled_to_local(): base_path = Path("/home/UROP/data_urop/unlabelled") destination_folder = Path("/home/UROP/data_urop/all_videos_local") assert base_path.is_dir() assert destination_folder.is_dir() migration_list = [] for video in os.listdir(base_path): video_path = base_path / video if not video_path.is_file(): logger.info(f"Unexpected") return if str(video_path)[-4:] != ".mp4": logger.info(f"Skip {video_path}") continue target_path = destination_folder / video if target_path.is_file(): logger.info(f"{target_path} already exist") else: logger.debug(f"{video_path} -> {target_path}") migration_list.append([str(video_path), str(target_path)]) logger.debug(f"Number of file to copy: {len(migration_list)}") proceed = input("Proceed? (y/n)") if proceed != "y": logger.warning("Abort") return logger.debug(json.dumps(migration_list, indent=4)) proceed = input("Proceed? (y/n)") if proceed != "y": logger.warning("Abort") return pool = multiprocessing.Pool() result = pool.map(call_safe_copy, migration_list)
def migrate_from_drive_to_local(): base_path = Path( "/home/UROP/shared_drive/Video_Folders/Trimmed_All_Videos") destination_folder = Path("/home/UROP/data_urop/all_videos_local") assert base_path.is_dir() assert destination_folder.is_dir() migration_list = [] for folder in os.listdir(base_path): folder_path = base_path / folder if not folder_path.is_dir(): logger.info(f"Skip {folder_path}") continue logger.debug(folder) if folder.startswith("bilibili_"): # folder_num = int(folder[-3:]) # if folder_num <= 80: # continue folder_path = base_path / folder assert folder_path.is_dir() for file in os.listdir(folder_path): if file[-4:] != ".mp4": logger.info(f"Skip {file}") continue new_name = "b_" + file video_filepath: Path = folder_path / file dst_path: Path = destination_folder / new_name logger.debug(f"{video_filepath} -> {dst_path}") migration_list.append([str(video_filepath), str(dst_path)]) elif folder.startswith("youtube_"): # folder_num = int(folder[-3:]) # if folder_num <= 10: # continue folder_path = base_path / folder assert folder_path.is_dir() for file in os.listdir(folder_path): if file[-4:] != ".mp4": logger.info(f"Skip {file}") continue new_name = "y_" + file video_filepath: Path = folder_path / file dst_path: Path = destination_folder / new_name logger.debug(f"{video_filepath} -> {dst_path}") migration_list.append([str(video_filepath), str(dst_path)]) elif folder.endswith("yutian"): folder_path = base_path / folder assert folder_path.is_dir() for file in os.listdir(folder_path): if file[-4:] != ".mp4": logger.info(f"Skip {file}") continue new_name = "" for i in file.lower(): if i == "(": new_name += "_" elif i == ")": pass else: new_name += i new_name = "c_" + new_name video_filepath: Path = folder_path / file dst_path: Path = destination_folder / new_name logger.debug(f"{video_filepath} -> {dst_path}") migration_list.append([str(video_filepath), str(dst_path)]) logger.debug(f"Number of file to copy: {len(migration_list)}") proceed = input("Proceed? (y/n)") if proceed != "y": logger.warning("Abort") return logger.debug(json.dumps(migration_list, indent=4)) proceed = input("Proceed? (y/n)") if proceed != "y": logger.warning("Abort") return pool = multiprocessing.Pool() result = pool.map(call_safe_copy, migration_list)
try: if _USER.endswith("ME"): from .db_secrete import _DB_NAME, _DB_PASS, _DB_USER else: _DB_USER = _USER _DB_PASS = _PASS _DB_NAME = _NAME except ImportError: logger.error("No Database Config Found.") logger.info( f"""----------------------------------- MongoDB config: User: {_DB_USER} Database Name: {_DB_NAME} ----------------------------------- """ ) client = pymongo.MongoClient( f"mongodb+srv://{_DB_USER}:{_DB_PASS}@clusteresc.xvunj.mongodb.net/{_DB_NAME}?retryWrites=true&w=majority", ssl=True, ) db = client[f"{_DB_NAME}"] db_available = False try:
logger.debug(f"Exported: {export_path}") if __name__ == "__main__": src_folder = Path(__file__).parent / "Inaugural_Addresses" result_folder = Path(__file__).parent / "results" word_occurrence_result = {} for filename in os.listdir(src_folder): filepath = src_folder / filename assert filepath.is_file() logger.debug(filename) with filepath.open() as f: lines = f.readlines() logger.info(f"Number of sentences: {len(lines)}") words = get_word_list_from_lines(lines) logger.info(f"Number of words: {len(words)}") word_occurence_map = get_word_occurence_map(words) logger.info( f"Number of unique words: {len(word_occurence_map.keys())}") # filter stopwords, remove words that once appeared once word_occurence_map = { k: v for k, v in sorted(word_occurence_map.items(), key=lambda x: x[1], reverse=True) if v > 1 and k in PRONOUNS } word_occurrence_result[filename[:-4]] = word_occurence_map
origins = ["http://localhost", "http://*****:*****@app.get("/api") async def index(): return {"Hello": "SUTD Housing Portal"}
def validate_new_application(target_AP_uid: str, student_id: str, stay_period: TimePeriod) -> bool: """ This method does the following checks to ensure incoming Application Form is valid. 1. Check if target ApplicationPeriod exists in DB. 2. Check if current time fall in between the valid application window. 3. Check if current student is among the eligible students in the particular ApplicationPeriod. 4. Check if current student has already submitted an application for this particular ApplicationPeriod. 5. Check if the stay period specified by the student is valid options in this particular ApplicationPeriod. """ _now = datetime.now() try: ap_dict = application_periods_collection.find_one( {"uid": target_AP_uid}) clean_dict(ap_dict) except Exception as e: logger.error(MSG.DB_QUERY_ERROR) logger.error(e) raise HTTPException(status_code=500, detail=MSG.DB_QUERY_ERROR) if not ap_dict: logger.info(f"ApplicationPeriod '{target_AP_uid}' Not Found") return False window_open_dt = ap_dict.get("application_window_open") window_close_dt = ap_dict.get("application_window_close") if not window_open_dt <= _now <= window_close_dt: logger.info( f"Student({student_id}) attempted submitting application to ApplicationPeriod({target_AP_uid})" ) logger.info(f"Failed. Not in Application Window: {target_AP_uid}") return False application_forms_map: Dict[str, str] = ap_dict.get("application_forms_map") if student_id not in application_forms_map: logger.info( f"Ineligible Student({student_id}) attempted submitting application to ApplicationPeriod({target_AP_uid})" ) # NOTE: this restriction is temporarily relaxed logger.info(f"Restriction temporarily relaxed") # return False if application_forms_map.get(student_id) != "": logger.info( f"Illegal second submission by Student({student_id}) to ApplicationPeriod({target_AP_uid})" ) # NOTE: this restriction is temporarily relaxed logger.info(f"Restriction temporarily relaxed") # return False u_start_date = stay_period.start_date u_end_date = stay_period.end_date applicable_periods: List[Dict[str, datetime]] = ap_dict.get( "applicable_periods") period_matched = False for _period in applicable_periods: start_date = convert_datetime_to_date(_period.get("start_date")) end_date = convert_datetime_to_date(_period.get("end_date")) if u_start_date == start_date and u_end_date == end_date: logger.debug("Stay Period Matched!") period_matched = True break if not period_matched: logger.info( f"Illegal Stay Period by Student({student_id}) to ApplicationPeriod({target_AP_uid})" ) return False return True
def parse_single_event(event_lines: List[str]) -> Dict[str, str]: if event_lines[0] == "BEGIN:VEVENT" and event_lines[-1] == "END:VEVENT": event_lines = event_lines[1:-1] else: logger.error("Invalid Event") return event = OrderedDict({ "SUMMARY": None, # Title "DESCRIPTION": None, # Description "LOCATION": None, # Location "DTSTART": None, # Date Start "DTEND": None, # Date End "UID": None, # Unique Identifier "RRULE": None, # Recurring Rule "EXDATE": None, # Expiring Date }) for line in event_lines: """ Each line looks like 'KEY:LINE_VALUE' """ # find the index of the colon separating KEY and LINE_VALUE colon_index = line.find(":") # Get KEY literal key = line[:colon_index] if key not in event: # skip unnecessary keys continue # Get LINE_VALUE literal line_value = line[colon_index + 1:] # timezone fix if key == "DTSTART" or key == "DTEND" or key == "EXDATE": event[key + ";TZID=Asia/Singapore"] = line_value # recurring rule fix elif key == "RRULE": # Ref: https://www.kanzaki.com/docs/ical/recur.html # TODO: convert UNTIL field to GMT (add Z ad the back as well) # TODO: recurring rule validation event[key] = line_value else: event[key] = line_value # 'Summary' fix original_summary = event.get("SUMMARY") logger.info(original_summary) if original_summary: room_index = original_summary.find("ROOM:") if room_index != -1: location_value = original_summary[room_index + 5:] event["LOCATION"] = location_value.strip() original_summary = original_summary[:room_index] chop_index = original_summary.find(":") if chop_index != -1: chopped = original_summary[:chop_index - 2] event["SUMMARY"] = chopped.strip() if event.get("SUMMARY") == event.get("DESCRIPTION"): event.pop("DESCRIPTION") # get keys with 'None' value unused_keys = [key for key in event.keys() if not event.get(key)] # remove unused keys for key in unused_keys: event.pop(key) return event