from vivint.db.core import connect, session_factory from vivint.db.models import School from vivint.grab.ratemyprof import get_driver, get_school_url StreamHandler(sys.stdout).push_application() logger = Logger(__name__) # process constants MAX_WORKERS = 15 TIMEOUT_SECS = 15 CONN_STRING = 'sqlite:///./data/school-data.original.db' SEARCH_URL = 'https://duckduckgo.com/?q={query}&t=h_&ia=web' # setup the database engine = connect(CONN_STRING, echo=True, pool_recycle=3600) get_session = session_factory(engine) def find_school_url_on_rmp(row): driver = get_driver() session = get_session() name, sid = row if driver is None: logger.error('SKIPPING ' + name) session.close() return try:
'Institution_City', 'Institution_State', 'Institution_Zip', 'Institution_Phone', 'Institution_OPEID', 'Institution_IPEDS_UnitID', 'Institution_Web_Address', 'Campus_ID', 'Campus_Name', 'Campus_Address', 'Campus_City', 'Campus_State', 'Campus_Zip', 'Campus_IPEDS_UnitID', 'Accreditation_Type', 'Agency_Name', 'Agency_Status', 'Program_Name', 'Accreditation_Status', 'Accreditation_Date_Type', 'Periods', 'Last Action' ] skip_actions = [ 'Resigned', 'Terminated' ] School = namedtuple('School', ' '.join(map(lambda s: s.replace(' ', '_').lower(), csv_header))) # setup the database engine = connect('sqlite:///./data/school-data.original.db', echo=True, pool_recycle=3600) get_session = session_factory(engine) # grab the raw list of schools from the CSV file def get_schools(from_file=None): with open(from_file) as csv_in: reader = csv.reader(csv_in, delimiter=',', quotechar='"') for idx, row in enumerate(reader): if idx == 0: continue yield School(*row) # get all the database rows where there is no URL set session = get_session()