def hasResearchExperience(notes): negativeKeywords = ['no research', 'no publi'] positiveKeywords = [ 'research', 'publish', 'publication', 'paper', 'academic', 'author', 'conference' ] return QueryUtil.searchKeywords(notes, negativeKeywords, positiveKeywords)
def hasWorkExperience(notes): negativeKeywords = ['no work', 'no industry'] positiveKeywords = ['work', 'industry'] return QueryUtil.searchKeywords(notes, negativeKeywords, positiveKeywords)
GoHackers.getResults(query, True) if not continueQuery(): break elif input == 5: # Display machine learning results # Get query from user query = getQuery(True, True, True) print "\n" # Get results from GradCafe gradResults = GradCafe.getResults(query, False) QueryUtil.refineQuery(query) # Get results from GoHackers goResults = GoHackers.getResults(query, False) # Predict outcome doExperiment = False predictor = Predictor(gradResults, goResults) if doExperiment: predictor.runExperiment() else: predictor.predict() if not continueQuery():
def getResults(query): school = query[QueryUtil.schoolKey] major = query[QueryUtil.majorKey] # Search using Google queryStr = school + ' ' + major + ' faculty' facultyLink = QueryUtil.google(queryStr) # Start scraping if a link to the faculty page has been found counter = 0 if facultyLink: print '\nSearching "' + facultyLink + '"\n' # Get base URL to deal with relative URLs domain = '{uri.scheme}://{uri.netloc}/'.format(uri=urlparse(facultyLink)) visitedUrls = dict() # Keeps track of visited urls # Get all links within faculty page content = requests.get(str(facultyLink), headers={'User-Agent': 'Mozilla/5.0'}).text soup = BeautifulSoup(content, "lxml") # Remove header and footer since these are definitely not relevant if soup.header: soup.header.decompose() if soup.footer: soup.footer.decompose() anchors = soup.find_all("a", href=True) if anchors: # Visit each link to see if there is information about a faculty member for anchor in anchors: url = anchor['href'] numWords = len(anchor.text.split()) # Minimum qualification for anchor text to match a faculty name if len(anchor.text.strip()) == 0 or numWords < 2 or numWords > 3: continue # Handle relative URL if url.startswith('/'): url = domain + url[1:] if domain.endswith('/') else domain + url # Skip visited sites if url in visitedUrls: continue visitedUrls[url] = 0 # Check obvious bad URLs if validUrl(url): try: # Visit link content = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text soup = BeautifulSoup(content, "lxml") # Remove header and footer since these are definitely not relevant if soup.header: soup.header.decompose() if soup.footer: soup.footer.decompose() # Check if this is a page related to faculty information if validContent(soup): matches = re.finditer(r'>(.+\S+\s+(research|holds|received|area|director|member|fellow|earned)[^<]+)', content) longestStr = None # Hack to avoid getting irrelevant short sentences containing above keywords. if matches: for match in matches: matchStr = match.group(1) if longestStr is None or len(longestStr) < len(matchStr): if matchStr[0].isupper(): longestStr = matchStr if longestStr: counter = counter + 1 print str(counter) + '. ' + anchor.text print removeTags(longestStr) print '\n' except: continue if counter == 0: print 'Could not find information...\n'
def preProcess(self, gradResults, goResults): data = np.ndarray(shape=(self.dimension, 0), dtype=np.float) for results in [gradResults, goResults]: for result in results: vector = np.ndarray(shape=(self.dimension, 1), dtype=np.float) gpa = result[QueryUtil.gpaScore] achievedGpa = None maxPossibleGpa = None if gpa: if isinstance(gpa, float): if float(gpa) > 0: achievedGpa = float(gpa) maxPossibleGpa = 4.0 else: if '/' in gpa: tokens = gpa.split('/') achievedGpa = tokens[0] maxPossibleGpa = tokens[1] if achievedGpa and maxPossibleGpa: gpa = QueryUtil.normalizeGpa(achievedGpa, maxPossibleGpa) if gpa > 1: continue else: vector[0,0] = gpa else: continue verbal = result[QueryUtil.greVerbal] if isinstance(verbal, float) or (verbal and verbal.isdigit() and int(verbal) > 0): vector[1,0] = QueryUtil.normalizeGre(verbal) else: continue quant = result[QueryUtil.greQuant] if isinstance(quant, float) or (quant and quant.isdigit() and int(quant) > 0): vector[2,0] = QueryUtil.normalizeGre(quant) else: continue writing = result[QueryUtil.greWriting] try: if isinstance(writing, float) or (writing and float(writing) > 0): vector[3,0] = float(writing) else: continue except Exception: continue vector[4,0] = float(result[QueryUtil.workExp]) vector[5,0] = float(result[QueryUtil.research]) vector[6,0] = float(result[QueryUtil.status]) vector[7,0] = float(result[QueryUtil.decision]) data = np.hstack((data, vector)) data = data.transpose() np.random.shuffle(data) return data