Exemplo n.º 1
0
def hasResearchExperience(notes):
    negativeKeywords = ['no research', 'no publi']
    positiveKeywords = [ 'research', 'publish', 'publication', 'paper', 'academic', 'author', 'conference' ]

    return QueryUtil.searchKeywords(notes, negativeKeywords, positiveKeywords)
Exemplo n.º 2
0
def hasWorkExperience(notes):
    negativeKeywords = ['no work', 'no industry']
    positiveKeywords = ['work', 'industry']

    return QueryUtil.searchKeywords(notes, negativeKeywords, positiveKeywords)
Exemplo n.º 3
0
        GoHackers.getResults(query, True)

        if not continueQuery():
            break
    elif input == 5:
        # Display machine learning results

        # Get query from user
        query = getQuery(True, True, True)

        print "\n"

        # Get results from GradCafe
        gradResults = GradCafe.getResults(query, False)

        QueryUtil.refineQuery(query)

        # Get results from GoHackers
        goResults = GoHackers.getResults(query, False)

        # Predict outcome
        doExperiment = False

        predictor = Predictor(gradResults, goResults)

        if doExperiment:
            predictor.runExperiment()
        else:
            predictor.predict()

        if not continueQuery():
Exemplo n.º 4
0
def getResults(query):
    school = query[QueryUtil.schoolKey]
    major = query[QueryUtil.majorKey]

    # Search using Google

    queryStr = school +  ' ' + major + ' faculty'
    facultyLink = QueryUtil.google(queryStr)

    # Start scraping if a link to the faculty page has been found

    counter = 0

    if facultyLink:
        print '\nSearching "' + facultyLink + '"\n'

        # Get base URL to deal with relative URLs
        domain = '{uri.scheme}://{uri.netloc}/'.format(uri=urlparse(facultyLink))

        visitedUrls = dict() # Keeps track of visited urls

        # Get all links within faculty page
        content = requests.get(str(facultyLink), headers={'User-Agent': 'Mozilla/5.0'}).text
        soup = BeautifulSoup(content, "lxml")

        # Remove header and footer since these are definitely not relevant
        if soup.header:
            soup.header.decompose()
        if soup.footer:
            soup.footer.decompose()

        anchors = soup.find_all("a", href=True)

        if anchors:
            # Visit each link to see if there is information about a faculty member
            for anchor in anchors:
                url = anchor['href']

                numWords = len(anchor.text.split())

                # Minimum qualification for anchor text to match a faculty name
                if len(anchor.text.strip()) == 0 or numWords < 2 or numWords > 3:
                    continue

                # Handle relative URL
                if url.startswith('/'):
                    url = domain + url[1:] if domain.endswith('/') else domain + url

                # Skip visited sites
                if url in visitedUrls:
                    continue

                visitedUrls[url] = 0

                # Check obvious bad URLs
                if validUrl(url):
                    try:
                        # Visit link
                        content = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text
                        soup = BeautifulSoup(content, "lxml")

                        # Remove header and footer since these are definitely not relevant
                        if soup.header:
                            soup.header.decompose()
                        if soup.footer:
                            soup.footer.decompose()

                        # Check if this is a page related to faculty information
                        if validContent(soup):
                            matches = re.finditer(r'>(.+\S+\s+(research|holds|received|area|director|member|fellow|earned)[^<]+)', content)

                            longestStr = None # Hack to avoid getting irrelevant short sentences containing above keywords.

                            if matches:
                                for match in matches:
                                    matchStr = match.group(1)
                                    if longestStr is None or len(longestStr) < len(matchStr):
                                        if matchStr[0].isupper():
                                            longestStr = matchStr
                            if longestStr:
                                counter = counter + 1
                                print str(counter) + '. ' + anchor.text
                                print removeTags(longestStr)
                                print '\n'
                    except:
                        continue

    if counter == 0:
        print 'Could not find information...\n'
Exemplo n.º 5
0
    def preProcess(self, gradResults, goResults):
        data = np.ndarray(shape=(self.dimension, 0), dtype=np.float)

        for results in [gradResults, goResults]:
            for result in results:
                vector = np.ndarray(shape=(self.dimension, 1), dtype=np.float)

                gpa = result[QueryUtil.gpaScore]
                achievedGpa = None
                maxPossibleGpa = None

                if gpa:
                    if isinstance(gpa, float):
                        if float(gpa) > 0:
                            achievedGpa = float(gpa)
                            maxPossibleGpa = 4.0
                    else:
                        if '/' in gpa:
                            tokens = gpa.split('/')
                            achievedGpa = tokens[0]
                            maxPossibleGpa = tokens[1]

                if achievedGpa and maxPossibleGpa:
                    gpa = QueryUtil.normalizeGpa(achievedGpa, maxPossibleGpa)
                    if gpa > 1:
                        continue
                    else:
                        vector[0,0] = gpa
                else:
                    continue

                verbal = result[QueryUtil.greVerbal]

                if isinstance(verbal, float) or (verbal and verbal.isdigit() and int(verbal) > 0):
                    vector[1,0] = QueryUtil.normalizeGre(verbal)
                else:
                    continue

                quant = result[QueryUtil.greQuant]

                if isinstance(quant, float) or (quant and quant.isdigit() and int(quant) > 0):
                    vector[2,0] = QueryUtil.normalizeGre(quant)
                else:
                    continue

                writing = result[QueryUtil.greWriting]

                try:
                    if isinstance(writing, float) or (writing and float(writing) > 0):
                        vector[3,0] = float(writing)
                    else:
                        continue
                except Exception:
                    continue

                vector[4,0] = float(result[QueryUtil.workExp])
                vector[5,0] = float(result[QueryUtil.research])
                vector[6,0] = float(result[QueryUtil.status])
                vector[7,0] = float(result[QueryUtil.decision])

                data = np.hstack((data, vector))

        data = data.transpose()

        np.random.shuffle(data)

        return data