Exemplo n.º 1
0
def scrape_professor(school_name,
                     faculty_url,
                     extracts_title,
                     extracts_name,
                     extracts_cv_url = None,
                     extracts_personal_url = None,
                     extracts_gscholar_url = None,
                     extracts_papers = None):
    """ :return: a Professor object or None if it's not a tenure track faculty """
    tree = get_tree(faculty_url)
    if tree is None:
        return None
    job_title = strip_whitespace(extracts_title(tree))
    if job_title is None:
        print("WARNING: job title not found on "+faculty_url)
        return None
    if not title_is_tenure_track(job_title):
        return None
    name = extracts_name(tree)
    cv_link = None if extracts_cv_url is None else extracts_cv_url(faculty_url, tree)
    personal_url = None if extracts_personal_url is None else extracts_personal_url(faculty_url, tree)
    google_scholar_url = None if extracts_gscholar_url is None else extracts_gscholar_url(faculty_url, tree)
    prof = Professor(name=name, title=job_title, cv_url=cv_link, school=school_name,
                     faculty_directory_url=faculty_url, personal_url=personal_url, google_scholar_url=google_scholar_url)
    if extracts_papers is not None:
        paper_list_url, papers = extracts_papers(faculty_url, tree)
        # save paper list to disk
        if paper_list_url and papers and len(papers) > 0:
            prof.paper_list_url = paper_list_url
            save_paper_list('paper_list', prof, papers)
    return prof
Exemplo n.º 2
0
def get_title(tree):
    # We don't know exactly where the title will be so we keep looking until we find one that matches.
    for candidate in [
            strip_whitespace(c.text)
            for c in css_select(tree, 'ul#ctl00_content_titles li i') +
            css_select(tree, 'ul#ctl00_content_areas li')
    ]:
        if candidate is not None and is_job_title(candidate):
            return candidate
    return None
Exemplo n.º 3
0
def get_papers(url, tree):
    papers = []
    # find the list of divs for journal publications
    for heading in css_select(tree, 'h3'):
        if 'Journal articles' in heading.text:
            # keep collecting the publication divs until we get to the next <H3>
            next = heading
            while next is not None:
                if next.name == 'p':
                    papers.append(strip_whitespace(next.text))
                elif next.name == 'div' and 'Awards and Honors' in next.text:
                    break
                next = next.next_element
    return url + '#research', papers
Exemplo n.º 4
0
def get_papers(url, tree):
    # find the bulleted list for publications
    for heading in css_select(tree, '#center-col strong'):
        if 'Publications' in heading.text:
            # look for the first <UL> under the publications header
            next = heading
            while next is not None:
                if next.name == 'ul':
                    return url, [
                        strip_whitespace(li.text.replace(' PDF.', ''))
                        for li in css_select(next, 'li')
                    ]
                next = next.next_element
    return None, None
Exemplo n.º 5
0
def get_papers(faculty_url, tree):
    name = get_name(tree)
    # download faculty directory, if not already downloaded
    global library_directory_tree
    if not library_directory_tree:
        library_directory_tree = get_tree(library_directory_url)
    # iterate through faculty names, looking for the best match
    anchors = css_select(library_directory_tree, 'table.table-striped a')
    closest_match = min(
        anchors,
        key=lambda x: editdistance.eval(name, strip_whitespace(x.text)))
    # require that closest match be pretty close to accept it
    if editdistance.eval(
            name, closest_match.text
    ) > 3:  # 3 characters would allow for a missing initial and period
        return None, None
    # download bibliography page
    bib_url = closest_match.get('href')
    bib_tree = get_tree(bib_url)
    # find the "Published Works" section
    for heading in css_select(bib_tree, 'div.rich-text h2'):
        if 'Published' in heading.text:
            # keep collecting the publication divs until we get to the next <H2>
            papers = []
            next = heading.next_element
            while next:
                if next.name == 'p':
                    citation = strip_whitespace(next.text)
                    # drop trailing link
                    citation = citation.split('http://')[0]
                    if len(citation) > 0:
                        papers.append(citation)
                elif next.name == 'h2':
                    break
                next = next.next_element
            return bib_url, papers
    return None, None
Exemplo n.º 6
0
def get_papers(faculty_url, tree):
    for e in css_select(tree, 'div.leftResearch div.entries'):
        # check that we're in the right section
        if e.previous_sibling[
                'class'] == 'tabSubheading' and e.previous_sibling.text != 'Articles':
            break
        paper_list = []
        for c in css_select(e, 'div.entry div.copy'):
            citation = c.text
            # some articles have an abstract, which I want to ignore
            for abstract in css_select(c, 'div'):
                citation = citation.replace(abstract.text, '')
            paper_list.append(strip_whitespace(citation))
        return faculty_url + '#research', paper_list
    return None, None
Exemplo n.º 7
0
def get_papers(url, tree):
    # find the list of Journal Articles
    for heading in css_select(tree,
                              'div.view-gsb-publications-listing h2.title'):
        if 'Journal Articles' in heading.text:
            # look for the first <div class="view-content"> under the Journal Articles header
            candidate = heading
            while candidate is not None:
                if candidate.name == 'div' and 'view-content' in candidate.get(
                        'class'):
                    return url, [
                        strip_whitespace(row.get_text())
                        for row in css_select(candidate, 'div.views-row')
                    ]
                candidate = candidate.next_element
    return None, None
Exemplo n.º 8
0
def get_papers(url, tree):
    # add "&facInfo=pub" to the faculty url to get the url for the publications tab
    pub_list_url = url + "&facInfo=pub"
    wait()
    pub_tree = get_tree(pub_list_url)
    # find the bulleted list for publications
    for heading in css_select(pub_tree, '.tab-content h3'):
        if 'Articles' in heading.text:
            # look for the first <OL> under the publications header
            next = heading
            while next is not None:
                if next.name == 'ol':
                    return pub_list_url, [
                        strip_whitespace(
                            li.text.replace('View Details',
                                            '').replace('Citation:', ''))
                        for li in css_select(next, 'div.citation')
                    ]
                next = next.next_element
    return None, None
Exemplo n.º 9
0
def papers_in_top_journals(
        professors: List[Professor]) -> Dict[Professor, AnyStr]:
    """:return dict mapping from professor to a list of titles."""
    top_papers = {}
    # also count the total number of papers since start_date
    total_papers = 0
    for p in professors:
        candidates = load_papers(p)
        total_papers += len(candidates)
        # filter out papers in the top journals
        top_papers[p] = [
            citation for (journal, citation) in candidates
            if is_a_top_journal(journal)
        ]
        # detect anomalies
        if len(top_papers[p]) > 30:
            print("\nWARNING: found %d top papers for %s" %
                  (len(top_papers[p]), p.slug()))

            for paper in sorted(top_papers[p],
                                key=lambda citation: get_year(citation)):
                print("\t" + strip_whitespace(paper))
    print("\nTotal of %d papers since %s\n" % (total_papers, starting_year))
    return top_papers
Exemplo n.º 10
0
def get_title(tag):
    for e in css_select(tag, 'td p')[0].children:
        if isinstance(e, NavigableString):
            e = strip_whitespace(e)
            if len(e) > 0:
                return e
Exemplo n.º 11
0
def get_papers(url, tree):
    # find the bulleted list for publications
    return url, [
        strip_whitespace(li.text)
        for li in css_select(tree, 'div#tabs-1 ul li')
    ]
Exemplo n.º 12
0
def norm_str(my_string):
    return strip_whitespace(my_string.lower().translate(non_letter_remover))