Пример #1
0
    for prof in course_profs:
      prof = list(prof)
      prof.append(department[0])
      prof.append(url)
      mod_profs.append(tuple(prof))

    all_prof_info += mod_profs 

prof_dictionary_list=[]
for prof in all_prof_info:
    if any('<' in p for p in prof):
      continue
    prof_dictionary={}
    prof_dictionary['name']=prof[1]
    prof_dictionary['personal_website']=prof[0]
    prof_dictionary['email']=prof[2]
    research = util.research_interest_extractor(prof[3])
    if research:
      prof_dictionary['keywords'] = research
    else:
      prof_dictionary['research_summary'] = prof[3]

    prof_dictionary['school'] = 'MIT'
    prof_dictionary['department'] = util.prep_department(prof[4])
    prof_dictionary['source'] = prof[5]
    print prof[5]
    util.validate_professor(prof_dictionary)
    prof_dictionary_list.append(prof_dictionary)

pickle.dump(prof_dictionary_list, file('prof_dicts/mit.dat', 'w'))
Пример #2
0
  url = "http://soe.stanford.edu/research/pers_index_results.php?index=%s" % chr(c)
  doc = util.dl_and_prep(url)
  results += re.findall(pat, doc)

print len(results), 'total professors'
output = []
for prof in results:
  pd = {}
  pd['lab_website'] = 'http://soe.stanford.edu/research/%s' % prof[0]
  pd['source'] = 'http://soe.stanford.edu/research/%s' % prof[0]
  pd['name'] = prof[1]
  #extract the primary deptmartment from within the <b> tags
  if '<b>' in prof[2]:
    pd['department'] = re.findall('<b>(.*?)</b>', prof[2])[0]
  else:
    pd['department'] = util.prep_department(util.remove_tags(prof[2]))
  research = prof[3].replace('&nbsp;', '').strip()
  if len(research) > 0:
    pd['keywords'] = util.split_and_clean(research, ',')
  
  pd['school'] = 'Stanford University'
  personal_page = util.dl_and_prep(pd['lab_website'])
  summary = re.findall('<h3>Research Statement</h3><p>(.*?)</p><h3>Degrees</h3>', personal_page)
  if summary:
    pd['research_summary'] = util.html_escape(summary[0].strip())
  try:
    pd['image'] = 'http://soe.stanford.edu/research/%s' % re.findall('\'(images/photos_faculty_staff/.*?)\'', personal_page)[0]
  except Exception:
    import pdb; pdb.set_trace()
  pd['title'] = re.findall("Title:</td><td class=\"data\">(.*?)</td>", personal_page)[0]  
  personal_website = re.findall("URL:</TD><TD class=\"data\"><a href='(.*?)'", personal_page)
Пример #3
0
 department = ""
 email = ""
 bio = ""
 summary = ""
 interests = ""
 image = ""
 g = util.dl_and_prep(prof)
 if len(re.findall("\[firstname\] => (.*?)\s", g)) > 0:
     first_name = re.findall("\[firstname\] => (.*?)\s", g)[0]
 if len(re.findall("\[lastname\] => (.*?)\s", g)) > 0:
     last_name = re.findall("\[lastname\] => (.*?)\s", g)[0]
 if len(re.findall("\[title\] => (.*?)\n\s*\[", g)) > 0:
     title = re.findall("\[title\] => (.*?)\n\s*\[", g)[0]
 if len(re.findall("\[department\] => (.*?)\n\s*\[", g)) > 0:
     department = re.findall("\[department\] => (.*?)\n\s*\[", g)[0]
     department = util.prep_department(department)
     print department
 if len(re.findall("\[email\] => (.*?)\s", g)) > 0:
     email = re.findall("\[email\] => (.*?)\s", g)[0]
 if len(re.findall("\[bio\] => (.*?)\n\s*\[", g)) > 0:
     bio = re.findall("\[bio\] => (.*?)\n\s*\[", g)[0]
 if len(re.findall("\[summary\] => (.*?)\n\s*\[", g)) > 0:
     summary = re.findall("\[summary\] => (.*?)\n\s*\[", g)[0]
 if len(re.findall("\[interests\] => (.*?)\n\s*\[", g)) > 0:
     interests = re.findall("\[interests\] => (.*?)\n\s*\[", g)[0]
 if len(re.findall('class=rightColumn>.*?<img src="(.*?)"', g)) > 0:
     image = "http://research.brown.edu" + re.findall('class=rightColumn>.*?<img src="(.*?)"', g)[0]
 # research = re.findall("\[fresearch\] => (.*?)\n\s*\[",g)[0] if len()>0]
 name = first_name + " " + last_name
 brown_prof_dict = {}
 brown_prof_dict["name"] = name