def select_can_univ(self, uni_list_path): merged_rank_list = np.genfromtxt(os.path.join(uni_list_path, 'university_merged_list.csv'), dtype=str, delimiter=',') print 'read a matrix with shape:', merged_rank_list.shape merged_rank_dict = read_ranking_list(os.path.join(uni_list_path, 'university_merged_list.csv'), dtype=int) yiben_set = set(np.genfromtxt(os.path.join(uni_list_path, 'university_yiben.csv'), dtype=str)) print '#yiben universities:', len(yiben_set) yiben_not_covered = 0 for uni in yiben_set: if uni not in merged_rank_dict.keys(): yiben_not_covered += 1 print uni data = np.genfromtxt('/home/ffl/nus/MM/complementary/chinese_university_ranking/data/features/univ_category.csv', delimiter=',', dtype=str) print 'read a matrix with shape: ', data.shape univ_types = {} for d in data: univ_types[clean_uni_name(d[0])] = d[1] arts_chi_str = '艺术' with open(os.path.join(uni_list_path, 'candidate_universites.csv'), 'w') as fout: for uni in yiben_set: if not uni in univ_types.keys(): fout.write('%s\n' % (uni)) print 'without category:', uni continue if not univ_types[uni] == arts_chi_str: fout.write('%s\n' % (uni))
def post_merged_list(self, uni_list_path): merged_rank_list = np.genfromtxt(os.path.join(uni_list_path, 'university_merged_list.csv'), dtype=str, delimiter=',') print 'read a matrix with shape:', merged_rank_list.shape merged_rank_dict = read_ranking_list(os.path.join(uni_list_path, 'university_merged_list.csv'), dtype=int) yiben_set = set(np.genfromtxt(os.path.join(uni_list_path, 'university_yiben.csv'), dtype=str)) print '#yiben universities:', len(yiben_set) data = np.genfromtxt('/home/ffl/nus/MM/complementary/chinese_university_ranking/data/features/univ_category.csv', delimiter=',', dtype=str) print 'read a matrix with shape: ', data.shape univ_types = {} for d in data: univ_types[clean_uni_name(d[0])] = d[1] arts_chi_str = '艺术' '''here we have a couple of actions to do: filter out military universities filter out art universities filter out universities that are not belongs to the first level ''' print '#initial universities in the merged list:', len(merged_rank_dict) for uni in merged_rank_dict.keys(): if not uni in yiben_set: del merged_rank_dict[uni] print '#yiben universities and not military in the merged list:', len(merged_rank_dict) for uni in merged_rank_dict.keys(): if uni not in univ_types.keys(): print 'category missing:', uni continue if univ_types[uni] == arts_chi_str: print uni del merged_rank_dict[uni] print '#yiben unviersities and not art universities in the merged list:', len(merged_rank_dict) # post-processing rank sorted_merged_rank = sorted(merged_rank_dict.items(), key=operator.itemgetter(1)) prev_rank = 1 ori_pre_rank = 1 # for i in range(len(sorted_merged_rank)): # if not sorted_merged_rank[i][1] == prev_rank: # sorted_merged_rank[i][1] = i + 1 with open('university-selected_merged_list.csv', 'w') as fout: for i in range(len(sorted_merged_rank)): if not sorted_merged_rank[i][1] == ori_pre_rank: fout.write('%s,%d\n' %(sorted_merged_rank[i][0], i + 1)) prev_rank = i + 1 ori_pre_rank = sorted_merged_rank[i][1] else: fout.write('%s,%d\n' % (sorted_merged_rank[i][0], prev_rank))
def __init__(self, uni_line_fname, pro_line_fname, kldm_value_fname): self.uni_line_fname = uni_line_fname self.pro_line_fname = pro_line_fname self.kldm_value_fname = kldm_value_fname # read university lines self.university_lines = {} # keys in the json data are in unicode format, besides, university names are not cleaned, # the names in the candidate university list are in utf-8, to handle this encoding mismatch # we transfer the data read in first. with open(self.uni_line_fname) as uni_file: data = json.load(uni_file) for year_kv in data.iteritems(): if year_kv[0] not in self.university_lines.keys(): self.university_lines[year_kv[0]] = {} for uni_kv in year_kv[1].iteritems(): self.university_lines[year_kv[0]][clean_uni_name( uni_kv[0].encode('utf-8'))] = uni_kv[1] print 'univerisity lines years:', self.university_lines.keys( ), '#years:', len(self.university_lines) if len(self.university_lines) < 1: self.init_flag = False # read province lines self.province_lines = {} with open(self.pro_line_fname) as pro_file: self.province_lines = json.load(pro_file) print 'province lines years:', self.province_lines.keys( ), '#years:', len(self.university_lines) if len(self.province_lines) < 1: self.init_flag = False # selected years to extract features self.years = [ '2013'.encode('utf-8'), '2014'.encode('utf-8'), '2015'.encode('utf-8') ] # self.years = ['2013'.encode('utf-8'), '2014'.encode('utf-8')] # read kldm values self.kldm_values = read_kldm_uni_values(self.kldm_value_fname) print '#years with kldm values:', len(self.kldm_values) # initialize the union university name set self.uni_names = set() self.init_flag = self.init_union_uni_name() print '#union university names:', len(self.uni_names)
def transfer_feature(self, in_path, out_path): for perspective in self.feature_fnames.itervalues(): for types in perspective.itervalues(): for fname in types: print 'transferring feature from:', os.path.join( in_path, fname) data = np.genfromtxt(os.path.join(in_path, fname), delimiter=',', dtype=str) print 'read data with shape:', data.shape feature_matrix = [] missing_line = [] for i in range(1, data.shape[1]): missing_line.append(-1) indexes = {} for ind, line in enumerate(data): if line[0] == 'university': continue indexes[clean_uni_name(line[0])] = ind print '#universities:', len(indexes) without_feature = 0 for name in self.final_list: if name not in indexes.keys(): print 'without:', name feature_matrix.append(missing_line) without_feature += 1 else: ind = indexes[name] feature_matrix.append(data[ind][1:]) print '#universities in the matrix:', len(feature_matrix) print '#universities without feature:', without_feature ofname = os.path.join(out_path, fname) np.savetxt(ofname, feature_matrix, delimiter=',', fmt='%s') print '-------------------------------------------' print '-------------------------------------------' print '-------------------------------------------'
def export_rank_lists(self, of_path, eng_name_fname): db_conn = MySQLdb.connect(host='localhost', db='other_univ_ranking', user='******', passwd='13577531', charset='utf8', use_unicode=True) cursor = db_conn.cursor() for rank_table in self.rank_tables: sql = 'SELECT rank, univ, score FROM %s ORDER BY rank' % (rank_table) try: if rank_table == 'ipin_2016': sql = 'SELECT rank, univ FROM %s ORDER BY rank' % (rank_table) cursor.execute(sql) results = cursor.fetchall() print '#rows:', len(results) ofname = os.path.join(of_path, rank_table + '.csv') rank_list = [] if rank_table == 'cuaa_2016' or rank_table == 'wsl_2017' or rank_table == 'rank_2017' or rank_table == 'arwu_2016': pre_rank = -1 for (rank, univ, score) in results: # print '%d \t %f \t %s' % (rank, score, univ) if rank < pre_rank: print 'unexpected rank %d after rank %d' % (rank, pre_rank) exit() else: pre_rank = rank cur_uni = [clean_uni_name(univ.encode('utf8')), rank] rank_list.append(cur_uni) elif rank_table == 'ipin_2016': pre_rank = -1 for (rank, univ) in results: # print '%d \t %f \t %s' % (rank, score, univ) if rank < pre_rank: print 'unexpected rank %d after rank %d' % (rank, pre_rank) exit() else: pre_rank = rank cur_uni = [clean_uni_name(univ.encode('utf8')), rank] rank_list.append(cur_uni) elif rank_table == 'qs_2016' or rank_table == 'usn_2017' or rank_table == 'the_2017': pre_rank = -1 nama = name_matcher(eng_name_fname) for (rank, univ, score) in results: # print '%d \t \t %s' % (rank, univ) if rank < pre_rank: print 'unexpected rank %d after rank %d' % (rank, pre_rank) exit() else: pre_rank = rank if rank_table == 'usn_2017': if univ.encode('utf8') == 'China University of Mining': clean_name = 'China University of Mining and Technology'.lower() elif univ.encode('utf8') == 'China University of Geosciences': clean_name = 'China University of Geosciences,Wuhan'.lower() elif univ.encode('utf8') == 'China University of Geosciences': clean_name = 'China University of Geosciences,Wuhan'.lower() # if univ.encode('utf8') == 'China University of Petorleum': # clean_name = 'China University of Geosciences,Wuhan'.lower() elif univ.encode('utf8') == 'Northwest University, Xi\'an': clean_name = 'Northwest University'.lower() else: clean_name = clean_uni_name_en(univ) elif rank_table == 'the_2017': if univ.encode('utf8') == 'China University of Mining and Technology': clean_name = 'China University of Mining and Technology,Beijing'.lower() else: clean_name = clean_uni_name_en(univ) else: # clean_name = re.sub('\(.+?\)', '', univ.encode('utf8')).replace('&', ' and ').replace(' ', ' ').strip() clean_name = clean_uni_name_en(univ) # print clean_name chi_name = nama.get_chi_name(clean_name) if chi_name is None: print 'cannot find chinese name of:', univ exit() cur_uni = [chi_name, rank] rank_list.append(cur_uni) np.savetxt(ofname, rank_list, fmt='%s', delimiter=',') except Exception, e: print 'MySQL query error: ', sql print str(e) traceback.print_exc(file=sys.stdout) db_conn.rollback()
def extract_score(self): index = 0 no_results = 0 for fname in self.fnames: # # TEST # print 'working on:', fname parsed = urlparse.urlparse('http://gaokao.do?' + fname) ssdm = int(urlparse.parse_qs(parsed.query)['ssdm'][0]) year = int(urlparse.parse_qs(parsed.query)['year'][0]) kldm = int(urlparse.parse_qs(parsed.query)['kldm'][0]) yxmc = urlparse.parse_qs(parsed.query)['yxmc'][0] # # TEST # print year, yxmc # parse single page soup = BeautifulSoup(open(self.pages_path + fname), 'lxml') # extract university line uni_results = self.extract_university_line(soup, fname) if uni_results == False: print '!!!parse error occurs while parsing university lines!!!' break elif uni_results == True: no_results += 1 elif len(uni_results) > 0: if year not in self.university_lines.keys(): self.university_lines[year] = {} if yxmc not in self.university_lines[year].keys(): self.university_lines[year][yxmc] = {} if ssdm not in self.university_lines[year][yxmc].keys(): self.university_lines[year][yxmc][ssdm] = {} self.university_lines[year][yxmc][ssdm][kldm] = uni_results # # TEST # print self.has_yiben(self.university_lines[year][yxmc]) # pass else: print '!!!unexpected return university line results!!!' break index += 1 if index % 500 == 0: print '#files finished:', index # # TEST # if index > 50: # break print '#file parsed:', index print '#file without result:', no_results for kv_year in self.university_lines.iteritems(): print 'year:', kv_year[0], '#universities:', len(kv_year[1]) print '#picis overall:', len(self.pici) for pici in self.pici: print pici '''traverse university lines to check whether there is any "本科一批" in the lines of every university: we consider two conditions seperately: 1. has "本科一批" in the selected year 2. always has "本科一批" in previous years ''' uni_yiben_year_sel = set() # universities have "本科一批" in the selected year # for kv_year in self.university_lines.iteritems(): for yxmc_kv in self.university_lines[self.year_sel].iteritems(): has_yiben = self.has_yiben(yxmc_kv[1]) if has_yiben: uni_yiben_year_sel.add(yxmc_kv[0]) uni_yiben = set() # universities always have "本科一批" in previous years for uni_name in uni_yiben_year_sel: always_yiben = True for year in self.university_lines.iterkeys(): if not year == self.year_sel: if not (uni_name in self.university_lines[year].keys() and self.has_yiben(self.university_lines[year][uni_name])): always_yiben = False if always_yiben: uni_yiben.add(uni_name) print '#universities have yiben:', len(uni_yiben_year_sel) print '#universities always have yiben:', len(uni_yiben) with open('university_yiben.csv', 'w') as uni_yiben_out: for uni in uni_yiben_year_sel: uni_yiben_out.write('%s\n' % (clean_uni_name(uni))) with open('university_yiben_always.csv', 'w') as uni_yiben_out: for uni in uni_yiben: uni_yiben_out.write('%s\n' % (clean_uni_name(uni))) uni_dif = uni_yiben_year_sel - uni_yiben for uni in uni_dif: print uni