예제 #1
0
 def select_can_univ(self, uni_list_path):
     merged_rank_list = np.genfromtxt(os.path.join(uni_list_path, 'university_merged_list.csv'), dtype=str, delimiter=',')
     print 'read a matrix with shape:', merged_rank_list.shape
     merged_rank_dict = read_ranking_list(os.path.join(uni_list_path, 'university_merged_list.csv'), dtype=int)
     yiben_set = set(np.genfromtxt(os.path.join(uni_list_path, 'university_yiben.csv'), dtype=str))
     print '#yiben universities:', len(yiben_set)
     yiben_not_covered = 0
     for uni in yiben_set:
         if uni not in merged_rank_dict.keys():
             yiben_not_covered += 1
             print uni
     data = np.genfromtxt('/home/ffl/nus/MM/complementary/chinese_university_ranking/data/features/univ_category.csv', delimiter=',',
                          dtype=str)
     print 'read a matrix with shape: ', data.shape
     univ_types = {}
     for d in data:
         univ_types[clean_uni_name(d[0])] = d[1]
     arts_chi_str = '艺术'
     with open(os.path.join(uni_list_path, 'candidate_universites.csv'), 'w') as fout:
         for uni in yiben_set:
             if not uni in univ_types.keys():
                 fout.write('%s\n' % (uni))
                 print 'without category:', uni
                 continue
             if not univ_types[uni] == arts_chi_str:
                 fout.write('%s\n' % (uni))
예제 #2
0
    def post_merged_list(self, uni_list_path):
        merged_rank_list = np.genfromtxt(os.path.join(uni_list_path, 'university_merged_list.csv'), dtype=str, delimiter=',')
        print 'read a matrix with shape:', merged_rank_list.shape
        merged_rank_dict = read_ranking_list(os.path.join(uni_list_path, 'university_merged_list.csv'), dtype=int)
        yiben_set = set(np.genfromtxt(os.path.join(uni_list_path, 'university_yiben.csv'), dtype=str))
        print '#yiben universities:', len(yiben_set)
        data = np.genfromtxt('/home/ffl/nus/MM/complementary/chinese_university_ranking/data/features/univ_category.csv', delimiter=',', dtype=str)
        print 'read a matrix with shape: ', data.shape
        univ_types = {}
        for d in data:
            univ_types[clean_uni_name(d[0])] = d[1]
        arts_chi_str = '艺术'

        '''here we have a couple of actions to do:
            filter out military universities
            filter out art universities
            filter out universities that are not belongs to the first level
        '''
        print '#initial universities in the merged list:', len(merged_rank_dict)
        for uni in merged_rank_dict.keys():
            if not uni in yiben_set:
                del merged_rank_dict[uni]
        print '#yiben universities and not military in the merged list:', len(merged_rank_dict)
        for uni in merged_rank_dict.keys():
            if uni not in univ_types.keys():
                print 'category missing:', uni
                continue
            if univ_types[uni] == arts_chi_str:
                print uni
                del merged_rank_dict[uni]
        print '#yiben unviersities and not art universities in the merged list:', len(merged_rank_dict)

        # post-processing rank
        sorted_merged_rank = sorted(merged_rank_dict.items(), key=operator.itemgetter(1))
        prev_rank = 1
        ori_pre_rank = 1
        # for i in range(len(sorted_merged_rank)):
        #     if not sorted_merged_rank[i][1] == prev_rank:
        #         sorted_merged_rank[i][1] = i + 1
        with open('university-selected_merged_list.csv', 'w') as fout:
            for i in range(len(sorted_merged_rank)):
                if not sorted_merged_rank[i][1] == ori_pre_rank:
                    fout.write('%s,%d\n' %(sorted_merged_rank[i][0], i + 1))
                    prev_rank = i + 1
                    ori_pre_rank = sorted_merged_rank[i][1]
                else:
                    fout.write('%s,%d\n' % (sorted_merged_rank[i][0], prev_rank))
예제 #3
0
 def __init__(self, uni_line_fname, pro_line_fname, kldm_value_fname):
     self.uni_line_fname = uni_line_fname
     self.pro_line_fname = pro_line_fname
     self.kldm_value_fname = kldm_value_fname
     # read university lines
     self.university_lines = {}
     # keys in the json data are in unicode format, besides, university names are not cleaned,
     # the names in the candidate university list are in utf-8, to handle this encoding mismatch
     # we transfer the data read in first.
     with open(self.uni_line_fname) as uni_file:
         data = json.load(uni_file)
         for year_kv in data.iteritems():
             if year_kv[0] not in self.university_lines.keys():
                 self.university_lines[year_kv[0]] = {}
             for uni_kv in year_kv[1].iteritems():
                 self.university_lines[year_kv[0]][clean_uni_name(
                     uni_kv[0].encode('utf-8'))] = uni_kv[1]
     print 'univerisity lines years:', self.university_lines.keys(
     ), '#years:', len(self.university_lines)
     if len(self.university_lines) < 1:
         self.init_flag = False
     # read province lines
     self.province_lines = {}
     with open(self.pro_line_fname) as pro_file:
         self.province_lines = json.load(pro_file)
     print 'province lines years:', self.province_lines.keys(
     ), '#years:', len(self.university_lines)
     if len(self.province_lines) < 1:
         self.init_flag = False
     # selected years to extract features
     self.years = [
         '2013'.encode('utf-8'), '2014'.encode('utf-8'),
         '2015'.encode('utf-8')
     ]
     # self.years = ['2013'.encode('utf-8'), '2014'.encode('utf-8')]
     # read kldm values
     self.kldm_values = read_kldm_uni_values(self.kldm_value_fname)
     print '#years with kldm values:', len(self.kldm_values)
     # initialize the union university name set
     self.uni_names = set()
     self.init_flag = self.init_union_uni_name()
     print '#union university names:', len(self.uni_names)
예제 #4
0
 def transfer_feature(self, in_path, out_path):
     for perspective in self.feature_fnames.itervalues():
         for types in perspective.itervalues():
             for fname in types:
                 print 'transferring feature from:', os.path.join(
                     in_path, fname)
                 data = np.genfromtxt(os.path.join(in_path, fname),
                                      delimiter=',',
                                      dtype=str)
                 print 'read data with shape:', data.shape
                 feature_matrix = []
                 missing_line = []
                 for i in range(1, data.shape[1]):
                     missing_line.append(-1)
                 indexes = {}
                 for ind, line in enumerate(data):
                     if line[0] == 'university':
                         continue
                     indexes[clean_uni_name(line[0])] = ind
                 print '#universities:', len(indexes)
                 without_feature = 0
                 for name in self.final_list:
                     if name not in indexes.keys():
                         print 'without:', name
                         feature_matrix.append(missing_line)
                         without_feature += 1
                     else:
                         ind = indexes[name]
                         feature_matrix.append(data[ind][1:])
                 print '#universities in the matrix:', len(feature_matrix)
                 print '#universities without feature:', without_feature
                 ofname = os.path.join(out_path, fname)
                 np.savetxt(ofname, feature_matrix, delimiter=',', fmt='%s')
                 print '-------------------------------------------'
                 print '-------------------------------------------'
                 print '-------------------------------------------'
예제 #5
0
 def export_rank_lists(self, of_path, eng_name_fname):
     db_conn = MySQLdb.connect(host='localhost', db='other_univ_ranking', user='******', passwd='13577531', charset='utf8', use_unicode=True)
     cursor = db_conn.cursor()
     for rank_table in self.rank_tables:
         sql = 'SELECT rank, univ, score FROM %s ORDER BY rank' % (rank_table)
         try:
             if rank_table == 'ipin_2016':
                 sql = 'SELECT rank, univ FROM %s ORDER BY rank' % (rank_table)
             cursor.execute(sql)
             results = cursor.fetchall()
             print '#rows:', len(results)
             ofname = os.path.join(of_path, rank_table + '.csv')
             rank_list = []
             if rank_table == 'cuaa_2016' or rank_table == 'wsl_2017' or rank_table == 'rank_2017' or rank_table == 'arwu_2016':
                 pre_rank = -1
                 for (rank, univ, score) in results:
                     # print '%d \t %f \t %s' % (rank, score, univ)
                     if rank < pre_rank:
                         print 'unexpected rank %d after rank %d' % (rank, pre_rank)
                         exit()
                     else:
                         pre_rank = rank
                     cur_uni = [clean_uni_name(univ.encode('utf8')), rank]
                     rank_list.append(cur_uni)
             elif rank_table == 'ipin_2016':
                 pre_rank = -1
                 for (rank, univ) in results:
                     # print '%d \t %f \t %s' % (rank, score, univ)
                     if rank < pre_rank:
                         print 'unexpected rank %d after rank %d' % (rank, pre_rank)
                         exit()
                     else:
                         pre_rank = rank
                     cur_uni = [clean_uni_name(univ.encode('utf8')), rank]
                     rank_list.append(cur_uni)
             elif rank_table == 'qs_2016' or rank_table == 'usn_2017' or rank_table == 'the_2017':
                 pre_rank = -1
                 nama = name_matcher(eng_name_fname)
                 for (rank, univ, score) in results:
                     # print '%d \t \t %s' % (rank, univ)
                     if rank < pre_rank:
                         print 'unexpected rank %d after rank %d' % (rank, pre_rank)
                         exit()
                     else:
                         pre_rank = rank
                     if rank_table == 'usn_2017':
                         if univ.encode('utf8') == 'China University of Mining':
                             clean_name = 'China University of Mining and Technology'.lower()
                         elif univ.encode('utf8') == 'China University of Geosciences':
                             clean_name = 'China University of Geosciences,Wuhan'.lower()
                         elif univ.encode('utf8') == 'China University of Geosciences':
                             clean_name = 'China University of Geosciences,Wuhan'.lower()
                         # if univ.encode('utf8') == 'China University of Petorleum':
                         #     clean_name = 'China University of Geosciences,Wuhan'.lower()
                         elif univ.encode('utf8') == 'Northwest University, Xi\'an':
                             clean_name = 'Northwest University'.lower()
                         else:
                             clean_name = clean_uni_name_en(univ)
                     elif rank_table == 'the_2017':
                         if univ.encode('utf8') == 'China University of Mining and Technology':
                             clean_name = 'China University of Mining and Technology,Beijing'.lower()
                         else:
                             clean_name = clean_uni_name_en(univ)
                     else:
                         # clean_name = re.sub('\(.+?\)', '', univ.encode('utf8')).replace('&', ' and ').replace('  ', ' ').strip()
                         clean_name = clean_uni_name_en(univ)
                     # print clean_name
                     chi_name = nama.get_chi_name(clean_name)
                     if chi_name is None:
                         print 'cannot find chinese name of:', univ
                         exit()
                     cur_uni = [chi_name, rank]
                     rank_list.append(cur_uni)
             np.savetxt(ofname, rank_list, fmt='%s', delimiter=',')
         except Exception, e:
             print 'MySQL query error: ', sql
             print str(e)
             traceback.print_exc(file=sys.stdout)
             db_conn.rollback()
예제 #6
0
    def extract_score(self):
        index = 0
        no_results = 0
        for fname in self.fnames:
            # # TEST
            # print 'working on:', fname
            parsed = urlparse.urlparse('http://gaokao.do?' + fname)
            ssdm = int(urlparse.parse_qs(parsed.query)['ssdm'][0])
            year = int(urlparse.parse_qs(parsed.query)['year'][0])
            kldm = int(urlparse.parse_qs(parsed.query)['kldm'][0])
            yxmc = urlparse.parse_qs(parsed.query)['yxmc'][0]
            # # TEST
            # print year, yxmc
            # parse single page
            soup = BeautifulSoup(open(self.pages_path + fname), 'lxml')
            # extract university line
            uni_results = self.extract_university_line(soup, fname)
            if uni_results == False:
                print '!!!parse error occurs while parsing university lines!!!'
                break
            elif uni_results == True:
                no_results += 1
            elif len(uni_results) > 0:
                if year not in self.university_lines.keys():
                    self.university_lines[year] = {}
                if yxmc not in self.university_lines[year].keys():
                    self.university_lines[year][yxmc] = {}
                if ssdm not in self.university_lines[year][yxmc].keys():
                    self.university_lines[year][yxmc][ssdm] = {}
                self.university_lines[year][yxmc][ssdm][kldm] = uni_results
                # # TEST
                # print self.has_yiben(self.university_lines[year][yxmc])
                # pass
            else:
                print '!!!unexpected return university line results!!!'
                break
            index += 1
            if index % 500 == 0:
                print '#files finished:', index
            # # TEST
            # if index > 50:
            #     break
        print '#file parsed:', index
        print '#file without result:', no_results
        for kv_year in self.university_lines.iteritems():
            print 'year:', kv_year[0], '#universities:', len(kv_year[1])
        print '#picis overall:', len(self.pici)
        for pici in self.pici:
            print pici

        '''traverse university lines to check whether there is any "本科一批" in the lines of every university:
            we consider two conditions seperately:
            1. has "本科一批" in the selected year
            2. always has "本科一批" in previous years
        '''
        uni_yiben_year_sel = set()      # universities have "本科一批" in the selected year
        # for kv_year in self.university_lines.iteritems():
        for yxmc_kv in self.university_lines[self.year_sel].iteritems():
            has_yiben = self.has_yiben(yxmc_kv[1])
            if has_yiben:
                uni_yiben_year_sel.add(yxmc_kv[0])
        uni_yiben = set()       # universities always have "本科一批" in previous years
        for uni_name in uni_yiben_year_sel:
            always_yiben = True
            for year in self.university_lines.iterkeys():
                if not year == self.year_sel:
                    if not (uni_name in self.university_lines[year].keys() and self.has_yiben(self.university_lines[year][uni_name])):
                        always_yiben = False
            if always_yiben:
                uni_yiben.add(uni_name)

        print '#universities have yiben:', len(uni_yiben_year_sel)
        print '#universities always have yiben:', len(uni_yiben)
        with open('university_yiben.csv', 'w') as uni_yiben_out:
            for uni in uni_yiben_year_sel:
                uni_yiben_out.write('%s\n' % (clean_uni_name(uni)))

        with open('university_yiben_always.csv', 'w') as uni_yiben_out:
            for uni in uni_yiben:
                uni_yiben_out.write('%s\n' % (clean_uni_name(uni)))
        uni_dif = uni_yiben_year_sel - uni_yiben
        for uni in uni_dif:
            print uni