Exemplo n.º 1
0
class filter:
    def __init__(self):
        self.db_helper = DBHelper()
        pass
    
    def deal_entrys(self,entrys):
        entrys = self.filter_illegal(entrys)
        return entrys
        
    # 如果有些字段为空的话不合法,记录到特殊表中 
    def filter_illegal(self,entrys):
        LOG.info("Begin to Filter illegal entrys..")
        not_null_fields = [ORIGIN_URL,ORIGIN_WEBSITE_NAME,RECRUIT_TITLE]
        legal_entrys = []
        illegal_entrys = []
        while len(entrys) > 0:
            entry = entrys.pop()
            info_type = entry.get(INFO_TYPE)
            flag = True
            for field in not_null_fields:
                if entry.get(field) is None:
                    illegal_entrys.append(entry)
                    flag = False
                    break
            
            if info_type == 0 and flag:
                if entry.get(RELEASE_DATE) is None:
                    illegal_entrys.append(entry)
                    flag = False
            if info_type == 1 and flag:
                if entry.get(MEETING_TIME) is None:
                    illegal_entrys.append(entry)
                    flag = False
            if flag:legal_entrys.append(entry)
        LOG.info("Finish filering entrys.[%s] entrys are illegal"%(len(illegal_entrys)))    
        db_illegal = DB_1
        table_illegal = TB_ILLEGAL
        LOG.info("Insert illegal Entrys into[%s.%s]"%(db_illegal,table_illegal))
        fields = list(INFO_FIELDS)
        fields.remove(GROUP_ID)
        fields.extend(['author','tmp_path'])
        self.db_helper.batch_insert(illegal_entrys,table_illegal,db_illegal,fields)
        return legal_entrys
    
    def rm_college_from_loc(self,entrys):
        LOG.info("Dealing Location field to remove college from loc")
        count = 0
        for entry in entrys:
            info_type = entry.get(INFO_TYPE)
            if info_type == 0:
                continue
            college = entry.get(COLLEGE_NAME)
            loc = entry.get(MEETING_LOCATION)
            if loc.startswith(college):
                count += 1
                loc = loc.replace(college,'')
                entry[MEETING_LOCATION] = loc
        LOG.info("Removing [%s] College Name from meeting_location field!"%count)
        return entrys
Exemplo n.º 2
0
 def test_diff_tables(self):
     dc = datachecker()
     db_helper = DBHelper()
     new_table = 'extracted_full_info_today'
     old_table = 'info_tmp'
     old_db = DB_test
     new_db = DB_1
     db_helper.recr_table(old_table,old_db,'info_template',DB_1)
     entrys =dc.diff_tables(table_old=old_table,table_new=new_table,db_old=old_db,db_new=new_db)
     db_helper.batch_insert(entrys,table=old_table,db=old_db)
Exemplo n.º 3
0
 def load_lasttime_failed_entrys(self):
     LOG.info("Loading lasttime failed entrys")
     lasttime_info_ids = self.load_ids_from_rec()
     if len(lasttime_info_ids) == 0:
         return []
     db_helper = DBHelper()
     entrys = db_helper.get_data_by_infoids(lasttime_info_ids,self.table,
                                            self.db,isdict=True)
     LOG.info("Loaded [%s] lasttime failed entrys" %(len(entrys)))
     return entrys
Exemplo n.º 4
0
class DBTestCase(unittest.TestCase):
    def setUp(self):
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
    def tearDown(self):
        self.db = None

    def test_get_max_group_id(self):
        res =self.db.get_max_group_id('refined_info_2','xyzb')
        res_2 =self.db.get_max_group_id('info_template','xyzb')
        print res
        print res_2

    def test_batch_insert(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:12056]
        table = 'info_tmp1'
        self.db.recr_table(table,DB_1,'info_template',DB_1);
        self.db.batch_insert(entrys,table,DB_1)

    def test_batch_get(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:2]
        for key, value in entrys[1].items():
            print key,value,type(value)
Exemplo n.º 5
0
 def setUp(self):
     self.db = DBHelper()
     self.samples =self.db.get_results_by_clms(columns='*',table=
                             'samples',database=DB_test,isdict=True)
     self.segger = segger()
     self.indexer = indexer()
     self.dd = data_drawer()
Exemplo n.º 6
0
class DMGTestCase(unittest.TestCase):
    def setUp(self):
        self.sb = submitter()
        self.dd = data_drawer()
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
    def tearDown(self):
        self.sb = None
        self.dd = None
        self.db = None

    def test_submit_samples(self):
        failed_ids = self.sb.submit_to_RB(self.samples)
        self.assertTrue(len(self.samples) > 0)
        print failed_ids

    def test_submit_large_entrys(self):
        entrys = self.db.get_results_by_clms(columns='*', table='refined_info',
                                            database=DB_1, isdict=True)[:]
        faild_ids = self.sb.submit_to_RB(entrys)
        self.assertTrue(len(self.samples) > 0)
        print failed_ids

    def test_new_added_fields(self):
        for entry in self.samples:
            entry['last_mod_time'] = 10086
            entry['group_id'] = 87217
            entry['collegeName'] = 'ouyangming'
        failed_ids = self.sb.submit_to_RB(self.samples)

    def test_retrive(self):
        entrys = self.dd.get_entrys(2000,is_full=False)
        for entry in entrys:
            #self.assertIn(entry['info_id'],info_ids)
            for key,value in entry.items():
                print key,value
            print "###############################"

    def test_submit_error_handle(self):
        error_url = "http://www.panguso.com"
        self.sb.table = 'samples'
        self.sb.db = DB_test
        self.sb.deal_submit_req(self.samples[:0],error_url)

    def test_post_error(self):
        posturl = P
Exemplo n.º 7
0
class GroupTestCase(unittest.TestCase):
    def setUp(self):
        self.dc = datachecker()
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
    def tearDown(self):
        self.dc = None
        self.samples = None

    def test_group_id(self):
        table_new = 'samples_2'
        table_old = 'info'
        entrys = self.dc.diff_tables(table_old,table_new,DB_test,DB_test)
        rm_field(INFO_ID,entrys)
        datastore.update_info_table(entrys,table_old,DB_test)
class ExtractTestCase(unittest.TestCase):
    def setUp(self):
        self.db = DBHelper()
        columns = ['recruit_title']
        self.recruit_titles = self.db.get_results_by_clms(columns,'refined_info',DB_1)

    def tearDown(self):
        self.db = None
        

    def test_extract_name(self):
        prefix = '$#@&'
        for recruit_title in self.recruit_titles[:]:
            comp_name = extract.extract_compname(recruit_title[0])
            if comp_name.startswith(prefix):
                recruit_title = prefix + " " + recruit_title[0]
            print "[Origin]:%s"%recruit_title
            print "[CompNM]:%s"%comp_name
Exemplo n.º 9
0
    def __init__(self):
        self.db = DBHelper()
        self.dc = datachecker()
        self.sb = submitter()
        self.dd = data_drawer()
        self.sg = segger()
        self.id = indexer()
        self.pc = prechecker()

        self.dmg_on = DMG_ON
        self.test_on = TEST_ON
        if len(sys.argv) < 4:
            self.table_old = "refined_info"
            self.db_old = DB_1
            self.table_new = "extracted_info"
            self.db_new = DB_1
        else:
            self.table_old = sys.argv[1]
            self.db_old = sys.argv[2]
            self.table_new = sys.argv[3]
            self.db_new = sys.argv[4]
Exemplo n.º 10
0
import time
import sys
import array

reload(sys)
sys.setdefaultencoding("utf-8")

host = "10.10.211.101"
port = 9090


dw = data_drawer()
#entrys = dw.get_entrys()
mdoc = None
res = []
db_helper = DBHelper()

entrys = db_helper.get_results_by_clms(columns="*",table='refined_info',isdict=True)[:]
city_id = load_city_map(CITY_FILE_PATH)
college_id = load_college_map(COLLEGE_FILE_PATH)
missed_colleges = set()

#using protobuff to serilize entrys
def serilize_entrys(entrys):
    res = []
    for entry in entrys:
        mdoc = merged_doc_pb2.MergedDocInfo()
        for key,value in entry.items():

            # 空值的跳过
            if value is None:
Exemplo n.º 11
0
 def setUp(self):
     self.dc = datachecker()
     self.db = DBHelper()
     self.samples =self.db.get_results_by_clms(columns='*',table=
                             'samples',database=DB_test,isdict=True)
Exemplo n.º 12
0
 def __init__(self):
     self.filter = filter()
     self.db_helper = DBHelper()
     self.cmp_table = 'refined_list_info'
     self.table = 'extracted_info'
     self.cmp_clms = [COMPANY_NAME,MEETING_TIME,MEETING_LOCATION,ORIGIN_URL,RELEASE_DATE,RECRUIT_TITLE]
Exemplo n.º 13
0
class prechecker:
    
    def __init__(self):
        self.filter = filter()
        self.db_helper = DBHelper()
        self.cmp_table = 'refined_list_info'
        self.table = 'extracted_info'
        self.cmp_clms = [COMPANY_NAME,MEETING_TIME,MEETING_LOCATION,ORIGIN_URL,RELEASE_DATE,RECRUIT_TITLE]
    
    def rm_dup_list_info(self,tb_new,db_new,tb_old,db_old):
        new_list = self.db_helper.get_results_by_clms("*",tb_new,db_new,True)
        old_list = self.db_helper.get_results_by_clms("*",tb_old,db_old,True)
        old_dict = {}
        
        for entry in old_list:
            url = entry.get(ORIGIN_URL)
            if url is not None:
                old_dict[url] = None
        updates = []
        for entry in new_list:
            url = entry.get(ORIGIN_URL)
            if url in old_dict:
                continue
            else:
                updates.append(entry)
                old_dict[url] = None
        fields = entry.keys()
        self.db_helper.batch_insert(updates,"refined_list_info",DB_1,fields)
        return updates


    def repair_data(self,entrys,cmp_entrys=None):
        if cmp_entrys == None:
            cmp_entrys = self.cmp_entrys
        LOG.info("repairing Data...")
        LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys)))
        cmple_info_dict = collections.defaultdict(dict)
        for entry in cmp_entrys:
            origin_url = entry.get(ORIGIN_URL)
            if origin_url != None:
                cmple_info_dict[origin_url].update(entry)
        for entry in entrys:
            origin_url = entry.get(ORIGIN_URL)
            if origin_url in cmple_info_dict:
                for clm in cmple_info_dict[origin_url]:
                    value = entry.get(clm)
                    if value is None:
                        new_value = cmple_info_dict[origin_url][clm]
                        value = new_value
                        entry[clm] = value
        return entrys

    def pre_process(self):
        
        self.rm_dup_list_info('extracted_list_info',DB_1,self.cmp_table,DB_1)
        cmp_entrys = self.db_helper.get_results_by_clms(self.cmp_clms,self.cmp_table,DB_1,isdict=True)
        entrys = self.db_helper.get_results_by_clms("*",self.table,DB_1,isdict=True)
        entrys = self.repair_data(entrys,cmp_entrys)
        entrys = self.filter.rm_college_from_loc(entrys)
        self.db_helper.exe_sql('delete from %s.%s'%(DB_1,self.table))
        if len(entrys) > 0 :
            fields = entrys[0].keys()
            self.db_helper.batch_insert(entrys,self.table,DB_1,fields)
Exemplo n.º 14
0
from common.DataBaseHelper import DBHelper
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import re
import codecs

db_helper = DBHelper()
res = db_helper.get_results_by_clms('*','site_domain','xyzb_test')

def get_dict_from_file(file):
    file = codecs.open(file,'r','utf-8')
    list = []
    rs = re.compile('[ ]+')
    for line in file.readlines():
        line = rs.split(line)
        list.append(line) 
    file.close()
    return list 

def test_match():
    file = open('domain_college','w')
    for line in site_stats:
        domain = line[0].strip()
        domain_pat = re.compile(domain)
        suc = False
        for site_domain in site_domains:
            if domain_pat.search(site_domain):
                file.write("%s\t%s\n"%(domain,site_domains[site_domain]))
                suc = True
                break
Exemplo n.º 15
0
 def setUp(self):
     self.db = DBHelper()
     columns = ['recruit_title']
     self.recruit_titles = self.db.get_results_by_clms(columns,'refined_info',DB_1)
Exemplo n.º 16
0
 def __init__(self):
     self.db_helper = DBHelper()
     pass
Exemplo n.º 17
0
class scheduler():
    """ To schedule the data flow
    """
    def __init__(self):
        self.db = DBHelper()
        self.dc = datachecker()
        self.sb = submitter()
        self.dd = data_drawer()
        self.sg = segger()
        self.id = indexer()
        self.pc = prechecker()

        self.dmg_on = DMG_ON
        self.test_on = TEST_ON
        if len(sys.argv) < 4:
            self.table_old = "refined_info"
            self.db_old = DB_1
            self.table_new = "extracted_info"
            self.db_new = DB_1
        else:
            self.table_old = sys.argv[1]
            self.db_old = sys.argv[2]
            self.table_new = sys.argv[3]
            self.db_new = sys.argv[4]


    def data_flow(self):

        threads = []
        self.pc.pre_process()
        #remove duplicates
        entrys = self.dc.diff_tables(self.table_old,self.table_new,self.db_old,self.db_new)
        #insert the new coming entrys into database
        self.db.batch_insert(entrys,self.table_old,self.db_old)
        
        if (self.test_on):
            t = threading.Thread(target=self.db.batch_insert,args=(entrys,self.table_old,DB_test))
            t.start()
            threads.append(t)

        if(self.dmg_on):
            #submite the updates to dmg
            self.sb.deal_submit_req(entrys)
            #get the all data from dmg
            entrys = dd.get_entrys()
            self.db.recr_table(TB_DMG,DB_1,TB_TEMPLATE,DB_1)
            t = threading.Thread(target=self.db.batch_insert,args=(entrys,TB_DMG,DB_1))
            t.start()
            threads.append(t)
        else:
            entrys = self.db.get_results_by_clms(columns='*',table= self.table_old,
                                                 database=self.db_old,isdict=True)
            entrys = sorted(entrys,key=self.sort_func,reverse=True)
        #serilize entrys to protobuffers
        entrys = self.sg.serilize_entrys(entrys)
        #remote call word segging
        entrys = self.sg.word_seg_list(entrys)
  #      #remote call indexing
        self.id.index_list_string(entrys)

        # wait until all threads are over
        for t in threads:
            t.join()

    def sort_func(self,entry):
        info_type = entry.get(INFO_TYPE)
        if info_type == 1:
            time = entry.get(MEETING_TIME)
        else:
            time = entry.get(RELEASE_DATE)
        if time is None:
            return 0
        else :
            return get_timestamp(time)
Exemplo n.º 18
0
                entry[key] = value
            entry_json = json.dumps(entry)
            self.dumpjson_to_file(entry_json)
        return res

    def dumpjson_to_file(self, item):
        # res = json.dumps(json_list)
        res = str(item)
        output_file = "json_" + datetime.datetime.now().strftime("%m_%d_%H%M")
        output = os.path.join(self.output_dir, output_file)
        try:
            json_file = open(output, "a")
            json_file.write(res)
            json_file.write("\n")
            json_file.close()
        except IOError, e:
            LOG.error(e)
            sys.exit(-1)
        finally:
            if json_file:
                json_file.close()


JD = JsonDumper(output_dir)

if __name__ == "__main__":
    db_helper = DBHelper()
    entrys = db_helper.get_results_by_clms(columns="*", table="info_2", database="xyzb", isdict=True)[:]
    jd = JsonDumper(output_dir)
    jd.dump_entrys(entrys)
Exemplo n.º 19
0
 def __init__(self):
     self.load_dicts()
     self.db = DBHelper()
Exemplo n.º 20
0
class statist():
 
    def __init__(self):
        self.load_dicts()
        self.db = DBHelper()
    
    def load_dicts(self):
        #load domain_college_map
        file_path = 'domain_college'
        self.domain_college_dict = self.load_file_as_dict(file_path)
        
        #load domain_college_map
        file_path = 'sitename_college'
        self.site_college_dict = self.load_file_as_dict(file_path)
        
        #load college_list
        file_path = 'college_list'
        self.college_list = self.load_file_as_list(file_path)
        
    
    def  build_crawler_stat(self):
        #load crawed_stats
        file_path = 'crawler_stat'
        rows = self.load_file_as_list(file_path)
        heads = ['College','Request','Response','Suc_Rate','Uniq_Url']
        entrys = []
        rq_sum = 0 
        rp_sum = 0
        uniq_url_sum = 0
        for row in  rows:
            #TODO try catch the index exceed the row length
            entry = []
            dm = row[0]
            rq = row[1]
            rp = row[2]
            uniq_url = row[3]

            rq_sum += int(rq)
            rp_sum += int(rp)
            uniq_url_sum += int(uniq_url)
            
            college = self.domain_college_dict.get(dm)
            entry.append(college)
            entry.append(rq)
            entry.append(rp)
            #zero can not be divided
            if rq == '0':
                suc_rate = str(100.00) +"%"
            else:
                #keep two digits 
                suc_rate =str(round(float(rp)/float(rq)*100,2)) + "%"
            
            entry.append(suc_rate)
            entry.append(uniq_url)
            entrys.append(entry)
        
        if rq_sum == '0':
            suc_rate_aver = str(100.00) + "%s"
        else:
            suc_rate_aver = str(round(float(rp_sum)/float(rq_sum)*100,2)) + "%s"
        
        sum_entry = ['SUM',str(rq_sum),str(rp_sum),suc_rate_aver,str(uniq_url_sum)]
        entrys.append(sum_entry)
        return heads,entrys
    
    def build_parse_stat(self):
        # need parameter db,table and where clause
        tb_full_today = 'extracted_full_info_today'
        tb_today = 'refined_info_today'
        
        sql_tmp = "select origin_website_name,count(*) from %s.%s %s group by origin_website_name"
        dl = []
        # get sitename and parsenum dict
        sql = sql_tmp%(DB_1,tb_full_today,' ')
        rows = self.db.exe_sql(sql)
        dict_1 = dict(rows)
        dl.append(dict_1)

        #get sitename and error campus info num
        where_clause = 'where info_type = 1 and isnull(meeting_time)'
        sql = sql_tmp%(DB_1,tb_full_today,where_clause)
        rows = self.db.exe_sql(sql)
        dict_2 = dict(rows)
        dl.append(dict_2)

        # get sitename and error recruit info num
        where_clause = 'where info_type = 0 and isnull(release_date)'
        sql = sql_tmp%(DB_1,tb_full_today,where_clause)
        rows = self.db.exe_sql(sql)
        dict_3 = dict(rows)
        dl.append(dict_3)

        #merge the all dicts as a list based on key:sitename
        
        rows = []
        for k in dl[0].keys():
            row = []
            row.append(k)
            for d in dl:
                if d.get(k) is None:
                    row.append(0)
                else:
                    row.append(d.get(k))
            rows.append(row)
        entrys = []
        site_tt_sum = 0
        err_1_sum = 0
        err_2_sum = 0

        for row in rows:
            entry = []
        #TODO exception
            site = row[0]
            site_tt = row[1]
            err_1 = row[2]
            err_2 = row[3]

            site_tt_sum += site_tt
            err_1_sum += err_1
            err_2_sum += err_2

            college = self.site_college_dict.get(site)
            entry.append(college)
            entry.append(site_tt)
            entry.append(err_1)
            entry.append(err_2)
            # no need to prevent site_tt_num as zero :impossible
            suc_rate = str(round((1 - float(err_1 + err_2) / float(site_tt)) * 100,2)) + "%"
            entry.append(suc_rate)
            entrys.append(entry)
        # get sum info
        suc_rate_tt = str(round((1 - float(err_1_sum + err_2_sum) / float(site_tt_sum)) * 100,2)) + "%"
        sum_entry = ['SUM',site_tt_sum,err_1_sum,err_2_sum,suc_rate_tt]
        entrys.append(sum_entry)
        heads = ['College','Parse Info','Err Campus Info','Err Recruit Info','Suc_Rate']
        return heads,entrys
       
    def build_duplicates_stat(self):
        # need parameter db,table and where clause
        tb_full_today = 'extracted_full_info_today'
        tb_today = 'refined_info_today'
        
        sql_tmp = "select origin_website_name,count(*) from %s.%s %s group by origin_website_name"
        dl = []
        # get sitename and parsenum dict
        sql = sql_tmp%(DB_1,tb_full_today,' ')
        rows = self.db.exe_sql(sql)
        dict_1 = dict(rows)
        dl.append(dict_1)

        #get sitename and error campus info num
        sql = sql_tmp%(DB_1,tb_today,' ')
        rows = self.db.exe_sql(sql)
        dict_2 = dict(rows)
        dl.append(dict_2)

        #merge the all dicts as a list based on key:sitename
        
        rows = []
        for k in dl[0].keys():
            row = []
            row.append(k)
            for d in dl:
                if d.get(k) is None:
                    row.append(0)
                else:
                    row.append(d.get(k))
            rows.append(row)
        entrys = []

        full_sum = 0
        refined_sum = 0

        for row in rows:
            entry = []
        #TODO exception
            site = row[0]
            full_tt = row[1]
            refined = row[2]

            full_sum += full_tt
            refined_sum += refined

            college = self.site_college_dict.get(site)
            entry.append(college)
            entry.append(full_tt)
            entry.append(refined)
            # no need to prevent site_tt_num as zero :impossible
            suc_rate = str(round(float(refined) / float(full_tt) * 100,2)) + "%"
            entry.append(suc_rate)
            entrys.append(entry)
        # get sum info
        suc_rate_tt = str(round(float(refined_sum) / float(full_sum) * 100,2)) + "%"
        sum_entry = ['SUM',full_sum,refined_sum,suc_rate_tt]
        entrys.append(sum_entry)
        heads = ['College','Full Info','Refined Info','Uniqs_Prop']
        return heads,entrys
    
    def build_total_stat(self):
        # need parameter db,table and where clause
        
        tb_refined = 'refined_info'
        sql_tmp = "select origin_website_name,count(*) from %s.%s %s group by origin_website_name"
        dl = []
        # get sitename and parsenum dict
        sql = sql_tmp%(DB_1,tb_refined,' ')
        rows = self.db.exe_sql(sql)
        dict_1 = dict(rows)
        dl.append(dict_1)

        #get sitename and error campus info num
        where_clause = "where info_type = 1"
        sql = sql_tmp%(DB_1,tb_refined, where_clause)
        rows = self.db.exe_sql(sql)
        dict_2 = dict(rows)
        dl.append(dict_2)
        
        #get sitename and error campus info num
        where_clause = "where info_type = 0"
        sql = sql_tmp%(DB_1,tb_refined, where_clause)
        rows = self.db.exe_sql(sql)
        dict_3 = dict(rows)
        dl.append(dict_3)

        
        #get sitename and error campus info num
        date_str = datetime.datetime.now().strftime('%Y-%m-%d')
        where_clause = "where info_type = 1 and meeting_time > '%s'"%date_str
        sql = sql_tmp%(DB_1,tb_refined, where_clause)
        rows = self.db.exe_sql(sql)
        dict_4 = dict(rows)
        dl.append(dict_4)

        #merge the all dicts as a list based on key:sitename
        
        rows = []
        for k in dl[0].keys():
            row = []
            row.append(k)
            for d in dl:
                if d.get(k) is None:
                    row.append(0)
                else:
                    row.append(d.get(k))
            rows.append(row)
        entrys = []

        tt_sum = 0
        cam_tt_sum = 0
        rec_tt_sum = 0
        fut_cam_tt_sum = 0

        for row in rows:
            entry = []
        #TODO exception
            site = row[0]
            tt = row[1]
            cam_tt = row[2]
            rec_tt = row[3]
            fut_cam_tt = row[4]

            tt_sum += tt
            cam_tt_sum += cam_tt
            rec_tt_sum += rec_tt
            fut_cam_tt_sum += fut_cam_tt

            college = self.site_college_dict.get(site)
            entry.append(college)
            entry.append(tt)
            entry.append(cam_tt)
            entry.append(rec_tt)
            entry.append(fut_cam_tt)
            entrys.append(entry)
        # get sum info
        sum_entry = ['SUM',tt_sum,cam_tt_sum,rec_tt_sum,fut_cam_tt_sum]
        entrys.append(sum_entry)
        heads = ['College','Total_Info','Campus_Info','Recruit_Info','Future_Info']
        return heads,entrys
    
    def to_dict(self,rows):
        data = [(row[0][0],row[0][1]) for row in rows if len (row[0]) > 1 ]
        return zip(data)

    def load_file_as_dict(self,file_path):
        map  = {}
        file = codecs.open(file_path,'r','utf-8')
        for line in file.readlines():
            line = line.split('\t')
            key = line[0].strip()
            value = line[1].strip()
            map[key] = value
        file.close()
        return map
    
    def load_file_as_list(self,file_path):
        li  = []
        file = codecs.open(file_path,'r','utf-8')
        for line in file.readlines():
            li.append(line.strip().split('\t'))
        file.close()
        return li
Exemplo n.º 21
0
class IndexTestCase(unittest.TestCase):
    def setUp(self):
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
        self.segger = segger()
        self.indexer = indexer()
        self.dd = data_drawer()

    def tearDown(self):
        self.db = None
        self.samples = None
        self.segger = None
        self.indexer = None

    def test_convert_to_protoentry(self):
        self.segger.serilize_entrys(self.samples)

    def test_serilize_entrys(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)
        res = self.segger.serilize_entrys(entrys[:10])
        res = self.segger.word_seg_list(res)
        retdoc = merged_doc_pb2.MergedDocInfo()
        for entry in res:
            retdoc.ParseFromString(entry)
            for i in range(0, len(retdoc.terms_info.term_infos)):
                print 'term_sign:', retdoc.terms_info.term_infos[i].term_sign
                print 'term_weight:',retdoc.terms_info.term_infos[i].term_weight
            print '##########'

    def test_record_protos(self):
        file = open("proto.output",'wb')
        res = self.segger.serilize_entrys(self.samples[3:10])
        res = self.segger.word_seg_list(res)
        print "length after segging res is ",len(res)
        retdoc = merged_doc_pb2.MergedDocInfo()
        for entry in res:
            arr = array.array('I')
            length = len(entry)
            li = [length]
            arr.fromlist(li)
            file.write(arr.tostring())
            file.write(entry)
        file.close()

    def test_serilize_field(self):
        #entrys =self.db.get_results_by_clms(columns='*',table=
         #                       'info_tes',database=DB_2,isdict=True)
        res = self.segger.serilize_entrys(self.samples)
        res = self.segger.word_seg_list(res)



    def test_word_seg_list(self):
        protobufs = self.segger.serilize_entrys(self.samples[:2])
        res = self.segger.word_seg_list(protobufs)
        self.indexer.index_list_string(res)

    def test_index_start_stop(self):
        protobufs = self.segger.serilize_entrys(self.samples[:2])
        #res = self.segger.word_seg_list(protobufs)
        self.indexer.transport.open()
        print self.indexer.start_index()
        print self.indexer.stop_index()
        self.indexer.transport.close()
        #print self.indexer.index_list_string(res)

    def test_index_list_string(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:]
        protobufs = self.segger.serilize_entrys(entrys)
        res = self.segger.word_seg_list(protobufs)
        self.indexer.index_list_string(res)

    def test_split_city(self):
        conv = TypeConvertor()
        str= u"北京"
        print str.strip().split(',')
        print conv.city_id.get(str)
        ids = conv.map_cities(str)
        print ids

    def test_0_entrys_index(self):
        protobufs = self.segger.serilize_entrys([])
        res = self.segger.word_seg_list(protobufs)
        self.indexer.index_list_string(res)
        
    def test_city_id(self):
        conv = TypeConvertor()
        print conv.city_id

    def test_college_id(self):
        conv = TypeConvertor()
        print conv.map_college(u"清华大学")
        self.assertEqual(1,conv.map_college(u"北京大学"))