示例#1
0
def load_city_map(filepath):

    city_id_map = {}
    lines = []
    try:
        file = codecs.open(filepath,encoding='utf-8')
        lines = file.readlines()
    except Exception as e:
        LOG.error("Doesn't find the city list file!")
        LOG.error(e)
    if len(lines) == 0:
        return city_id_map

    for line in lines:
        line = line.strip("\n")
        names = []
        #编号,城市名,别名
        (id,name,alias)= line.split("\t")
        id = int(id)
        names.append(name)
        alias = alias.split(";")

        for name in alias:
            names.append(name)
        for name in names:
            name = name.strip('\"')
            if len(name.strip()) == 0:
                continue
            city_id_map.update({name:id})
    return city_id_map
示例#2
0
    def deal_data(self,raw_data):
        
        entrys = []
        tt_sum = 0
        cam_tt_sum = 0
        rec_tt_sum = 0
        fut_cam_tt_sum = 0

        for row in raw_data:
            try:
                entry = []
                site = row[0]
                tt = row[1]
                cam_tt = row[2]
                rec_tt = row[3]
                fut_cam_tt = row[4]

                tt_sum += tt
                cam_tt_sum += cam_tt
                rec_tt_sum += rec_tt
                fut_cam_tt_sum += fut_cam_tt

                college = self.site_college_dict.get(site)
                if college is None:
                    LOG.error('[%s] name not found!'%site)
                    college = site
                entry.append(college)
                entry.append(tt)
                entry.append(cam_tt)
                entry.append(rec_tt)
                entry.append(fut_cam_tt)
                entrys.append(entry)
            except Exception,e:
                LOG.error(e)
示例#3
0
def analyse_data(entrys):
    c_name_map = collections.defaultdict(list)
    c_name_infoid_list = []
    i = 0
    j = 0
    for i in range(len(entrys)):
        j = i + 1
        name_i = str(entrys[i][COMPANY_NAME])
        id_i = str(entrys[i][INFO_ID])
        if name_i.strip() == "" or name_i == "None":
            continue
        if i % 10 == 0:
            LOG.info("[%s] has been dealed!"%i)
        print "[%s:%s]"%(name_i,id_i),id_i + ",",
        for j in range(len(entrys))[i+1:]:
            name_j = str(entrys[j][COMPANY_NAME])
            if name_j.strip() == "" or name_j == "None":
                continue
            id_j = str(entrys[j][INFO_ID])
            if name_j in name_i or \
                    name_i in name_j or \
                        StrSim.get_sim(name_i,name_j) > 0.8:
                #print "(%s:%s)"%(name_j,id_j),'\t',
                print id_j + ",",
        print
示例#4
0
    def deal_data(self,raw_data):
       
        entrys = []
        full_sum = 0 
        refined_sum = 0 

        for row in raw_data:
            try:
                entry = []
                site = row[0]
                full_tt = row[1]
                refined = row[2]
                
                full_sum += full_tt
                refined_sum += refined

                college = self.site_college_dict.get(site)
                if college is None:
                    LOG.error('[%s] name not found!'%site)
                    college = site
                entry.append(college)
                entry.append(full_tt)
                entry.append(refined)
                # no need to prevent site_tt_num as zero :impossible
                suc_rate = self.get_rate(refined,full_tt)
                entry.append(suc_rate)
                entrys.append(entry)
            except Exception,e:
                LOG.error(e)
示例#5
0
    def deal_data(self,raw_data):
        entrys = []
        site_tt_sum = 0 
        err_1_sum = 0 
        err_2_sum = 0 
        for row in raw_data:
            try:
                entry = []
                site = row[0]
                site_tt = row[1]
                err_1 = row[2]
                err_2 = row[3]

                site_tt_sum += site_tt
                err_1_sum += err_1
                err_2_sum += err_2

                college = self.site_college_dict.get(site)
                if college is None:
                    LOG.error('[%s] name not found!'%site)
                    college = site
                entry.append(college)
                entry.append(site_tt)
                entry.append(err_1)
                entry.append(err_2)
                # no need to prevent site_tt_num as zero :impossible
                suc_rate = self.get_rate(site_tt - err_1 - err_2,site_tt)
                entry.append(suc_rate)
                entrys.append(entry)
            except Exception,e:
                LOG.error(e)
示例#6
0
 def load_lasttime_failed_entrys(self):
     LOG.info("Loading lasttime failed entrys")
     lasttime_info_ids = self.load_ids_from_rec()
     if len(lasttime_info_ids) == 0:
         return []
     db_helper = DBHelper()
     entrys = db_helper.get_data_by_infoids(lasttime_info_ids,self.table,
                                            self.db,isdict=True)
     LOG.info("Loaded [%s] lasttime failed entrys" %(len(entrys)))
     return entrys
示例#7
0
def make_key(entry, key_fields=None):
    """ based on the key_fields generate key for a entry"""
    if key_fields is None:
        LOG.warning("Compared Fileds Are None!")
        key_fields = [MEETING_TIME, MEETING_LOCATION, RECRUIT_URL]
    key = ""
    for field in key_fields:
        if entry.get(field) is None:
            continue
        key += str(entry[field])
    return key
示例#8
0
def update_info_table(entrys,table=T_info,database=DB_1):
    if len(entrys) == 0:
        return
    LOG.info("Begin to update to table [%s.%s]"%(database,table))
    count = 0
    for entry in entrys:
        #del entry[INFO_ID]
        id = db_helper.insert_entry(entry,table,database)
        #插入成功
        if id is not None:
            count += 1
    LOG.info("Successfully insert  [%s] Entrys To Table."%count)
示例#9
0
 def load_ids_from_rec(self):
     file = None
     info_ids = []
     try:
         file = open(FAILED_INFOID_REC,'r')
         info_ids_str = file.read()
         info_ids = info_ids_str.split(";")
     except Exception as e:
         LOG.error(e)
     finally:
         if file:file.close()
     return info_ids
示例#10
0
 def map_job_type(self, value):
     value = value.strip()
     job_type_id = 0
     if len(value) > 0:
         if value == "兼职":
             job_type_id = 1
         elif value == "实习":
             job_type_id = 2
         elif value == "全职":
             job_type_id = 0
         else:
             LOG.error("The Job Type Value is not valid,[%s]" % (value))
     return job_type_id
示例#11
0
 def dumpjson_to_file(self, item):
     # res = json.dumps(json_list)
     res = str(item)
     output_file = "json_" + datetime.datetime.now().strftime("%m_%d_%H%M")
     output = os.path.join(self.output_dir, output_file)
     try:
         json_file = open(output, "a")
         json_file.write(res)
         json_file.write("\n")
         json_file.close()
     except IOError, e:
         LOG.error(e)
         sys.exit(-1)
示例#12
0
 def send_request(self,retry_time=3):
     count = 0
     res = None
     resp = None
     while(count < retry_time):
         LOG.info("Send Request Round: [%s]" %(count))
         try:
             resp = urllib2.urlopen(self.url)
             res = resp.read()
             break
         except Exception,e:
             LOG.error(e)
         time.sleep(1)
         count += 1
示例#13
0
 def deal_old_list(self, cmp_columns, entrys):
     campus_entrys = []
     recruit_entrys = []
     LOG.info("Dealing Old List...")
     entry_idx = 0
     for entry in entrys:
         # 宣讲会的数据
         if entry[INFO_TYPE] == INFO_TYPE_CAM:
             campus_entrys.append(entry)
         else:
             recruit_entrys.append(entry)
         entry_idx += 1
     LOG.info("Finish Dealing Old List.")
     return campus_entrys, recruit_entrys
示例#14
0
def insert_to_campusTalk(entrys):
    if len(entrys) == 0:
        return
    LOG.info("Startging to Store to multi tables")
    table = T_campusTalk
    id = None
    count = 0
    for entry in entrys:
        if count % 2000 == 0:
            LOG.info("[%s] Has been inserted!"%count)
        collegeID = insert_subtable(entry,T_collegeInfo)
        if collegeID is None:
            LOG.error("INSERT TO [CollegeInfo] FAILS:%s"\
                        %("|".join(entry.values())))
            return None

        campanyID = insert_subtable(entry,T_companyInfo)
        if campanyID is None:
            LOG.error("INSERT TO [CampanyInfo] FAILS:%s"\
                        %("|".join(entry.values())))
            return None
        new_entry = {}
        new_entry[COLLEGE_ID] = collegeID
        new_entry[COMPANY_ID] = campanyID
        for key,value in entry.items():
            if value is None:
                continue
            if ct_field_map[key][0] == table:
                new_entry[ct_field_map[key][1]] = value
        if len(new_entry.keys()) > 0:
            id = db_helper.insert_entry(new_entry,table)
        count += 1
    return id
示例#15
0
    def dmg_to_proto_entry(self, entry):
        new_entry = {}

        convert_int_keys = set([INFO_ID, GROUP_ID, HAS_HUKOU, HAS_EXAM, HAS_RESUME, INFO_TYPE, CLICK_RATE])

        convert_time_keys = set(
            [MEETING_TIME, RELEASE_DATE, RESUME_START_DATE, RESUME_END_DATE, EXAM_TIME, INTERVIEW_TIME, LAST_MOD_TIME]
        )

        for key, value in entry.items():

            if value is None:
                continue
            if isinstance(value, basestring):
                if value.strip() == "":
                    continue

            # convert to  int count:9
            if key in convert_int_keys:
                try:
                    value = int(value)
                except ValueError:
                    LOG.error("ValueError,key is [%s] value is [%s]" % (key, value))

            # convert to id count:2
            elif key == COLLEGE_NAME:
                college_id = self.map_college(value)
                new_entry[COLLEGE_ID] = college_id
            elif key == RECRUIT_CITIES:
                try:
                    value = self.map_cities(value)
                except:
                    LOG.error("In Map cities[%s]:%s" % (key, value))
            # convert to type:
            if key == JOB_TYPE:
                value = self.map_job_type(value)
            if key == COMPANY_TYPE:
                value = self.map_company_type(value)

            # convert to list count:1
            elif key == WORK_PLACE:
                value = self.deal_work_place(value)

            # convert time to timestamp  count:6
            elif key in convert_time_keys:
                try:
                    if not isinstance(value, datetime.datetime):
                        value = datetime.datetime.strptime(value, DATEFORMAT)
                    value = get_timestamp(value)
                except Exception, e:
                    LOG.error("[%s]:[%s]" % (key, e))

            # last_mod_time has micro-second scale
            elif key == LAST_MOD_TIME:
                try:
                    value = datetime.datetime.strptime(value, DATEFORMAT)
                    value = get_timestamp(value)
                except Exception, e:
                    LOG.error("[%s]:[%s]" % (key, e))
示例#16
0
 def rm_college_from_loc(self,entrys):
     LOG.info("Dealing Location field to remove college from loc")
     count = 0
     for entry in entrys:
         info_type = entry.get(INFO_TYPE)
         if info_type == 0:
             continue
         college = entry.get(COLLEGE_NAME)
         loc = entry.get(MEETING_LOCATION)
         if loc.startswith(college):
             count += 1
             loc = loc.replace(college,'')
             entry[MEETING_LOCATION] = loc
     LOG.info("Removing [%s] College Name from meeting_location field!"%count)
     return entrys
示例#17
0
 def rec_protobuf(self,res):
     if not os.path.isdir(RECORD_DIR):
         os.mkdir(RECORD_DIR)
     rec_file = os.path.join(RECORD_DIR,self.make_file_name())
     file = open(rec_file,'w')
     LOG.info("Recording the proto to file [%s]" %(rec_file))
     for protobuf in res:
         arr = array.array('I')
         length = len(protobuf)
         li = [length]
         arr.fromlist(li)
         file.write(arr.tostring())
         file.write(protobuf)
     file.close()
     LOG.info("Recording to [%s] successfully!"%(rec_file))
示例#18
0
    def is_similar(self, columns, new, old):
        res = True
        for key in columns:

            LOG.debug("Comparing[%s],new is [%s],old is [%s]" % (key, new[key], old[key]))
            if new[key] == old[key]:
                continue
            if new[key] is None or old[key] is None:
                res = False
                break
            if new[key] in old[key] or old[key] in new[key]:
                continue
            if StrSim.get_sim(str(new[key]), str(old[key])) < THRESHOLD:
                res = False
                break
        return res
示例#19
0
def load_table(filepath=None):
    try:
        domtree = ElementTree.parse(filepath)
        root = domtree.getroot()
        tables = root.findall('table')
        table_info = {}
        for table in tables:
            columns = []
            table_name = table.find('name').text
            columns_list = table.find('columns').findall('column')
            for column in columns_list:
                columns.append(column.text)
            table_info[table_name] = columns
    except Exception,e:
        LOG.error(e)
        sys.exit(-1)
示例#20
0
def write_entrys(entrys,filepath):
    file = open(filepath,'w')
    count = 0
    for entry in entrys:
        if isinstance(entry,dict):
            for key,value in entry.items():
                value = str(value).replace('\n','').replace('\t','')
                file.write("%s\t"%value)
            file.write("\n")
            count += 1
        if isinstance(entry,list):
            for value in entry:
                value = str(value).replace('\n','').replace('\t','')
                file.write("%s***\t"%value)
            file.write("\n")
            count += 1
    file.close()
    LOG.info("write [%s] entrys"%(count))
示例#21
0
    def get_entrys(self,seconds_before=1800,is_full=True):
        """ to get data from content base ,the data which updated
            from seconds_before to now
        """
        LOG.info("Begin To Requset Data From DMG.")
        entrys = []
        # When is_full is True , to get all the entrys from dmg
        if is_full:
            now_str = None
            before_str = None
        else:
            now_str,before_str = self.make_time(seconds_before)

        self.construct_args(before_str,now_str,is_full)
        resp = self.send_request()
        if resp != None:
            entrys = self.deal_resp(resp)
        return entrys
示例#22
0
def get_term_info(res):
    socket = TSocket.TSocket(host, port)
    transport = TTransport.TBufferedTransport(socket)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    client = TextProcessServer.Client(protocol)
    transport.open()
    file = open("output.protobuffer",'wb')
    resps = client.word_seg_list(res)

    for resp in resps:
        arr = array.array('I')
        length = len(resp);
        li = [length]
        arr.fromlist(li)
        LOG.info("The response length is : [%s]"%length)
        file.write(arr.tostring())
        file.write(resp)
    file.close()
    transport.close()
    return resps
示例#23
0
 def repair_data(self,entrys,cmp_entrys=None):
     if cmp_entrys == None:
         cmp_entrys = self.cmp_entrys
     LOG.info("repairing Data...")
     LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys)))
     cmple_info_dict = collections.defaultdict(dict)
     for entry in cmp_entrys:
         origin_url = entry.get(ORIGIN_URL)
         if origin_url != None:
             cmple_info_dict[origin_url].update(entry)
     for entry in entrys:
         origin_url = entry.get(ORIGIN_URL)
         if origin_url in cmple_info_dict:
             for clm in cmple_info_dict[origin_url]:
                 value = entry.get(clm)
                 if value is None:
                     new_value = cmple_info_dict[origin_url][clm]
                     value = new_value
                     entry[clm] = value
     return entrys
示例#24
0
 def filter_illegal(self,entrys):
     LOG.info("Begin to Filter illegal entrys..")
     not_null_fields = [ORIGIN_URL,ORIGIN_WEBSITE_NAME,RECRUIT_TITLE]
     legal_entrys = []
     illegal_entrys = []
     while len(entrys) > 0:
         entry = entrys.pop()
         info_type = entry.get(INFO_TYPE)
         flag = True
         for field in not_null_fields:
             if entry.get(field) is None:
                 illegal_entrys.append(entry)
                 flag = False
                 break
         
         if info_type == 0 and flag:
             if entry.get(RELEASE_DATE) is None:
                 illegal_entrys.append(entry)
                 flag = False
         if info_type == 1 and flag:
             if entry.get(MEETING_TIME) is None:
                 illegal_entrys.append(entry)
                 flag = False
         if flag:legal_entrys.append(entry)
     LOG.info("Finish filering entrys.[%s] entrys are illegal"%(len(illegal_entrys)))    
     db_illegal = DB_1
     table_illegal = TB_ILLEGAL
     LOG.info("Insert illegal Entrys into[%s.%s]"%(db_illegal,table_illegal))
     fields = list(INFO_FIELDS)
     fields.remove(GROUP_ID)
     fields.extend(['author','tmp_path'])
     self.db_helper.batch_insert(illegal_entrys,table_illegal,db_illegal,fields)
     return legal_entrys
示例#25
0
 def deal_new_list(self, cmp_columns, entrys):
     # 宣讲会的数据和招聘会的数据
     campus_entrys = []
     recruit_entrys = []
     fields_set = set()
     title_set = set()
     LOG.info("Dealing New List...")
     entry_idx = 0
     key_fields = cmp_columns + [ORIGIN_URL]
     for entry in entrys:
         # 宣讲会的数据
         if entry_idx % 5000 == 0:
             LOG.info("Dealing new_entrys In Progress:[%s]" % entry_idx)
         if entry[INFO_TYPE] == INFO_TYPE_CAM:
             # 同一时间的数据进行相似度对比去重
             key = make_key(entry, key_fields)
             if key not in fields_set:
                 fields_set.add(key)
                 # 处理应届生海投这种一个页面对应多条信息的情况
                 entry = self.deal_special_url(entry)
                 campus_entrys.append(entry)
         # 招聘会的数据
         else:
             key = entry.get(RECRUIT_TITLE)
             if key not in title_set:
                 title_set.add(key)
                 recruit_entrys.append(entry)
         entry_idx += 1
     LOG.info("Finish Dealing New List.")
     return campus_entrys, recruit_entrys
示例#26
0
 def word_seg_list(self,protobufs):
     res = []
     if not check_list(protobufs):
         LOG.info("Do Not Call Word Segging,For the input list is:%s"%(protobufs))
         return res
     try:
         self.transport.open()
         LOG.info("Begin  RPC Word Segging,[%s] To Be Segged!"
                  %(len(protobufs)))
         res = self.client.word_seg_list(protobufs)
         self.transport.close()
         LOG.info("Finish RPC Word Segging,[%s] Entrys Have Been Segged!"
                  %(len(res)))
         self.rec_protobuf(res)
     except Exception,e:
         LOG.error(e)
示例#27
0
    def deal_data(self, raw_data):
        entrys = []
        rq_sum = 0
        rp_sum = 0
        uniq_url_sum = 0
        rows = self.prepare_data()
        for row in raw_data:
            # TODO try catch the index exceed the row length
            try:
                entry = []
                dm = row[0]
                rq = row[1]
                rp = row[2]
                uniq_url = row[3]

                rq_sum += int(rq)
                rp_sum += int(rp)
                uniq_url_sum += int(uniq_url)

                college = self.domain_college_dict.get(dm)
                if college is None:
                    LOG.error("[%s] domain not found" % dm)
                    college = dm
                entry.append(college)
                entry.append(rq)
                entry.append(rp)
                # zero can not be divided

                suc_rate = self.get_rate(rp, rq)
                entry.append(suc_rate)
                entry.append(uniq_url)
                entrys.append(entry)
            except Exception, e:
                LOG.error(e)

            suc_rate_aver = self.get_rate(rp_sum, rq_sum)
示例#28
0
def extract(recruit_title):
    if not recruit_title:
        return recruit_title    
    #保存一份原始标题
    origin_recruit_title = recruit_title.strip()
    recruit_title = pre_clean(recruit_title)
    if len(recruit_title) < 2:
        #如果抽出为空,那么返回原始标题
        recruit_title = origin_recruit_title
    
    recruit_title = recruit_title.strip(u',、-—')
    match_result = pat_company_pre_tail.search(recruit_title) 
    if match_result is not None:
        LOG.info('extract %s by pat_company_pre_tail' %(recruit_title))
        return match_result.group(0)
    match_result = pat_company_tail.search(recruit_title) 
    if match_result is not None:
        #LOG.info('extract %s by pat_company_tail' %(recruit_title))
        return match_result.group(0)
    match_result = pat_company_tail_sec.search(recruit_title)
    if match_result is not None:
        #LOG.info('extract %s by pat_company_tail_sec' %(recruit_title))
        return match_result.group(0)
    match_result = pat_company_tail_third.search(recruit_title)
    if match_result is not None:
        #LOG.info('extract %s by pat_company_tail_third' %(recruit_title))
        return match_result.group(0)
    #..xxx中心
    match_result = pat_company_tail_forth.search(recruit_title)
    if match_result is not None:
        #LOG.info('extract %s by pat_company_tail_forth' %(recruit_title))
        return match_result.group(0)
    company_name = clean(recruit_title)
    if len(company_name) < 2:
        return recruit_title
    return company_name
示例#29
0
    def index_list_string(self, protobufs):

        if not check_list(protobufs):
            LOG.info("Do Not Call Indexing, For the input list is:%s" % (protobufs))
            return
        count = 0
        try:
            self.transport.open()
            is_ready = self.start_index()
            if not is_ready:
                LOG.error("Index Server Is Not Ready!")
                return 0
            count = self.client.put_list_string(protobufs)
            LOG.info("[%s] Entrys Has Been Successfully Indexed." % (count))
            self.stop_index()
            self.transport.close()
        except Exception, e:
            LOG.error(e)
示例#30
0
 def serilize_entrys(self,entrys):
     protobufs = []
     LOG.info("Begin Serilizing Entrys,[%s] Entrys To Be Serilized!"
              %(len(entrys)))
     for entry in entrys:
         mdoc = merged_doc_pb2.MergedDocInfo()
         new_entry = self.convertor.dmg_to_proto_entry(entry)
         for key,value in new_entry.items():
             try:
                 if isinstance(value,list):
                     getattr(mdoc,key).extend(value)
                 else:
                     setattr(mdoc,key,value)
             except Exception,e:
                 LOG.error("[%s]:%s" % (key,value))
                 LOG.error(e)
         protobuf = mdoc.SerializeToString()
         protobufs.append(protobuf)