示例#1
0
def insert_to_campusTalk(entrys):
    if len(entrys) == 0:
        return
    LOG.info("Startging to Store to multi tables")
    table = T_campusTalk
    id = None
    count = 0
    for entry in entrys:
        if count % 2000 == 0:
            LOG.info("[%s] Has been inserted!"%count)
        collegeID = insert_subtable(entry,T_collegeInfo)
        if collegeID is None:
            LOG.error("INSERT TO [CollegeInfo] FAILS:%s"\
                        %("|".join(entry.values())))
            return None

        campanyID = insert_subtable(entry,T_companyInfo)
        if campanyID is None:
            LOG.error("INSERT TO [CampanyInfo] FAILS:%s"\
                        %("|".join(entry.values())))
            return None
        new_entry = {}
        new_entry[COLLEGE_ID] = collegeID
        new_entry[COMPANY_ID] = campanyID
        for key,value in entry.items():
            if value is None:
                continue
            if ct_field_map[key][0] == table:
                new_entry[ct_field_map[key][1]] = value
        if len(new_entry.keys()) > 0:
            id = db_helper.insert_entry(new_entry,table)
        count += 1
    return id
示例#2
0
def analyse_data(entrys):
    c_name_map = collections.defaultdict(list)
    c_name_infoid_list = []
    i = 0
    j = 0
    for i in range(len(entrys)):
        j = i + 1
        name_i = str(entrys[i][COMPANY_NAME])
        id_i = str(entrys[i][INFO_ID])
        if name_i.strip() == "" or name_i == "None":
            continue
        if i % 10 == 0:
            LOG.info("[%s] has been dealed!"%i)
        print "[%s:%s]"%(name_i,id_i),id_i + ",",
        for j in range(len(entrys))[i+1:]:
            name_j = str(entrys[j][COMPANY_NAME])
            if name_j.strip() == "" or name_j == "None":
                continue
            id_j = str(entrys[j][INFO_ID])
            if name_j in name_i or \
                    name_i in name_j or \
                        StrSim.get_sim(name_i,name_j) > 0.8:
                #print "(%s:%s)"%(name_j,id_j),'\t',
                print id_j + ",",
        print
示例#3
0
 def load_lasttime_failed_entrys(self):
     LOG.info("Loading lasttime failed entrys")
     lasttime_info_ids = self.load_ids_from_rec()
     if len(lasttime_info_ids) == 0:
         return []
     db_helper = DBHelper()
     entrys = db_helper.get_data_by_infoids(lasttime_info_ids,self.table,
                                            self.db,isdict=True)
     LOG.info("Loaded [%s] lasttime failed entrys" %(len(entrys)))
     return entrys
示例#4
0
def update_info_table(entrys,table=T_info,database=DB_1):
    if len(entrys) == 0:
        return
    LOG.info("Begin to update to table [%s.%s]"%(database,table))
    count = 0
    for entry in entrys:
        #del entry[INFO_ID]
        id = db_helper.insert_entry(entry,table,database)
        #插入成功
        if id is not None:
            count += 1
    LOG.info("Successfully insert  [%s] Entrys To Table."%count)
示例#5
0
 def send_request(self,retry_time=3):
     count = 0
     res = None
     resp = None
     while(count < retry_time):
         LOG.info("Send Request Round: [%s]" %(count))
         try:
             resp = urllib2.urlopen(self.url)
             res = resp.read()
             break
         except Exception,e:
             LOG.error(e)
         time.sleep(1)
         count += 1
示例#6
0
 def deal_old_list(self, cmp_columns, entrys):
     campus_entrys = []
     recruit_entrys = []
     LOG.info("Dealing Old List...")
     entry_idx = 0
     for entry in entrys:
         # 宣讲会的数据
         if entry[INFO_TYPE] == INFO_TYPE_CAM:
             campus_entrys.append(entry)
         else:
             recruit_entrys.append(entry)
         entry_idx += 1
     LOG.info("Finish Dealing Old List.")
     return campus_entrys, recruit_entrys
示例#7
0
 def rec_protobuf(self,res):
     if not os.path.isdir(RECORD_DIR):
         os.mkdir(RECORD_DIR)
     rec_file = os.path.join(RECORD_DIR,self.make_file_name())
     file = open(rec_file,'w')
     LOG.info("Recording the proto to file [%s]" %(rec_file))
     for protobuf in res:
         arr = array.array('I')
         length = len(protobuf)
         li = [length]
         arr.fromlist(li)
         file.write(arr.tostring())
         file.write(protobuf)
     file.close()
     LOG.info("Recording to [%s] successfully!"%(rec_file))
示例#8
0
 def rm_college_from_loc(self,entrys):
     LOG.info("Dealing Location field to remove college from loc")
     count = 0
     for entry in entrys:
         info_type = entry.get(INFO_TYPE)
         if info_type == 0:
             continue
         college = entry.get(COLLEGE_NAME)
         loc = entry.get(MEETING_LOCATION)
         if loc.startswith(college):
             count += 1
             loc = loc.replace(college,'')
             entry[MEETING_LOCATION] = loc
     LOG.info("Removing [%s] College Name from meeting_location field!"%count)
     return entrys
示例#9
0
    def get_entrys(self,seconds_before=1800,is_full=True):
        """ to get data from content base ,the data which updated
            from seconds_before to now
        """
        LOG.info("Begin To Requset Data From DMG.")
        entrys = []
        # When is_full is True , to get all the entrys from dmg
        if is_full:
            now_str = None
            before_str = None
        else:
            now_str,before_str = self.make_time(seconds_before)

        self.construct_args(before_str,now_str,is_full)
        resp = self.send_request()
        if resp != None:
            entrys = self.deal_resp(resp)
        return entrys
示例#10
0
 def serilize_entrys(self,entrys):
     protobufs = []
     LOG.info("Begin Serilizing Entrys,[%s] Entrys To Be Serilized!"
              %(len(entrys)))
     for entry in entrys:
         mdoc = merged_doc_pb2.MergedDocInfo()
         new_entry = self.convertor.dmg_to_proto_entry(entry)
         for key,value in new_entry.items():
             try:
                 if isinstance(value,list):
                     getattr(mdoc,key).extend(value)
                 else:
                     setattr(mdoc,key,value)
             except Exception,e:
                 LOG.error("[%s]:%s" % (key,value))
                 LOG.error(e)
         protobuf = mdoc.SerializeToString()
         protobufs.append(protobuf)
示例#11
0
 def load_columns(self, filepath=None):
     columns_list = []
     try:
         dom_tree = ElementTree.parse(filepath)
         root = dom_tree.getroot()
         columns = root.findall("field")
         if columns is None or len(columns) == 0:
             LOG.error("No columns found in xml conf [%s]" % (filepath))
             sys.exit(-1)
         columns_count = 0
         for column in columns:
             columns_list.append(column.text)
             columns_count += 1
         LOG.info("Total Load [%d] columns in conf [%s]" % (columns_count, filepath))
     except Exception as e:
         LOG.error(e)
         sys.exit(-1)
     return columns_list
示例#12
0
def write_entrys(entrys,filepath):
    file = open(filepath,'w')
    count = 0
    for entry in entrys:
        if isinstance(entry,dict):
            for key,value in entry.items():
                value = str(value).replace('\n','').replace('\t','')
                file.write("%s\t"%value)
            file.write("\n")
            count += 1
        if isinstance(entry,list):
            for value in entry:
                value = str(value).replace('\n','').replace('\t','')
                file.write("%s***\t"%value)
            file.write("\n")
            count += 1
    file.close()
    LOG.info("write [%s] entrys"%(count))
示例#13
0
    def index_list_string(self, protobufs):

        if not check_list(protobufs):
            LOG.info("Do Not Call Indexing, For the input list is:%s" % (protobufs))
            return
        count = 0
        try:
            self.transport.open()
            is_ready = self.start_index()
            if not is_ready:
                LOG.error("Index Server Is Not Ready!")
                return 0
            count = self.client.put_list_string(protobufs)
            LOG.info("[%s] Entrys Has Been Successfully Indexed." % (count))
            self.stop_index()
            self.transport.close()
        except Exception, e:
            LOG.error(e)
示例#14
0
 def write_ids_to_rec(self):
     LOG.info("Recording Failed IDs Into File [%s]" %(FAILED_INFOID_REC))
     if not (len(self.failed_ids)>0):
         return
     # record the failed_ids to file
     # check the record file dir if exists ,if not create it
     dir= os.path.dirname(FAILED_INFOID_REC)
     file = None
     if not os.path.isdir(dir):
         os.mkdir(dir)
     try:
         file = open(FAILED_INFOID_REC,'w')
         file.write(";".join([str(id) for id in self.failed_ids]))
         LOG.info("Record Successfully Totally [%s] entrys"
                   %(len(self.failed_ids)))
     except Exception as e:
         LOG.error(e)
     finally:
         if file: file.close()
示例#15
0
    def submit_to_RB(self,entrys,posturl=POSTURL):
        """ submit data to dmg
        """
        failed_count = 0
        suc_count = 0
        suc_ids = []
        failed_ids = []
        failed_entrys = []
        count = 0
        state = ""
        res = None

        for entry in entrys[:]:
            #just for test should delet after testing
            count += 1
            info_id = entry[INFO_ID]
            entry_json = self.construct_json_data(entry)
            resjson = {"resjson":json.dumps(entry_json)}
            try:
                state =self.post(posturl,resjson)
                res = json.loads(state)['result']
                if (res == 0):
                    LOG.warning("[%s] Submitted Error!" %(info_id))
                    LOG.warning(state)
                    failed_count += 1
                if (res == 1): suc_count += 1
            except Exception as e:
                LOG.error(e)
                failed_count += 1
                LOG.error(state)
                failed_ids.append(info_id)
                failed_entrys.append(entry)

       #     LOG.debug("Post one entry into RB the result is: [%s],Suc_Count=\
        #            [%s],Fail_count=[%s]" % (res,suc_count,failed_count))
            if (suc_count + 1) % 1000 == 0 or (failed_count + 1 ) % 100 == 0:
                LOG.info("Post Entrys To DMG,Suc:[%s] Failed:[%s]"
                        %(suc_count,failed_count))
        self.failed_ids = failed_ids
        self.failed_entrys = failed_entrys
        LOG.info("Successfully Submitted To DMG [%s],Failed:[%s]"
                %(suc_count,failed_count))
示例#16
0
def get_term_info(res):
    socket = TSocket.TSocket(host, port)
    transport = TTransport.TBufferedTransport(socket)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    client = TextProcessServer.Client(protocol)
    transport.open()
    file = open("output.protobuffer",'wb')
    resps = client.word_seg_list(res)

    for resp in resps:
        arr = array.array('I')
        length = len(resp);
        li = [length]
        arr.fromlist(li)
        LOG.info("The response length is : [%s]"%length)
        file.write(arr.tostring())
        file.write(resp)
    file.close()
    transport.close()
    return resps
示例#17
0
 def repair_data(self,entrys,cmp_entrys=None):
     if cmp_entrys == None:
         cmp_entrys = self.cmp_entrys
     LOG.info("repairing Data...")
     LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys)))
     cmple_info_dict = collections.defaultdict(dict)
     for entry in cmp_entrys:
         origin_url = entry.get(ORIGIN_URL)
         if origin_url != None:
             cmple_info_dict[origin_url].update(entry)
     for entry in entrys:
         origin_url = entry.get(ORIGIN_URL)
         if origin_url in cmple_info_dict:
             for clm in cmple_info_dict[origin_url]:
                 value = entry.get(clm)
                 if value is None:
                     new_value = cmple_info_dict[origin_url][clm]
                     value = new_value
                     entry[clm] = value
     return entrys
示例#18
0
 def deal_new_list(self, cmp_columns, entrys):
     # 宣讲会的数据和招聘会的数据
     campus_entrys = []
     recruit_entrys = []
     fields_set = set()
     title_set = set()
     LOG.info("Dealing New List...")
     entry_idx = 0
     key_fields = cmp_columns + [ORIGIN_URL]
     for entry in entrys:
         # 宣讲会的数据
         if entry_idx % 5000 == 0:
             LOG.info("Dealing new_entrys In Progress:[%s]" % entry_idx)
         if entry[INFO_TYPE] == INFO_TYPE_CAM:
             # 同一时间的数据进行相似度对比去重
             key = make_key(entry, key_fields)
             if key not in fields_set:
                 fields_set.add(key)
                 # 处理应届生海投这种一个页面对应多条信息的情况
                 entry = self.deal_special_url(entry)
                 campus_entrys.append(entry)
         # 招聘会的数据
         else:
             key = entry.get(RECRUIT_TITLE)
             if key not in title_set:
                 title_set.add(key)
                 recruit_entrys.append(entry)
         entry_idx += 1
     LOG.info("Finish Dealing New List.")
     return campus_entrys, recruit_entrys
示例#19
0
 def filter_illegal(self,entrys):
     LOG.info("Begin to Filter illegal entrys..")
     not_null_fields = [ORIGIN_URL,ORIGIN_WEBSITE_NAME,RECRUIT_TITLE]
     legal_entrys = []
     illegal_entrys = []
     while len(entrys) > 0:
         entry = entrys.pop()
         info_type = entry.get(INFO_TYPE)
         flag = True
         for field in not_null_fields:
             if entry.get(field) is None:
                 illegal_entrys.append(entry)
                 flag = False
                 break
         
         if info_type == 0 and flag:
             if entry.get(RELEASE_DATE) is None:
                 illegal_entrys.append(entry)
                 flag = False
         if info_type == 1 and flag:
             if entry.get(MEETING_TIME) is None:
                 illegal_entrys.append(entry)
                 flag = False
         if flag:legal_entrys.append(entry)
     LOG.info("Finish filering entrys.[%s] entrys are illegal"%(len(illegal_entrys)))    
     db_illegal = DB_1
     table_illegal = TB_ILLEGAL
     LOG.info("Insert illegal Entrys into[%s.%s]"%(db_illegal,table_illegal))
     fields = list(INFO_FIELDS)
     fields.remove(GROUP_ID)
     fields.extend(['author','tmp_path'])
     self.db_helper.batch_insert(illegal_entrys,table_illegal,db_illegal,fields)
     return legal_entrys
示例#20
0
 def deal_resp(self,resp):
     entrys = resp.strip('\n').split("\n")
     new_entrys = []
     LOG.info("[%s] Entrys We Totally Get From DMG"%len(entrys))
     for entry in entrys:
         new_entry = {}
         try:
             entry = json.loads(entry)
             specials = entry.get('specials')
             origins = entry.get('contentOrigns')
             if origins != None:
                 origins = origins[0]
             if specials != None:
                 new_entry.update(specials)
             if origins != None:
                 new_entry.update(origins)
             #LOG.debug(new_entry)
             new_entry = self.convertor.dmg_to_db(new_entry)
         except Exception,e:
             LOG.debug(e)
             LOG.debug("broken json is:[%s]"%entry)
         if len(new_entry) != 0:
             new_entrys.append(new_entry)
示例#21
0
def extract(recruit_title):
    if not recruit_title:
        return recruit_title    
    #保存一份原始标题
    origin_recruit_title = recruit_title.strip()
    recruit_title = pre_clean(recruit_title)
    if len(recruit_title) < 2:
        #如果抽出为空,那么返回原始标题
        recruit_title = origin_recruit_title
    
    recruit_title = recruit_title.strip(u',、-—')
    match_result = pat_company_pre_tail.search(recruit_title) 
    if match_result is not None:
        LOG.info('extract %s by pat_company_pre_tail' %(recruit_title))
        return match_result.group(0)
    match_result = pat_company_tail.search(recruit_title) 
    if match_result is not None:
        #LOG.info('extract %s by pat_company_tail' %(recruit_title))
        return match_result.group(0)
    match_result = pat_company_tail_sec.search(recruit_title)
    if match_result is not None:
        #LOG.info('extract %s by pat_company_tail_sec' %(recruit_title))
        return match_result.group(0)
    match_result = pat_company_tail_third.search(recruit_title)
    if match_result is not None:
        #LOG.info('extract %s by pat_company_tail_third' %(recruit_title))
        return match_result.group(0)
    #..xxx中心
    match_result = pat_company_tail_forth.search(recruit_title)
    if match_result is not None:
        #LOG.info('extract %s by pat_company_tail_forth' %(recruit_title))
        return match_result.group(0)
    company_name = clean(recruit_title)
    if len(company_name) < 2:
        return recruit_title
    return company_name
示例#22
0
 def word_seg_list(self,protobufs):
     res = []
     if not check_list(protobufs):
         LOG.info("Do Not Call Word Segging,For the input list is:%s"%(protobufs))
         return res
     try:
         self.transport.open()
         LOG.info("Begin  RPC Word Segging,[%s] To Be Segged!"
                  %(len(protobufs)))
         res = self.client.word_seg_list(protobufs)
         self.transport.close()
         LOG.info("Finish RPC Word Segging,[%s] Entrys Have Been Segged!"
                  %(len(res)))
         self.rec_protobuf(res)
     except Exception,e:
         LOG.error(e)
示例#23
0
    def resubmit_failed_data(self,url=POSTURL,retry_time=3):
        """ resubmit the failed data to RB, will try 3 times, before every new
        try system will sleep 1 minute.Finally after 3 trys,the all-the-time
        failed data will be submitted with next program call
        """
        db_helper = DBHelper()
        try_count = 0
        while(len(self.failed_ids)>0 and try_count <retry_time):
            time.sleep(1)
            LOG.info("\nResubmit Retry Round [%s]." %try_count)
            LOG.info("[%s] Entrys Are To Re-Submit"%(len(self.failed_ids)))

            self.submit_to_RB(self.failed_entrys,url)
            try_count += 1
        LOG.info("Finish Resubmitting ,[%s] Remains UnSubmitted."
                  %len(self.failed_ids))
        if len(self.failed_ids) != 0:
            self.write_ids_to_rec()
示例#24
0
 def rm_field(self, entrys, field):
     LOG.info("Removing Field :[%s]" % (field))
     for entry in entrys:
         if entry.get(field):
             del entry[field]
示例#25
0
    def diff_tables(self, table_old, table_new, db_old, db_new):
        LOG.info("Begin To Remove Duplicates entrys...")
        entrys = []
        group_checker = GroupChecker()
        # group_id_offset = db_helper.get_max_group_id(table_old,db_old)
        group_id_offset = 0
        group_checker.set_group_id_offset(group_id_offset)
        # load 关键字段是信息判重的依据
        cmp_columns = self.load_columns(KEY_FIELDS_PATH)
        # 已发布的数据要多选取group_id字段
        old_columns = cmp_columns + [ORIGIN_URL, GROUP_ID, RECRUIT_TITLE]
        old_list = self.get_list_fromDB(old_columns, table_old, db=db_old, isdict=True)[:]
        LOG.info("Get Published Data From Table[%s.%s] [%s] " % (db_old, table_old, len(old_list)))

        # 新爬取数据要多选取info_id字段,唯一标识该信息
        new_columns = cmp_columns + [INFO_ID, ORIGIN_URL, RECRUIT_TITLE]
        new_list = self.get_list_fromDB(new_columns, table_new, db=db_new, isdict=True)[:]
        LOG.info("Get New Data From Table[%s.%s] [%s] " % (db_new, table_new, len(new_list)))

        # 对新数据做预处理,分类,初步去重
        cam_entrys_new, rec_entrys_new = group_checker.deal_new_list(cmp_columns, new_list)

        # 对已发布数据分类
        cam_entrys_old, rec_entrys_old = group_checker.deal_old_list(cmp_columns, old_list)

        #   LOG.info("[%s] entrys are campus_entrys"%len(cam_entrys_new))
        #  LOG.info("[%s] entrys are recruit_entrys"%len(rec_entrys_new))

        # 对宣讲会数据去重,返回去重之后的数据info_id列表和与之对应的group_id
        cam_updates_infoids_dict = group_checker.deal_campus_list(cmp_columns, cam_entrys_new, cam_entrys_old)
        LOG.info("Totally we got [%s] updates campus entry_ids" % len(cam_updates_infoids_dict.keys()))
        # 对招聘会数据去重,返回招聘会数据的info_id列表
        rec_updates_infoids = group_checker.get_rec_updates_ids(rec_entrys_new, rec_entrys_old)
        LOG.info("Totally we got [%s] updates recruit entry_ids" % len(rec_updates_infoids))

        # 根据info_id列表,取出宣讲会信息的所有属性
        cam_updates_infoids = cam_updates_infoids_dict.keys()
        cam_full_entrys = self.fetch_data_by_ids(cam_updates_infoids, table_new, db_new, isdict=True)
        LOG.info("Update Campus Entrys NO. Is [%s]" % (len(cam_full_entrys)))

        # 为去重之后的新数据,增加group_id信息
        group_checker.add_cam_groupid_info(cam_full_entrys, cam_updates_infoids_dict)

        # 根据info_id列表,取出招聘会信息的所有属性
        rec_full_entrys = self.fetch_data_by_ids(rec_updates_infoids, table_new, db_new, isdict=True)
        group_checker.add_rec_groupid_info(rec_full_entrys)

        LOG.info("Update Recreuit Entrys No. Is [%s]" % (len(rec_full_entrys)))

        # 得到最终的更a新数据
        entrys = cam_full_entrys + rec_full_entrys

        # 过滤不合法的entry
        entrys = self.filter.deal_entrys(entrys)
        LOG.info("Finish Removing Duplicate Entrys!")
        LOG.info("Totally We got Update Entrys NO.Is:[%s]." % len(entrys))
        return entrys
示例#26
0
 def start_index(self):
     LOG.info("Prepare To Index")
     res = self.client.send_start_sig()
     if res:
         LOG.info("Server %s Get Ready To Index" % (self.host))
     return res
示例#27
0
 def stop_index(self):
     LOG.info("Stop Index")
     self.client.send_stop_sig()
示例#28
0
def serilize_entrys(entrys):
    res = []
    for entry in entrys:
        mdoc = merged_doc_pb2.MergedDocInfo()
        for key,value in entry.items():

            # 空值的跳过
            if value is None:
                continue

            # 数据库读出的long值改为int,info_id字段除外
            if isinstance(value,long):
                if(key != INFO_ID):
                    value = int(value)

            # 时间转化成时间戳int类型
            if isinstance(value,datetime.datetime):
                value = get_timestamp(value)

            # 对recruit_cities和编号进行映射
            if key == RECRUIT_CITIES:
                for city in value.strip().split(","):
                    if len(city) > 0:

                        id = city_id.get(value)
                        if id is None:
                            LOG.error("City Name [%s] Not Found!"%value)
                        else:
                            mdoc.recruit_cities.append(id)
                continue

            # 对collegeName和编号进行映射
            if key == COLLEGE_NAME:
                value = value.strip()
                if len(value) > 0:
                    id = college_id.get(value)
                    if id is None:
                        LOG.error("College Name [%s] Not Found!"%value)
                        missed_colleges.add(value)

                        #默认为0
                        mdoc.college_ID = 1
                    else:
                        #填充protobuffer中的college_id 字段
                        mdoc.college_ID = id

            #处理work_place值
            if key == 'work_place':
                for place in value.strip().split(","):
                    if len(place) > 0:
                        mdoc.work_place.append(place)
                LOG.info(";".join(mdoc.work_place))
                continue

            try:
                #计算term-weight需要gbk编码的value
                if isinstance(value,str):
                    value = value.decode('utf-8').encode('gbk')
                setattr(mdoc, key, value)

                mdoc.origin_url_sign = 0
                mdoc.recruit_title_sign = 0
                mdoc.info_text_sign = 0
                mdoc.group_id = 1
            except Exception,e:
                LOG.error(e)
                LOG.error("key is [%s] and value is[%s]"%(key,value))
        serilized = mdoc.SerializeToString()
        res.append(serilized)