Пример #1
0
 def accessSolrByMonth(self, query, month, prows, raw_fq=[], raw_param={}):
     """
     #提供按月搜索新闻,使搜到的新闻尽可能的分散
     # @param query : 
     # @param month : 搜索的月数
     # @param prows : 每个月搜索量
     # 其余参数同上
     # return : 同上 list
     """
     reslist = []
     nowtime = time.time()
     month_day = 30
     timeHelper = TimeHelper()
     for x in xrange(month):
         end_time = nowtime - x * month_day
         current_end_day = timeHelper.getDateTimeFromSeconds(
             end_time, "%Y-%m-%d")
         flag, curreslist = self.process(query,
                                         rows=prows,
                                         last_day=month_day,
                                         end_day=current_end_day,
                                         fq=raw_fq,
                                         solr_param=raw_param)
         if flag:
             reslist.extend(curreslist)
     return reslist
Пример #2
0
 def __init__(self, solr_url, reconnect_maxcnt=2):
     """ eg : http://192.168.201.63:10000/solr/select? """
     self.solr_url = solr_url
     self.timeHelper = TimeHelper()
     self.reconnect_cnt = 0
     self.reconnect_maxcnt = reconnect_maxcnt
     self.addressType = self.addressIsList()
Пример #3
0
 def __init__(self, solr_url, reconnect_maxcnt=2):
     """ eg : http://192.168.201.63:10000/solr/select? """ 
     self.solr_url = solr_url
     self.timeHelper = TimeHelper()
     self.reconnect_cnt = 0
     self.reconnect_maxcnt = reconnect_maxcnt
     self.addressType = self.addressIsList()
Пример #4
0
 def accessSolrByMonth(self, query, month, prows, raw_fq=[], raw_param={}):
     """
     #提供按月搜索新闻,使搜到的新闻尽可能的分散
     # @param query : 
     # @param month : 搜索的月数
     # @param prows : 每个月搜索量
     # 其余参数同上
     # return : 同上 list
     """
     reslist = []
     nowtime = time.time()
     month_day = 30
     timeHelper = TimeHelper()
     for x in xrange(month):
         end_time = nowtime - x * month_day
         current_end_day = timeHelper.getDateTimeFromSeconds(end_time, "%Y-%m-%d")
         flag,curreslist = self.process(query, rows=prows, last_day=month_day, end_day=current_end_day, fq=raw_fq, solr_param=raw_param)
         if flag:
             reslist.extend(curreslist)
     return reslist
Пример #5
0
class SolrReaderUtil:
    def __init__(self, solr_url, reconnect_maxcnt=2):
        """ eg : http://192.168.201.63:10000/solr/select? """
        self.solr_url = solr_url
        self.timeHelper = TimeHelper()
        self.reconnect_cnt = 0
        self.reconnect_maxcnt = reconnect_maxcnt
        self.addressType = self.addressIsList()

    def addressIsList(self):
        if type(self.solr_url) == types.ListType:
            return True
        else:
            return False

    def constructSolrQuery(self, query, param_else={}, fq=[]):
        param_dict = {}
        param_dict["wt"] = "json"
        param_dict["distrib"] = "true"
        param_dict["q"] = query
        for param, value in param_else.items():
            param_dict[param] = value
        try:
            param_str = urllib.urlencode(param_dict)
            if len(fq) <> 0:
                plus_str = "&".join(["fq=%s" % afq for afq in fq])
                param_str += "&" + plus_str
        except:
            logging.warn("urlencode dict failed!")
            return ""
        if self.addressType:
            solr_url = [(address + param_str) for address in self.solr_url]
        else:
            solr_url = self.solr_url + param_str
        return solr_url

    def connect(self, solr_url):
        """ connect solr """
        line = ""
        try:
            req = urllib2.Request(solr_url)
            #print solr_url
            if solr_url.find("201.") <> -1:
                req.set_proxy(PROXY_ADDRESS, "http")
            line = urllib2.urlopen(req).read()
            #print line
        except:
            #print(traceback.format_exc())
            logging.warn(
                "solr connect failed ! solr down ? address : %s  !reconnect %d times"
                % (solr_url, self.reconnect_cnt))
            return line
        return line

    def reconnect(self, solr_url):
        """ reconnect """
        self.reconnect_cnt += 1
        return self.connect(solr_url)

    def accessSolr(self, solr_url):
        """ connect or else reconnect """
        line = ""
        if self.addressType:
            addresslist = solr_url[:]
            cnt = 0
            while (len(addresslist) > 0 and cnt <= self.reconnect_maxcnt):
                num = random.randint(0, len(addresslist) - 1)
                logging.info("connect address : %s" % addresslist[num])
                line = self.connect(addresslist[num])
                if line <> "":
                    break
                cnt += 1
                del addresslist[num]
        else:
            line = self.connect(solr_url)
            if line == "":
                for x in xrange(self.reconnect_maxcnt):
                    line = self.reconnect(solr_url)
                    if line != "":
                        break
                    #time.sleep(2)
        return line

    @staticmethod
    def parserClassification(classiferStr):
        """
        # classiferStr : Classification 字段内容 eg : -1:0.979535 1000000:1 1075:0.725315 
        # return : dict key : cid ; value : score(string)
        """
        cid2score = {}
        if classiferStr == None:
            logging.info("the Classification is null")
            return cid2score
        cid2scorelist = classiferStr.strip().split()
        for cidinfo in cid2scorelist:
            clist = cidinfo.strip().split(":")
            if len(clist) <> 2:
                continue
            cid2score[clist[0]] = clist[1]
        return cid2score

    @staticmethod
    def parserRelatedStock(stockstr):
        """
        # stockstr : RelatedStock 字段内容
        # return  : dict  key : stock code ;value : (score , titleFre, contentFre)
        """
        stock2info = {}
        if stockstr == None:
            logging.info("the RelatedStock is null")
            return stock2info
        stocklist = stockstr.strip().split()
        length = len(stocklist)
        if length % 2 <> 0:
            logging.warn("the RelatedStock format is wrong!")
            return stock2info
        for x in xrange(length):
            if x % 2 == 1:
                continue
            try:
                score = (stocklist[x + 1].split(":")[0])
                titFre = (stocklist[x + 1].split(":")[1])
                conFre = (stocklist[x + 1].split(":")[2])
                stock2info[stocklist[x].strip()] = (score, titFre, conFre)
            except:
                #print(traceback.format_exc())
                logging.warn(
                    "the RelatedStock format is wrong! float chang wrong")
                continue
        return stock2info

    @staticmethod
    def parserRelatedSecurity(stockstr):
        """
        # stockstr : RelatedSecurity 字段内容
        # return  : dict  key : stock code ;value : [score , titleFre, contentFre, stockType]
        """
        stock2info = {}
        if stockstr == None:
            logging.info("the RelatedStock is null")
            return stock2info
        stocklist = stockstr.strip().split()
        length = len(stocklist)
        if length % 2 <> 0:
            logging.warn("the RelatedStock format is wrong!")
            return stock2info
        for x in xrange(length):
            if x % 2 == 1:
                continue
            try:
                infolist = stocklist[x + 1].split(":")
                stock2info[stocklist[x].strip()] = infolist
            except:
                #print(traceback.format_exc())
                logging.warn("the RelatedSecurity format is wrong!")
                continue
        return stock2info

    def loadJsonSolr(self, solr_res):
        """
        # solr_res : solr result json_str
        # return : solr result json_format
        """
        json_format = {}
        try:
            json_format = json.loads(solr_res)
        except:
            logging.warn("load json result of solr failed!")
            return False, json_format
        return True, json_format

    def getSolrInfo(self, solr_res, param_set=set()):
        """
        # solr_res :  solr result json
        # param_set : 需要获取结果的字段名,空则全部结果
        # return :  list  = [dict1,dict2 ...] ,每个dict为 字段名 对应 结果
        """
        paramlist = []
        flag, json_format = self.loadJsonSolr(solr_res)
        if "response" in json_format:
            if "docs" in json_format["response"]:
                reslist = json_format["response"]["docs"]
            else:
                logging.warn("json format wrong! has no docs tag!")
                return False, paramlist
        else:
            logging.warn("json format wrong! has no response tag!")
            return False, paramlist
        #reslist.sort(key=lambda s:s["PublishTime"],reverse = True)
        #print reslist
        if len(param_set) == 0:
            return True, reslist
        for res in reslist:
            paramdict = {}
            try:
                for param in param_set:
                    content = res.get(param)
                    paramdict[param] = content
            except:
                continue
            paramlist.append(paramdict)
        #print paramlist
        return True, paramlist

    def addTimeToQuery(self, last_day=5, end_day=""):
        """
        # last_day : days 
        # end_day format :  %Y-%m-%d  eg : 2012-12-04
        # if end_day == "" : 
        #   end_day = time.time()
        """
        if end_day == "":
            end_seconds = int(time.time())
        else:
            end_data = end_day + " 00:00:00"
            end_seconds = self.timeHelper.getSecondsFromDateTime(end_data)
        begin_seconds = end_seconds - last_day * 3600 * 24
        fq = "PublishTime:[" + str(begin_seconds) + " TO " + str(
            end_seconds) + "]"
        return fq
Пример #6
0
    def getPos(self):
        return self.offset
        pass

    def getAverageTime(self):
        pass_time = time.time() - self.start_time
        result_cnt = self.offset - self.org_offset
        if result_cnt == 0:
            return 0
        return pass_time / (1. * result_cnt)
        pass


if __name__ == "__main__":
    #设置搜索时间
    time_helper = TimeHelper()
    time_start = time_helper.getSecondsFromDateTime("2011-03-01 00:00")
    time_end = time.mktime(time.localtime())
    config_dict = {"url":"http://192.168.201.87:8888/search", \
                   "channel":"report",  \
                   "other_param":{"sde":"0"}, \
                   "solr":{ "UID":"", "contentsize":"1"},\
                   "desc":{"title":"", "content":""}, \
                   "time_start": int(time_start), \
                   "time_end": int(time_end),\
                   "max_number":500,\
                   "q": "烟台万华" }
    online_news_reader = OnlineTimeReader()
    online_news_reader.open(config_dict)
    result_dict = []  #存放结果 每个结果是一个字典,key是 solr 和 desc指定的内容
    next_meta = {}
Пример #7
0
        return self.offset
        pass

    def getAverageTime(self):
        pass_time = time.time() - self.start_time
        result_cnt = self.offset - self.org_offset
        if result_cnt == 0:
            return 0
        return pass_time/(1. *result_cnt)
        pass



if __name__ == "__main__":
    #设置搜索时间
    time_helper = TimeHelper()
    time_start = time_helper.getSecondsFromDateTime("2011-03-01 00:00")
    time_end = time.mktime(time.localtime()) 
    config_dict = {"url":"http://192.168.201.87:8888/search", \
                   "channel":"report",  \
                   "other_param":{"sde":"0"}, \
                   "solr":{ "UID":"", "contentsize":"1"},\
                   "desc":{"title":"", "content":""}, \
                   "time_start": int(time_start), \
                   "time_end": int(time_end),\
                   "max_number":500,\
                   "q": "烟台万华" }
    online_news_reader = OnlineTimeReader()
    online_news_reader.open(config_dict)
    result_dict = [] #存放结果 每个结果是一个字典,key是 solr 和 desc指定的内容
    next_meta = {}
Пример #8
0
class SolrReaderUtil:

    def __init__(self, solr_url, reconnect_maxcnt=2):
        """ eg : http://192.168.201.63:10000/solr/select? """ 
        self.solr_url = solr_url
        self.timeHelper = TimeHelper()
        self.reconnect_cnt = 0
        self.reconnect_maxcnt = reconnect_maxcnt
        self.addressType = self.addressIsList()
    
    def addressIsList(self):
        if type(self.solr_url) == types.ListType:
            return True
        else:
            return False

    def constructSolrQuery(self, query, param_else={}, fq=[]):
        param_dict = {}
        param_dict["wt"] = "json"
        param_dict["distrib"] = "true"
        param_dict["q"] = query
        for param,value in param_else.items():
            param_dict[param] = value
        try:
            param_str = urllib.urlencode(param_dict) 
            if len(fq) <> 0:
                plus_str = "&".join(["fq=%s" %afq for afq in fq])                
                param_str += "&" + plus_str
        except:
            logging.warn("urlencode dict failed!")
            return ""
        if self.addressType:
            solr_url = [ (address + param_str) for address in self.solr_url]
        else:
            solr_url = self.solr_url + param_str
        return solr_url

    def connect(self, solr_url):
        """ connect solr """
        line = ""
        try:
            req = urllib2.Request(solr_url)
            #print solr_url
            if solr_url.find("201.") <> -1:
                req.set_proxy(PROXY_ADDRESS, "http")
            line = urllib2.urlopen(req).read()
            #print line
        except:
            #print(traceback.format_exc())
            logging.warn("solr connect failed ! solr down ? address : %s  !reconnect %d times" %(solr_url,self.reconnect_cnt))
            return line
        return line

    def reconnect(self, solr_url):
        """ reconnect """
        self.reconnect_cnt += 1
        return self.connect(solr_url)

    def accessSolr(self, solr_url):
        """ connect or else reconnect """
        line = ""
        if self.addressType:
            addresslist = solr_url[:]
            cnt = 0
            while( len(addresslist) > 0 and cnt <= self.reconnect_maxcnt):
                num = random.randint(0,len(addresslist)-1)
                logging.info("connect address : %s" %addresslist[num])
                line = self.connect(addresslist[num])
                if line <> "":
                    break
                cnt += 1
                del addresslist[num]
        else:
            line = self.connect(solr_url)
            if line == "":
                for x in xrange(self.reconnect_maxcnt):
                    line = self.reconnect(solr_url)
                    if line != "":
                        break
                    #time.sleep(2)
        return line

    @staticmethod
    def parserClassification(classiferStr):
        """
        # classiferStr : Classification 字段内容 eg : -1:0.979535 1000000:1 1075:0.725315 
        # return : dict key : cid ; value : score(string)
        """
        cid2score = {}
        if classiferStr == None:
            logging.info("the Classification is null")
            return cid2score
        cid2scorelist = classiferStr.strip().split()
        for cidinfo in cid2scorelist:
            clist = cidinfo.strip().split(":")
            if len(clist) <> 2:
                continue
            cid2score[clist[0]] = clist[1]
        return cid2score

    @staticmethod
    def parserRelatedStock(stockstr):
        """
        # stockstr : RelatedStock 字段内容
        # return  : dict  key : stock code ;value : (score , titleFre, contentFre)
        """
        stock2info = {}
        if stockstr == None:
            logging.info("the RelatedStock is null")
            return stock2info
        stocklist = stockstr.strip().split()
        length = len(stocklist)
        if length % 2 <> 0:
            logging.warn("the RelatedStock format is wrong!")
            return stock2info
        for x in xrange(length):
            if x % 2 == 1:
                continue
            try:
                score = (stocklist[x+1].split(":")[0])
                titFre = (stocklist[x+1].split(":")[1])
                conFre = (stocklist[x+1].split(":")[2])
                stock2info[stocklist[x].strip()] = (score, titFre, conFre)
            except:
                #print(traceback.format_exc())
                logging.warn("the RelatedStock format is wrong! float chang wrong")
                continue
        return stock2info

    @staticmethod
    def parserRelatedSecurity(stockstr):
        """
        # stockstr : RelatedSecurity 字段内容
        # return  : dict  key : stock code ;value : [score , titleFre, contentFre, stockType]
        """
        stock2info = {}
        if stockstr == None:
            logging.info("the RelatedStock is null")
            return stock2info
        stocklist = stockstr.strip().split()
        length = len(stocklist)
        if length % 2 <> 0:
            logging.warn("the RelatedStock format is wrong!")
            return stock2info
        for x in xrange(length):
            if x % 2 == 1:
                continue
            try:
                infolist = stocklist[x+1].split(":")
                stock2info[stocklist[x].strip()] = infolist
            except:
                #print(traceback.format_exc())
                logging.warn("the RelatedSecurity format is wrong!")
                continue
        return stock2info

    def loadJsonSolr(self, solr_res):
        """
        # solr_res : solr result json_str
        # return : solr result json_format
        """
        json_format = {}
        try:
            json_format = json.loads(solr_res)
        except:
            logging.warn("load json result of solr failed!")
            return False,json_format
        return True,json_format

    def getSolrInfo(self, solr_res, param_set=set()):
        """
        # solr_res :  solr result json
        # param_set : 需要获取结果的字段名,空则全部结果
        # return :  list  = [dict1,dict2 ...] ,每个dict为 字段名 对应 结果
        """
        paramlist = []
        flag,json_format = self.loadJsonSolr(solr_res)
        if "response" in json_format:
            if "docs" in json_format["response"]:
                reslist = json_format["response"]["docs"]
            else:
               logging.warn("json format wrong! has no docs tag!")
               return False,paramlist
        else:
            logging.warn("json format wrong! has no response tag!")
            return False,paramlist
        #reslist.sort(key=lambda s:s["PublishTime"],reverse = True)
        #print reslist
        if len(param_set) == 0:
            return True,reslist
        for res in reslist:
            paramdict = {}
            try:
                for param in param_set:
                    content = res.get(param)
                    paramdict[param] = content
            except:
                continue
            paramlist.append(paramdict)
        #print paramlist
        return True,paramlist

    def addTimeToQuery(self, last_day=5, end_day=""):
        """
        # last_day : days 
        # end_day format :  %Y-%m-%d  eg : 2012-12-04
        # if end_day == "" : 
        #   end_day = time.time()
        """
        if end_day == "":
            end_seconds = int(time.time())
        else:
            end_data = end_day + " 00:00:00"
            end_seconds = self.timeHelper.getSecondsFromDateTime(end_data)
        begin_seconds = end_seconds - last_day * 3600 * 24
        fq = "PublishTime:[" + str(begin_seconds) + " TO " +str(end_seconds) +  "]" 
        return fq