Пример #1
0
    def querymanylog(self, input='', output=''):

        lText = []
        try:
            #dump xls to txt
            lLines = XLSDeal().XlsToList(input)
            for line in lLines:
                fs = line.strip().split('\t')
                if len(fs) < 3:
                    continue
                entryid = fs[0].strip()
                url = fs[1].strip()
                abst = fs[2].strip()
                ret = self.TIL.query_by_entryidurl(entryid, url)
                reason = self.__getBestReason(abst, ret)
                fs.insert(0, reason)
                lText.append("\t".join(fs).strip())
        except:
            print traceback.format_exc()
            self.errLogger.error("Raise exception: \n%s\n" %
                                 (traceback.format_exc()))

        try:
            XLSDeal().toXlsFile(lText, output)
        except:
            self.errLogger.error("Raise exception: \n%s\n" %
                                 (traceback.format_exc()))
Пример #2
0
 def __init__(self, keyword, infile):
     self.keyword = keyword
     self.lLines = XLSDeal().XlsToList(infile)
     self.esmins = esm.Index()
     self.dup_list = []
     for word in noneed_word:
         self.esmins.enter(word.strip())
     self.esmins.fix()
Пример #3
0
 def run(self, raw_data_path, save_path):
     list_data = self.read_raw_data(raw_data_path)
     duplicate_list = self.get_duplicate_list(list_data)
     result_data = self.rule_filter(list_data, duplicate_list)
     excel_title = [
         u'序号', u'分类', u'主题', u'摘要', u'作者', u'监控对象', u'(汽车)结果', u'判断依据'
     ]
     result_data.insert(0, u'\t'.join(excel_title).encode('utf-8'))
     XLSDeal().toXlsFile(result_data, save_path)
Пример #4
0
 def run(self):
     """
     from list to xlsx
     :return:
     """
     if self.method == 'tfidf':
         lfile, output = self.tfidf()
         XLSDeal().toXlsFile(lfile, self.outputfile)
         return output
     elif self.method == 'feelingword':
         word_instance = semevalword(moniterword=self.keyword, infile=self.inputfile, attr_num=20, word_num=1)
         output = []
         for attword, group in word_instance.items():
             for word, num in group:
                 line2 = '%s\t%s' % (word, num)
                 output.append(line2)
         return output
Пример #5
0
    def summons_article(self):
        """

        :return:lwords type list includes some articles such as [article one ,article tow ]
        warning only get 50000 article
        """
        lwords = []
        lLines = XLSDeal().XlsToList(self.inputfile)
        num = 0
        for line in lLines:
            num += 1
            if num >=50000:
                break 
            line = line.strip().split('\t')
            if len(line) == 2:
                lwords.append(line[1].lower())
            elif len(line) >= 3:
                line = ' %s %s' % (line[1], line[2])
                lwords.append(line.lower())
        return lwords
Пример #6
0
def semevalword(moniterword,
                industry,
                attr_num,
                infile=False,
                is_marketing=True,
                is_dup=True,
                is_es=False,
                is_sitename=False,
                day=90):
    logging.info('Punctuation')

    if not is_es:
        raw_data = XLSDeal().XlsToList(infile)
    elif is_es:
        raw_data = es_query(moniterword, day)

    sen_ins = MakeSentence(raw_data=raw_data,
                           is_marketing=is_marketing,
                           is_dup=is_dup)
    logging.info('find words')
    Dsents = sen_ins.extract_sentence()
    logging.info(moniterword)
    word_ins = PhraseReconize(moniter_word=moniterword,
                              Dsent=Dsents,
                              industry_id=industry)
    ddata, outlist = word_ins.prepare()
    #XLSDeal().toXlsFile(outlist, 'result_%s'%infile)
    count_ins = DataStatistic(ddata=ddata, attr_num=attr_num + 1, word_num=1)
    result, regroup = count_ins.run()
    sort_result = []
    for attrword, group in result.items():
        attr, attnum = attrword
        sort_result.append(group[0])
    iresult = sorted(sort_result, key=lambda x: x[1], reverse=True)
    dealresult = []
    for word in iresult:
        words = word[0].split(':')
        if len(words) == 2:
            dealresult.append((words[1], word[1]))

    return dealresult
Пример #7
0
    def predict(self, projectname='', brands='', input='', output=''):
        '''
            inputs:
               projectname : project name
               brands : brands name under project 
               input : input file path, the first three columns must be id, title, abstract 
               output : output file path

            res:
                predict result: True success or False Failure

        '''

        if projectname == '' or input == '' or output == '':
            self.logger.error("parameter error !!!")
            return False

        lProjectinfo = c_servers[projectname.strip()].split('_')
        sProject = lProjectinfo[0]
        sIndustryno = lProjectinfo[1] if len(lProjectinfo) > 1 else ''

        try:
            #dump xls to txt
            lLines = XLSDeal().XlsToList(input)
        except:
            self.logger.error("Raise exception: \n%s\n" %
                              (traceback.format_exc()))

        lText = []
        for sLine in lLines:
            sLine = sLine.strip()
            fs = sLine.split('\t')
            if sLine == '' or len(fs) < 3:
                continue

            sId = fs[0]
            sTitle = fs[1].replace(' ', '')
            sDocument = fs[2].replace(' ', '')

            dParam = {'id': sId, 'title': sTitle, 'document': sDocument}

            #require more information project deal
            if sProject in c_more_project:
                dParam['other'] = fs[3].strip()

            if sIndustryno != '':
                dParam['industry'] = sIndustryno

            try:
                jRes = self.__start(sProject, [dParam])
                lRes = json.loads(jRes)
                dItem = lRes[0]
                if 'prob' in dItem:
                    sOut = '%s\t%s\t%s' % (c_labels[projectname][
                        dItem['type']], _utf_string(dItem['prob']), sLine)
                elif 'nothit' in dItem and brands.strip() != '':
                    brand = brands.strip().decode('utf-8')
                    if brand in dItem['nothit']:
                        hothitprob = 1 - float(dItem['nothit'][brand])
                        sOut = '%s\t%s\t%s' % (
                            c_labels[projectname][dItem['type']],
                            _utf_string(str(hothitprob)), sLine)
                    elif 'result' in dItem:
                        prob = 0.5
                        for industry in dItem['result']:
                            for rec in dItem['result'][industry]:
                                if brand in rec:
                                    prob = rec[brand]
                                    break
                        sOut = '%s\t%s\t%s' % (c_labels[projectname][
                            dItem['type']], _utf_string(prob), sLine)
                else:
                    sOut = '%s\t%s' % (c_labels[projectname][dItem['type']],
                                       sLine)

                if brands.strip() != '':
                    bSpecific = False
                    for brand in brands.split('#'):
                        if isContain(sOut.replace(' ', ''), brand):
                            bSpecific = True
                            break
                    if bSpecific:
                        lText.append(sOut.strip())
                else:
                    lText.append(sOut.strip())

            except:
                self.logger.error("Raise exception: \n%s\nWith data: \n%s" %
                                  (traceback.format_exc(), sLine))

        try:
            XLSDeal().toXlsFile(lText, output)
        except:
            self.logger.error("Raise exception: \n%s\n" %
                              (traceback.format_exc()))
        return True
Пример #8
0
    def predict(self, industry='', target='', input='', output=''):
        '''
            inputs:
               industry : industry name
               input : input file path, the first three columns must be id, target, title, abstract 
               output : output file path

            res:
                predict result: True success or False Failure

        '''

        if industry == '' or input == '' or output == '':
            self.logger.error("parameter error !!!")
            return False

        sIndustryNo = c_industrys[industry.strip()]
        sProject = 'semeval'

        try:
            #dump xls to txt
            lLines = XLSDeal().XlsToList(input)
        except:
            self.logger.error("Raise exception: \n%s\n" %
                              (traceback.format_exc()))

        lText = []
        for sLine in lLines:
            sLine = sLine.strip()
            fs = sLine.split('\t')
            if sLine == '' or len(fs) < 5:
                continue

            sId = fs[0]
            sTarget = fs[1].replace(' ', '').strip()
            sTitle = self.__cut(fs[2].replace(' ', '').strip())
            sDocument = self.__cut(fs[3].replace(' ', '').strip())
            sLink = fs[4].replace(' ', '').strip()

            dParam = {
                'id': sId,
                'title': sTitle,
                'document': sDocument,
                "type": "weibo",
                "brand": sTarget,
                "industry": sIndustryNo,
                'link': sLink
            }

            #require more information project deal
            try:
                lRes = self.__start(sProject, [dParam])
                dItem = lRes[0]
                if 'prob' in dItem:
                    sOut = '%s\t%s\t%s' % (c_labels[dItem['type']],
                                           _utf_string(dItem['prob']), sLine)
                else:
                    sOut = '%s\t%s' % (c_labels[projectname][dItem['type']],
                                       sLine)

                lText.append(sOut.strip())

            except:
                self.logger.error("Raise exception: \n%s\nWith data: \n%s" %
                                  (traceback.format_exc(), sLine))

        try:
            XLSDeal().toXlsFile(lText, output)
        except:
            self.logger.error("Raise exception: \n%s\n" %
                              (traceback.format_exc()))
        return True