def querymanylog(self, input='', output=''): lText = [] try: #dump xls to txt lLines = XLSDeal().XlsToList(input) for line in lLines: fs = line.strip().split('\t') if len(fs) < 3: continue entryid = fs[0].strip() url = fs[1].strip() abst = fs[2].strip() ret = self.TIL.query_by_entryidurl(entryid, url) reason = self.__getBestReason(abst, ret) fs.insert(0, reason) lText.append("\t".join(fs).strip()) except: print traceback.format_exc() self.errLogger.error("Raise exception: \n%s\n" % (traceback.format_exc())) try: XLSDeal().toXlsFile(lText, output) except: self.errLogger.error("Raise exception: \n%s\n" % (traceback.format_exc()))
def __init__(self, keyword, infile): self.keyword = keyword self.lLines = XLSDeal().XlsToList(infile) self.esmins = esm.Index() self.dup_list = [] for word in noneed_word: self.esmins.enter(word.strip()) self.esmins.fix()
def run(self, raw_data_path, save_path): list_data = self.read_raw_data(raw_data_path) duplicate_list = self.get_duplicate_list(list_data) result_data = self.rule_filter(list_data, duplicate_list) excel_title = [ u'序号', u'分类', u'主题', u'摘要', u'作者', u'监控对象', u'(汽车)结果', u'判断依据' ] result_data.insert(0, u'\t'.join(excel_title).encode('utf-8')) XLSDeal().toXlsFile(result_data, save_path)
def run(self): """ from list to xlsx :return: """ if self.method == 'tfidf': lfile, output = self.tfidf() XLSDeal().toXlsFile(lfile, self.outputfile) return output elif self.method == 'feelingword': word_instance = semevalword(moniterword=self.keyword, infile=self.inputfile, attr_num=20, word_num=1) output = [] for attword, group in word_instance.items(): for word, num in group: line2 = '%s\t%s' % (word, num) output.append(line2) return output
def summons_article(self): """ :return:lwords type list includes some articles such as [article one ,article tow ] warning only get 50000 article """ lwords = [] lLines = XLSDeal().XlsToList(self.inputfile) num = 0 for line in lLines: num += 1 if num >=50000: break line = line.strip().split('\t') if len(line) == 2: lwords.append(line[1].lower()) elif len(line) >= 3: line = ' %s %s' % (line[1], line[2]) lwords.append(line.lower()) return lwords
def semevalword(moniterword, industry, attr_num, infile=False, is_marketing=True, is_dup=True, is_es=False, is_sitename=False, day=90): logging.info('Punctuation') if not is_es: raw_data = XLSDeal().XlsToList(infile) elif is_es: raw_data = es_query(moniterword, day) sen_ins = MakeSentence(raw_data=raw_data, is_marketing=is_marketing, is_dup=is_dup) logging.info('find words') Dsents = sen_ins.extract_sentence() logging.info(moniterword) word_ins = PhraseReconize(moniter_word=moniterword, Dsent=Dsents, industry_id=industry) ddata, outlist = word_ins.prepare() #XLSDeal().toXlsFile(outlist, 'result_%s'%infile) count_ins = DataStatistic(ddata=ddata, attr_num=attr_num + 1, word_num=1) result, regroup = count_ins.run() sort_result = [] for attrword, group in result.items(): attr, attnum = attrword sort_result.append(group[0]) iresult = sorted(sort_result, key=lambda x: x[1], reverse=True) dealresult = [] for word in iresult: words = word[0].split(':') if len(words) == 2: dealresult.append((words[1], word[1])) return dealresult
def predict(self, projectname='', brands='', input='', output=''): ''' inputs: projectname : project name brands : brands name under project input : input file path, the first three columns must be id, title, abstract output : output file path res: predict result: True success or False Failure ''' if projectname == '' or input == '' or output == '': self.logger.error("parameter error !!!") return False lProjectinfo = c_servers[projectname.strip()].split('_') sProject = lProjectinfo[0] sIndustryno = lProjectinfo[1] if len(lProjectinfo) > 1 else '' try: #dump xls to txt lLines = XLSDeal().XlsToList(input) except: self.logger.error("Raise exception: \n%s\n" % (traceback.format_exc())) lText = [] for sLine in lLines: sLine = sLine.strip() fs = sLine.split('\t') if sLine == '' or len(fs) < 3: continue sId = fs[0] sTitle = fs[1].replace(' ', '') sDocument = fs[2].replace(' ', '') dParam = {'id': sId, 'title': sTitle, 'document': sDocument} #require more information project deal if sProject in c_more_project: dParam['other'] = fs[3].strip() if sIndustryno != '': dParam['industry'] = sIndustryno try: jRes = self.__start(sProject, [dParam]) lRes = json.loads(jRes) dItem = lRes[0] if 'prob' in dItem: sOut = '%s\t%s\t%s' % (c_labels[projectname][ dItem['type']], _utf_string(dItem['prob']), sLine) elif 'nothit' in dItem and brands.strip() != '': brand = brands.strip().decode('utf-8') if brand in dItem['nothit']: hothitprob = 1 - float(dItem['nothit'][brand]) sOut = '%s\t%s\t%s' % ( c_labels[projectname][dItem['type']], _utf_string(str(hothitprob)), sLine) elif 'result' in dItem: prob = 0.5 for industry in dItem['result']: for rec in dItem['result'][industry]: if brand in rec: prob = rec[brand] break sOut = '%s\t%s\t%s' % (c_labels[projectname][ dItem['type']], _utf_string(prob), sLine) else: sOut = '%s\t%s' % (c_labels[projectname][dItem['type']], sLine) if brands.strip() != '': bSpecific = False for brand in brands.split('#'): if isContain(sOut.replace(' ', ''), brand): bSpecific = True break if bSpecific: lText.append(sOut.strip()) else: lText.append(sOut.strip()) except: self.logger.error("Raise exception: \n%s\nWith data: \n%s" % (traceback.format_exc(), sLine)) try: XLSDeal().toXlsFile(lText, output) except: self.logger.error("Raise exception: \n%s\n" % (traceback.format_exc())) return True
def predict(self, industry='', target='', input='', output=''): ''' inputs: industry : industry name input : input file path, the first three columns must be id, target, title, abstract output : output file path res: predict result: True success or False Failure ''' if industry == '' or input == '' or output == '': self.logger.error("parameter error !!!") return False sIndustryNo = c_industrys[industry.strip()] sProject = 'semeval' try: #dump xls to txt lLines = XLSDeal().XlsToList(input) except: self.logger.error("Raise exception: \n%s\n" % (traceback.format_exc())) lText = [] for sLine in lLines: sLine = sLine.strip() fs = sLine.split('\t') if sLine == '' or len(fs) < 5: continue sId = fs[0] sTarget = fs[1].replace(' ', '').strip() sTitle = self.__cut(fs[2].replace(' ', '').strip()) sDocument = self.__cut(fs[3].replace(' ', '').strip()) sLink = fs[4].replace(' ', '').strip() dParam = { 'id': sId, 'title': sTitle, 'document': sDocument, "type": "weibo", "brand": sTarget, "industry": sIndustryNo, 'link': sLink } #require more information project deal try: lRes = self.__start(sProject, [dParam]) dItem = lRes[0] if 'prob' in dItem: sOut = '%s\t%s\t%s' % (c_labels[dItem['type']], _utf_string(dItem['prob']), sLine) else: sOut = '%s\t%s' % (c_labels[projectname][dItem['type']], sLine) lText.append(sOut.strip()) except: self.logger.error("Raise exception: \n%s\nWith data: \n%s" % (traceback.format_exc(), sLine)) try: XLSDeal().toXlsFile(lText, output) except: self.logger.error("Raise exception: \n%s\n" % (traceback.format_exc())) return True