def vk2xmpp(id): """ Returns id@TransportID if parameter "id" is int or str(int) Returns id if parameter "id" is id@TransportID Returns TransportID if "id" is TransportID """ if not utils.isNumber(id) and "@" in id: id = id.split("@")[0] if utils.isNumber(id): id = int(id) elif id != TransportID: id = u"%s@%s" % (id, TransportID) return id
def __isCommand(self, key, word2): """ decide if a keyword and next word are of the form 'command arg, ...' which will get translated to 'command(arg, ...)' to allow 'command syntax' """ # this could be in one long test, but we simplify: # first test key: if (not isValidName(key) or key in self.friends or key.startswith('#') or len(key) < 1 or len(word2) < 1): return False if self._larch is not None: comms = self._larch.symtable.get_symbol('_sys.valid_commands', create=True) if key not in comms: return False # next test word2 return (isValidName(word2) or isNumber(word2) or isLiteralStr(word2) )
def classify(self, data): classProbabilities = {} # Stores our final probabilities for className in self.classifierBins: probabilityProd = None # Calculate product of our probabilities for key in data: if key != settings.CLASSIFIER_NAME: if util.isNumber(data[key]) == False: probKey = str(data[key]) + " given " + className if probabilityProd == None: probabilityProd = self.probability[probKey] else: probabilityProd *= self.probability[probKey] else: prob = util.gaussianDensity(data[key], self.numericBins[key + ' given ' + className + ' mean'], self.numericBins[key + ' given ' + className + ' stdev']) if probabilityProd == None: probabilityProd = prob else: probabilityProd *= prob classProbabilities[className] = probabilityProd * self.probability[className] maxProb = [0, None] for className in classProbabilities: if classProbabilities[className] > maxProb[0]: maxProb = [classProbabilities[className], className] return maxProb[1]
def classify(self, data): classProbabilities = {} # Stores our final probabilities for className in self.classifierBins: probabilityProd = None # Calculate product of our probabilities for key in data: if key != settings.CLASSIFIER_NAME: if util.isNumber(data[key]) == False: probKey = str(data[key]) + " given " + className if probabilityProd == None: probabilityProd = self.probability[probKey] else: probabilityProd *= self.probability[probKey] else: prob = util.gaussianDensity( data[key], self.numericBins[key + ' given ' + className + ' mean'], self.numericBins[key + ' given ' + className + ' stdev']) if probabilityProd == None: probabilityProd = prob else: probabilityProd *= prob classProbabilities[ className] = probabilityProd * self.probability[className] maxProb = [0, None] for className in classProbabilities: if classProbabilities[className] > maxProb[0]: maxProb = [classProbabilities[className], className] return maxProb[1]
def filter_keyword(): fname = u'D:/Data/词检索/国家战略.txt' with open(fname, 'r') as f: files = [line.strip() for line in f.readlines()] for f in files: with open(f, 'r') as fp: text = fp.readlines() if f.startswith('D:/Data/gov'): text = ' '.join(text).decode('utf-8') # f = f.lstrip('D:/Data/gov/') # dep, fname = f.split('/')[0], f.split('/')[1] # f = '%s_%s_%s'%(fname.split('_')[0], dep, fname.split('_')[1]) # print f if text.find(u'国家战略') != -1 and f.decode('gbk').find(u'习近平') != -1: print f # fname = u'D:/Data/词检索/国家战略/%s'%(f.decode('gbk')) # with open(fname, 'w') as fp: # fp.write(text.encode('utf-8')) doc = jieba.cut(text) doc = [ word for word in doc if (not word in stop_words) and (not utils.isNumber(word)) ] doc = ' '.join(doc) doc = doc.replace('\t', ' ').replace('\n', ' ') with open(text_file, 'a') as fp: fp.write(doc.encode('utf-8') + '\n')
def get_topic_heat(topic_number, look_back=7): fname = '%s/topic_words/topic_%s_twords.xlsx' % (const.TOPIC_DIR, topic_number) df = pd.read_excel(fname) heat_count = {} for word in df['word']: if not utils.isNumber(word) and word in word_count: for day, value in word_count[word].iteritems(): if not heat_count.has_key(day): heat_count[day] = 0 heat_count[day] += value df = pd.DataFrame({ 'date': total_word_count.keys(), 'count': total_word_count.values() }) df.index = pd.to_datetime(df['date'], format="%Y-%m-%d") df.sort_index(inplace=True) heat_df = pd.DataFrame({ "date": heat_count.keys(), "absolute": heat_count.values() }) heat_df.index = heat_df["date"].map( lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")) heat_df.sort_index(inplace=True) heat_df['total'] = df['count'] # 滚动k日 heat_df.loc[:, 'total'] = heat_df['total'].rolling(window=look_back).sum() heat_df.loc[:, 'absolute'] = heat_df['absolute'].rolling( window=look_back).sum() heat_df['relative'] = heat_df['absolute'] * 100. / heat_df['total'] heat_df.to_excel("%s/%s.xlsx" % (const.TOPIC_CLASS_DIR, topic_number), index=False)
def update_files(prefix, files): with open(const.WORD_CNT_FILE, 'rb') as fp: word_count = pickle.load(fp) with open(const.WORD_CNT_CHECKED_FILE, 'r') as fp: checked_list = set([line.strip().decode('utf-8') for line in fp.readlines()]) f_check = open(const.WORD_CNT_CHECKED_FILE, 'a') for f in files: date = f.split('_')[0] if not date.startswith('2016') and not date.startswith('2017'): continue date = date.split(' ')[0] f = '%s/%s'%(prefix, f) if f in checked_list or not f.endswith('.txt'): continue f_check.write(f.encode('utf-8') + '\n') with open(f, 'r') as fp: text = fp.readlines() content = ' '.join(text) doc = [word for word in jieba.cut(content) if not word in stop_words] for word in doc: if utils.isNumber(word): continue if word_count.has_key(word): continue if not word_count.has_key(word): word_count[word] = {} if not word_count[word].has_key(date): word_count[word][date] = 0 word_count[word][date] += 1 with open(const.WORD_CNT_FILE, 'wb') as fp: pickle.dump(word_count, fp)
def get_wallst_text(): document = '' for y in years: files = [ "%s/%s/%s" % (WALLSTCN_DIR, y, f) for f in os.listdir("%s/%s/" % (WALLSTCN_DIR, y)) ] print y, len(files) if len(files) > 0: for f in files: with open(f, 'r') as fp: text = fp.readlines() time = text[0].split('_')[0] time = time.strip() if time.find('-') != -1: dt = datetime.datetime.strptime(time, "%Y-%m-%d %H:%M") else: dt = datetime.datetime.strptime(time, '%Y年%m月%d日 %H:%M:%S') date = dt.strftime("%Y-%m-%d") content = " ".join(text[1:]) doc = [ word for word in jieba.cut(content) if (not word in stop_words) and (not utils.isNumber(word)) ] document += '\n' + ' '.join(doc) with open(TEXT_FILE, 'w') as fp: fp.write(document.encode('utf-8'))
def writeMember(self, obj, memberName): if isString(obj): logging.warning(u"String as object provided! " + self._warningPrefix(obj, memberName)) if isInteger(memberName) and isList(obj): member = obj[memberName] memberName = str(memberName) else: member = getattr(obj, memberName, None) if member is None: self.log(u"skipped " + self._warningPrefix(obj, memberName) + u"It is empty or does not exist (=None).") return if isCallable(member) and not hasattr(member, "hdfWrite"): member = member() if hasattr(member, "hdfWrite"): # support instances and types # store the member in a group of its own oldLocation = self.location self._location = "/".join((oldLocation.rstrip('/'), memberName)) member.hdfWrite(self) # recursion entry, mind the loops! self._location = oldLocation elif isList(member): self.writeDataset(memberName, member) elif isString(member) or isNumber(member): self.writeAttribute(memberName, member) else: self.log(u"skipped " + self._warningPrefix(obj, memberName) + "(={}) It is not a compatible value type!".format( classname(member)))
def removeTask(self, task): '''Remove task from subtasks.''' if not isNumber(task): for t in self.get('Subtasks'): if t.get('Task') == task: task = t.get('ID') break if not isNumber(task) or len(self.get('Subtasks')[int(task)].get('Subtasks')) > 0: if not confirm("Are you sure you wish to remove '"+task+"'?", self.win): return if isNumber(task): self['Subtasks'] = [t for t in self.get('Subtasks') if t.get('ID') != int(task)] elif task.lower() == "all" or task.lower() == "-a": self['Subtasks'] = [] for i in range(len(self['Subtasks'])): self['Subtasks'][i]['ID'] = i
def vk2xmpp(id): """ Converts a numeric VK ID to a Jabber ID and vice versa Args: id: a Jabber or VK id Returns: id@TransportID if parameter id is a number id if parameter "id" is id@TransportID TransportID if the given id is equal to TransportID """ if not utils.isNumber(id) and "@" in id: id = id.split("@")[0] if utils.isNumber(id): id = int(id) elif id != TransportID: id = u"%s@%s" % (id, TransportID) return id
def setValue(selforcls, newValue, clip=True): if newValue is None: return # ignore testfor(isNumber(newValue), DefaultValueError, u"A value has to be numerical! ({})".format(newValue)) if clip: # clip to min/max values: newValue = selforcls.clip(newValue) super(ParameterNumerical, selforcls).setValue(newValue)
def setDecimals(selforcls, newDecimals): if newDecimals is not None: testfor( isNumber(newDecimals) and newDecimals >= 0, DecimalsError, "Parameter decimals has to be a positive number!") else: start, end = selforcls._valueRange newDecimals = round(math_log10(math_fabs(end - start))) newDecimals = max(newDecimals, 0) newDecimals = min(newDecimals, sys.float_info.max_10_exp) selforcls._decimals = int(newDecimals)
def setDisplayValues(selforcls, newDisplayValues): if newDisplayValues is None: return testfor(isMap(newDisplayValues), DisplayValuesError, "Expected a display value mapping of numbers to text!") testfor(all([isNumber(v) for v in newDisplayValues.keys()]), DisplayValuesError, "Display value keys have to be numbers!") testfor(all([isString(s) for s in newDisplayValues.values()]), DisplayValuesError, "Display values have to be text!") # TODO: also add reverse lookup selforcls._displayValues = newDisplayValues
def getExploded(self, namebase, atomsMass): import itertools if "p_" in self.name: namebase = "p_" + namebase if "o_" in self.name: namebase = "o_" + namebase if "i_" in self.name: namebase = "i_" + namebase if "c_" in self.name: namebase = "c_" + namebase if "l_" in self.name: namebase = "l_" + namebase if "m_" in self.name: namebase = "m_" + namebase # copy namebase to replace after specName = namebase + "" # store keys sorted by inverse length atoms = sorted(list(atomsMass.keys()), key=lambda xx: len(xx), reverse=True) # produce unique character combinations alpha = ["".join(x) for x in list(itertools.product("XYZ", repeat=4))] # check to have enough combinations if len(atoms) > len(alpha): sys.exit("ERROR: in species parser alpha needs to be extended!") # replace atoms slash-separated for i in range(len(atoms)): specName = specName.replace(atoms[i], "/" + alpha[i] + "/") # replace double slashes while "//" in specName: specName = specName.replace("//", "/") # split at slashes aspec = [x for x in specName.split("/") if x != ""] # search for number and when found multiply previous non-number exploded = [] for a in aspec: if isNumber(a): for j in range(int(a) - 1): exploded.append(aold) else: exploded.append(a) aold = a # store exploded with real atom names try: exploded = [atoms[alpha.index(x)] for x in exploded] except: print("ERROR: wanted to parse ", namebase) print(" but something went wrong with ", exploded) print(" Available atoms are:", atoms) print(" Add to atom list file if needed.") sys.exit() return sorted(exploded)
def convDAT(lines, savename): """ Convert .DAT diffraction file format in a standard text file :param lines: List of the text lines containing the data :param savename: Save the converted data to the file 'savename' :return: None if success otherwise return an error message. """ # Remove empty lines newlines = [lin for lin in lines if lin] # Remove comment lines (starting with #) lines = [lin for lin in newlines if not lin.startswith('#')] # Remove multiple spaces newlines = [" ".join(lin.split()) for lin in lines] # The first line should contain 2theta start, step and final value params = newlines[0].split() errmsg = "Unknown file format" if isNumber(params[0]): start = float(params[0]) if isNumber(params[1]): step = float(params[1]) if isNumber(params[2]): stop = float(params[2]) if stop > start: errmsg = None if errmsg is not None: return errmsg Ilst = [lin.split() for lin in newlines[1:]] # Flatten the list of list fIlst = [item for sublist in Ilst for item in sublist] lines = [] for j, I in enumerate(fIlst): x = start + j * step lin = "{0}\t{1}".format(x, I) lines.append(lin) # Save the new list in a file outfile = open(savename, 'w') outfile.write("\n".join(lines)) return errmsg
def filterNodes(self, x=None, y=None, z=None, t=None): l = self.nodes if x is not None: l = [a for a in l if a.x == x] if y is not None: l = [a for a in l if a.y == y] else: # default --> only full states l = [a for a in l if isNumber(a.y)] if z is not None: l = [a for a in l if a.z == z] if t is not None: l = [a for a in l if a.t == t] return l
def addNode(self, node): if node.name in self.nodeDict: raise Exception(f"node {node.name} already in graph") self.nodes.append(node) self.nodeDict[node.name] = node t = node.t y = node.y if isNumber(y): l = len(self.nodesFiltT) if (t >= l): self.nodesFiltT += [[] * (t + 1 - l)] self.nodesFiltT[t] += [node]
def update_wallst_word_count(): # with open(WORD_CNT_FILE, 'r') as fp: # word_count = json.load(fp) # with open(WORD_CNT_CHECKED_FILE, 'r') as fp: # checked_list = [line.strip() for line in fp.readlines()] with open(const.WORD_CNT_FILE, 'rb') as fp: word_count = pickle.load(fp) with open(const.WORD_CNT_CHECKED_FILE, 'r') as fp: checked_list = set([line.strip().decode('utf-8') for line in fp.readlines()]) f_check = open(const.WORD_CNT_CHECKED_FILE, 'a') for y in years: files = ["%s/%s/%s"%(WALLSTCN_DIR, y, f) for f in os.listdir("%s/%s/"%(WALLSTCN_DIR, y))] print y, len(files) if len(files) > 0: for f in files: if f in checked_list: continue # with open(WORD_CNT_CHECKED_FILE, 'a') as fp: # fp.write(f + '\n') f_check.write(f.encode('utf-8') + '\n') with open(f, 'r') as fp: text = fp.readlines() time = text[0].decode('utf-8').split('_')[0] time = time.strip() if time.find('-') == -1: time = time.replace(u'年', '-').replace(u'月', '-').replace(u'日', '') ''' if time.count(':') == 2: dt = datetime.datetime.strptime(time, "%Y-%m-%d %H:%M:%S") else: dt = datetime.datetime.strptime(time, '%Y-%m-%d %H:%M') date = dt.strftime("%Y-%m-%d") ''' date = time.split(' ')[0] content = " ".join(text[1:]) doc = [word for word in jieba.cut(content) if not word in stop_words] for word in doc: if utils.isNumber(word): continue if not word_count.has_key(word): word_count[word] = {} if not word_count[word].has_key(date): word_count[word][date] = 0 word_count[word][date] += 1 # with open(WORD_CNT_FILE, 'w') as fp: # json.dump(word_count, fp) with open(const.WORD_CNT_FILE, 'wb') as fp: pickle.dump(word_count, fp)
def disAssembly(self, md, inSymbols, code, address, outputPath): try: for (address, size, mnemonic, op_str) in md.disasm_lite(code, address): with open(outputPath + "/" + "assembly", "a+") as f: if utils.isNumber(op_str.lstrip("#")): op_fun = inSymbols.get(op_str.lstrip("#")) if op_fun is not None: op_str = "[" + op_str.lstrip( "#") + "->" + op_fun + "]" f.write("0x%x:\t%s\t%s" % (address, mnemonic, op_str)) f.write("\n") except CsError as e: print("ERROR: %s" % e)
def plot(self, stats): xvec = stats["seriesKey"] if not all((isNumber(x) for x in xvec)): xvecNew = range(len(xvec)) self._axes.set_xticks(xvecNew) self._axes.set_xticklabels(xvec, rotation=15) xvec = xvecNew self._axes.errorbar(xvec, stats["mean"], stats["meanStd"], label=stats["cfg"]) self._axes.set_xlabel(stats["seriesKeyName"]) self._axes.set_ylabel("mean") self._axes.set_title(stats["title"])
def editTask(self, task, cbox): '''Edit the task name.''' if isNumber(task): if int(task) < len(self.get('Subtasks')) and int(task) >= 0: t = self.get('Subtasks')[int(task)] else: for item in self.get('Subtasks'): if item.get('Task') == task: t = item cbox.text = t.get('Task') cbox.bl = 0 cbox.br = len(cbox.text) while True: self.win.erase() cbox.draw()
def __isCommand(self, key, word2): """ decide if a keyword and next word are of the form 'command arg, ...' which will get translated to 'command(arg, ...)' to allow 'command syntax' """ # this could be in one long test, but we simplify: # first test key: if (not isValidName(key) or key in self.friends or key.startswith('#') or len(key) < 1 or len(word2) < 1): return False # next test word2 return (isValidName(word2) or isNumber(word2) or isLiteralStr(word2))
def setValueRange(selforcls, newRange): testfor(isList(newRange), ValueRangeError, "A value range is mandatory for a numerical parameter!") testfor( len(newRange) == 2, ValueRangeError, "A value range has to consist of two values!") testfor(all([isNumber(v) for v in newRange]), ValueRangeError, "A value range has to consist of numbers only!") minVal, maxVal = min(newRange), max(newRange) # minVal = max(minVal, -sys.float_info.max) # maxVal = min(maxVal, sys.float_info.max) # avoid inf/nan showing up somewhere # otherwise, inf might be ok if UI elements support it (going in&out) minVal = max(minVal, -1e200) # as good as -inf?... maxVal = min(maxVal, 1e200) # as good as inf?... selforcls._valueRange = minVal, maxVal # apply limits to value: selforcls.setValue(selforcls.clip())
def slotItemChanged(self, item): """ Callback function when the user changes a cell content. :param row: :param col: :return: nothing """ if not self.init: text = item.text() if isNumber(text): oldval = self.data[item.column()][item.row()] newval = float(text) if newval != oldval: self.data[item.column()][item.row()] = newval self.pltw.dirty = True self.pltw.updatePlot() if self.pltw.dcursor is not None: self.pltw.dcursor.updateLinePos()
def out_to_csv(in_filename, out_filename): data = [] i = -1 with open(in_filename) as in_f: lines = in_f.readlines() for line in lines: if not isNumber(line): i += 1 data.append([]) else: data[i].append(float(line)) with open(out_filename, 'w') as out_f: for i in range(len(data)): for j in range(len(data[i])): if j == len(data[i]) - 1: out_f.write("{}\n".format(data[i][j])) else: out_f.write("{},".format(data[i][j]))
def __isCommand(self, key, word2): """ decide if a keyword and next word are of the form 'command arg, ...' which will get translated to 'command(arg, ...)' to allow 'command syntax' """ # this could be in one long test, but we simplify: # first test key: if (not isValidName(key) or key in self.friends or key.startswith('#') or len(key) < 1 or len(word2) < 1): return False # next test word2 return (isValidName(word2) or isNumber(word2) or isLiteralStr(word2) )
def convPRF(lines, savename): """ Convert a PRF file generated by Fullprof in a standard text file. Save the converted data in the file 'filename'. :param lines: list of the text lines containing the PRF data :param savename: name of file to which converted data will be saved. :return: None if success otherwise return an error message. """ err = 0 if not any(" 2Theta" in lin for lin in lines) or len(lines) < 7: err = 1 nl = len(lines) nitems = lines[1].split(' ') nitems = [x for x in nitems if x] if not isNumber(nitems[1]): err = 1 else: npt = int(nitems[1]) if npt > nl: err = 1 if err: return "Unknown file format" startidx = [idx for idx, s in enumerate(lines) if " 2Theta" in s][0] + 1 errmsg = None newlines = [] # Add a # at the beginning of the first line newlines.append('#' + lines[0]) newlines.extend(["# 2Theta\tYobs\tYcal\tYobs_Ycal"]) # For each line keeps only the 4 first items for l, line in enumerate(lines): if l >= startidx and l < npt + startidx: items = line.split('\t') line = [items[i] for i in range(4)] newlines.append("\t".join(line)) # Save the new list in a file outfile = open(savename, 'w') outfile.write("\n".join(newlines)) return errmsg
def load_sentences(path, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] with open(path, 'r') as f: # for line in codecs.open(path, 'r', 'utf8'): for line in f.readlines(): line = isNumber(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: sentences.append(sentence) sentence = [] else: word = line.split('\t') assert len(word) >= 2 sentence.append([word[0], word[1], word[2]]) if len(sentence) > 0: sentences.append(sentence) return sentences
def extract_text(path): files = [f for f in os.listdir(path)] content = [] for f in files: # print f fname = u'%s/%s' % (path, f) with open(fname, 'r') as fp: text = fp.readlines() text = ' '.join(text).decode('utf-8') doc = jieba.cut(text) doc = [ word for word in doc if (not word in stop_words) and (not utils.isNumber(word)) ] # print(len(doc)) doc = ' '.join(doc) doc = doc.replace('\t', ' ').replace('\n', ' ') content.append(doc) # for word in doc: # print word # print ' '.join(doc).encode('gbk') # break with open(text_file, 'w') as f: f.write('\n'.join(content).encode('utf-8'))
def getNameLatex(self, name): # electrons are special if name == "E_gas": return "e$^-$" latexName = "" # loop on name characters for char in list(name): if char in ["+", "-"]: # signs are superscripts latexName += "$^" + char + "$" elif isNumber(char): # numbers are subscripts latexName += "$_" + char + "$" else: # standard characters latexName += char # replace _gas with dedicated latex command latexName = latexName.replace("_gas", "$\\gas$") # return latex name return latexName
def save_wallst_text(): years = [ '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018' ] with open(const.WST_WORD_CNT_CHECKED_FILE, 'r') as fp: checked_list = ([line.strip() for line in fp.readlines()]) for y in years: files = [f for f in os.listdir('%s/%s' % (const.WALLSTCN_DIR, y))] print y, len(files) for f in files: fname = '%s/%s/%s' % (const.WALLSTCN_DIR, y, f) if fname in checked_list: with open(fname, 'r') as fp: content = ' '.join( [line.strip() for line in fp.readlines()[1:]]) content = jieba.cut(content) content = [ w for w in content if (w not in stop_words) and (not utils.isNumber(w)) ] content = ' '.join(content).encode('utf-8') with open(const.WST_TOPIC_FILE, 'a') as fp: fp.write(content + '\n')
def get_word_heat(key_word, threshold=0.5, similar_words=1000, start_date="2016-01-01", end_date=datetime.datetime.today().strftime("%Y-%m-%d"), look_back=7): print(key_word) heat_count = word_count[key_word].copy() heat_count_relative = word_count[key_word].copy() heat_count_weighted = word_count[key_word].copy() similar_df = pd.DataFrame({"word": [key_word], "distance": [1.0]}) start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") end_date = datetime.datetime.today() for word, dis in model.most_similar(key_word, topn=similar_words): if dis < threshold: break if not utils.isNumber(word) and word in word_count: similar_df.ix[similar_df.shape[0]] = [dis, word] for day, value in word_count[word].iteritems(): if not heat_count.has_key(day): heat_count[day], heat_count_relative[ day], heat_count_weighted[day] = 0, 0, 0 heat_count[day] += value heat_count_relative[day] += value heat_count_weighted[day] += value * dis # 填充当日没有出现词的数据 current_date = start_date while current_date < end_date: key = current_date.strftime("%Y-%m-%d") if not heat_count.has_key(key): heat_count[key], heat_count_relative[key], heat_count_weighted[ key] = 0, 0, 0 if not total_word_count.has_key(key): total_word_count[key] = 0 current_date = current_date + datetime.timedelta(1) df = pd.DataFrame({ 'date': total_word_count.keys(), 'count': total_word_count.values() }) df.index = pd.to_datetime(df['date'], format="%Y-%m-%d") df.sort_index(inplace=True) # df = df[df.index >= '2016-01-01'] # print df.tail() heat_df = pd.DataFrame({ "date": heat_count.keys(), "absolute": heat_count.values(), 'relative': heat_count_relative.values(), 'weighted': heat_count_weighted.values() }) heat_df.index = heat_df["date"].map( lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")) heat_df.sort_index(inplace=True) heat_df['total'] = df['count'] # print heat_df.tail() # 滚动k日 heat_df.loc[:, 'total'] = heat_df['total'].rolling(window=look_back).sum() heat_df.loc[:, 'absolute'] = heat_df['absolute'].rolling( window=look_back).sum() heat_df.loc[:, 'relative'] = heat_df['relative'].rolling( window=look_back).sum() * 100. / heat_df['total'] heat_df.loc[:, 'weighted'] = heat_df['weighted'].rolling( window=look_back).sum() * 100. / heat_df['total'] similar_df.to_csv("%s/%s_%.1f_words.csv" % (const.ASSET_CLASS_DIR, key_word, threshold), index=False, encoding="utf-8") heat_df.to_csv("%s/%s_%.1f.csv" % (const.ASSET_CLASS_DIR, key_word, threshold), index=False)
def readArff(fileSrc): # main variables to be returned relation = "" # relation attributes = [] # attribute list rawData = [] # main data storage reverseLookup = {} # store by value for reverse lookup continuousVariables = {} categoricalVariables = {} dataFile = codecs.open(fileSrc, 'rb', 'utf-8') # specify utf-8 encoding print "Reading file..." lines = dataFile.readlines() # read all lines if settings.PROGRESS_BAR == True: util.updateProgress(0) # create a progress bar # test every line and extract its relevant information for idx, line in enumerate(lines): # test each line if settings.PROGRESS_BAR == True: util.updateProgress(float(idx) / float(len(lines))) if line[0] == '%': # ignore comments continue elif line[0] == '@': # if is metadata if '@relation' in line: # if relation arrayLine = line.split(" ") relation = arrayLine[1] elif "@attribute" in line: # if attribute arrayLine = line.split(" ") attributes.append([arrayLine[1]]) if "real" not in arrayLine[2]: # if attribute is not real (is categorical) attrs = re.search('\{(.*?)\}', line).group() # select text between brackets attrs = re.sub('[\{\}]', "", attrs) # remove brackets newAttrs = attrs.split(", ") options = [] for attr in newAttrs: options.append(attr) attributes[len(attributes) - 1].append(options) else: # if it is real attributes[len(attributes) - 1].append('real') elif line[0] == " ": continue else: line = line.replace(" ", "") line = line.replace("\n", "") line = line.split(",") newDataEntry = {} # create a new object to store our row data for idx, value in enumerate(line): # for every column of data attribute = attributes[idx] if util.isNumber(value): # convert string to float if it's a number value = float(value) # Add value to our reverse lookup under the key "attributeName attributeValue" rlKey = attribute[0] + " " + str(value) # create key for our reverseLookup data structure if rlKey in reverseLookup: reverseLookup[rlKey].append(len(rawData)) # append index of our current row (the length of data) for quick lookup later else: reverseLookup[rlKey] = [len(rawData)] # create a new arrayList to store our indices if one does not already exist # fill our newData Entry newDataEntry[attribute[0]] = value # store the value under its proper key # add variables to our bins if attribute[1] == 'real': # if the attribute is real, we place it in a continuous bin if attribute[0] in continuousVariables: continuousVariables[attribute[0]].add(value, line[len(line) - 1]) # add our value to our continuous bin else: continuousVariables[attribute[0]] = util.continuousBin(attribute[0]) # instantiate a continuous bin to hold our variable continuousVariables[attribute[0]].add(value, line[len(line) - 1]) else: # if the attribute is categorical, we place it in a categorical bin if attribute[0] in categoricalVariables: categoricalVariables[attribute[0]].add(value, line[len(line) - 1]) else: categoricalVariables[attribute[0]] = util.categoricalBin(attribute[1]) categoricalVariables[attribute[0]].add(value, line[len(line) - 1]) rawData.append(newDataEntry) # append data entry to all of our data # END OF FOR LOOP results = {} results['data'] = rawData results['attributes'] = attributes results['relation'] = relation results['lookup'] = reverseLookup results['continuousVariables'] = continuousVariables results['categoricalVariables'] = categoricalVariables if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\nFile read complete \n" return results
def __init__(self, trainingData, attributes): print "Training Bayesian Classifier with " + str( len(trainingData)) + " data entries.\n" # COUNT VARIABLES print "Counting all variables:" if settings.PROGRESS_BAR == True: util.updateProgress(0) # Sort the training data into two bins based on classifier, meanwhile recording the counts for each variable numOfEntries = float(len(trainingData)) categoricalCounts = {} # Holds counts of each category self.classifierBins = {} # Holds the data points for each classifier self.probability = {} self.numericBins = {} count = 0.0 for entry in trainingData: # for every data row... count += 1.0 if settings.PROGRESS_BAR == True: util.updateProgress(count / (numOfEntries)) for attr in entry: # for each attribute... if util.isNumber( entry[attr]) == False: # for categorical attributes if entry[ attr] in categoricalCounts: # if we have already created a key for this categoricalCounts[ entry[attr]] += 1.0 # increment the key else: # otherwise we create a new key and set it to 1 categoricalCounts[entry[attr]] = 1.0 if attr == settings.CLASSIFIER_NAME: # if we are on the classifier, in this case "class" if entry[ attr] in self.classifierBins: # add the row to the classifier bins, self.classifierBins[entry[attr]].append(entry) else: self.classifierBins[entry[attr]] = [entry] else: # For Numeric Attributes key = attr + ' given ' + entry[ settings.CLASSIFIER_NAME] # declare a key if key in self.numericBins: # if the key is already in our numeric bins bisect.insort( self.numericBins[key], entry[attr] ) # insert the numeric attribute in a sorted location else: self.numericBins[key] = [ entry[attr] ] # if it doesn't exist, create a list for it # DEAL WITH CONTINUOUS VARIABLES initialKeys = self.numericBins.keys() for key in initialKeys: self.numericBins[key + " mean"] = np.mean( self.numericBins[key]) # store mean of each prob self.numericBins[key + " stdev"] = np.std( self.numericBins[key] ) # store std deviation of each continuous var for attr in attributes: # if we have not stored values for certain attributes, we do so now, using smoothing techniques if attr[1] != 'real': for attrType in attr[1]: if attrType not in self.probability: self.probability[attrType] = .5 / numOfEntries for name in self.classifierBins: self.probability[attrType + " given " + name] = .5 / len( self.classifierBins[name]) # ASSIGN PROBABILITIES print "\n\nAssigning probabilities:" # Now we have two bins, each holding our different classifiers and counts of all our variables if settings.PROGRESS_BAR == True: util.updateProgress(0) for key in categoricalCounts.keys(): # Assign categorical counts self.probability[key] = self.getProbability( categoricalCounts[key], numOfEntries) attrs = categoricalCounts.keys( ) # get the attrs we will iterate through count = 0.0 # create a count used to log to the status bar for key in self.classifierBins.keys(): # for each classifier type... count += 1 if settings.PROGRESS_BAR == True: util.updateProgress(count / float( len(self.classifierBins.keys()))) # update progress bar for row in self.classifierBins[ key]: # for each row in the classifierBins... for rowKey in row: # for each key in the row... if util.isNumber( row[rowKey] ) == False: # if we're dealing with a categorical variable... newKey = row[ rowKey] + " given " + key # create a key variable if newKey in categoricalCounts: # count number of items included in that section categoricalCounts[newKey] += 1.0 else: categoricalCounts[newKey] = 1.0 for attrValue in attrs: # for every attrValue... countKey = attrValue + " given " + key # create a key if countKey in categoricalCounts: # add to categoricalCounts our conditional probabilities self.probability[countKey] = self.getProbability( categoricalCounts[countKey], len(self.classifierBins[key]) ) # Assign conditional probabilities else: self.probability[countKey] = self.getProbability( 0, len(self.classifierBins[key])) if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\nModel creation complete\n"
def __init__(self, trainingData, attributes): print "Training Bayesian Classifier with " + str(len(trainingData)) + " data entries.\n" # COUNT VARIABLES print "Counting all variables:" if settings.PROGRESS_BAR == True: util.updateProgress(0) # Sort the training data into two bins based on classifier, meanwhile recording the counts for each variable numOfEntries = float(len(trainingData)) categoricalCounts = {} # Holds counts of each category self.classifierBins = {} # Holds the data points for each classifier self.probability = {} self.numericBins = {} count = 0.0 for entry in trainingData: # for every data row... count += 1.0 if settings.PROGRESS_BAR == True: util.updateProgress(count / (numOfEntries)) for attr in entry: # for each attribute... if util.isNumber(entry[attr]) == False: # for categorical attributes if entry[attr] in categoricalCounts: # if we have already created a key for this categoricalCounts[entry[attr]] += 1.0 # increment the key else: # otherwise we create a new key and set it to 1 categoricalCounts[entry[attr]] = 1.0 if attr == settings.CLASSIFIER_NAME: # if we are on the classifier, in this case "class" if entry[attr] in self.classifierBins: # add the row to the classifier bins, self.classifierBins[entry[attr]].append(entry) else: self.classifierBins[entry[attr]] = [entry] else: # For Numeric Attributes key = attr + ' given ' + entry[settings.CLASSIFIER_NAME] # declare a key if key in self.numericBins: # if the key is already in our numeric bins bisect.insort(self.numericBins[key], entry[attr]) # insert the numeric attribute in a sorted location else: self.numericBins[key] = [entry[attr]] # if it doesn't exist, create a list for it # DEAL WITH CONTINUOUS VARIABLES initialKeys = self.numericBins.keys() for key in initialKeys: self.numericBins[key + " mean"] = np.mean(self.numericBins[key]) # store mean of each prob self.numericBins[key + " stdev"] = np.std(self.numericBins[key]) # store std deviation of each continuous var for attr in attributes: # if we have not stored values for certain attributes, we do so now, using smoothing techniques if attr[1] != 'real': for attrType in attr[1]: if attrType not in self.probability: self.probability[attrType] = .5 / numOfEntries for name in self.classifierBins: self.probability[attrType + " given " + name] = .5 / len(self.classifierBins[name]) # ASSIGN PROBABILITIES print "\n\nAssigning probabilities:" # Now we have two bins, each holding our different classifiers and counts of all our variables if settings.PROGRESS_BAR == True: util.updateProgress(0) for key in categoricalCounts.keys(): # Assign categorical counts self.probability[key] = self.getProbability(categoricalCounts[key], numOfEntries) attrs = categoricalCounts.keys() # get the attrs we will iterate through count = 0.0 # create a count used to log to the status bar for key in self.classifierBins.keys(): # for each classifier type... count += 1 if settings.PROGRESS_BAR == True: util.updateProgress(count / float(len(self.classifierBins.keys()))) # update progress bar for row in self.classifierBins[key]: # for each row in the classifierBins... for rowKey in row: # for each key in the row... if util.isNumber(row[rowKey]) == False: # if we're dealing with a categorical variable... newKey = row[rowKey] + " given " + key # create a key variable if newKey in categoricalCounts: # count number of items included in that section categoricalCounts[newKey] += 1.0 else: categoricalCounts[newKey] = 1.0 for attrValue in attrs: # for every attrValue... countKey = attrValue + " given " + key # create a key if countKey in categoricalCounts: # add to categoricalCounts our conditional probabilities self.probability[countKey] = self.getProbability(categoricalCounts[countKey], len(self.classifierBins[key])) # Assign conditional probabilities else: self.probability[countKey] = self.getProbability(0, len(self.classifierBins[key])) if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\nModel creation complete\n"
def setStepping(selforcls, newStepping): if newStepping is None: return testfor(isNumber(newStepping), SteppingError, "Parameter has to be a number!") selforcls._stepping = newStepping
def isDataType(cls, value): """ParameterNumerical is a fallback for all number not being float.""" return isNumber(value) and not isinstance(value, float)
def isDigit(docTerm): termCopy = re.sub(",", "", docTerm) if util.isNumber(termCopy): return True return False