def __WriteDict(dic, collection, FieldK, FieldV): db = MongoDB.getConnection('mining') docs = [] for k,v in dic.iteritems(): doc = {} doc[FieldK] = k doc[FieldV] = v docs.append(doc) db[collection].insert(docs)
def __ReadDict(dic, collection, FieldK, FieldV): db = MongoDB.getConnection('mining') for doc in db[collection].find(): k = doc[FieldK] v = doc[FieldV] if (FieldK == "term"): k = k.encode('utf-8') if (FieldV == "term"): v = v.encode('utf-8') dic[k] = v
def __init__(self, config, nodeName, loadFromDB = False): self.node = config.GetChild(nodeName) self.trained = loadFromDB GlobalInfo.Init(config, "__global__", loadFromDB) #get data source data_source = self.node.GetChild("data_source") dbname = data_source.GetChild('db').GetValue() self.collection = data_source.GetChild('collection').GetValue() self.field = data_source.GetChild('field').GetValue() #self.field = self.field.encode('utf-8') self.db = MongoDB.getConnection(dbname)
def __init__(self, config, nodeName, loadFromFile = False): self.curNode = config.GetChild(nodeName) self.rate = float(self.curNode.GetChild("rate").GetValue()) self.method = self.curNode.GetChild("method").GetValue() self.modelPath = self.curNode.GetChild("model_path").GetValue() self.people_tag_collection = self.curNode.GetChild('people_tag').GetValue() self.blackList = {} dbname = self.curNode.GetChild("db").GetValue() self.db = MongoDB.getConnection(dbname) self.trained = loadFromFile if (loadFromFile): f = open(self.modelPath, "r") for line in f: self.blackList[int(line)] = 1
doc = doc.decode("gbk").encode("utf-8") except: page_id += process_num continue page_id += process_num soup = BeautifulSoup(doc) word = soup.find('h1', "title") if word: #baike.append({'title':word.string, 'url':url, 'html':doc}) #if not db.word_dic.find_one({'word':word.string}): words.append({'word':word.string, 'len':len(word.string)}) matchs = soup.findAll(href=re.compile('^/view/\d+.htm')) for match in matchs: #if match.string: if match.string and not db.word_dic.find_one({'word':match.string}): words.append({'word':match.string, 'len':len(match.string)}) if len(words) >= 10: db.word_dic.insert(words) words = [] #db.baike.insert(baike) #baike = [] if __name__=="__main__": db = MongoDB.getConnection('mining') process_num = 1 startindex = 1 for i in range(startindex, process_num+startindex): p = Process(target=son,args=(process_num, i, db)) p.start()