def handle_int(self, signum, frame): if os.getpid() != self.root_pid: return _logger.info('got signal(%d), will shutdown gracefully' % signum) self.shutdown() _logger.info('all process killed, will call exit(0)') sys.exit(0)
def test(model, test_file_path): total = 0 correct = 0 decoder = NaiveDecoder(model) outfile = open("predicted.dat", 'w') _logger.info("Testing %s" % test_file_path) with open(test_file_path) as test_file: processed = 1 for line in test_file: line = line.strip().decode('utf-8') if not line: continue total += 1 sentence, tag = line.split('\t') #sentence = extract(sentence) result = decoder.decode(sentence) predicted, _ = conv.argmax(result.items()) outfile.write("%s\t%s\t%s\n" % (sentence.encode('utf-8'), predicted.encode('utf-8'), tag.encode('utf-8'))) if predicted == tag: correct += 1 if processed % 1000 == 0: _logger.debug("%d lines processed" % processed) processed += 1 outfile.close() _logger.info("accuracy: %f" % (float(correct) / total))
def recursive_crawl(url, encoding, selenium, agent, domain, terminate): if crawled_as_hub(agent, url, day_limit=3): _logger.debug('ignore, recently(3 days) crawled as hub: %s' % (url)) return links = pbrowser.get_all_href(url, encoding) _logger.debug("processing %d links" % (len(links))) count = 0 for idx, link in enumerate(links): # ignore href to different domain; accept all href if 'domain' is empty string if urlparse(link['href'].encode('utf-8')).netloc.find(domain) == -1: _logger.debug('ignore (%s), different from domain (%s)' % (link['href'].text.encode('utf-8'), domain)) continue tweet = None try: #tweet = try_crawl_href(link, encoding, agent, selenium) tweet = try_crawl_href(link['href'].encode('utf-8').lower(), link.text.encode('utf-8').strip(), encoding, agent, selenium) except Exception, err: _logger.error('crawl href failed: %s, %s' % (err, traceback.format_exc())) continue if tweet != None: count += 1 try: agent.add_crawled_tweet(url, tweet) _logger.info( 'new tweed added to db, %d total, (%d / %d) prcessed' % (count, idx, len(links))) except Exception, err: _logger.error('failed to add crawled tweet to DB: %s' % err)
def __init__(self, user=None): self.user = user self.selenium = selenium('localhost', 4444, 'chrome', 'http://www.baidu.com') _logger.info('starting selenium') self.selenium.start() self.selenium.set_timeout(120 * 1000) # timeout 120 seconds
def _crawl_thirdary(self, anchor): self.output.write(' %s\n' % anchor.text.encode('utf-8')) _logger.info('crawling fourth (%s)' % anchor['href']) try: self._crawl_fourth(anchor['href']) except Exception, err: _logger.error('fourth(%s) failed: %s' % (anchor['href'], err))
def main(spec): et = ElementTree.parse("task.conf") cases = et.findall('case') for case in cases: query = case.attrib['search'] search_count = int(case.attrib['search-count']) compose_count = int(case.attrib['compose-count']) _logger.info('processing case, query=[%s]' % query) if spec['crawl']: # Start crawler _logger.info('kicking off crawler, keyword=(%s), count=%d' % (query, search_count)) gcrawler.start_crawler(keyword=query, count=search_count) # Start interpretor _logger.info('kicking off interpreter') interpreter.interpret('crawler_out', 'interpret_out') # Start composer if spec['compose']: _logger.info('start composing %d articles' % compose_count) link_info = {} for link in case.findall('link'): link_info[link.attrib['anchor']] = list() for href in link.findall('href'): link_info[link.attrib['anchor']].append(href.text) composer.compose('interpret_out', 'composer_out', compose_count, link_info) # Start poster if spec['post']: _logger.info('start posting') post_count = int(case.attrib['post-count']) poster.post_spam('composer_out', limit=post_count)
def start_spam(self, anchor, href, keyword, count=100, verbose=False, fingerprint="will not be published"): """ Start spamming, use keyword to query Google, request for 'count' results. Anchor text and url are specified with 'anchor' and 'href'. """ # Make diretory: path = "./%s(%s).%d/" % (anchor, keyword, count) if verbose: if os.path.exists(path): _logger.error('%s exists, I\'ll have to remove it, sorry' % path) import shutil shutil.rmtree(path) os.mkdir(path) query = keyword + " " + fingerprint self.cur_fingerprint = fingerprint[1:-1] lazy_result = [] urls = pbrowser.ask_google( query, count, callback=lambda new_url: lazy_result.append( self.process_new_url(new_url, anchor, href)), sleep_min=15, sleep_max=20, ) _logger.info( 'ask_google retured %d results, start joinning %s target' % (len(urls), len(lazy_result))) success_count = 0 for result in lazy_result: try: success, info = result.eval() except Exception, err: _logger.error("failed extracting lazy result:%s" % (err)) else: try: output_path = path + (urlparse(info[0]).hostname + str(random.randint(1, 1000))) except: _logger.error("can't parse hostname from target url:[%s]" % info[0]) output_path = path + "info[0]" + str( random.randint(1, 1000)) if success: success_count += 1 output_path += '.success.html' else: output_path += '.fail.html' if verbose: with open(output_path.encode('utf-8'), 'w') as output: output.write(info[0] + '\n') output.write(info[1])
def fill_account(daemon, helper, user): sele = daemon.selenium daemon.user = user _logger.info('start joining groups') try: daemon.grouping(force=True) except Exception, err: _logger.error('grouping failed: %s' % err)
def main(): _logger.info("wee indexer started") agent = WeeSQLAgent(DB_NAME, DB_USER, DB_PASSWORD) agent.start() _logger.info("MySQL agent started") indexer = Indexer(agent) while True: #agent.restart() indexer.index_new_wee() _logger.debug("Sleep for %d sec" % SLEEP_SEC) time.sleep(SLEEP_SEC)
def process_terminal(self, task): anchor_text = task['anchor_text'] anchor_url = task['anchor_url'] _logger.info('processing terminal link, url:%s' % anchor_url) tweet = None try: tweet = try_crawl_href(anchor_url, anchor_text, task['encoding'], self.agent, self.sele) except Exception, err: _logger.error('crawl href failed: %s, %s' % (err, traceback.format_exc()))
def shutdown(self): self.agent.stop() if hasattr(self, 'workers'): for worker in self.workers: pid = worker.pid try: self.kill_worker(worker) _logger.info('child process %d killed' % pid) except Exception, err: _logger.error( 'failed to kill child pid:%d, %s, it will become orphan' % (pid, err))
def crawl(self, url): self.output = open('hao123.crawl%s' % datetime.now().date(), 'w') _logger.info('opening hao123 home page: %s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312")) for top_tier in soup.findAll('table', monkey='cool'): anchor = top_tier.find('a') _logger.info('crawling top tier category: %s (%s)' % (anchor.text, anchor['href'])) self.crawl_first(pbrowser.abs_url(url, anchor['href'])) self.output.close()
def load_data(self): _logger.info("Loading training data from %s" % self.train_path) self.X = [] self.y = [] with open(self.train_path) as train_file: for line in train_file: line = line.strip().decode('utf-8') if not line: continue terms, domain = line.split('\t') self.X.append(terms) self.y.append(domain)
def compose(infile, outfile, count, link_info, min_word = 200, max_word = 500): infile = open(infile) paragraphs = _parse_paragraphs(infile) infile.close() articles = [] link_dropper = _LinkDropper(link_info) for i in range(0, count): _logger.info('writing article ' + str(i) + ' of ' + str(count)) articles.append(link_dropper._drop_links(write_one_article(paragraphs, min_word, max_word)) + '\n\n') outfile = open(outfile, 'w') outfile.write(('\n\n' + doc_sep + '\n\n').join(articles)) outfile.close()
def process_hub(self, task): url = task['anchor_url'] _logger.info('processing hub page, url:%s' % url) last_crawl = self.agent.get_crawl_history(url) now = datetime.now() if (now - last_crawl).days <= 3: _logger.debug('ignore, recently crawled: %s' % str(last_crawl)) return domain = task['domain'] encoding = task['encoding'] links = pbrowser.get_all_href(url, encoding) _logger.debug("got %d links" % (len(links))) for idx, link in enumerate(links): if urlparse( link['href'].encode('utf-8')).netloc.find(domain) == -1: _logger.debug('ignore (%s), different from domain (%s)' % (link['href'].encode('utf-8'), domain)) continue # make tempoary source cur_url = link['href'].encode('utf-8').lower() cur_text = link.text.encode('utf-8').strip() if crawled_as_hub(self.agent, cur_url, day_limit=3): _logger.debug('ignore, recently(3 days) crawled as hub: %s' % (cur_url)) continue if crawled_as_terminal(self.agent, cur_url, cur_text, day_limit=30): _logger.debug( 'ignore, recently(3 days) crawled as terminal: %s' % (cur_url)) continue if in_task_queue(self.agent, cur_url, cur_text): _logger.debug('ignore, already added to task queue: %s' % (cur_url)) continue ttl = task['ttl'] - 1 try: self.agent.add_crawler_task(anchor_url=cur_url, anchor_text=cur_text, encoding=encoding, domain=domain, ttl=ttl) _logger.debug('%s added to task in DB' % cur_url) except Exception, err: _logger.error('failed to add crawler task, url:(%s), %s' % (cur_url, err))
def load_data(train_path): _logger.info("Loading data from %s" % train_path) X = [] y = [] with open(train_path) as train_file: for line in train_file: line = line.strip().decode("utf-8") if not line: continue terms, domain = line.split("\t") X.append(terms) y.append(domain) return np.array(X), np.array(y)
def _crawl_secondary(self, div): tb = div self.output.write(' %s\n' % div.text.encode('utf-8')) while not hasattr(tb, 'name') or tb.name != u"table": tb = tb.nextSibling for third in tb.findAll('a'): _logger.info('crawling thirdary (%s)' % third.text) try: self._crawl_thirdary(third) except Exception, err: _logger.error( 'third(%s) failed: %s\n%s' % (third.text.encode('utf-8'), err, traceback.format_exc()))
def crawl_authors(self, authors, callback): for author in authors: cur_url = author _logger.info("crawling author from %s" % cur_url) try: self.crawl_one_author(cur_url, callback) _logger.debug('sleeping for 5 sec') time.sleep(5) except Exception, err: _logger.error( "crawl one author failed, url:(%s), error:%s, %s" % (cur_url, err, traceback.format_exc())) continue
def check_proxies(agent): config = agent.get_core_config() PROXY_TRYOUT_COUNT = int(config['proxy_tryout_count']) VALID_PROXY_FAIL_RATE = float(config['valid_proxy_fail_rate']) all_proxy = agent.get_all_proxy() account_num = agent.get_all_user_count() slot_num = math.ceil(account_num / 50.0) slot_num = int(slot_num) _logger.info( "%d account, %d proxy slots, fail rate limit: %.2f%%, try out: %d" % (account_num, slot_num, VALID_PROXY_FAIL_RATE * 100, PROXY_TRYOUT_COUNT)) for slot_id in range(slot_num): proxy = agent.get_proxy_by_slot(slot_id) if proxy == None: _logger.info("proxy slot #%d is empty, try picking proxy for it" % slot_id) pick_proxy_for_slot(agent, slot_id, all_proxy) elif bad_proxy(proxy): _logger.info( "proxy slot #%d is bad with addr: %s, will pick new one" % (slot_id, proxy['addr'])) agent.remove_proxy_from_slot(proxy) pick_proxy_for_slot(agent, slot_id, all_proxy) else: _logger.info("proxy slot #%d OK, addr: %s" % (slot_id, proxy['addr']))
def vectorize(tfidf=False,binary=False): _logger.info("Loding...") trainX = [r[0] for r in tsv.reader(conv.redirect('train.tokenized.dat'))] testX = [r[0] for r in tsv.reader(conv.redirect('test.tokenized.dat'))] vectorizer = None if tfidf: vectorizer = TfidfVectorizer else: vectorizer = CountVectorizer _logger.info("Fitting and transforming...") vectorizer = vectorizer(token_pattern=u'(?u)\\b\\w+\\b',binary=binary, ngram_range = (1, 3)) trainX = vectorizer.fit_transform(trainX) testX = vectorizer.transform(testX) _logger.info("Dumping binaries...") pickle.dump(vectorizer,open("vectorizer.bin",'w')) pickle.dump(trainX,open("train.vectorized.mat",'w')) pickle.dump(testX,open("test.vectorized.mat",'w')) schema = vectorizer.get_feature_names() codecs.open("schema.dat",'w',encoding='utf-8').write('\n'.join(schema)) # debug # _logger.info("Dumping inversered...") # codecs.open("test.vectorized.dat",'w',encoding='utf-8').write( '\n'.join( [(' '.join(i)) for i in vectorizer.inverse_transform(testX)] ) ) # codecs.open("train.vectorized.dat",'w',encoding='utf-8').write( '\n'.join( [(' '.join(i)) for i in vectorizer.inverse_transform(trainX)] ) ) trainX = trainX.tocoo(False) testX = testX.tocoo(False) _logger.info("Dumping test.vectorized.dat...") with codecs.open("test.vectorized.dat",'w',encoding='utf-8') as fl: dc = defaultdict(list) for r,c,v in zip(testX.row,testX.col,testX.data): dc[r].append( "%s(%s)=%s"%(schema[c],c,v) ) for i in sorted(dc.keys()): fl.write("%s\t%s\n" % (i, " , ".join(list(dc[i])) )) _logger.info("Dumping train.vectorized.dat...") with codecs.open("train.vectorized.dat",'w',encoding='utf-8') as fl: dc = defaultdict(list) for r,c,v in zip(trainX.row,trainX.col,trainX.data): dc[r].append( "%s(%s)=%s"%(schema[c],c,v) ) for i in sorted(dc.keys()): fl.write("%s\t%s\n" % (i, " , ".join(list(dc[i])) ))
def fetch_source(self, source): _logger.debug("pool stat: %d working %d waiting" % (self.pool.running(), self.pool.waiting())) _logger.info("crawling source id=%d url=%s" % (source['id'], source['url'])) cur_time = int(time.time()) last_crawl_time = source['last_crawl_time'] if cur_time - last_crawl_time < HOUR: _logger.info("ignore source(%s), last crawled %d minutes ago" % (source['url'], (cur_time - last_crawl_time) / 60)) return try: _logger.debug("fetching feed from (%s)" % source['url']) p = feedparser.parse(source['url']) _logger.debug("fetched from (%s)" % source['url']) if p.feed.has_key( 'updated_parsed') and p.feed.updated_parsed != None: cur_feed_time = int(time.mktime(p.feed.updated_parsed)) else: cur_feed_time = int(time.time( )) # FeedParser doesn't understand the 'updated' field # of this feed, neither can we. Probabaly some CJK chars. db_feed_time = source['last_feed_time'] if db_feed_time >= cur_feed_time: _logger.info( "ignore source(%s), no new feed. Last feed:%s, cur feed:%s" % (source['url'], datetime.fromtimestamp(db_feed_time), datetime.fromtimestamp(cur_feed_time))) self.agent.update_source_time(source) else: _logger.info("processing %d entries from %s" % (len(p.entries), source['url'])) for entry in p.entries: self.process_entry(entry, source) self.agent.update_source_time(source, cur_feed_time) _logger.debug( "source(%s) updated: %s" % (source['url'], datetime.fromtimestamp(cur_feed_time))) _logger.info("source(id=%d) success" % source['id']) _logger.debug("pool stat: %d working %d waiting" % (self.pool.running(), self.pool.waiting())) except Exception, err: _logger.error( "crawling faild for source id=%d, %s: %s" % (source['id'], source['url'], traceback.format_exc()))
def user_timeline(self, count=10): # Assume logged in self.selenium.click('id=mblog') self._wait_load() soup = BeautifulSoup(self.selenium.get_html_source()) tweet = [i.text for i in soup.findAll('p', 'sms')] while len(tweet) < count: try: self.selenium.click(u'下一页') self._wait_load() except Exception, err: _logger.info('failed to load next page: %s', err) break soup = BeautifulSoup(self.selenium.get_html_source()) tweet.extend([i.text for i in soup.findAll('p', 'sms')])
class SQLAgent(object): __metaclass__ = MetaAgent # set sscursor to True if want to store the result set in server. It's for large result set def __init__(self, db_name, db_user, db_pass, host = "localhost", sscursor = False): self.db_name = db_name self.db_user = db_user self.db_pass = db_pass self.db_host = host self.use_sscursor = sscursor def start(self): _logger.info('connecting DB... host:%s %s@%s:%s' % (self.db_host, self.db_user, self.db_name, self.db_pass)) self.conn = MySQLdb.connect(host = self.db_host, user = self.db_user, passwd = self.db_pass, db = self.db_name, ) if self.use_sscursor: # store result in server self.cursor = self.conn.cursor(MySQLdb.cursors.SSDictCursor) else: self.cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) self.cursor.old_execute = self.cursor.execute self.cursor.execute = self.safe_execute self.cursor.execute('set names utf8') self.conn.commit() def stop(self): try: self.cursor.close() self.conn.close() except Exception, err: _logger.error('stopping SQLAgent failed: %s, will continue anyway' % err) _logger.info('sql agent stopped')
def start(self): _logger.info('connecting DB... host:%s %s@%s:%s' % (self.db_host, self.db_user, self.db_name, self.db_pass)) self.conn = MySQLdb.connect( host=self.db_host, user=self.db_user, passwd=self.db_pass, db=self.db_name, ) if self.use_sscursor: # store result in server self.cursor = self.conn.cursor(MySQLdb.cursors.SSDictCursor) else: self.cursor = self.conn.cursor(MySQLdb.cursors.DictCursor) self.cursor.execute('set names utf8') self.conn.commit()
def crawl(self, url): self.owned = set() self.output = open('265.crawl%s' % datetime.now().date(), 'w') _logger.info('opening 265 home page: %s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"), fromEncoding='utf-8') for anchor in soup.find('div', id="siteCate").find('div', 'body').findAll('a'): _logger.info('crawling top tier category: %s (%s)' % (anchor.text, anchor['href'])) self.output.write('%s\n' % anchor.text.encode('utf8')) self.crawl_layer(pbrowser.abs_url(url, anchor['href']), 1) self.output.close()
def _crawl_primary(self, anchor): self.output.write(anchor.text.encode('utf-8') + '\n') self._randsleep() html = self.br.open(anchor['href']).read() html = util.convert_to_utf8(html, 'gb2312') soup = BeautifulSoup(html) seconds = soup.findAll('div', 'dirtit') for second in seconds: _logger.info('crawling secondary category: (%s)' % second.text.encode('utf-8')) try: self._crawl_secondary(second) except Exception, err: _logger.error('secondary(%s) failed: %s' % (second.text.encode('utf-8'), err))
def interpret(inpath, outpath): with open(inpath, "rb") as crawled_docs: docs = cPickle.load(crawled_docs) _logger.info('found ' + str(len(docs)) + ' docs from crawler\'s output') output_str = u'' for doc in docs: _logger.info('processing doc from url: ' + doc['url']) contents = parse_html(doc) output_str += unicode(doc['url'] + '\n\n' + '+' * 100 + '\n\n') for paragraph in contents: if is_valid_text(paragraph): output_str += unicode(paragraph + '\n\n' + '+' * 100 + '\n\n') output_str += ('\n\n' + '=' * 100 + '\n') with open(outpath, "w") as output: output.write(output_str.encode('utf-8'))
def crawl(self, url): self.output = open('baike.crawl%s' % datetime.now().date(), 'w') _logger.info('opening baike home page: %s' % url) html = self.br.open(url).read() html = util.convert_to_utf8(html, 'gb2312') soup = BeautifulSoup(html) for item in soup.find('div', id="classList").findAll('h2'): anchor = item.find('a') _logger.info( 'crawling primary category: (%s), %s' % (anchor.text.encode('utf-8'), anchor['href'].encode('utf-8'))) try: self._crawl_primary(anchor) except Exception, err: _logger.error('primary category(%s) failed: %s' % (anchor.text.encode('utf-8'), err))
def try_crawl_href(anchor_url, anchor_text, encoding, agent, selenium): _logger.debug('crawling anchor (%s), URL: %s' % (anchor_text, anchor_url)) # filters # ignore bad-looking anchors if util.chinese_charactor_count(anchor_text.decode('utf-8')) < 10: _logger.debug('too few chinese chars in anchor text, ignoring') return None # ignore same href crawled recently if crawled_as_terminal(agent, anchor_url, anchor_text, 30): _logger.debug('ignore %s, same href was crawled %d days ago' % (anchor_url, (now - last_crawled_at).days)) return None tweet = crawl_href(anchor_url, anchor_text, encoding, selenium) _logger.info('crawl_href finished, anchor-text:(%s)' % anchor_text) return tweet
def crawl_one_author(self, url, callback): page = 1 while True: _logger.info("openning page URL: %s" % url) self.br.open(url, timeout=TIMEOUT) soup = BeautifulSoup(self.br.response().read()) url = self.br.geturl() img_div = soup.findAll('div', 'images') imgs = list( itertools.chain( *[div.findAll('a', target='_blank') for div in img_div])) imgs.extend(soup.findAll('a', {'data-location': 'content'})) _logger.debug("%d images on this page" % len(imgs)) for a in imgs: img_url = a['href'] if img_url in self.crawl_history: _logger.debug('ignoring crawled URL: %s' % img_url) continue info = None try: all_info = self.grab_image_info(img_url) self.logfile.write(img_url + '\n') self.logfile.flush() _logger.debug('image processed %s' % img_url) except Exception, err: _logger.error( 'processing one image url failed, url:%s, %s' % (img_url, err)) else: for info in all_info: try: if callback != None: callback(info=info) except Exception, err: _logger.error( 'callback failed, image url: %s, %s, %s' % (img_url, err, traceback.format_exc())) _logger.debug('sleeping for 5 sec') time.sleep(5)
def test(X, y): by_domain = defaultdict(list) sz = len(y) for i in xrange(sz): by_domain[y[i]].append(X[i]) domains = ['alarm', 'calendar', 'communication', 'note', 'places', 'reminder', 'weather', 'web'] for p in domains: for q in domains: if p < q: clf = svms[p, q] p_len = len(by_domain[p]) q_len = len(by_domain[q]) X = list(by_domain[p]) X.extend(by_domain[q]) y = [p] * p_len y.extend([q] * q_len) _logger.info("%.4f, %s - %s" % (clf.score(X, y), p, q))
def main(): listen() socket.setdefaulttimeout(120) agent = WeeSQLAgent('weDaily', 'junyi', 'admin123') agent.start() pool = eventlet.GreenPool(2000) crawler = FeedCrawler(agent, pool) loop_count = 1 while True: agent.restart() sources = agent.get_all_sources() for source in sources: pool.spawn_n(crawler.fetch_source, source) pool.waitall() _logger.info("loop %d finished, will sleep for %d seconds" % (loop_count, SLEEP_IN_SEC)) loop_count += 1 time.sleep(SLEEP_IN_SEC)
def cv(self, fold): size = len(self.y) kf = cross_validation.KFold(size, fold, shuffle=True) iteration = 0 scores = list() for train_idx, test_idx in kf: X = [self.X[idx] for idx in train_idx] y = [self.y[idx] for idx in train_idx] X_test = [self.X[idx] for idx in test_idx] y_test = [self.y[idx] for idx in test_idx] _logger.debug("Training...") self.fit(X, y) _logger.debug("Testing...") score = self.get_test_accuracy(X_test, y_test) scores.append(score) iteration += 1 _logger.info("CV iteration %d: CV accuracy: %f" % \ (iteration, score)) scores = np.array(scores) return scores.mean(), scores.std()
def train(self): _logger.info("reading posterior probabilities from naive bayes model") self.words = list() self.words_seen = set() X = np.array([]) for term in g_term_count: term = term_category(term) if term in self.words_seen: continue self.words_seen.add(term) self.words.append(term) x = list() for domain in self.naive.model.domains: val = self.naive.posterior_prob(term, domain) x.append(val) X = np.append(X, x) _logger.info("%d terms need to be clustered" % len(self.words)) X = np.reshape(X, (len(self.words), len(self.naive.model.domains))) kmeans = KMeans(n_clusters = len(self.words) / 10) y = kmeans.fit_predict(X) with open(OUTFILE_PATH, "w") as outfile: for i in xrange(len(y)): outfile.write("%s\t%d\n" % (self.words[i].encode('utf-8'), y[i])) _logger.info("clustering result wrote to %s" % OUTFILE_PATH)
def fill_all_accounts(daemon, helper): users = daemon.agent.get_all_user() good_ids = open('good', 'w') bad_ids = open('bad', 'w') for user in users: try: api = daemon.get_api_by_user(user.uname) good_ids.write( '%s:%s:%s\n' % (user.uname, user.passwd, api.me().name.encode('utf-8'))) _logger.info('successful fetching api failed for %s' % (user.uname)) except Exception, err: _logger.error('failed fetching api failed for %s: %s' % (user.uname, err)) bad_ids.write('%s:%s\n' % (user.uname, user.passwd)) # try: # fill_account(daemon, helper, user) # except Exception, err: # _logger.error('fill_account failed for (%s): %s' % # (user.uname, err)) time.sleep(1)
def get_all_friend(self, callback=None): profile_page = self.selenium.get_location() _logger.debug('copy location url: %s' % profile_page) _logger.debug('loading attentions page') self.selenium.click('id=attentions') self._wait_load() soup = BeautifulSoup(self.selenium.get_html_source()) friends = [ self._create_user_from_attention_list(i) for i in soup.findAll('li', 'MIB_linedot_l') ] while True: try: self.selenium.click(u'下一页') except Exception, err: _logger.info('failed to load next page: %s' % err) soup = BeautifulSoup(self.selenium.get_html_source()) for li in soup.findAll('li', 'MIB_linedot_l'): friends.append(self._create_user_from_attention_list(li)) if callback != None: callback(li)
def clean(X, y, k=10): _logger.info("cleaning base on %d-fold cross validation" % k) size = len(y) kf = KFold(size, n_folds=k, shuffle=True) fold = 1 for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] pipeline = Pipeline([ ("vert", TfidfVectorizer(min_df = 1, binary = True, ngram_range = (1, 3), tokenizer = Tokenizer())), ("clf", LinearSVC(loss='l1', penalty="l2", multi_class="ovr", class_weight="auto")), ]) _logger.debug("Training fold %d" % fold) pipeline.fit(X_train, y_train) _logger.debug("Predicting for fold %d" % fold) y_pred = pipeline.predict(X_test) _logger.info("fold %d got accuracy: %f" % (fold, accuracy_score(y_test, y_pred))) right_f = open("fold%d.right.dat" % fold, "w") wrong_f = open("fold%d.wrong.dat" % fold, "w") size = len(y_test) for i in xrange(size): sent, pred, gold = X_test[i].encode('utf-8'), y_pred[i].encode('utf-8'), y_test[i].encode('utf-8') if pred != gold: wrong_f.write("%s\t%s\t%s\n" % (pred, gold, sent)) else: right_f.write("%s\t%s\n" % (sent, gold)) right_f.close() wrong_f.close() fold +=1
def test(test_file_path, clf): X, y = load_data(test_file_path) size = len(y) scores = clf.decision_function(X) # y_pred = [] # for i in xrange(size): # score = scores[i] # detail = sorted(zip(clf.named_steps['clf'].classes_, # score), # key = lambda x: -x[1]) # if detail[0][1] >= 1.1: # y_pred.append(detail[0][0]) # else: # y_pred.append(u'web') y_pred = clf.predict(X) outfile = open("predicted.dat", 'w') for i in range(len(y)): sentence, pred, gold = X[i], y_pred[i], y[i] outfile.write("%s\t%s\t%s\n" % (sentence.encode('utf-8'), pred.encode('utf-8'), gold.encode('utf-8'))) _logger.info("accuracy: %f, %d records" % (accuracy_score(y, y_pred), len(y)))
def crawl_layer(self, url, level): self._randsleep() prefix = ' ' * level _logger.info('opening layer url: %s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"), fromEncoding="utf-8") _logger.info('processing page with title:%s' % soup.title.text) # get next level links children = {} for li in soup.find('div', id='TreeData').findAll('li', 'close'): a = li.find('a') children[a.text] = a['href'] # grab links in current page for div in soup.find('div', id="BMain").findAll('div', 'subBM'): cate = div.find('h3').text if cate in self.owned: continue self.owned.add(cate) self.output.write(prefix + '%s\n' % cate.encode('utf8')) for li in div.find('ul', 'listUrl').findAll('li'): try: a = li.find('a') self.output.write( prefix * 2 + '%s %s\n' % (a['href'].encode('utf8'), a.text.encode('utf8'))) except Exception, err: _logger.error('error processing anchor(%s): %s' % (str(li), err)) # grab links in next level, if any if cate in children: self.crawl_layer(children[cate], level + 1)
def test(X, y): _logger.info("Fisrt stage accuracy: %f" % front.score(X, y)) import decode_svm outfile = open("predicted.dat", "w") discfile = open("discriminated.dat", "w") y_pred = list() sz = len(y) domains = front.named_steps["clf"].classes_ for i in xrange(sz): sent = X[i] gold = y[i] front_result = sorted(zip(domains, front.decision_function([sent])[0]), key = lambda x: -x[1]) pred = front_result[0][0] assert pred == front.predict([sent])[0] if front_result[0][1] < 0.0 or front_result[1][1] > 0.0: p = front_result[0][0] q = front_result[1][0] svm_pred = decode_svm.discriminate(p, q, sent)[0] discfile.write("%s\t%s\t%s\t%s\t%s\n" % \ (sent.encode('utf-8'), p.encode('utf-8'), q.encode('utf-8'), svm_pred.encode('utf-8'), gold.encode('utf-8'))) pred = svm_pred y_pred.append(pred) outfile.write("%s\t%s\t%s\n" % (sent.encode('utf-8'), pred.encode('utf-8'), gold.encode('utf-8'))) _logger.info("ensembled accuracy: %f" % accuracy_score(y, y_pred)) outfile.close() discfile.close()
def train_pair(self, p, q): if p > q: p, q = q, p p_len = len(self.by_domain_data[p]) q_len = len(self.by_domain_data[q]) _logger.info("Training SVM for %s V.S. %s, %d + %d = %d recored" % \ (p, q, p_len, q_len, p_len + q_len)) X = list(self.by_domain_data[p]) X.extend(self.by_domain_data[q]) y = [p] * p_len y.extend([q] * q_len) pipeline = Pipeline([ ("vert", TfidfVectorizer(min_df = 1, binary = False, ngram_range = (1, 1), tokenizer = Tokenizer())), ("svm", LinearSVC(loss='l2', penalty="l1", dual=False, tol=1e-3)), ]) if self.cv > 0: _logger.info("Doing grid search on %d fold CV" % self.cv) params = { "svm__C": [1, 10, 50, 100, 500, 1000], } grid = GridSearchCV(pipeline, params, cv=self.cv, verbose=50) grid.fit(X, y) pipeline = grid.best_estimator_ _logger.info("Grid search got best score:%f" % grid.best_score_) pipeline.accur = grid.best_score_ else: pipeline.fit(X, y) _logger.debug("Testing on training data") accur = accuracy_score(y, pipeline.predict(X)) pipeline.accur = accur _logger.info("Trainig accuracy (%s - %s): %f" % (p, q, accur)) self.svms[p,q] = pipeline return pipeline
]) params = { "nb__alpha": [0.001, 0.01, 0.1, 0.5], } if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--input", help="path of the training data", default=TRAIN_FILE_PATH) cmd.add_argument("--cv", help="enable cross validation", type=int, default=0) args = cmd.parse_args() X, y = load_data(args.input) if args.cv > 0: _logger.info("Doing %d fold cross validation" % args.cv) gs = GridSearchCV(pipeline, params, cv = args.cv, verbose=5) gs.fit(X, y) with open("sk_naive.model", "w") as outfile: pickle.dump(gs.best_estimator_, outfile) _logger.info("Model dumped to sk_naive.model") print gs.best_estimator_ print gs.best_score_ else: _logger.info("Start training") pipeline.fit(X, y) with open("sk_naive.model", "w") as outfile: pickle.dump(pipeline, outfile) _logger.info("Model dumped to sk_naive.model")
def __init__(self, naive_model_path): _logger.info("loading naive bayes model from %s" % naive_model_path) model = pickle.load(open(naive_model_path)) self.naive = NaiveDecoder(model) self.words = dict()
domains = ['alarm', 'calendar', 'communication', 'note', 'places', 'reminder', 'weather', 'web'] for p in domains: for q in domains: if p < q: clf = svms[p, q] p_len = len(by_domain[p]) q_len = len(by_domain[q]) X = list(by_domain[p]) X.extend(by_domain[q]) y = [p] * p_len y.extend([q] * q_len) _logger.info("%.4f, %s - %s" % (clf.score(X, y), p, q)) _logger.info("loading model from svms.model") svms = pickle.load(open('svms.model')) if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH) cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true') args = cmd.parse_args() X, y = load_data(args.path) if args.as_server: serv() else: test(X, y)
# else: # y_pred.append(u'web') y_pred = clf.predict(X) outfile = open("predicted.dat", 'w') for i in range(len(y)): sentence, pred, gold = X[i], y_pred[i], y[i] outfile.write("%s\t%s\t%s\n" % (sentence.encode('utf-8'), pred.encode('utf-8'), gold.encode('utf-8'))) _logger.info("accuracy: %f, %d records" % (accuracy_score(y, y_pred), len(y))) if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH) cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true') cmd.add_argument("--model", help = "path to the pickled model", required=True, choices = ["%s.model" % algo for algo in CLFs.keys()]) args = cmd.parse_args() _logger.info("loading model from %s" % args.model) clf = pickle.load(open(args.model)) if args.as_server: serv(clf) sys.exit(0) test(args.path, clf)
self.terms.add(term) self.domains.add(domain) v = len(self.terms) for term in self.terms: p = dict() for domain in self.domains: p[domain] = (1.0 + self.count[term, domain]) / (v + self.count[domain]) wcp = dict() s = sum(p.values()) for domain in self.domains: wcp[domain] = p[domain] / s self.gini[term] = sum([v ** 2 for v in wcp.values()]) def dump(self, out_path): with open(out_path, 'w') as outfile: for k, v in self.gini.items(): outfile.write("%s %f\n" % (k, v)) if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--input", help="path of the count data") cmd.add_argument("--output", help="path to dump the model", default=MODEL_PATH) args = cmd.parse_args() gini = GiniCoe(args.input) _logger.info("Training Gini coefficient from count file: %s" % args.input) gini.train() _logger.info("Dumping model to %s" % args.output) gini.dump(args.output)
from util import * from util.log import _logger from model.naive.train import NaiveBayes from feat.terms.term_categorize import term_category, g_term_count import rep.word_clustering.decode as word_clustering class ClusteredNaiveBayes(NaiveBayes): def get_category(self, term): term = term_category(term) return word_clustering.get_cluster(term) if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--input", help="path of the training data") cmd.add_argument("--terms", help="path of the terms file") cmd.add_argument("--alpha", help="alpha of discounting", type=float, default=0.5) cmd.add_argument("--cv", help="enable cross validation", type=int, default=0) args = cmd.parse_args() naive = ClusteredNaiveBayes(args.input, args.terms, args.alpha) if args.cv > 0: _logger.info("CV accuracy: %f +/- %f" % naive.cv(args.cv)) else: _logger.info("Start training"); naive.train() with open("naive.clustered.model", "w") as outfile: pickle.dump(naive, outfile) _logger.info("Model dumped to naive.clustered.model")
vert = clf.named_steps['vert'] terms = list(set(sentence.split())) terms = sorted([(term, sel.scores_[get_vert_idx(vert, term_category(term))]) for term in terms], key = lambda x: -x[1])[:7] return ' '.join([term[0] for term in terms]) def extract(X, clf): ret = [] for sentence in X: ret.append(slim(sentence, clf)) return ret if __name__ == "__main__": _logger.info("loading model") clf = pickle.load(open('sk_naive.model')) cmd = argparse.ArgumentParser() cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH) cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true') args = cmd.parse_args() if args.as_server: serv(clf) X, y = load_data(args.path) # _logger.debug("Extracting merites for long sentences") # X = extract(X, clf) y_pred = clf.predict(X)
l = np.array(l) l.shape = len(l), 1 ret = sparse.hstack([ret, l]) _logger.debug("vectorization transform done") return ret if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--input", help="path of the training data", default = TRAIN_FILE_PATH) cmd.add_argument("--algo", help="alogrithm to use", required=True, choices = CLFs.keys()) args = cmd.parse_args() X, y = load_data(args.input) _logger.info("training using %s" % args.algo) pipeline = Pipeline([ ("vert", TfidfVectorizer(min_df = 1, binary = True, ngram_range = (1, 3), tokenizer = Tokenizer())), #("vert", Vectorizer()), ("clf", CLFs[args.algo]), ]) pipeline.fit(X, y) from decode import test test(TEST_FILE_PATH, pipeline) outpath = "%s.model" % args.algo with open(outpath, "w") as outfile: pickle.dump(pipeline, outfile)
if __name__ == "__main__": cmd = argparse.ArgumentParser() <<<<<<< HEAD cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true') cmd.add_argument("--serv-prob", help = "run as server compare posterior probability of terms under every domain", dest="as_server_prob", action='store_true') cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH) cmd.add_argument("--model-path", help = "path to the naive bayes model file") ======= cmd.add_argument("--serv", help = "run as server", default=False, dest="as_server", action='store_true') cmd.add_argument("--path", help = "path to the test data", default='test.dat') >>>>>>> bf1b826a908169fa2340477f367736f63a5f7875 args = cmd.parse_args() print args _logger.info("Loading model") <<<<<<< HEAD model = pickle.load(open(args.model_path)) ======= model = pickle.load(open(conv.redirect('naive.model'))) >>>>>>> bf1b826a908169fa2340477f367736f63a5f7875 if args.as_server: serv(model) elif args.as_server_prob: serv_prob(model) else: <<<<<<< HEAD test(model, args.path) =======
from train import Vectorizer def gen(path, clf): X, y = load_data(path) scores = clf.decision_function(X) sz = len(y) with open("web_split.dat", "w") as outfile: for i in xrange(sz): assert y[i] == "web" score = scores[i] detail = sorted(zip(clf.named_steps["clf"].classes_, score), key=lambda x: -x[1]) outfile.write("%s %f\n" % (detail[0][0], detail[0][1])) if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--path", help="path to only-web training data") cmd.add_argument("--serv", help="run as server", dest="as_server", action="store_true") cmd.add_argument("--gen", help="generate training data", dest="generate", action="store_true") args = cmd.parse_args() _logger.info("loading model from %s" % "svm_ovr.model") clf = pickle.load(open("svm_ovr.model")) if args.generate: gen(args.path, clf)
accur = accuracy_score(y, pipeline.predict(X)) pipeline.accur = accur _logger.info("Trainig accuracy (%s - %s): %f" % (p, q, accur)) self.svms[p,q] = pipeline return pipeline if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--input", help="path of the training data", default=TRAIN_FILE_PATH) cmd.add_argument("--classes", help="the pair of classes need to train, train all combination if not specified", nargs=2, default=None) cmd.add_argument("--cv", help="fold of cross validation 0 for not doing", default=0, type=int) args = cmd.parse_args() _logger.info("Loading training data from %s" % args.input) X, y = load_data(args.input) if args.classes: _logger.info("Will train 1v1 SVM between %s and %s" % (args.classes[0], args.classes[1])) gp = SVMGroup(cv=args.cv) if os.path.isfile("svms.model"): gp.svms = pickle.load(open("svms.model")) gp.collect_by_domain(X, y) gp.train_pair(args.classes[0], args.classes[1]) else: gp = SVMGroup() _logger.info("Start training") gp.train(X, y)
terms, domain = line.split('\t') term_set = set() for term in terms.split(' '): term = term_category(term) if term not in term_set: term_set.add(term) self.count[(term, domain)] += 1 c += 1 if c % 10000 == 0: _logger.debug("%d records processed" % c) def dump(self, path): with open(path, 'w') as outfile: for key, val in self.count.items(): term, domain = key outfile.write("%s %s %d\n" % (term.encode('utf-8'), domain.encode('utf-8'), val)) if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--input", help="path of the training data") cmd.add_argument("--output", help="path to dump the model", default=DEFAULT_OUTPATH) args = cmd.parse_args() counter = Counter(args.input) _logger.info("training from %s" % args.input) counter.train() _logger.info("dumping model to %s" % args.output) counter.dump(args.output)
q = front_result[1][0] svm_pred = decode_svm.discriminate(p, q, sent)[0] discfile.write("%s\t%s\t%s\t%s\t%s\n" % \ (sent.encode('utf-8'), p.encode('utf-8'), q.encode('utf-8'), svm_pred.encode('utf-8'), gold.encode('utf-8'))) pred = svm_pred y_pred.append(pred) outfile.write("%s\t%s\t%s\n" % (sent.encode('utf-8'), pred.encode('utf-8'), gold.encode('utf-8'))) _logger.info("ensembled accuracy: %f" % accuracy_score(y, y_pred)) outfile.close() discfile.close() if __name__ == "__main__": cmd = argparse.ArgumentParser() cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true') cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH) cmd.add_argument("--front-model-path", help = "path to the first stage model") args = cmd.parse_args() _logger.info("Loading naive bayes model from %s" % args.front_model_path) front = pickle.load(open(args.front_model_path)) X, y = load_data(args.path) test(X, y)