def survey(top_url, h = 3, max_state = 10000, similarity = 0.95): #コントロール変数 target_netloc = _get_netloc(top_url) elements = [] result = {} history = {} cl = urlclass.fisherclassifier(urlclass.GetUrlFeatures) #初期要素 element = {'url' : top_url, 'step' : 0, 'referer' : '_SearchEngine'} elements.append(element) history.setdefault(top_url, 1) #クローリングしてリンクを抽出する j = 0 while len(elements) > 0 and len(result) < max_state: e = elements.pop(0) if j % 10 == 0: print j, e['step'], len(elements), len(result) #URLと遷移数とリンク元をデータとして保存する if result.has_key(e['url']) == False: try: site = urlopen(e['url']).read() soup = BeautifulSoup(site) links = soup.findAll('a') result.setdefault(e['url'], {}) result[e['url']].setdefault(e['step'], []) result[e['url']][e['step']].append(e['referer']) for link in links: try: url = link['href'] netloc = _get_netloc(url) #netlocが自分のサイトであり、遷移数が設定値以下で、未調査URLの場合には新たな調査候補として加える if netloc.find(target_netloc) != -1 and e['step'] + 1 < h and result.has_key(url) == False: exist, redirect_url = httpExists(url) #リンクが存在するかをチェックする #リダイレクト先が存在していなければ、調査候補として加える if exist == True and result.has_key(redirect_url) == False and history.has_key(redirect_url) == False: elements.append({'url' : redirect_url, 'step' : e['step'] + 1, 'referer' : e['url']}) history.setdefault(redirect_url, 1) except: pass except: print 'Crawling ERROR', e['url'] else: result.setdefault(e['url'], {}) result[e['url']].setdefault(e['step'], []) result[e['url']][e['step']].append(e['referer']) j += 1 result = find_similar(result, similarity = similarity) return result
def transition_model(data, uu_volume = 0.01, upper_limit_of_lower_dic = 30, lower_criteria = 0.5, upper_criteria = 0.7, link_criteria = 0.4, img_criteria = 0.2, script_criteria = 0.2): #ゴミURLを除外する print "## start URL check ##" new_data = cleaning_url_in_data(data, cl = urlclass.fisherclassifier(urlclass.GetUrlFeatures)) print '## end ##' #URLを構造化する print "## start data structuring ##" structured = structured_data.get_data(new_data) print '## end ##' #構造化したURLで遷移状態への採用粒度を決める print "## start state adopting ##" structured = structured_data.set_state(structured, uu_volume = uu_volume, upper_limit_of_lower_dic = upper_limit_of_lower_dic, lower_criteria = lower_criteria, upper_criteria = upper_criteria, link_criteria = link_criteria, img_criteria = img_criteria, script_criteria = script_criteria) print '## end ##' print '## show structured data ##' for h, s in structured.items(): for state, v in s.items(): rate = float(v['state_uu']) / float(v['netloc_uu']) if float(v['netloc_uu']) != 0 else 0 if rate >= uu_volume: print state, ' : ', v['netloc_uu'], ',', v['state_uu'], ',', v['uu'], ',', v['is_state'] print '## end ##' #採用粒度のラベルでURLを張り替える print "## start relabeling URL ##" new_data = change_url(data = new_data, structured_data = structured) print '## end ##' #UUトラックデータを作成する print "## start making tracking data ##" uu_base = make_uu_base_data(new_data) print '## end ##' #UUトラックデータから回遊遷移モデルを作成する print "## start making taransition model ##" model = make_model(data = uu_base) print '## end ##' return model
def change_url(data, structured_data): from difflib import SequenceMatcher as SM new_data = [] j = 0 # stateを配列として切り出す states = [] for h, s in structured_data.items(): for state, v in s.items(): if v['is_state']: states.append(state) if len(v['query_state']) > 0: for qs in v['query_state']: states.append(qs) #ラベルを張り替える cl = urlclass.fisherclassifier(urlclass.GetUrlFeatures) m = SM() for log in data: if j % 10000 == 0: print 'read: ', j, ', ', len(data) j += 1 #分類器から推定結果を得る label = cl.classify(log['url']) if label == None: m.set_seq1(log['url']) match_ratio = {'state' : None, 'match-ratio' : None} for state in states: m.set_seq2(state) if match_ratio['state'] == None or m.ratio() > match_ratio['match_ratio']: match_ratio['state'] = state match_ratio['match_ratio'] = m.ratio() new_data.append({'id' : log['id'], 'url' : match_ratio['state'], 'unixtime' : log['unixtime']}) cl.train(log['url'], match_ratio['state']) cl.setminimum(match_ratio['state'], min = 0.8) else: new_data.append({'id' : log['id'], 'url' : label, 'unixtime' : log['unixtime']}) return new_data