def save_url_articl(self): log_print('Saving data file in: '.format(self.work_dir)) assert self.work_dir != '', 'Data file dir can not be none!' if not os.path.exists(self.work_dir): os.makedirs(self.work_dir) url_article_file = os.path.join(self.work_dir, 'url_article.pkl') if os.path.exists(url_article_file): logging.warn( 'Data file: \'{}\' already exist and it will be replaced!'. format(url_article_file)) with open(url_article_file, 'wb') as fout: result = { 'url_dict': self.url_dict, 'article_dict': self.article_dict, # 'word_dict': self.word_dict, 'simultaneous_dict': self.simultaneous_dict } pickle.dump(result, fout) with open(os.path.join(self.work_dir, 'url.txt'), 'wt', encoding='utf-8') as fout: fout.write('# article number ' + '#' * 30 + '\n') _urls = set() for k, v in self.url_dict.items(): fout.write('{}: {}\n'.format(k, len(v))) _urls = _urls.union(v) fout.write('no repeat article number: {}\n'.format(len(_urls))) fout.write('\n') for k, v in self.url_dict.items(): fout.write('# {} '.format(k) + '#' * 30 + '\n') for i, _url in enumerate(v): fout.write('{}. {}\n'.format(i + 1, _url)) fout.write('\n') with open(os.path.join(self.work_dir, 'article.txt'), 'wt', encoding='utf-8') as fout: # title read like time keywords abstract text segmentation for i, (k, v) in enumerate(self.article_dict.items()): fout.write('# {} '.format(i + 1) + '#' * 30 + '\n') fout.write('url: {}\n'.format(k)) fout.write('title: {}\n'.format(v[0])) fout.write('read: {}\n'.format(v[1])) fout.write('like: {}\n'.format(v[2])) fout.write('time: {}\n'.format(v[3])) fout.write('keywords: {}\n'.format(v[4])) fout.write('abstract: {}\n'.format(v[5])) fout.write('text: {}\n'.format(v[6])) fout.write('segmentation: {}\n\n'.format(v[7]))
def load_url_article(self, keywords): data_file = os.path.join(self.work_dir, 'url_article.pkl') if self.work_dir != '' and os.path.exists(data_file): log_print('Loading data file:{}'.format(data_file)) with open(data_file, 'rb') as fin: result = pickle.load(fin) self.url_dict = result['url_dict'] self.article_dict = result['article_dict'] missing_keys = [ keyword for keyword in keywords if keyword not in self.url_dict.keys() ] if len(missing_keys) > 0: return False, missing_keys else: return True, None else: return False, keywords
def wait_for_elem(xpath, time_out_internals=[60, 120, 180]): time_out_nums = 0 while True: try: data_loader.DataLoader.WAIT.until( lambda brows: brows.find_element_by_xpath(xpath)) time.sleep(random.randint(1, 5)) return True, 1 except TimeoutException: if time_out_nums >= 3: return False, -1 else: time_out_tips(time_out_internals[time_out_nums]) time_out_nums += 1 continue except NoSuchElementException: return False, -2 except Exception as msg: log_print(msg) return False, -3
def get_absts_page(): # 获取url和标题 urls, titles, absts = [], [], [] if wait_for_elem( '//div[@class="search-list-con"]/dl[contains(@class, "search-list")]' ): try: # title read like time keywords abstract text segmentation node_list = DataLoader.BROWSER.find_elements_by_xpath( '//div[@class="search-list-con"]/dl[contains(@class, "search-list")]' ) for node in node_list: try: tmp1 = node.find_element_by_css_selector( 'dt > div.limit_width > a') tmp2 = node.find_element_by_css_selector( 'dd.search-detail') if DataLoader.FILTER.filter_when_getting( keyword, tmp1.text, tmp2.text): urls.append(tmp1.get_attribute('href')) titles.append(tmp1.text) absts.append(tmp2.text) except selenium.common.exceptions.NoSuchElementException: continue except selenium.common.exceptions.NoSuchElementException: log_print('NoSuchElementException!') return [], [], [] except selenium.common.exceptions.TimeoutException: log_print('TimeoutException!') return [], [], [] except Exception as msg: log_print(msg) return [], [], [] return urls, titles, absts
def get_absts_page(): # 获取url和标题 urls, titles, absts = [], [], [] if wait_for_elem( '//ul[@class="note-list"]/li/div[@class="content"]'): try: # title read like time keywords abstract text segmentation node_list = DataLoader.BROWSER.find_elements_by_xpath( '//ul[@class="note-list"]/li/div[@class="content"]') for node in node_list: tmp1 = node.find_element_by_css_selector('a.title') tmp2 = node.find_element_by_css_selector('p.abstract') if DataLoader.FILTER.filter_when_getting( keyword, tmp1.text, tmp2.text): urls.append(tmp1.get_attribute('href')) titles.append(tmp1.text) absts.append(tmp2.text) except selenium.common.exceptions.NoSuchElementException: log_print('NoSuchElementException!') return [], [], [] except selenium.common.exceptions.TimeoutException: log_print('TimeoutException!') return [], [], [] except Exception as msg: log_print(msg) return [], [], [] return urls, titles, absts
def get_article(url): # 获取文章正文 # article_dict: url-(title read like time keywords abstract text segmentation) DataLoader.BROWSER.get(url) log_print('Browsing url:{}...'.format(url)) result = [] if wait_for_elem('//article[@class="_2rhmJa"]'): try: title = DataLoader.BROWSER.find_element_by_xpath( '//h1[@class="_1RuRku"]') article = DataLoader.BROWSER.find_element_by_xpath( '//article[@class="_2rhmJa"]') dsoj = DataLoader.BROWSER.find_element_by_xpath( '//div[@class="s-dsoj"]') art_time = dsoj.find_element_by_xpath('//time') read_nums = dsoj.find_element_by_xpath( '//span[contains(text(), "阅读")]') like = DataLoader.BROWSER.find_element_by_xpath( '//span[@class="_1LOh_5"]') # log_print('title:{}'.format(title.text)) # log_print(art_time.text) # log_print(read_nums.text) # log_print(like.text) except selenium.common.exceptions.NoSuchElementException: log_print('NoSuchElementException!') return [] except selenium.common.exceptions.TimeoutException: log_print('TimeoutException!') return [] except Exception as msg: log_print(msg) return [] result = [ title.text, int(read_nums.text.replace(',', '').split(' ')[-1]), int(like.text.replace(',', '')[:-3]), art_time.text, [], '', article.text, None ] return result
def get_article(url): # 获取文章正文 # article_dict: url-(title read like time keywords abstract text segmentation) DataLoader.BROWSER.get(url) log_print('Browsing url:{}...'.format(url)) result = [] read_nums_, like_ = 0, 0 if wait_for_elem('//div[@id="content_views"]'): try: article = DataLoader.BROWSER.find_element_by_xpath( '//div[@id="content_views"]') art_time = DataLoader.BROWSER.find_element_by_xpath( '//div[@class="bar-content"]/span[@class="time"]') read_nums = DataLoader.BROWSER.find_element_by_xpath( '//div[@class="bar-content"]/span[@class="read-count"]' ) like = DataLoader.BROWSER.find_element_by_xpath( '//div[@class="bar-content"]/a/span[@class="get-collection"]' ) log_print('article:{}'.format(article.text)) log_print(art_time.text) log_print(read_nums.text) log_print(like.text) except selenium.common.exceptions.NoSuchElementException: log_print('NoSuchElementException!') return [] except selenium.common.exceptions.TimeoutException: log_print('TimeoutException!') return [] except Exception as msg: log_print(msg) return [] if read_nums.text.replace(' ', '') != '': read_nums_ = int(read_nums.text.replace(' ', '')) if like.text.replace(' ', '') != '': like_ = int(like.text.replace(' ', '')) result = [ '', read_nums_, like_, art_time.text, [], '', article.text, None ] return result
def get_url_articl(self, keywords, pages=15): log_print('Geting urls from csdn...') # self.url_dict: keyword-[urls] base_url = 'https://so.csdn.net/so/search/s.do?p={}&q={}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0' def get_absts_page(): # 获取url和标题 urls, titles, absts = [], [], [] if wait_for_elem( '//div[@class="search-list-con"]/dl[contains(@class, "search-list")]' ): try: # title read like time keywords abstract text segmentation node_list = DataLoader.BROWSER.find_elements_by_xpath( '//div[@class="search-list-con"]/dl[contains(@class, "search-list")]' ) for node in node_list: try: tmp1 = node.find_element_by_css_selector( 'dt > div.limit_width > a') tmp2 = node.find_element_by_css_selector( 'dd.search-detail') if DataLoader.FILTER.filter_when_getting( keyword, tmp1.text, tmp2.text): urls.append(tmp1.get_attribute('href')) titles.append(tmp1.text) absts.append(tmp2.text) except selenium.common.exceptions.NoSuchElementException: continue except selenium.common.exceptions.NoSuchElementException: log_print('NoSuchElementException!') return [], [], [] except selenium.common.exceptions.TimeoutException: log_print('TimeoutException!') return [], [], [] except Exception as msg: log_print(msg) return [], [], [] return urls, titles, absts def get_article(url): # 获取文章正文 # article_dict: url-(title read like time keywords abstract text segmentation) DataLoader.BROWSER.get(url) log_print('Browsing url:{}...'.format(url)) result = [] read_nums_, like_ = 0, 0 if wait_for_elem('//div[@id="content_views"]'): try: article = DataLoader.BROWSER.find_element_by_xpath( '//div[@id="content_views"]') art_time = DataLoader.BROWSER.find_element_by_xpath( '//div[@class="bar-content"]/span[@class="time"]') read_nums = DataLoader.BROWSER.find_element_by_xpath( '//div[@class="bar-content"]/span[@class="read-count"]' ) like = DataLoader.BROWSER.find_element_by_xpath( '//div[@class="bar-content"]/a/span[@class="get-collection"]' ) log_print('article:{}'.format(article.text)) log_print(art_time.text) log_print(read_nums.text) log_print(like.text) except selenium.common.exceptions.NoSuchElementException: log_print('NoSuchElementException!') return [] except selenium.common.exceptions.TimeoutException: log_print('TimeoutException!') return [] except Exception as msg: log_print(msg) return [] if read_nums.text.replace(' ', '') != '': read_nums_ = int(read_nums.text.replace(' ', '')) if like.text.replace(' ', '') != '': like_ = int(like.text.replace(' ', '')) result = [ '', read_nums_, like_, art_time.text, [], '', article.text, None ] return result try: for i, keyword in enumerate(keywords): self.url_dict[keyword] = set() for page in range(1, pages + 1): log_print('keyword:{}, Searching page:{}...'.format( keyword, page)) url = base_url.format(page, urllib.parse.quote(keyword)) # 发送请求 DataLoader.BROWSER.get(url) # 页面滚动,直至出现”下一页“链接,或超时没出现,或本来就没有 # driver.execute_script('document.documentElement.scrollTop=5000') # 获取当前页文章 urls, titles, absts = get_absts_page() for url, title, abst in zip(urls, titles, absts): if url not in self.article_dict.keys(): log_print(title) result = get_article(url) if len(result) > 0: result[0] = title self.article_dict[url] = result if (url in self.article_dict) and ( url not in self.url_dict[keyword]): self.url_dict[keyword].add(url) if len(urls) < 5: break except InterruptedError: log_print('Stop manually!') finally: self.save_url_articl()
from knowledge_graph.concept_management.concept_manager import ConceptManager from knowledge_graph.relation_management.relation_manager import RelationManager from knowledge_graph.utils.log_utils import log_print, log_close from knowledge_graph.utils.path_utils import create_work_dir os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../')) if __name__ == '__main__': # 创建工作区 work_dir = create_work_dir('data/knowledge_graph/ex1/') # 1 获取语料 data_manager = DataManager(work_dir) # 1.1 获取url、文章 url_dict, article_dict = data_manager.get_url_article(['机器人故障']) # 1.2 统计词频,获取词典、两词语同时出现的次数统计 word_dict, simultaneous_dict = data_manager.get_word_frequency() # # 2 获取概念 # concept_manager = ConceptManager(work_dir) # concept_dict = concept_manager.get_concept(article_dict, word_dict, simultaneous_dict) # # # 3 关系抽取 # relation_manager = RelationManager(work_dir) # relation_result = relation_manager.get_relation(concept_dict, article_dict) # log_print('Finish!') log_close() pass
def get_url_articl(self, keywords, pages=100): log_print('Geting urls from JianShu...') # self.url_dict: keyword-[urls] base_url = 'https://www.jianshu.com/search?q={}&page={}&type=note' def get_absts_page(): # 获取url和标题 urls, titles, absts = [], [], [] if wait_for_elem( '//ul[@class="note-list"]/li/div[@class="content"]'): try: # title read like time keywords abstract text segmentation node_list = DataLoader.BROWSER.find_elements_by_xpath( '//ul[@class="note-list"]/li/div[@class="content"]') for node in node_list: tmp1 = node.find_element_by_css_selector('a.title') tmp2 = node.find_element_by_css_selector('p.abstract') if DataLoader.FILTER.filter_when_getting( keyword, tmp1.text, tmp2.text): urls.append(tmp1.get_attribute('href')) titles.append(tmp1.text) absts.append(tmp2.text) except selenium.common.exceptions.NoSuchElementException: log_print('NoSuchElementException!') return [], [], [] except selenium.common.exceptions.TimeoutException: log_print('TimeoutException!') return [], [], [] except Exception as msg: log_print(msg) return [], [], [] return urls, titles, absts def get_article(url): # 获取文章正文 # article_dict: url-(title read like time keywords abstract text segmentation) DataLoader.BROWSER.get(url) log_print('Browsing url:{}...'.format(url)) result = [] if wait_for_elem('//article[@class="_2rhmJa"]'): try: title = DataLoader.BROWSER.find_element_by_xpath( '//h1[@class="_1RuRku"]') article = DataLoader.BROWSER.find_element_by_xpath( '//article[@class="_2rhmJa"]') dsoj = DataLoader.BROWSER.find_element_by_xpath( '//div[@class="s-dsoj"]') art_time = dsoj.find_element_by_xpath('//time') read_nums = dsoj.find_element_by_xpath( '//span[contains(text(), "阅读")]') like = DataLoader.BROWSER.find_element_by_xpath( '//span[@class="_1LOh_5"]') # log_print('title:{}'.format(title.text)) # log_print(art_time.text) # log_print(read_nums.text) # log_print(like.text) except selenium.common.exceptions.NoSuchElementException: log_print('NoSuchElementException!') return [] except selenium.common.exceptions.TimeoutException: log_print('TimeoutException!') return [] except Exception as msg: log_print(msg) return [] result = [ title.text, int(read_nums.text.replace(',', '').split(' ')[-1]), int(like.text.replace(',', '')[:-3]), art_time.text, [], '', article.text, None ] return result for i, keyword in enumerate(keywords): self.url_dict[keyword] = set() for page in range(1, pages): log_print('keyword:{}, Searching page:{}...'.format( keyword, page)) url = base_url.format(urllib.parse.quote(keyword), page) # 发送请求 DataLoader.BROWSER.get(url) # 页面滚动,直至出现”下一页“链接,或超时没出现,或本来就没有 # driver.execute_script('document.documentElement.scrollTop=5000') # 获取当前页文章 urls, titles, absts = get_absts_page() for url, title, abst in zip(urls, titles, absts): if url not in self.article_dict.keys(): result = get_article(url) if len(result) > 0: # result[-3] = abst self.article_dict[url] = result if (url in self.article_dict) and ( url not in self.url_dict[keyword]): self.url_dict[keyword].add(url) if len(urls) < 10: break self.save_url_articl()
def get_word_frequency(self): # word_dict: word-(sk-keyword sk-tfidf time idf paper_keyword) # simultaneous_dict: word1_2-(sentimes subsentimes) assert hasattr(self, 'url_dict') and hasattr( self, 'article_dict' ), 'Please get(and merge) url and article before getting word frequency!' # 分词、获得所有词词典 ################################ self.word_dict = OrderedDict() # 保存所有词典及其在语料中的出现次数 self.simultaneous_dict = OrderedDict() # 保存包含某个词的文章的数量 log_print('Article nums:{}'.format(len(self.article_dict))) self.segmentation, self.segmentation_pos = [], [] all_word_times = 0 for url, article in tqdm(self.article_dict.items(), 'Spliting texts'): # str:title, int:read_nums, int:like_nums, str:time, list:keywords, str:abstract, # str:article.text, list:segmentation tmp = article[0] + '\n' + article[6] cut_result = self.cut_lac(tmp) tmp_ws = [] tmp_pos = [] for i in range(len(cut_result[0])): # cut_result[0][i] = cut_result[0][i].replace(' ', '') if cut_result[0][i] != '': if cut_result[0][i] in ',.!?;,。!?;\n': # 只输出动词和名词 # for j in range(1, len(tmp_pos) - 1): # if tmp_pos[j] == 'v': # for k in range(j - 1, 0, -1): # if tmp_pos[k] == 'v': # start = k # break # for _ in range(k + 1, j): # print(tmp_ws[_], end='-') # print('[[{}]]'.format(tmp_ws[j]), end='-') # for k in range(j + 1, len(tmp_pos)): # if tmp_pos[k] != 'v': # print(tmp_ws[k], end='-') # else: # break # print() # 输出所有词,动词用中括号标记 for j in range(len(tmp_pos)): if tmp_pos[j] in ['v', 'vd']: print('-[[{}]]-'.format(tmp_ws[j]), end='') else: print('-{}-'.format(tmp_ws[j]), end='') print() # if len(tmp_ns) > 1: # for j in range(len(tmp_ns) - 1): # for k in range(j + 1, len(tmp_ns)): # if tmp_ns[j] == tmp_ns[k]: # continue # self.simultaneous_dict[ # '{}-{}'.format(tmp_ns[j], tmp_ns[k])] = self.simultaneous_dict.get( # '{}-{}'.format(tmp_ns[j], tmp_ns[k]), 0) + 1 tmp_ws.clear() tmp_pos.clear() else: # if cut_result[1][i] in ['n', 'f', 's', 'nw', 'nz', 'v']: tmp_ws.append(cut_result[0][i]) tmp_pos.append(cut_result[1][i]) if cut_result[0][i] not in self.word_dict: self.word_dict[cut_result[0][i]] = [0] * 5 self.word_dict[cut_result[0][i]][2] = self.word_dict.get( cut_result[0][i])[2] + 1 self.segmentation.append(cut_result[0]) self.segmentation_pos.append(cut_result[1]) all_word_times += len(cut_result[0]) cut_result_set = set(cut_result[0]) for word in cut_result_set: self.word_dict[word][3] = self.word_dict.get(word)[3] + 1 keywords = article[4] for keyword in keywords: if keyword not in self.word_dict: self.word_dict[keyword] = [0] * 5 self.word_dict[keyword][4] = 1 # 将文本中的词语转换为词频矩阵 ################################ vectorizer = CountVectorizer() # # 计算个词语出现的次数 log_print('Counting words by sklearn...') corpus = [' '.join(text) for text in self.segmentation] X = vectorizer.fit_transform(corpus) # 获取词袋中所有文本关键词 words = vectorizer.get_feature_names() log_print('keyword nums:{}'.format(len(words))) # 查看关键词在文章中的出现次数 # count_array = X.toarray() # print(X.toarray()) # 查看关键词在语料中的出现次数 for word in words: if word not in self.word_dict: self.word_dict[word] = [0] * 5 self.word_dict[word][0] = 1 # 不再统计TF-IDF # transformer = TfidfTransformer() # # 将词频矩阵X统计成TF-IDF值 # log_print('Calculating TF-IDF by sklearn...') # tfidf = transformer.fit_transform(X) # # 查看数据结构 tfidf[i][j]表示i个文本中的关键词的tf-idf权重 # tfidf_array = tfidf.toarray() # 将所有词典保存到本地 # self.save_word_info() return self.word_dict, self.simultaneous_dict