def extract_sentences(word): folder_name = word.encode("GBK") list_dirs = os.walk(folder_name) i = 0 for root, dirs, files in list_dirs: outfile = open('Extraction/' + folder_name + '.txt', 'w') for f in files: file_path = os.path.join(root, f) infile = codecs.open(file_path, 'r', 'utf-8') lines = infile.readlines() full_text = '' for line in lines: full_text += line.strip() infile.close() bs = BeautifulSoup(full_text, "lxml") for tag in bs(['script', 'img', 'a', 'head', 'li', 'style']): tag.extract() full_text = bs.get_text() cut_list = '\s\t\f\r\n。!? '.decode('utf-8') sentences = functions.cut(cut_list, full_text) for s in sentences: if string.find(s, word) != -1: i += 1 print 's:' + str(i) s = s.encode('utf-8') outfile.write(s + '\n') outfile.close() return
def search(word): word_url_encoded = urllib.quote(word.encode('utf-8')) request_url = "http://www.baidu.com/s?wd=" + word_url_encoded out_file_name = (macro.BDNEWS_DOCS_ORG_DIR + '/' + word + '.txt').encode('GBK') # 文件已经存在,说明已经爬取过,直接跳过 if os.path.isfile(out_file_name): return 0 print word outfile = codecs.open(out_file_name, 'w', 'utf-8') try: response = requests.get(request_url) except requests.exceptions.RequestException: outfile.close() print -1 return -1 soup = BeautifulSoup(response.text, 'html.parser') divs = soup.find_all('div',class_='c-abstract') if len(divs)>0: for div in divs: text = div.text cutlist = '\s\t\f\r\n。!? '.decode('utf-8') sens = functions.cut(cutlist,text) for s in sens: if s.find(word)!=-1: outfile.write(s+'\r\n') else: outfile.close() print -2 return -2 outfile.close()
def write_data(word_list_file, num, outfilename): infile = codecs.open(word_list_file, 'r', 'utf-8') outfile = codecs.open(outfilename, 'w', 'utf-8') line = infile.readline() outfile.write( 'ID\tWord1\tWord2\tweb-jaccard\tweb-overlap\tweb-dice\tweb-pmi\tIsSubString\tSharedCharNum\r\n' ) i = 0 while i < num: infile.readline() i += 1 while line != '\n': line = infile.readline() cutlist = ',\t\r'.decode('utf-8') i += 1 words = functions.cut(cutlist, line) if len(words) == 0: break word1 = words[0].strip() word2 = words[1].strip() features = sk_LR.get_all_features(word1, word2) if i % 100 == 0: time.sleep(1) if len(features) == 0: print i break outfile.write(word1 + '\t' + word2 + '\t') for f in features: outfile.write(str(f) + '\t') outfile.write('\r\n') outfile.close() infile.close()
def extract(word): out_file_name = (macro.BDNEWS_DOCS_ORG_DIR + '/' + word + '.txt').encode('GBK') # 文件已经存在,说明已经爬取过,直接跳过 if os.path.isfile(out_file_name): return 0 outfile = codecs.open(out_file_name, 'w', 'utf-8') word_url_encoded = urllib.quote(word.encode('utf-8')) # 向百度新闻发起搜索请求的URL request_url = "http://news.baidu.com/ns?word=" + word_url_encoded + "&tn=news&from=news&cl=2&rn=50&ct=1" try: response = requests.get(request_url) except requests.exceptions.RequestException: outfile.close() return -1 soup = BeautifulSoup(response.text, 'html.parser') # 找到所有h3标签,读取内容 result_heads = soup.find_all('h3') for head in result_heads: # 有时因为网络问题会导致返回的结果页面异常 if head['class'] == 'norsTitle': return -1 a_temp = head.contents[0] link = '' try: link = a_temp.attrs['href'] except AttributeError: continue else: pass try: article = requests.get(link, timeout=3) except requests.exceptions.RequestException: print 'failed:' + link continue else: pass # 检测并转换编码 if article.encoding == 'ISO-8859-1': encodings = requests.utils.get_encodings_from_content( article.content) if len(encodings) == 0: article.encoding = 'utf-8' else: article.encoding = encodings[0] article = article.text.encode('utf-8').decode('utf-8') full_text = functions.remove_useless_tags(article) # 分句 cut_list = '\s\t\f\r\n。!? '.decode('utf-8') sentences = functions.cut(cut_list, full_text) for s in sentences: # 含有关键词则写文件 if string.find(s, word) != -1: outfile.write(s + '\r\n') outfile.close() return 1
def extract_from_html(word, html, pagenum): text = functions.remove_useless_tags(html) cut_list = '!?.\t\r\n。!?'.decode('utf-8') # 含有关键词的句子列表 sentences = [] for s in functions.cut(cut_list, text): if string.find(s, word) != -1: sentences.append(s) if len(sentences) > 0: sentences.remove(sentences[0]) return sentences
itime = getfitskeywords(filename, 'ITIME', HEADER='SCI') # print('target, itime', target,itime) qphi = np.nan_to_num(qphi) vu = np.quantile(qphi, 0.99) vl = np.quantile(qphi, 0.01) # print("upper =",vu, " lower=",vl) ### HYPERBOLIC FUNCTION # qphi = hyperbolic(qphi, 10, vu, vl) ### PLOTTING IMAGE AND BEST FIT ELLIPSE # qphi = deproject(qphi, 31) qphi = cut(35, 0, 0, 141, 141, 0, qphi) plt.figure(figsize=(12, 12)) plt.imshow(qphi, cmap='seismic', origin='lower', vmin=vl, vmax=vu) # t = test_map(55.37, 24.36, 23.32, 98.48, 140.2, 141.2, 9.954, 0, 282) # t = test_map_mie(55, 24, 23, 97, 141, 142, 7, 5, 35, 0, 282) # t = hg_map(50, 20, 45, 45, 141, 141, 0.5, 20, 0, 282) # t = hg_map(54.41, 23.87, 18.19, -517.70, 140.49, 142.39, -0.2524, 149.09, -2.03275376e-06, 282) # t = add_noise(t, 1) # plt.imshow(t, cmap='seismic', origin='lower') plt.colorbar(shrink=0.8) ### ELLIPSE FITTING # e = e_best(55, 65, 30, 35, 90, 110, 141, 142, 141, 142, qphi)
# Set header of table. ws_cut = functions.set_cutted_header(ws_raw, ws_cut) ws_cor = functions.set_common_header(ws_raw, ws_cor) ws_cor_temp = functions.set_cutted_header(ws_raw, ws_cor_temp) ws_rearr = functions.set_cutted_header(ws_raw, ws_rearr) ws_rearr_temp = functions.set_cutted_header(ws_raw, ws_rearr_temp) ws_fit = functions.set_cutted_header(ws_raw, ws_fit) ws_fit_uc = functions.set_cutted_header(ws_raw, ws_fit_uc) # Calculate resistance. resistance_res = functions.resistor(ws_raw) resistance = resistance_res[0][0] peaks = resistance_res[1] # Cut the CV curve. cut = functions.cut(ws_raw, ws_cut) ws_cut = cut[0] voltage_range_cut = cut[1] # Calculate the result vol-cur after corrected. ws_cor_res = functions.correct(ws_raw, ws_cor, resistance) ws_cor_temp_res = functions.correct(ws_cut, ws_cor_temp, resistance) ws_cor = ws_cor_res[0] ws_cor_temp = ws_cor_temp_res[0] # Set voltage range. voltage_range = [max(ws_cor_temp_res[2]), min(ws_cor_temp_res[1])] # Rearrangements of data. ws_rearr = functions.rearrangement(ws_cor_temp, ws_rearr, voltage_range) ws_rearr_temp = functions.rearrangement(ws_cut, ws_rearr_temp,
change = False print(Fore.YELLOW + "> WANT SOME RANDOM ? :)") #endregion if run: game.draw_matrix() im2 = game.draw_image() cv2.imshow("frame2", im2) game.init_matrix() start = time.time() frame = capture.force_update() # uh oh im = functions.cut(frame) if tried != -1: tried, last = functions.test(im, p_bot, game, last, mode, change, tried, random) random = False if tried >= 15: change = False if mode == "tree": print("> I'M SORRY, BACK TO CHOPPING") else: print("> I'M SORRY, BACK TO MINING") tried = 0