Пример #1
0
def extract_sentences(word):
    folder_name = word.encode("GBK")
    list_dirs = os.walk(folder_name)
    i = 0
    for root, dirs, files in list_dirs:
        outfile = open('Extraction/' + folder_name + '.txt', 'w')
        for f in files:
            file_path = os.path.join(root, f)
            infile = codecs.open(file_path, 'r', 'utf-8')
            lines = infile.readlines()
            full_text = ''
            for line in lines:
                full_text += line.strip()
            infile.close()
            bs = BeautifulSoup(full_text, "lxml")
            for tag in bs(['script', 'img', 'a', 'head', 'li', 'style']):
                tag.extract()
            full_text = bs.get_text()
            cut_list = '\s\t\f\r\n。!?  '.decode('utf-8')
            sentences = functions.cut(cut_list, full_text)
            for s in sentences:
                if string.find(s, word) != -1:
                    i += 1
                    print 's:' + str(i)
                    s = s.encode('utf-8')
                    outfile.write(s + '\n')
        outfile.close()
    return
Пример #2
0
def search(word):
    word_url_encoded = urllib.quote(word.encode('utf-8'))
    request_url = "http://www.baidu.com/s?wd=" + word_url_encoded
    out_file_name = (macro.BDNEWS_DOCS_ORG_DIR + '/' + word + '.txt').encode('GBK')

    # 文件已经存在,说明已经爬取过,直接跳过
    if os.path.isfile(out_file_name):
        return 0
    print word
    outfile = codecs.open(out_file_name, 'w', 'utf-8')
    try:
        response = requests.get(request_url)
    except requests.exceptions.RequestException:
        outfile.close()
        print -1
        return -1
    soup = BeautifulSoup(response.text, 'html.parser')
    divs = soup.find_all('div',class_='c-abstract')
    if len(divs)>0:
        for div in divs:
            text = div.text
            cutlist = '\s\t\f\r\n。!?  '.decode('utf-8')
            sens = functions.cut(cutlist,text)
            for s in sens:
                if s.find(word)!=-1:
                    outfile.write(s+'\r\n')
    else:
        outfile.close()
        print -2
        return -2
    outfile.close()
def write_data(word_list_file, num, outfilename):
    infile = codecs.open(word_list_file, 'r', 'utf-8')
    outfile = codecs.open(outfilename, 'w', 'utf-8')
    line = infile.readline()
    outfile.write(
        'ID\tWord1\tWord2\tweb-jaccard\tweb-overlap\tweb-dice\tweb-pmi\tIsSubString\tSharedCharNum\r\n'
    )
    i = 0
    while i < num:
        infile.readline()
        i += 1
    while line != '\n':
        line = infile.readline()
        cutlist = ',\t\r'.decode('utf-8')
        i += 1
        words = functions.cut(cutlist, line)
        if len(words) == 0:
            break

        word1 = words[0].strip()
        word2 = words[1].strip()
        features = sk_LR.get_all_features(word1, word2)
        if i % 100 == 0:
            time.sleep(1)
        if len(features) == 0:
            print i
            break
        outfile.write(word1 + '\t' + word2 + '\t')
        for f in features:
            outfile.write(str(f) + '\t')
        outfile.write('\r\n')
    outfile.close()
    infile.close()
Пример #4
0
def extract(word):
    out_file_name = (macro.BDNEWS_DOCS_ORG_DIR + '/' + word +
                     '.txt').encode('GBK')

    # 文件已经存在,说明已经爬取过,直接跳过
    if os.path.isfile(out_file_name):
        return 0

    outfile = codecs.open(out_file_name, 'w', 'utf-8')
    word_url_encoded = urllib.quote(word.encode('utf-8'))
    # 向百度新闻发起搜索请求的URL
    request_url = "http://news.baidu.com/ns?word=" + word_url_encoded + "&tn=news&from=news&cl=2&rn=50&ct=1"
    try:
        response = requests.get(request_url)
    except requests.exceptions.RequestException:
        outfile.close()
        return -1
    soup = BeautifulSoup(response.text, 'html.parser')
    # 找到所有h3标签,读取内容
    result_heads = soup.find_all('h3')
    for head in result_heads:
        # 有时因为网络问题会导致返回的结果页面异常
        if head['class'] == 'norsTitle':
            return -1
        a_temp = head.contents[0]
        link = ''
        try:
            link = a_temp.attrs['href']
        except AttributeError:
            continue
        else:
            pass
        try:
            article = requests.get(link, timeout=3)
        except requests.exceptions.RequestException:
            print 'failed:' + link
            continue
        else:
            pass
        # 检测并转换编码
        if article.encoding == 'ISO-8859-1':
            encodings = requests.utils.get_encodings_from_content(
                article.content)
            if len(encodings) == 0:
                article.encoding = 'utf-8'
            else:
                article.encoding = encodings[0]
        article = article.text.encode('utf-8').decode('utf-8')
        full_text = functions.remove_useless_tags(article)
        # 分句
        cut_list = '\s\t\f\r\n。!?  '.decode('utf-8')
        sentences = functions.cut(cut_list, full_text)
        for s in sentences:
            # 含有关键词则写文件
            if string.find(s, word) != -1:
                outfile.write(s + '\r\n')
    outfile.close()
    return 1
Пример #5
0
def extract_from_html(word, html, pagenum):
    text = functions.remove_useless_tags(html)
    cut_list = '!?.\t\r\n。!?'.decode('utf-8')
    # 含有关键词的句子列表
    sentences = []
    for s in functions.cut(cut_list, text):
        if string.find(s, word) != -1:
            sentences.append(s)
    if len(sentences) > 0:
        sentences.remove(sentences[0])
    return sentences
Пример #6
0
itime = getfitskeywords(filename, 'ITIME', HEADER='SCI')
# print('target, itime', target,itime)

qphi = np.nan_to_num(qphi)

vu = np.quantile(qphi, 0.99)
vl = np.quantile(qphi, 0.01)
# print("upper =",vu, " lower=",vl)

### HYPERBOLIC FUNCTION
# qphi = hyperbolic(qphi, 10, vu, vl)

### PLOTTING IMAGE AND BEST FIT ELLIPSE
# qphi = deproject(qphi, 31)

qphi = cut(35, 0, 0, 141, 141, 0, qphi)

plt.figure(figsize=(12, 12))
plt.imshow(qphi, cmap='seismic', origin='lower', vmin=vl, vmax=vu)

# t = test_map(55.37, 24.36, 23.32, 98.48, 140.2, 141.2, 9.954, 0, 282)
# t = test_map_mie(55, 24, 23, 97, 141, 142, 7, 5, 35, 0, 282)
# t = hg_map(50, 20, 45, 45, 141, 141, 0.5, 20, 0, 282)
# t = hg_map(54.41, 23.87, 18.19, -517.70, 140.49, 142.39, -0.2524, 149.09, -2.03275376e-06, 282)
# t = add_noise(t, 1)
# plt.imshow(t, cmap='seismic', origin='lower')

plt.colorbar(shrink=0.8)

### ELLIPSE FITTING
# e = e_best(55, 65, 30, 35, 90, 110, 141, 142, 141, 142, qphi)
# Set header of table.
ws_cut = functions.set_cutted_header(ws_raw, ws_cut)
ws_cor = functions.set_common_header(ws_raw, ws_cor)
ws_cor_temp = functions.set_cutted_header(ws_raw, ws_cor_temp)
ws_rearr = functions.set_cutted_header(ws_raw, ws_rearr)
ws_rearr_temp = functions.set_cutted_header(ws_raw, ws_rearr_temp)
ws_fit = functions.set_cutted_header(ws_raw, ws_fit)
ws_fit_uc = functions.set_cutted_header(ws_raw, ws_fit_uc)

# Calculate resistance.
resistance_res = functions.resistor(ws_raw)
resistance = resistance_res[0][0]
peaks = resistance_res[1]

# Cut the CV curve.
cut = functions.cut(ws_raw, ws_cut)
ws_cut = cut[0]
voltage_range_cut = cut[1]

# Calculate the result vol-cur after corrected.
ws_cor_res = functions.correct(ws_raw, ws_cor, resistance)
ws_cor_temp_res = functions.correct(ws_cut, ws_cor_temp, resistance)
ws_cor = ws_cor_res[0]
ws_cor_temp = ws_cor_temp_res[0]

# Set voltage range.
voltage_range = [max(ws_cor_temp_res[2]), min(ws_cor_temp_res[1])]

# Rearrangements of data.
ws_rearr = functions.rearrangement(ws_cor_temp, ws_rearr, voltage_range)
ws_rearr_temp = functions.rearrangement(ws_cut, ws_rearr_temp,
Пример #8
0
        change = False
        print(Fore.YELLOW + "> WANT SOME RANDOM ? :)")

    #endregion

    if run:

        game.draw_matrix()
        im2 = game.draw_image()
        cv2.imshow("frame2", im2)

        game.init_matrix()
        start = time.time()

        frame = capture.force_update()  # uh oh
        im = functions.cut(frame)

        if tried != -1:
            tried, last = functions.test(im, p_bot, game, last, mode, change,
                                         tried, random)

        random = False

        if tried >= 15:
            change = False
            if mode == "tree":
                print("> I'M SORRY, BACK TO CHOPPING")
            else:
                print("> I'M SORRY, BACK TO MINING")
            tried = 0