Пример #1
0
def yz():
    result_path = "./zyz"
    for runs in os.listdir(result_path):
        run_path = result_path + "/" + runs
        for lang in os.listdir(run_path):
            lang_path = run_path + "/" + lang + "/systems"
            if not os.path.exists(lang_path):
                lang_path = run_path + "/" + lang
            ori_lang_path = run_path + "/" + lang
            for file in os.listdir(lang_path):
                file_path = lang_path + "/"
                print file_path + file
                if file.endswith(".temp"):
                    shutil.move(file_path + file,
                                file_path + file.replace(".temp", ".txt"))
                    file = file.replace(".temp", ".txt")
                if lang in ["zh", "ja"]:
                    content = read_file(file_path + file)
                    new_content = []
                    for sentence in content:
                        new_content.append(sentence.replace(" ", ""))
                    write_file(new_content, ori_lang_path + "/" + file, False)
                else:
                    content = read_file(file_path + file)
                    write_file(content, ori_lang_path + "/" + file, False)
            if os.path.exists(run_path + "/" + lang + "/systems"):
                shutil.rmtree(lang_path)
Пример #2
0
def cjq():
    result_path = "./multiling2017_summarization"
    for runs in os.listdir(result_path):
        run_path = result_path + "/" + runs
        for lang in os.listdir(run_path):
            lang_path = run_path + "/" + lang
            for file in os.listdir(lang_path):
                file_path = lang_path + "/"
                print file_path + file
                if file.endswith(".temp"):
                    shutil.move(file_path + file,
                                file_path + file.replace(".temp", ".txt"))
                    file = file.replace(".temp", ".txt")
                if lang in ["zh", "ja"]:
                    content = read_file(file_path + file)
                    new_content = []
                    for sentence in content:
                        new_content.append(sentence.replace(" ", ""))
                    write_file(new_content, file_path + file, False)
Пример #3
0
for q in range(len(keywords)):
    # print(q,keywords[q])
    keywords[q] = keywords[q].lower()
    keywords[q] = keywords[q].strip()

for l in keywords_origin:
    l = l.lower()
    for m in keywords:
        if l in m:
            keywords.remove(m)
print(len(keywords))
#delete those phrases with the original keywords

un_relevant = ['algorithm','learning','data','design','calculation','neural network',
               'model','simulation','structure','cluster','regression','system','prediction',
               'throughput','theory','analysis','monte carlo','function','pca','comput','equation',
               'lead','feature extraction','technique','loop','interface','software','matrix',
               'network','drying','thermodynamics','monte-carlo','method','popcorn failure',
               'statistics','coefficient','classification','estimation','sampling',
               'modul','search','k-points','probability','probabilistic','dft','software','matlab',
               'eulerian','first-principles','gga','first principles','experiments','approach',
               'mbj','lsda','strategy','rbfnns','lda','gw','lmto','aim','dna','gpu','pbe',
               'bte','fea','test','rdf','cpa','grain','program','cpu','measurement','newton','negf']
for v in un_relevant:
    for n in keywords:
        if v in n:
            keywords.remove(n)
print(len(keywords))
filename = 'Doc_processing/keywords_filter.txt'
write_file(filename,keywords)
from crawler import extract_html_code
from file_operation import read_file, write_file

website = "https://www.nature.com/search?journal=npjcompumats"
order = "//div//span[@class='text-gray-light']/text()"
page_num = extract_html_code(website, order)
links = []
if page_num:
    page_num = int(page_num[0].split()[-1])
    page_num = page_num // 50 + 1
    for j in range(page_num):
        u = website + "&page=" + str(j + 1)
        links.append(u)

article = []
for i in range(len(links)):
    titles = extract_html_code(links[i], "//h2/a")
    for j in range(len(titles)):
        title = titles[j].xpath('string(.)').strip()
        article.append(title)

print(len(article))
write_file('Doc_processing_npj/articels_npj.txt', article)
header = {
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15'
}


def page_code(url):
    req = urllib.request.Request(url, headers=header)
    webpage = urllib.request.urlopen(req)
    html = webpage.read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup


soup1 = page_code(url)
for i in soup1.find_all('div', class_='li'):
    key = i.get_text().strip()
    if key not in ks:
        ks.append(key)

keywords = ks[27:]
soup2 = page_code(url2)

for j in soup2.find_all('dt'):  #,string='更多'
    keyword = j.get_text().strip()
    if keyword not in keywords:
        keywords.append(keyword)

filename = 'Doc_processing/' + 'keywords.txt'
file_operation.write_file(filename, keywords)
print(len(new_keywords))
un_relevant = ['algorithm','learning','data','design','calculation','neural network',
               'model','simulation','structure','cluster','regression','system','prediction',
               'throughput','theory','analysis','monte carlo','function','pca','comput','equation',
               'lead','feature extraction','technique','loop','interface','software','matrix',
               'network','drying','thermodynamics','monte-carlo','method','popcorn failure',
               'statistics','coefficient','classification','estimation','sampling',
               'modul','search','k-points','probability','probabilistic','dft','software','matlab',
               'eulerian','first-principles','gga','first principles','experiments','approach',
               'mbj','lsda','strategy','rbfnns','lda','gw','lmto','aim','dna','gpu','pbe',
               'bte','fea','test','rdf','cpa','grain','program','cpu','measurement','newton','negf']
for v in un_relevant:
    for n in new_keywords:
        if v in n:
            new_keywords.remove(n)
print(len(new_keywords))
write_file('Doc_processing/additional_keywords.txt',new_keywords)
articles12 = read_file('Doc_processing/articles.txt')+read_file('Doc_processing/additional articles.txt')
links12 = read_file('Doc_processing/articles_link.txt')+read_file('Doc_processing/additional_articles_links.txt')

print(len(articles12),len(links12))
n_articles,n_links = get_articles(generate_pages(generate_research_url(new_keywords)))
new_articles = []
new_articles_links = []
for n in range(len(n_links)):
    if n_links[n] not in n_links:
        new_articles.append(n_articles[n])
        new_articles_links.append(n_links[n])
print(len(new_articles))

Пример #7
0
            Quantity = key_err(item, 'GCL')
            ForeignCode = key_err(item, 'ForeignCode')
            Character = key_err(item, 'Character')
            UnitPrice = key_err(item, 'ComUnitPrice')

        coloum_list.append((
            DataSysNo,
            DataType,
            RowType,
            Quantity,
            DataName,
            DataCateName,
            QuotedBasis,
            Model,
            ForeignCode,
            Character,
            UnitName,
            JobContent,
            Remark,
            UnitPrice,
            TotalPrice,
        ))
    coloum_list.sort()
    coloum_list = coloum_list
    return coloum_list


if __name__ == '__main__':
    my_list = contract_list()
    write_file('122121', my_list)