Пример #1
0
def get_extraction_outcome(xml_path, save_path, config_path):
    table_extractor_m = TableExtractorToAlloy(xml_path, save_path, config_path)
    all_error_file = []
    xml_name = os.listdir(xml_path)
    log_wp = LogWp()
    for file_i in range(len(os.listdir(xml_path))):
        tables = None
        all_tables = []
        doi = xml_name[file_i].replace(
            ".xml", "")  # choose target file according to doi
        doi = doi.replace("-", "/", 1)
        xml_n = xml_name[file_i]
        file = xml_path + '/' + str(xml_n)
        try:
            tables, captions = table_extractor_m.get_xml_tables(doi, file)
        except Exception as e:
            print(e)
            all_error_file.append(doi)
            tables = None
            captions = None
        if tables:
            cols, rows, col_inds, row_inds = table_extractor_m.get_headers(
                tables, doi)
            tab = []
            for table, row_ind, col_ind in zip(tables, row_inds, col_inds):
                curr, error_file = (table_extractor_m.construct_table_object(
                    doi, table, row_ind, col_ind))
                if curr:
                    tab.append(curr)
                if error_file:
                    all_error_file.append(str(doi))
            for i, (t, caption) in enumerate(zip(tab, captions)):
                if t is not None:
                    t['order'] = i
                    t['_id'] = ObjectId()
                    t['caption'] = caption
                    t['paper_doi'] = doi
                    all_tables.append(t)
                    log_wp.print_log('Success: Extracted Tables from %s', doi)
            xls = openpyxl.Workbook()
            sheet_id = 1
            if all_tables:
                for table in all_tables:
                    sht_new = xls.create_sheet(str(sheet_id))
                    act_table = table['act_table']
                    caption = table['caption']
                    row_len = len(act_table[0])
                    doi = table['paper_doi']
                    sht_new.cell(1, 1, str(doi))
                    sht_new.cell(2, 1, str(caption))
                    start_row = 3
                    for row in act_table:
                        len_row = len(row)
                        for index in range(len_row):
                            sht_new.cell(start_row, index + 1, row[index])
                        start_row += 1
                    sheet_id += 1
                del xls['Sheet']
                xls.save(save_path + '/' + str(file_i) + ".xlsx")
    return all_error_file, len(all_error_file)
class GetTableHtml:
    def __init__(self, doi_path, output_path):
        self.doi_path = doi_path
        self.output_path = output_path
        self.log_wp = LogWp()

    def get_all_url(self, url):
        """
        return all url on the page
        :param url:url of one page
        :return: all url as list
        """
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0)\
                    Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=url, headers=headers)
        html = urllib.request.urlopen(req).read().decode("utf-8")
        soup = BeautifulSoup(html, features='html.parser')
        tags = soup.find_all('a')
        all_url = []
        for tag in tags:
            all_url.append(str(tag.get('href')).strip())
        return all_url

    def get_table_url(self, doi_info):
        """
        return table url on the page for Springer and Nature Publishing Group
        :param doi_info:doi_info of article
        :return: table url as list
        """
        all_url = self.get_all_url(doi_info[0])
        table_url = []
        for i in all_url:
            if "article" and "table" in i:
                if "%" and "#" and "?" not in i:
                    if len(i) <= 150:
                        print(str(i))
                        if doi_info[1] in "Springer":
                            table_url.append('https://link.springer.com' + i)
                        else:
                            table_url.append('https://www.nature.com' + i)
        if len(table_url) == 0:
            print("There is no table url in this article!")
        print(str(table_url))
        return table_url

    def doi_info(self, doi_str):
        """
        get url and database of doi
        :param doi_str: doi as str
        :return: doi_info=[doi_url,doi_database]
        """
        global doi_url
        doi_info = []
        if doi_str[0:7] in "10.1016":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "Elsevier"
        elif doi_str[0:7] in ["10.1007", "10.1361", "10.1023"]:
            doi_url = "https://link.springer.com/article/" + doi_str
            doi_database = "Springer"
        elif doi_str[0:7] in "10.1080":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "Taylor & Francis Online"
        elif doi_str[0:7] in ["10.1002", "10.1111"]:
            doi_url = "https://onlinelibrary.wiley.com/doi/" + doi_str
            doi_database = "Wiley Blackwell"
        elif doi_str[0:7] in "10.1115":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "ASME International"
        elif doi_str[0:7] in "10.3390":
            all_url = self.get_all_url("https://doi.org/" + doi_str)
            for url_str in all_url:
                if "htm" in url_str:
                    doi_url = "https://www.mdpi.com/" + url_str
                    break
            doi_database = "MDPI"
        elif doi_str[0:7] == "10.1038":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "Nature Publishing Group"
        else:
            doi_url = "other URL"
            doi_database = "other database"
        doi_info.append(doi_url)
        doi_info.append(doi_database)
        doi_info.append(doi_str)
        return doi_info

    def file_name(self, name):
        st = '\|/:?*<>;'
        for s in st:
            if s in name:
                name = name.replace(s, '-')
        return name

    def get_table(self, doi_info, path=r'table.xlsx'):
        """
        get all table name from the page,
        :param doi_info: [doi_url,1doi_info_name]str
        :param path: requests.get(url).text
        :return:
        """
        table_name = []
        if doi_info[1] in ['Springer', 'Nature Publishing Group']:
            table_url = self.get_table_url(doi_info)
            if len(table_url) != 0:
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(table_url)):
                        time.sleep(1)
                        print("Start crawling the page")
                        r = requests.get(table_url[p])
                        rt = r.text
                        try:
                            df = pd.read_html(rt)
                            print("complete!")
                        except Exception as e:
                            print(e)
                            print('format of table ' + str(p) + ' is PDF')
                            data_df = pd.DataFrame()
                            self.log_wp.excel_writer(data_df, writer)
                            continue
                        start = rt.find("<h1")
                        end = rt.rfind("</h1>")
                        title_str = ''
                        for i in range(start, end + 5):
                            title_str += rt[i]
                        title_start = title_str.find("Table")
                        title_end = title_str.find("</h1>")
                        title = ''
                        for j in range(title_start, title_end):
                            title += title_str[j]
                        table_name.append(title)
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[0].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(title)
                        for j in range(len(df[0].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[0].columns))
                        for i in range(len(df[0])):
                            table_te.append(list(df[0].iloc[i]))
                        df[0] = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        self.log_wp.excel_writer(df[0], writer, sheet_name)
            else:
                self.log_wp.print_log(" Cannot find table in this page:%s", doi_info[0])
        elif doi_info[1] in 'Taylor & Francis Online':
            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('b'):
                name = page1.text
                if 'Table' in name:
                    table_name.append(name)
            del table_name[int(len(table_name) / 2):len(table_name)]
            count = 0
            for t in table_name:
                if 'Table 1' in t:
                    count += 1
            if count > 1:
                del table_name[1:(len(table_name)):2]
            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(table_name)):
                        df = pd.read_html(rt)
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        df = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        self.log_wp.excel_writer(df, writer, sheet_name)
            else:
                self.log_wp.print_log(" Cannot find table in this page: %s", doi_info[0])
        elif doi_info[1] in 'MDPI':
            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('caption'):
                name = page1.text
                name = name.replace('\n', '')
                table_name.append(name)
            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    time.sleep(1)
                    print("Start crawling the page")
                    r = requests.get(doi_info[0])
                    rt = r.text
                    df = pd.read_html(rt)
                    print("complete!")
                    for p in range(len(table_name)):
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        self.log_wp.excel_writer(fa, writer, sheet_name)
            else:
                self.log_wp.print_log(" Cannot find table in this page:%s", doi_info[0])
        elif doi_info[1] in "ASME International":
            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('div'):
                name = page1.text
                if 'Table' in name[0:5]:
                    if ' ' in name[-1]:
                        table_name.append(name)
            if len(table_name) != 0:
                df = pd.read_html(rt)
                print("complete!")
                del df[0:len(df):2]
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(df)):
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        self.log_wp.excel_writer(fa, writer, sheet_name)
            else:
                self.log_wp.print_log(" Cannot find table in this page:%s", doi_info[0])
        elif doi_info[1] in "Wiley Blackwell":
            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('header'):
                name = page1.text
                if 'Table' in name:
                    name = ' '.join(name.split())
                    table_name.append(name.replace('\n', ''))
            if len(table_name) != 0:
                df = pd.read_html(rt)
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(table_name)):
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        self.log_wp.excel_writer(fa, writer, sheet_name)
            else:
                self.log_wp.print_log(" Cannot find table in this page:%s", doi_info[0])
        else:
            print("Please try other function!")
        return table_name

    def get_rt(self, url):
        """
        :param url:str
        :return:
        '"""
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        time.sleep(1)
        print("Start crawling the page")
        r = requests.get(url, headers=headers)
        r.encoding = 'utf-8'
        rt = r.text
        print("complete!")
        return rt

    def load_doi(self, path):
        import xlrd
        data = xlrd.open_workbook(path)
        table = data.sheet_by_index(0)
        nrows = table.nrows
        doi_li = []
        for row in range(nrows):
            table_doi = table.row_values(row, start_colx=0, end_colx=None)[0]
            doi_li.append(table_doi)
        return doi_li

    def get_html(self, url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=url, headers=headers)
        html = urllib.request.urlopen(req).read()
        return html

    def save_html(self, file_name, file_content):
        self.log_wp.write_tohtml_log(path=file_name + ".html", content=file_content)

    def down_html(self, doi_li, path=''):
        for s in range(len(doi_li)):
            name = self.file_name(doi_li[s])
            url_te = self.doi_info(doi_li[s])[0]
            html = self.get_html(url_te)
            self.save_html(path + name, html)
            print('html_' + str(s + 1) + " download completed!")

    def run(self):
        xls = xlrd.open_workbook(self.doi_path)
        sht = xls.sheet_by_index(0)
        doi_li = sht.col_values(0)
        doi_error = []
        for i in range(len(doi_li)):
            print('*****************  text' + str(i + 1) + ' start!  *****************')
            doi_ls = self.doi_info(doi_li[i])
            name = self.file_name(doi_ls[2])
            try:
                table_name = self.get_table(doi_ls, self.output_path + '/' + str(name) + '.xlsx')
                print('*****************  text' + str(i + 1) + ' finished!  ********************\n')
            except Exception as e:
                print(e)
                doi_error.append(str(i + 1))
                print("\033[1;31;40m*****************  text " + str(i + 1) 
                      + " is error!  ********************\n\033[0m")
                print('*****************  text' + str(i + 1) + ' is error!  ********************\n')
        print('*' * 100)
        print(str(doi_error))
class TableExtractorModifiedtoalloy(object):
    def __init__(self, xml_path, save_path, config_path):
        self.xml_path = xml_path
        self.save_path = save_path
        self.dict_info = Dictionary(config_path)
        self.list_of_units = self.dict_info.table_units
        self.log_wp = LogWp()

    def get_caption(self, doi, table, format):
        if format == 'html':
            if '10.1016' in doi:
                up = table.parent
                table_root = up.parent
                caption = table_root.find('div', 'caption')
                caption = caption.find('p')
                caption, ref = self._search_for_reference(caption, format)
                caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip()
                return caption, ref

            elif '10.1039' in doi:
                check = table.parent
                check = check.parent
                if check.get('class') == ['rtable__wrapper']:
                    up = table.parent
                    up = up.parent
                    caption = up.previous_sibling
                    if caption is None:
                        return '', []
                    else:
                        caption = caption.previous_sibling
                        if caption is None:
                            return '', []
                        else:
                            caption = caption.find('span')
                            caption, ref = self._search_for_reference(caption, format)
                            caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip()
                            return caption, ref
                else:
                    return '', []
            elif '10.1002' in doi:
                up = table.parent
                caption = up.previous_sibling
                caption = caption.previous_sibling
                if caption is not None:
                    caption.span.decompose()
                    caption, ref = self._search_for_reference(caption, format)
                    caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip()
                    return caption, ref
                else:
                    self.log_wp.print_log('No caption')
                    return '', []
            elif '10.1021' in doi:
                up = table.parent
                if up.get('class') == ['NLM_table-wrap']:
                    caption = up.find('div', 'NLM_caption')
                else:
                    caption = up.previous_sibling
                if caption == ' ':
                    caption = caption.previous_sibling
                if caption is not None:
                    caption, ref = self._search_for_reference(caption, format)
                    caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip()
                    return caption, ref
                else:
                    return '', None
            elif '10.1007' in doi:
                up = table.parent
                caption = up.previous_sibling
                caption = caption.find('p')
                caption, ref = self._search_for_reference(caption, format)
                caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip()
                return caption, ref
            else:
                return '', []
        elif format == 'xml':
            if '10.1016' in doi:
                try:
                    caption = table.find('caption')
                    caption, ref = self._search_for_reference(caption, format)
                    caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip()
                except Exception as e:
                    self.log_wp.print_log(str(e))
                return caption, ref
            elif '10.1021' in doi:
                caption = table.find('title')
                if caption is None:
                    up = table.parent
                    caption = table.find('title')
                    if caption is None:
                        caption = up.find('caption')
                caption, ref = self._search_for_reference(caption, format)
                caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip()
                return caption, ref
        return '', []

    def get_xml_tables(self, doi, xml):
        all_tables = []
        all_captions = []
        soup = BeautifulSoup(open((xml), 'r+', encoding='utf-8'), 'xml')
        tables = soup.find_all('table')
        if len(tables) == 0:
            soup = BeautifulSoup(open(xml, 'r+', encoding='utf-8'), 'lxml')
            tables = soup.find_all('table-wrap')
        for w, table in enumerate(tables):
            try:
                try:
                    caption, ref = self.get_caption(doi, table, format='xml')
                except Exception as e:
                    self.log_wp.print_log(str(e))
                all_captions.append(caption)
                tab = []
                sup_tab = []
                for t in range(150):
                    tab.append([None] * 150)
                    sup_tab.append([None] * 150)
                rows = table.find_all('row')
                if len(rows) == 0:
                    rows = table.find_all('oasis:row')
                num_rows = len(rows)
                for i, row in enumerate(rows):
                    counter = 0
                    for ent in row:
                        curr_col = 0
                        beg = 0
                        end = 0
                        more_row = 0
                        if type(ent) == type(row):
                            if ent.has_attr('colname'):
                                try:
                                    curr_col = int(ent['colname'])
                                except:
                                    curr = list(ent['colname'])
                                    for c in curr:
                                        try:
                                            curr_col = int(c)
                                        except:
                                            continue
                            if ent.has_attr('namest'):
                                try:
                                    beg = int(ent['namest'])
                                except:
                                    curr = list(ent['namest'])
                                    for c in curr:
                                        try:
                                            beg = int(c)
                                        except:
                                            continue
                            if ent.has_attr('nameend'):
                                try:
                                    end = int(ent['nameend'])
                                except:
                                    curr = list(ent['nameend'])
                                    for c in curr:
                                        try:
                                            end = int(c)
                                        except:
                                            continue
                            if ent.has_attr('morerows'):
                                try:
                                    more_row = int(ent['morerows'])
                                except:
                                    curr = list(ent['morerows'])
                                    for c in curr:
                                        try:
                                            more_row = int(c)
                                        except:
                                            continue
                            ent, curr_ref = self._search_for_reference(ent, 'xml')
                            if beg != 0 and end != 0 and more_row != 0:
                                for j in range(beg, end + 1):
                                    for k in range(more_row + 1):
                                        tab[i + k][j - 1] = unidecode.unidecode(
                                            HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ')
                                        sup_tab[i + k][j - 1] = curr_ref
                            elif beg != 0 and end != 0:
                                for j in range(beg, end + 1):
                                    tab[i][j - 1] = unidecode.unidecode(
                                        HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ')
                                    sup_tab[i][j - 1] = curr_ref
                            elif more_row != 0:
                                for j in range(more_row + 1):
                                    tab[i + j][counter] = unidecode.unidecode(
                                        HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ')
                                    sup_tab[i + j][counter] = curr_ref
                            elif curr_col != 0:
                                tab[i][curr_col - 1] = unidecode.unidecode(
                                    HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ')
                                sup_tab[i][curr_col - 1] = curr_ref
                            else:
                                counter_ent = counter
                                found = False
                                while not found:
                                    if tab[i][counter_ent] is None:
                                        tab[i][counter_ent] = unidecode.unidecode(
                                            HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ')
                                        sup_tab[i][counter_ent] = curr_ref
                                        found = True
                                    else:
                                        counter_ent += 1
                                counter = counter_ent
                            counter = counter + 1 + (end - beg)
                for t, s in zip(tab, sup_tab):
                    for j, k in zip(reversed(t), reversed(s)):
                        if j is None:
                            t.remove(j)
                            s.remove(k)
                for t, s in zip(reversed(tab), reversed(sup_tab)):
                    if len(t) == 0:
                        tab.remove(t)
                        sup_tab.remove(s)
                lens = []
                for t in tab:
                    lens.append(len(t))
                size = stats.mode(lens)[0][0]
                for t, s in zip(tab, sup_tab):
                    if len(t) != size:
                        for j in range(len(t), size):
                            t.append('')
                            s.append([])
                all_tables.append(tab)
            except Exception as e:
                self.log_wp.print_log('Failed to extract XML table')
                table = [[0]]
                self.log_wp.print_log(str(e))
                sup_table = [[None]]
                all_tables.append(table)
                tb = sys.exc_info()[-1]
                self.log_wp.print_log(str(traceback.extract_tb(tb, limit=1)[-1][1]))
        return all_tables, all_captions

    def get_headers(self, tables, doi):
        all_col_headers = []
        all_row_headers = []
        all_col_indexes = []
        all_row_indexes = []
        for num, table in enumerate(tables):
            try:
                curr = table[0]
                col_index = 0
                for i in range(len(table) - 1):
                    next = table[i + 1]
                    count_curr = 0
                    count_next = 0
                    for cell in curr:
                        try:
                            cell, _ = self.value_extractor(cell)
                            fixed = float(cell)
                        except:
                            if cell != '':
                                count_curr += 1
                    for cell in next:
                        try:
                            cell, _ = self.value_extractor(cell)
                            fixed = float(cell)
                        except:
                            if cell != '':
                                count_next += 1
                    if count_next > count_curr:
                        curr = next
                    else:
                        col_index = 0
                        break
                trans_table = list(map(list, zip(*table)))
                curr_row = trans_table[0]
                row_index = 0
                for i in range(len(trans_table) - 1):
                    next = trans_table[i + 1]
                    count_curr = 0
                    count_next = 0
                    for cell in curr_row:
                        try:
                            cell, _ = self.value_extractor(cell)
                            fixed = float(cell)
                        except:
                            if cell != '':
                                count_curr += 1
                    for cell in next:
                        try:
                            cell, _ = self.value_extractor(cell)
                            fixed = float(cell)
                        except:
                            if cell != '':
                                count_next += 1
                    if count_next > count_curr:
                        curr = next
                    else:
                        row_index = 0
                        break
                row_header = []
                col_header = []
                for i in range(col_index + 1):
                    col_header.extend(table[i])
                for i in range(row_index + 1):
                    row_header.extend(trans_table[i])
                indexes = []
                curr = col_header[0]
                for i in range(len(col_header) - 1):
                    next = col_header[i + 1]
                    if curr == next:
                        indexes.append(i)
                        curr = next
                    else:
                        curr = next
                for i in reversed(indexes):
                    col_header.pop(i)
                indexes = []
                curr = row_header[0]
                for i in range(len(row_header) - 1):
                    next = row_header[i + 1]
                    if curr == next:
                        indexes.append(i)
                        curr = next
                    else:
                        curr = next
                for i in reversed(indexes):
                    row_header.pop(i)
                all_col_headers.append(col_header)
                all_row_headers.append(row_header)
                all_col_indexes.append(col_index)
                all_row_indexes.append(row_index)
            except IndexError as e:
                self.log_wp.print_log("FAILURE: Index self.get_headers table #" + str(num) + " from paper " + str(doi))
                self.log_wp.print_log('IndexError in get headers')
                self.log_wp.print_log(str(e))
                tb = sys.exc_info()[-1]
                self.log_wp.print_log(str(traceback.extract_tb(tb, limit=1)[-1][1]))
        return all_col_headers, all_row_headers, all_col_indexes, all_row_indexes

    def load_embeddings(self, file_loc=None):
        if file_loc == None:
            self.log_wp.print_log('Need to specify path to word embedding model')
            self.log_wp.print_log('Materials science training word2vec and fasttext are available for download')
            self.log_wp.print_log('Check the read-me')
        else:
            embeddings = keyedvectors.KeyedVectors.load(file_loc)
            # embeddings.bucket = 2000000
            emb_vocab_ft = dict([('<null>', 0), ('<oov>', 1)] +
                                [(k, v.index + 2) for k, v in embeddings.vocab.items()])
            emb_weights_ft = np.vstack([np.zeros((1, 100)), np.ones((1, 100)), np.array(embeddings.syn0)])

    def _normalize_string(self, string):
        ret_string = ''
        for char in string:
            if re.match(u'[Α-Ωα-ωÅ]', char) is not None:
                ret_string += str(char)
            else:
                ret_string += str(unidecode_expect_nonascii(str(char)))
        return ret_string

    def construct_table_object(self, doi, table, row_ind, col_ind):
        new_table = Table()
        new_table['act_table'] = table
        mat_trans_table = np.array(table).T.tolist()
        mat_table = np.array(table).tolist()
        error_file = []
        try:
            for i, c in enumerate(mat_table[col_ind][(row_ind + 1):]):
                entity = Entity()
                entity['name'] = str(c)
                entity['descriptor'] = str(mat_table[col_ind][row_ind])
                if col_ind > 0:
                    for j in range(col_ind):
                        link = Link()
                        link['name'] = str(mat_table[col_ind - j - 1][i + 1])
                        if link['name'] != entity['name']:
                            entity['links']
                for j, r in enumerate(mat_trans_table[row_ind][(col_ind + 1):]):
                    attr = Attribute()
                    try:
                        potential_units = unit_regex.search(r).group(0)[1:-1]
                        found_units = [u for u in self.list_of_units if u in potential_units]
                        if len(found_units) > 0:
                            attr['unit'] = unit
                    except:
                        pass
                    attr['name'] = str(r)
                    if row_ind > 0:
                        for k in range(row_ind):
                            link = Link()
                            link['name'] = str(mat_trans_table[row_ind - k - 1][j + 1])
                            if link['name'] != attr['name']:
                                attr['links'].append(link)
                    val, unit = self.value_extractor(str(mat_table[row_ind + j + 1][i + 1]))
                    if type(val) == float:
                        attr['value'] = val
                    else:
                        attr['string_value'] = val
                    if unit is not None:  # overwrites previous unit
                        attr['unit'] = unit
                    entity['attributes'].append(attr)
                new_table['entities'].append(entity)
            return new_table, set(error_file)
        except IndexError as e:
            self.log_wp.print_log("FAILURE: Index construct_table table from paper " + str(doi))
            self.log_wp.print_log('IndexError in construct object')
            self.log_wp.print_log(str(e))
            error_file.append(str(doi))
            return new_table, set(error_file)

    def print_table_object(self, table):
        for ent in table['entities']:
            self.log_wp.print_log('Ent:', ent['name'])
            self.log_wp.print_log('Links:')
            for link in ent['links']:
                self.log_wp.print_log(link['name'])
            self.log_wp.print_log('Attr:')
            for att in ent['attributes']:
                self.log_wp.print_log(att['name'])
                self.log_wp.print_log(att['value'])
                for link in att['links']:
                    self.log_wp.print_log(link['name'])
            self.log_wp.print_log('-------')
        self.log_wp.print_log('--------------')

    def value_extractor(self, string):
        original_string = string[:]
        extracted_unit = None
        balance_syn = ['balance', 'bal', 'bal.', 'other.', 'other']
        if string.lower() in balance_syn:
            return 'balance', extracted_unit

        units = [u for u in self.list_of_units if u in string]
        if units:
            extracted_unit = max(units)
            string = string.replace(extracted_unit, '')

        # e.g. already in int or float form: 12.5 -> 12.5
        try:
            return float(string), extracted_unit
        except:
            pass

        # e.g. 12.5 - 13.5 -> 13.0
        range_regex = re.compile('\d+\.?\d*\s*-\s*\d+\.?\d*')
        try:
            ranges = range_regex.search(string).group().split('-')
            average = (float(ranges[0]) + float(ranges[1])) / 2.0
            return average, extracted_unit
        except:
            pass

        # e.g. 12.2 (5.2) -> 12.2
        bracket_regex = re.compile('(\d+\.?\d*)\s*\(\d*.?\d*\)')
        try:
            extracted_value = float(bracket_regex.search(string).group(1))
            return float(extracted_value), extracted_unit
        except:
            pass

        # e.g. 12.3 ± 0.5 -> 12.3
        plusmin_regex = re.compile('(\d+\.?\d*)(\s*[±+-]+\s*\d+\.?\d*)')
        try:
            extracted_value = float(plusmin_regex.search(string).group(1))
            return extracted_value, extracted_unit
        except AttributeError:
            pass

        # e.g. <0.05 -> 0.05  |  >72.0 -> 72.0    | ~12 -> 12
        lessthan_roughly_regex = re.compile('([<]|[~]|[>])=?\s*\d+\.*\d*')
        try:
            extracted_value = lessthan_roughly_regex.search(string).group()
            num_regex = re.compile('\d+\.*\d*')
            extracted_value = num_regex.search(extracted_value).group()
            return float(extracted_value), extracted_unit
        except:
            pass

        # e.g. 0.4:0.6 (ratios)
        if ':' in string:
            split = string.split(":")
            try:
                extracted_value = round(float(split[0]) / float(split[1]), 3)
                return extracted_value, extracted_unit
            except:
                pass
        return original_string, None

    def load_composition_elements(self, domain=None):
        # Compositional elements to help in correclty identifiying the orientation of tables in specific domains
        if domain == 'geopolymers':
            material_constituents = ['Al2O3', 'SiO2']
            constituent_threshold = 2
            remaining = None
        elif domain == 'steel':
            material_constituents = ['Fe', 'Cr', 'Cu', 'C', 'Ti', 'Ni', 'Mo', 'Mn']
            constituent_threshold = 4
            remaining = ['Fe']
        elif domain == 'titanium':
            material_constituents = ['Ti', 'Fe', 'C']
            constituent_threshold = 2
            remaining = ['Fe']
        elif domain == 'zeolites':
            material_constituents = (
                ['Si/Ge', 'DMAP/T', 'HF/T', 'H2O/T', '(Si + Ge)/Al', 'SiO2', 'GeO2', 'SDA', 'HF', 'H2O', 'Ge', 'Si',
                 'SiO2/Al2O3', 'Si/Al',
                 'R(OH)2/Si', 'F-/Si', '(Si + Ge)/Zr', 'Al', 'SDA/Si', 'H2O/Si', 'OH/Si', 'Si/H2O', 'Si/OH', 'Ge/Si',
                 'Si/Ti',
                 'MeO',
                 'SiO2/GeO2', 'TMHDA', 'TMEDA', 'TEOS', 'NH4F', 'Al/T', 'N,N-Diethylethylenediamine', 'NaGaGeO4',
                 'NaGaO2',
                 'Na2GeO3*H2O',
                 'SOD', 'NaNO2', 'NaOH'])
            constituent_threshold = 2
            remaining = None
        elif domain == 'aluminum':
            material_constituents = ['Al', 'Cu', 'Mn', 'Si', 'O', 'Mg']
            constituent_threshold = 2
            remaining = None
        elif domain == 'alloys':
            material_constituents = ['Ag', 'Al', 'Ar', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'Ca', 'Cd', 'Ce',
                                     'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy',
                                     'Er', 'Eu', 'F', 'Fe', 'Ga', 'Gd', 'Ge', 'H', 'Hf', 'Hg', 'Ho', 'I', 'In', 'Ir',
                                     'K', 'La', 'Li', 'Lu', 'Md', 'Mg',
                                     'Mn', 'Mo', 'N', 'Na', 'Nb', 'Nd', 'Ni', 'O', 'Os', 'P', 'Pb', 'Pd', 'Pr', 'Pt',
                                     'Rb', 'Re', 'Rh', 'Ru', 'S', 'Sb',
                                     'Sc', 'Se', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U',
                                     'V', 'W', 'Y', 'Yb', 'Zn', 'Zr']
            constituent_threshold = 2
            remaining = ['Fe', 'Al', 'Ti']

    def set_balance(self, entity, balance_pos, cumsum):
        if cumsum < 1:
            entity['attributes'][balance_pos]['value'] = 1.0 - cumsum
        else:
            entity['attributes'][balance_pos]['value'] = 100.0 - cumsum

    def get_links(self, entity):
        list_of_names = []
        for attr in entity['attributes']:
            list_of_names.append(attr['name'])
        if len(set(list_of_names)) < 3:
            for attr in entity['attributes']:
                if len(attr['links']) > 0:
                    swapped = attr['name']
                    attr['name'] = attr['links'][0]['name']
                    attr['links'][0]['name'] = swapped

    def check_if_balanced(self, cumsum):
        if cumsum > 1:
            if 100 - cumsum < 1.5:
                return True
            else:
                return False
        else:
            if 1 - cumsum < 0.015:
                return True
            else:
                return False

    def _search_for_reference(self, soup, format):
        if format == 'html':
            ref = soup.find_all('a')
            tags = []
            if len(ref) == 0:
                text = soup.text
                refs = re.findall('\[\D\]', text)
                if len(refs) == 0:
                    return soup, tags
                else:
                    text = re.split('\[\D\]', text)
                    text = ''.join(text)
                    soup.string = text
                    return soup, refs
            else:
                for r in ref:
                    tag = soup.a.extract()
                    tags.append(tag.text)
                return soup, tags
        elif format == 'xml':
            ref = soup.find_all('xref')
            tags = []
            if len(ref) == 0:
                if soup.name == 'caption':
                    return soup, tags
                ref = soup.find_all('sup')
                for r in ref:
                    text = r.text.split(',')
                    for t in text:
                        if len(t) == 1 and t.isalpha():
                            tags.append(t)
                            soup.sup.decompose()
                return soup, tags
            else:
                for r in ref:
                    if len(r.text) < 4:
                        tag = soup.xref.extract()
                        tags.append(tag.text)
                return soup, tags
Пример #4
0
class TableExtraction:
    def __init__(self, excels_path, c_path, prop_name='solvus'):
        self.excels_path = excels_path
        self.c_path = c_path
        self.prop_name = prop_name
        self.dict_info = Dictionary(self.c_path)
        self.ele_list = self.dict_info.ele_list
        self.e_pattern = self.dict_info.table_e_pattern
        self.ratio_pattern = self.dict_info.table_ratio_pattern
        self.prop_pattern = self.dict_info.table_prop_pattern
        self.unit_pattern = self.dict_info.unit_pattern_table
        self.number_pattern = self.dict_info.table_number_pattern
        self.ele_to_abr = self.dict_info.ele_to_abr
        self.prop_pattern_words = self.dict_info.table_prop_pattern_words
        self.log_wp = LogWp()

    def composition_triple_extraction(self):
        file_list = os.listdir(self.excels_path)
        composition_all = {}
        for excel_path in file_list:
            try:
                file = xlrd.open_workbook(self.excels_path + '/' + excel_path)
                all_material = []
                for sheet_i in range(len(file.sheets())):
                    try:
                        sheet = file.sheet_by_index(sheet_i)
                        topic = sheet.row_values(1)[0]
                        if 'composition' in topic.lower():
                            target_ele_row = []
                            target_ele_col = []
                            search_outcome = []
                            ele_loc = None
                            for line_index in range(2, len(sheet.col_values(0))):
                                search_line = sheet.row_values(line_index)
                                unit_i = 0
                                for unit in search_line:
                                    outcome = re.findall(self.e_pattern, str(unit))
                                    if outcome and str(unit) in self.ele_list:
                                        target_ele_row.append(line_index)
                                        target_ele_col.append(unit_i)
                                        search_outcome.append(unit)
                                    unit_i += 1
                                if search_outcome:
                                    ele_loc = line_index
                                    break
                            if ele_loc:
                                dict_info = Dictionary(self.c_path)
                                alloy_replace = dict_info.table_alloy_to_replace
                                alloy_common_type = dict_info.alloy_writing_type
                                alloy_blank_type = dict_info.alloy_blank_type
                                for alloy_model, replace in alloy_replace.items():
                                    alloy_part = re.findall(alloy_model, str(topic))
                                    for alloy in alloy_part:
                                        find_part = re.findall(replace[0], str(alloy))
                                        alloy_out = alloy.replace(find_part[0], replace[1])
                                        topic = topic.replace(alloy, alloy_out)
                                    outcome_name = list()
                                    topic_tokenize = nltk.word_tokenize(topic)
                                    for word in topic_tokenize:
                                        for pattern_1 in alloy_common_type:
                                            outcome_common = re.findall(pattern_1, str(word))
                                            if outcome_common:
                                                outcome_name.append(word)
                                                break
                                    for pattern_2 in alloy_blank_type:
                                        outcome_blank = re.findall(pattern_2, str(topic))
                                        if outcome_blank and outcome_blank[0] not in outcome_name:
                                            outcome_name.append(outcome_blank[0])
                                            break
                                len_col = len(sheet.row_values(3))
                                alloy_name_col = None
                                alloy_name_search = []
                                if len_col <= 3:
                                    for col_i in range(len_col):
                                        col_info = sheet.col_values(col_i)
                                        if col_i == 0:
                                            col_info = sheet.col_values(col_i)[2:]
                                        if col_info:
                                            for cell in col_info:
                                                for pattern_1 in alloy_common_type:
                                                    outcome_common = re.findall(pattern_1, str(cell))
                                                    if outcome_common:
                                                        alloy_name_col = col_i
                                                        alloy_name_search.append(col_i)
                                                for pattern_2 in alloy_blank_type:
                                                    outcome_blank = re.findall(pattern_2, str(cell))
                                                    if outcome_blank:
                                                        alloy_name_col = col_i
                                                        alloy_name_search.append(col_i)
                                else:
                                    for col_i in range(3):
                                        col_info = sheet.col_values(col_i)
                                        if col_i == 0:
                                            col_info = sheet.col_values(col_i)[2:]
                                        if col_info:
                                            for cell in col_info:
                                                for pattern_1 in alloy_common_type:
                                                    outcome_common = re.findall(pattern_1, str(cell))
                                                    if outcome_common:
                                                        alloy_name_col = col_i
                                                        alloy_name_search.append(col_i)
                                                for pattern_2 in alloy_blank_type:
                                                    outcome_blank = re.findall(pattern_2, str(cell))
                                                    if outcome_blank:
                                                        alloy_name_col = col_i
                                                        alloy_name_search.append(col_i)
                                if not alloy_name_search:
                                    alloy_name_col = 0
                                else:
                                    alloy_name_col = alloy_name_search[0]
                                first_col = sheet.col_values(0)
                                ele_first = []
                                for unit in first_col:
                                    firstcol_search = re.findall(self.e_pattern, str(unit))
                                    if firstcol_search:
                                        ele_first.append(unit)
                                if len(ele_first) <= 2:
                                    if len(first_col) > 4:
                                        e_search = re.findall(self.e_pattern, str(sheet.col_values(0)[ele_loc]))
                                        if e_search and outcome_name and len(outcome_name) == 1:
                                            for index_row in range(ele_loc + 1, len(first_col)):
                                                composition_single = {}
                                                composition_single['material'] = outcome_name[0].replace('~', ' ')
                                                composition_single['doi'] = first_col[0]
                                                ratio_find_topic = re.findall(self.ratio_pattern, str(topic))
                                                ratio_find_col = re.findall(self.ratio_pattern,
                                                                            str(first_col[index_row]))
                                                for table_head in sheet.row_values(2):
                                                    ratio_find_head = re.findall(self.ratio_pattern, str(table_head))
                                                    if ratio_find_head:
                                                        composition_single['percentage'] = ratio_find_head[0]
                                                        break
                                                if ratio_find_topic:
                                                    composition_single['percentage'] = ratio_find_topic[0]
                                                elif ratio_find_col:
                                                    composition_single['percentage'] = ratio_find_col[0]
                                                for ele_index in range(len(sheet.row_values(2))):
                                                    ele_name = sheet.row_values(ele_loc)[ele_index]
                                                    if ele_name in tuple(self.ele_to_abr.keys()):
                                                        ele_name = self.ele_to_abr[ele_name]
                                                    number = sheet.row_values(index_row)[ele_index]
                                                    composition_single[ele_name] = number
                                                all_material.append(composition_single)
                                        if not e_search:
                                            for index_row in range(ele_loc + 1, len(first_col)):
                                                if first_col[index_row]:
                                                    composition_single = {}
                                                    name_col = sheet.col_values(alloy_name_col)
                                                    if outcome_name and len(
                                                            outcome_name) == 1 and not alloy_name_search:
                                                        composition_single['material'] = outcome_name[0].replace('~',
                                                                                                                 ' ')
                                                    else:
                                                        composition_single['material'] = name_col[index_row]
                                                    composition_single['doi'] = first_col[0]
                                                    ratio_find_topic = re.findall(self.ratio_pattern, str(topic))
                                                    ratio_find_col = re.findall(self.ratio_pattern,
                                                                                str(first_col[index_row]))
                                                    for table_head in sheet.row_values(2):
                                                        ratio_find_head = re.findall(self.ratio_pattern,
                                                                                     str(table_head))
                                                        if ratio_find_head:
                                                            composition_single['percentage'] = ratio_find_head[0]
                                                            break
                                                    if ratio_find_topic:
                                                        composition_single['percentage'] = ratio_find_topic[0]
                                                    elif ratio_find_col:
                                                        composition_single['percentage'] = ratio_find_col[0]
                                                    ratio_find_unit = re.findall(self.ratio_pattern,
                                                                                 str(first_col[index_row]))
                                                    if ratio_find_unit:
                                                        composition_single['percentage'] = ratio_find_unit[0]
                                                    for ele_index in range(len(sheet.row_values(ele_loc)[1:])):
                                                        ele_name = sheet.row_values(ele_loc)[1:][ele_index]
                                                        if ele_name in tuple(self.ele_to_abr.keys()):
                                                            ele_name = self.ele_to_abr[ele_name]
                                                        number = sheet.row_values(index_row)[ele_index + 1]
                                                        composition_single[ele_name] = number
                                                    all_material.append(composition_single)
                                    else:
                                        composition_single = {}
                                        first_col_1 = sheet.row_values(3)[0]
                                        e_search = re.findall(self.e_pattern, str(sheet.col_values(0)[ele_loc]))
                                        ratio_find_col = re.findall(self.ratio_pattern, str(first_col_1))
                                        for table_head in sheet.row_values(2):
                                            ratio_find_head = re.findall(self.ratio_pattern, str(table_head))
                                            if ratio_find_head:
                                                composition_single['percentage'] = ratio_find_head[0]
                                                break
                                        if ratio_find_col:
                                            composition_single['percentage'] = ratio_find_col[0]
                                        ratio_find_topic = re.findall(self.ratio_pattern, str(topic))
                                        if ratio_find_topic:
                                            composition_single['percentage'] = ratio_find_topic[0]
                                        if outcome_name and e_search:
                                            composition_single['material'] = outcome_name[0].replace('~', ' ')
                                            composition_single['doi'] = first_col[0]
                                            for ele_index in range(len(sheet.row_values(2))):
                                                ele_name = sheet.row_values(ele_loc)[ele_index]
                                                number = sheet.row_values(3)[ele_index]
                                                if ele_name in tuple(self.ele_to_abr.keys()):
                                                    ele_name = self.ele_to_abr[ele_name]
                                                composition_single[ele_name] = number
                                            all_material.append(composition_single)
                                        elif outcome_name and not e_search:
                                            if len(outcome_name) == 1:
                                                composition_single['material'] = outcome_name[0].replace('~', ' ')
                                            else:
                                                composition_single['material'] = sheet.row_values(ele_loc + 1)[
                                                    alloy_name_col]
                                            composition_single['doi'] = first_col[0]
                                            for ele_index in range(len(sheet.row_values(2)[1:])):
                                                ele_name = sheet.row_values(ele_loc)[1:][ele_index]
                                                number = sheet.row_values(3)[1:][ele_index]
                                                if ele_name in tuple(self.ele_to_abr.keys()):
                                                    ele_name = self.ele_to_abr[ele_name]
                                                composition_single[ele_name] = number
                                            all_material.append(composition_single)
                                        elif not outcome_name and not e_search:
                                            composition_single['material'] = sheet.row_values(ele_loc + 1)[
                                                alloy_name_col]
                                            composition_single['doi'] = first_col[0]
                                            m_name = sheet.row_values(ele_loc)[0]
                                            composition_single[m_name] = first_col[3]
                                            for ele_index in range(len(sheet.row_values(2)[1:])):
                                                ele_name = sheet.row_values(ele_loc)[1:][ele_index]
                                                number = sheet.row_values(3)[1:][ele_index]
                                                if ele_name in tuple(self.ele_to_abr.keys()):
                                                    ele_name = self.ele_to_abr[ele_name]
                                                composition_single[ele_name] = number
                                            all_material.append(composition_single)
                                        elif not outcome_name and e_search:
                                            composition_single['material'] = None
                                            composition_single['doi'] = first_col[0]
                                            for ele_index in range(len(sheet.row_values(2))):
                                                ele_name = sheet.row_values(ele_loc)[ele_index]
                                                number = sheet.row_values(3)[ele_index]
                                                if ele_name in tuple(self.ele_to_abr.keys()):
                                                    ele_name = self.ele_to_abr[ele_name]
                                                composition_single[ele_name] = number
                                            all_material.append(composition_single)
                                else:
                                    ele_row = sheet.row_values(ele_loc - 1)
                                    len_elerow = len(ele_row)
                                    for index_col in range(1, len_elerow):
                                        if ele_row[index_col]:
                                            composition_single = {}
                                            if outcome_name and len(outcome_name) == 1 and len_elerow <= 2:
                                                material_name = outcome_name[0].replace('~', ' ')
                                            else:
                                                material_name = ele_row[index_col]
                                            composition_single['material'] = material_name
                                            composition_single['doi'] = first_col[0]
                                            ratio_find_topic = re.findall(self.ratio_pattern, str(topic))
                                            ratio_find_col = re.findall(self.ratio_pattern, str(material_name))
                                            if ratio_find_topic:
                                                composition_single['percentage'] = ratio_find_topic[0]
                                            elif ratio_find_col:
                                                composition_single['percentage'] = ratio_find_col[0]
                                            for ele_index in range(len(sheet.col_values(0)[ele_loc:])):
                                                ele_name = sheet.col_values(0)[ele_loc:][ele_index]
                                                number = sheet.col_values(index_col)[ele_loc + ele_index]
                                                if ele_name in tuple(self.ele_to_abr.keys()):
                                                    ele_name = self.ele_to_abr[ele_name]
                                                composition_single[ele_name] = number
                                            all_material.append(composition_single)
                        if all_material:
                            break
                    except Exception as e:
                        self.log_wp.print_log("%s", str(e))
                        self.log_wp.print_log("An error in the %s of %s!", sheet_i, excel_path)
                if all_material:
                    composition_all[excel_path] = all_material
            except Exception as e:
                self.log_wp.print_log("can't open this file, name of file is %s", str(excel_path))
                self.log_wp.print_log("Error is %s", str(e))
                self.log_wp.print_log("%s", "--" * 25)
        return composition_all

    def property_info_extraction(self):
        file_list = os.listdir(self.excels_path)
        property_all = {}
        number_prop = 0
        K_path = []
        for excel_path in file_list:
            try:
                file = xlrd.open_workbook(self.excels_path + '/' + excel_path)
                all_material = []
                for sheet_i in range(len(file.sheets())):
                    try:
                        sheet = file.sheet_by_index(sheet_i)
                        topic = sheet.row_values(1)[0]
                        search_outcome = []
                        target_prop_row = []
                        target_prop_col = []
                        for line_index in range(2, len(sheet.col_values(0))):
                            search_line = sheet.row_values(line_index)[1:]
                            unit_i = 1
                            for unit in search_line:
                                outcome_words = None
                                for pattern in self.prop_pattern[self.prop_name]:
                                    outcome = re.findall(pattern, str(unit))
                                    if all(word in str(unit) for word in self.prop_pattern_words[self.prop_name]):
                                        outcome_words = unit
                                    if outcome:
                                        break
                                if outcome or outcome_words:
                                    target_prop_row.append(line_index)

                                    target_prop_col.append(unit_i)
                                    search_outcome.append(unit)
                                unit_i += 1
                        if any(search_outcome):
                            first_col = sheet.col_values(0)
                            alloy_replace = Dictionary(self.c_path).table_alloy_to_replace
                            for alloy_model, replace in alloy_replace.items():
                                alloy_part = re.findall(alloy_model, str(topic))
                                for alloy in alloy_part:
                                    find_part = re.findall(replace[0], str(alloy))
                                    alloy_out = alloy.replace(find_part[0], replace[1])
                                    topic = topic.replace(alloy, alloy_out)
                            alloy_common_type = Dictionary(self.c_path).alloy_writing_type
                            alloy_blank_type = Dictionary(self.c_path).alloy_blank_type
                            outcome_name = []
                            topic_tokenize = nltk.word_tokenize(topic)
                            for word in topic_tokenize:
                                for pattern_1 in alloy_common_type:
                                    outcome_common = re.findall(pattern_1, str(word))
                                    if outcome_common:
                                        outcome_name.append(word)
                                        break
                            for pattern_2 in alloy_blank_type:
                                outcome_blank = re.findall(pattern_2, str(topic))
                                if outcome_blank:
                                    outcome_name.append(outcome_blank[0])
                                    break
                            fc_ns = []
                            for cell in sheet.col_values(0)[1:]:
                                fc_n = re.findall(self.number_pattern[self.prop_name], str(cell))
                                alphabet_search = re.findall("[A-Za-z]", str(cell))
                                if fc_n and not alphabet_search:
                                    fc_ns.append(cell)
                            len_col = len(sheet.row_values(3))
                            alloy_name_col = None
                            alloy_name_search = []
                            if len_col <= 3:
                                for col_i in range(len_col):
                                    col_info = sheet.col_values(col_i)
                                    if col_i == 0:
                                        col_info = sheet.col_values(col_i)[2:]
                                    if col_info:
                                        for cell in col_info:
                                            for pattern_1 in alloy_common_type:
                                                outcome_common = re.findall(pattern_1, str(cell))
                                                if outcome_common:
                                                    alloy_name_col = col_i
                                                    alloy_name_search.append(col_i)
                                            for pattern_2 in alloy_blank_type:
                                                outcome_blank = re.findall(pattern_2, str(cell))
                                                if outcome_blank:
                                                    alloy_name_col = col_i
                                                    alloy_name_search.append(col_i)
                            else:
                                for col_i in range(3):
                                    col_info = sheet.col_values(col_i)
                                    if col_i == 0:
                                        col_info = sheet.col_values(col_i)[2:]
                                    if col_info:
                                        for cell in col_info:
                                            for pattern_1 in alloy_common_type:
                                                outcome_common = re.findall(pattern_1, str(cell))
                                                if outcome_common:
                                                    alloy_name_col = col_i
                                                    alloy_name_search.append(col_i)
                                            for pattern_2 in alloy_blank_type:
                                                outcome_blank = re.findall(pattern_2, str(cell))
                                                if outcome_blank:
                                                    alloy_name_col = col_i
                                                    alloy_name_search.append(col_i)
                            if not alloy_name_search:
                                alloy_name_col = 0
                            else:
                                alloy_name_col = alloy_name_search[0]
                            if len(first_col) > 4:
                                for prop_i in range(len(target_prop_row)):
                                    sub_label = []
                                    curr_col = []
                                    for index_row in range(target_prop_row[prop_i] + 1, len(first_col)):
                                        unit_search_parts = []
                                        unit_search_parts.append(topic)
                                        if len(fc_ns) == 0:
                                            name_col = sheet.col_values(alloy_name_col)
                                            material_name = name_col[index_row]
                                            property_single = {}
                                            number = sheet.row_values(index_row)[target_prop_col[prop_i]]
                                            number_inspect = re.findall(self.number_pattern[self.prop_name],
                                                                        str(number))
                                            prop_name = sheet.row_values(target_prop_row[prop_i])[
                                                target_prop_col[prop_i]]
                                            unit_search_parts.append(first_col[index_row])
                                            unit_search_parts.append(number)
                                            for unit in sheet.row_values(target_prop_row[0]):
                                                unit_search_parts.append(unit)
                                            for row_s in range(2, target_prop_row[prop_i] + 1):
                                                unit_search_parts.append(
                                                    sheet.row_values(row_s)[target_prop_col[prop_i]])
                                            if number_inspect:
                                                one_info = {}
                                                for prop_index in range(len(sheet.row_values(target_prop_row[prop_i]))):
                                                    prop_name_line = sheet.row_values(target_prop_row[prop_i])[
                                                        prop_index]
                                                    number_line_line = sheet.row_values(index_row)[prop_index]
                                                    one_info[prop_name_line] = number_line_line
                                                curr_col.append(number)
                                                property_single[prop_name] = number
                                                property_single['other_info'] = one_info
                                                property_single['material'] = material_name
                                                property_single['doi'] = first_col[0]
                                                if sub_label:
                                                    property_single['child_tag'] = sub_label
                                                for item in unit_search_parts:
                                                    unit_find = re.findall(self.unit_pattern[self.prop_name], str(item))
                                                    if unit_find:
                                                        property_single['unit'] = unit_find[0].replace('degC', '°C')
                                                        K_path.append(excel_path)
                                                if 'unit' not in property_single.keys():
                                                    property_single['unit'] = 'no mentioned'
                                            elif not number_inspect and len(curr_col) != 0:
                                                property_single['material'] = material_name
                                                property_single['doi'] = first_col[0]
                                                property_single[prop_name] = number
                                                if sub_label:
                                                    property_single['child_tag'] = sub_label
                                                for item in unit_search_parts:
                                                    unit_find = re.findall(self.unit_pattern[self.prop_name], str(item))
                                                    if unit_find:
                                                        property_single['unit'] = unit_find[0].replace('degC', '°C')
                                                        K_path.append(excel_path)
                                                        break
                                                if 'unit' not in property_single.keys():
                                                    property_single['unit'] = 'no mentioned'
                                            elif not number_inspect and len(curr_col) == 0:
                                                if number and not property_single:
                                                    if number != '-' and number != '--':
                                                        sub_label.append(number)
                                            if property_single:
                                                property_single['table_topic'] = first_col[1]
                                                all_material.append(property_single)
                                        if first_col[index_row] and len(fc_ns) != 0 and len(outcome_name) == 1:
                                            material_name = outcome_name[0].replace('~', ' ')
                                            property_single = {}
                                            unit_search_parts.append(first_col[index_row])
                                            for row_s in range(2, target_prop_row[prop_i] + 1):
                                                unit_search_parts.append(
                                                    sheet.row_values(row_s)[target_prop_col[prop_i]])
                                            prop_name = sheet.row_values(target_prop_row[prop_i])[
                                                target_prop_col[prop_i]]
                                            number = sheet.row_values(index_row)[target_prop_col[prop_i]]
                                            number_inspect = re.findall(self.number_pattern[self.prop_name],
                                                                        str(number))
                                            unit_search_parts.append(number)
                                            if number_inspect:
                                                property_single[prop_name] = number
                                                property_single['material'] = material_name
                                                property_single['doi'] = first_col[0]
                                                for item in unit_search_parts:
                                                    unit_find = re.findall(self.unit_pattern[self.prop_name], str(item))
                                                    if unit_find:
                                                        property_single['unit'] = unit_find[0].replace('degC', '°C')
                                                        K_path.append(excel_path)
                                                        break
                                                if 'unit' not in property_single.keys():
                                                    property_single['unit'] = 'no mentioned'
                                            elif not number_inspect and len(curr_col) != 0:
                                                property_single['material'] = material_name
                                                property_single['doi'] = first_col[0]
                                                property_single[prop_name] = number
                                                if sub_label:
                                                    property_single['child_tag'] = sub_label
                                                for item in unit_search_parts:
                                                    unit_find = re.findall(self.unit_pattern[self.prop_name], str(item))
                                                    if unit_find:
                                                        property_single['unit'] = unit_find[0].replace('degC', '°C')
                                                        K_path.append(excel_path)
                                                        break
                                                if 'unit' not in property_single.keys():
                                                    property_single['unit'] = 'no mentioned'
                                            elif not number_inspect and len(curr_col) == 0:
                                                if number and not property_single:
                                                    sub_label.append(number)
                                            if property_single:
                                                property_single['table_topic'] = first_col[1]
                                                all_material.append(property_single)
                            else:
                                unit_search_parts = []
                                property_single = {}
                                property_single['table_topic'] = first_col[1]
                                alloy_replace = Dictionary(self.c_path).table_alloy_to_replace
                                for alloy_model, replace in alloy_replace.items():
                                    alloy_part = re.findall(alloy_model, str(topic))
                                    for alloy in alloy_part:
                                        find_part = re.findall(replace[0], str(alloy))
                                        alloy_out = alloy.replace(find_part[0], replace[1])
                                        topic = topic.replace(alloy, alloy_out)
                                alloy_common_type = Dictionary(self.c_path).alloy_writing_type
                                alloy_blank_type = Dictionary(self.c_path).alloy_blank_type
                                outcome_name = []
                                topic_tokenize = nltk.word_tokenize(topic)
                                for word in topic_tokenize:
                                    for pattern_1 in alloy_common_type:
                                        outcome_common = re.findall(pattern_1, str(word))
                                        if outcome_common:
                                            outcome_name.append(word)
                                            break
                                for pattern_2 in alloy_blank_type:
                                    outcome_blank = re.findall(pattern_2, str(topic))
                                    if outcome_blank and outcome_blank[0] not in outcome_name:
                                        outcome_name.append(outcome_blank[0])
                                        break
                                unit_search_parts.append(first_col[3])
                                unit_search_parts.append(topic)
                                for row_s in range(2, 4):
                                    for prop_i in range(len(target_prop_row)):
                                        unit_search_parts.append(sheet.row_values(row_s)[target_prop_col[prop_i]])
                                number_search = re.findall(self.number_pattern[self.prop_name],
                                                           str(sheet.col_values(0)[2]))
                                if outcome_name and number_search:
                                    for prop_i in range(len(target_prop_row)):
                                        property_single['material'] = outcome_name[0].replace('~', ' ')
                                        property_single['doi'] = first_col[0]
                                        number = sheet.row_values(3)[target_prop_col[prop_i]]
                                        unit_search_parts.append(number)
                                        for item in unit_search_parts:
                                            unit_find = re.findall(self.unit_pattern[self.prop_name], str(item))
                                            if unit_find:
                                                property_single['unit'] = unit_find[0].replace('degC', '°C')
                                                K_path.append(excel_path)
                                                break
                                        if 'unit' not in property_single.keys():
                                            property_single['unit'] = 'no mentioned'
                                        prop_name = sheet.row_values(target_prop_row[prop_i])[target_prop_col[prop_i]]
                                        property_single[prop_name] = number
                                        all_material.append(property_single)
                                elif not outcome_name and not number_search:
                                    for prop_i in range(len(target_prop_row)):
                                        property_single[sheet.col_values(2)[0]] = first_col[3]
                                        property_single['doi'] = first_col[0]
                                        number = sheet.row_values(3)[target_prop_col[prop_i]]
                                        unit_search_parts.append(number)
                                        for item in unit_search_parts:
                                            unit_find = re.findall(self.unit_pattern[self.prop_name], str(item))
                                            if unit_find:
                                                property_single['unit'] = unit_find[0].replace('degC', '°C')
                                                K_path.append(excel_path)
                                                break
                                        if 'unit' not in property_single.keys():
                                            property_single['unit'] = 'no mentioned'
                                        prop_name = sheet.row_values(target_prop_row[prop_i])[target_prop_col[prop_i]]
                                        property_single[prop_name] = number
                                        all_material.append(property_single)
                                elif not outcome_name and number_search:
                                    for prop_i in range(len(target_prop_row)):
                                        property_single['material'] = 'no mentioned'
                                        property_single['doi'] = first_col[0]
                                        number = sheet.row_values(3)[target_prop_col[prop_i]]
                                        unit_search_parts.append(number)
                                        for item in unit_search_parts:
                                            unit_find = re.findall(self.unit_pattern, str(item))
                                            if unit_find:
                                                property_single['unit'] = unit_find[0].replace('degC', '°C')
                                                K_path.append(excel_path)
                                                break
                                        if 'unit' not in property_single.keys():
                                            property_single['unit'] = 'no mentioned'
                                        prop_name = sheet.row_values(target_prop_row[prop_i])[target_prop_col[prop_i]]
                                        property_single[prop_name] = number
                                        all_material.append(property_single)
                    except Exception as e:
                        self.log_wp.print_log("An error in file:%s-sheet:%s---%s!", excel_path, sheet_i, e)
                if all_material:
                    number_prop += 1
                    property_all[excel_path] = all_material
            except Exception as e:
                self.log_wp.print_log("can't open %s ", excel_path)
                self.log_wp.print_log("%s", str(e))
                self.log_wp.print_log("%s", "--" * 25)
        return property_all
Пример #5
0
class GetTInfoFromHtml:
    def __init__(self, html_path, output_path):
        self.html_path = html_path
        self.output_path = output_path
        self.log_wp = LogWp()

    def get_all_url(self, url):
        import urllib.request
        from bs4 import BeautifulSoup
        # Masquerading as browser access
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0)\
                    Gecko/20100101 Firefox/23.0'
        }
        req = urllib.request.Request(url=url, headers=headers)
        html = urllib.request.urlopen(req).read().decode("utf-8")
        # html = urllib.request.urlopen(url).read().decode("utf-8")
        soup = BeautifulSoup(html, features='html.parser')
        tags = soup.find_all('a')
        all_url = []
        for tag in tags:
            all_url.append(str(tag.get('href')).strip())

        return all_url

    def get_table_url(self, doi_info):
        all_url = self.get_all_url(doi_info[0])
        table_url = []
        for i in all_url:
            if "table" in i:
                if 'article' in i:
                    self.log_wp.print_log(str(i))
                    if doi_info[1] in "Springer":
                        table_url.append('https://link.springer.com' + i)
                    else:
                        table_url.append('https://www.nature.com' + i)
        if len(table_url) == 0:
            self.log_wp.print_log("There is no table url in this article!")
        self.log_wp.print_log(str(table_url))

        return table_url

    def doi_info(self, doi_str):
        global doi_url
        doi_info = []
        if doi_str[0:7] in "10.1016":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "Elsevier"
        elif doi_str[0:7] in ["10.1007", "10.1361", "10.1023"]:
            doi_url = "https://link.springer.com/article/" + doi_str
            doi_database = "Springer"
        elif doi_str[0:7] in "10.1080":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "Taylor & Francis Online"
        elif doi_str[0:7] in ["10.1002", "10.1111"]:
            doi_url = "https://onlinelibrary.wiley.com/doi/" + doi_str
            doi_database = "Wiley Blackwell"
        elif doi_str[0:7] in "10.1115":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "ASME International"
        elif doi_str[0:7] in "10.3390":
            # 解决MDPI页面跳转
            all_url = self.get_all_url("https://doi.org/" + doi_str)
            for url_str in all_url:
                if "htm" in url_str:
                    doi_url = "https://www.mdpi.com/" + url_str
                    break
            doi_database = "MDPI"
        elif doi_str[0:7] == "10.1038":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "Nature Publishing Group"
        else:
            doi_url = "other URL"
            doi_database = "other database"
        doi_info.append(doi_url)
        doi_info.append(doi_database)
        doi_info.append(doi_str)
        return doi_info

    def file_name(self, name):
        st = '\|/:?*<>;'
        for s in st:
            if s in name:
                name = name.replace(s, '-')

        return name

    def get_table(self, doi_info, path=r'table.xlsx'):
        table_name = []
        if doi_info[1] in ['Springer', 'Nature Publishing Group']:
            table_url = self.get_table_url(doi_info)
            if len(table_url) != 0:
                with pd.ExcelWriter(path) as writer:
                    # 保存并写入多个sheet
                    for p in range(len(table_url)):
                        time.sleep(1)
                        self.log_wp.print_log("Start crawling the page")
                        r = requests.get(table_url[p])
                        rt = r.text
                        # 将html转为纯文本, header=None, index_col=None
                        try:
                            df = pd.read_html(rt)
                            self.log_wp.print_log("complete!")
                        except Exception as e:
                            self.log_wp.print_log('format of table ' + str(p) +
                                                  ' is PDF')
                            continue
                        # 解析表格title
                        start = rt.find("<h1")
                        end = rt.rfind("</h1>")

                        title_str = ''
                        for i in range(start, end + 5):
                            title_str += rt[i]
                        title_start = title_str.find("Table")
                        title_end = title_str.find("</h1>")
                        title = ''
                        for j in range(title_start, title_end):
                            self.log_wp.print_log(str(title_str[j]))
                            title += title_str[j]
                        table_name.append(title)
                        # 读取table的title并写入dataframe首行
                        table_te = []
                        # 将文章的doi加入其中
                        row_doi = [doi_info[2]]
                        for j in range(len(df[0].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)  # 将title加入表格其中
                        # 将title加入表格其中
                        row_title = [title]
                        for j in range(len(df[0].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[0].columns))
                        for i in range(len(df[0])):
                            # te.append()
                            table_te.append(list(df[0].iloc[i]))
                        df[0] = pd.DataFrame(data=table_te)
                        # 写入excel
                        sheet_name = 'table' + str(p + 1)
                        # 写入excel时不加index和columns
                        df[0].to_excel(writer,
                                       sheet_name=sheet_name,
                                       header=None,
                                       index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this page: " +
                                      self.file_name(doi_info[0]))

        elif doi_info[1] in 'Taylor & Francis Online':

            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')  # 利用BeautifulSoup取得网页代码
            # 寻找页面构造成table_name的list
            table_name = []
            for page1 in page.find_all('b'):
                name = page1.text
                if 'Table' in name:
                    table_name.append(name)
            # 删除重复表格
            del table_name[int(len(table_name) / 2):len(table_name)]

            # 删除表格中重复的title
            count = 0
            for t in table_name:
                if 'Table 1' in t:
                    count += 1
            if count > 1:
                del table_name[1:(len(table_name)):2]

            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(table_name)):
                        df = pd.read_html(rt)
                        # 读取table的title并写入dataframe首行
                        table_te = []
                        # 读取文章的doi并写入表格第一行
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        # 将title加入表格其中
                        row_title = [table_name[p]]
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            # te.append()
                            table_te.append(list(df[p].iloc[i]))
                        df = pd.DataFrame(data=table_te)
                        # 写入excel
                        sheet_name = 'table' + str(p + 1)
                        # 写入excel时不加index和columns
                        df.to_excel(writer,
                                    sheet_name=sheet_name,
                                    header=None,
                                    index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this page: " +
                                      self.file_name(doi_info[0]))

        elif doi_info[1] in 'MDPI':

            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')  # 利用BeautifulSoup取得网页代码
            table_name = []
            for page1 in page.find_all('caption'):
                name = page1.text
                name = name.replace('\n', '')  # 清除title中的冗余字段
                table_name.append(name)
            self.log_wp.print_log(str(table_name))
            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    # 爬取HTML内容
                    time.sleep(1)
                    self.log_wp.print_log("Start crawling the page")
                    r = requests.get(doi_info[0])
                    rt = r.text
                    # 将html转为纯文本, header=None, index_col=None
                    df = pd.read_html(rt)
                    self.log_wp.print_log("complete!")
                    for p in range(len(table_name)):
                        # 读取table的title并写入dataframe首行
                        table_te = []
                        # 读取文章的doi并写入表格第一行
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        # 将title加入表格其中
                        row_title = [table_name[p]]
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            # te.append()
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        # 写入excel
                        sheet_name = 'table' + str(p + 1)
                        # 写入excel时不加index和columns
                        fa.to_excel(writer,
                                    sheet_name=sheet_name,
                                    header=None,
                                    index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this page: " +
                                      self.file_name(doi_info[0]))
        elif doi_info[1] in "ASME International":
            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')  # 利用BeautifulSoup取得网页代码
            # 将html转为纯文本, header=None, index_col=None
            table_name = []
            for page1 in page.find_all('div'):
                name = page1.text
                if 'Table' in name[0:5]:
                    if ' ' in name[-1]:
                        table_name.append(name)
            if len(table_name) != 0:
                df = pd.read_html(rt)
                self.log_wp.print_log("complete!")
                del df[0:len(df):2]
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(df)):
                        # 读取table的title并写入dataframe首行
                        table_te = []
                        # 读取文章的doi并写入表格第一行
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        # 将title加入表格其中
                        row_title = [table_name[p]]
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            # te.append()
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        # 写入excel
                        sheet_name = 'table' + str(p + 1)
                        # 写入excel时不加index和columns
                        fa.to_excel(writer,
                                    sheet_name=sheet_name,
                                    header=None,
                                    index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this page: " +
                                      doi_info[0])
        elif doi_info[1] in "Wiley Blackwell":

            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')  # 利用BeautifulSoup取得网页代码
            # 将html转为纯文本, header=None, index_col=None
            table_name = []
            for page1 in page.find_all('header'):
                name = page1.text
                if 'Table' in name:
                    name = ' '.join(name.split())
                    table_name.append(name.replace('\n', ''))
            try:
                if len(table_name) != 0:
                    df = pd.read_html(rt)
                    with pd.ExcelWriter(path) as writer:
                        # 爬取HTML内容
                        for p in range(len(df)):
                            # 读取table的title并写入dataframe首行
                            table_te = []
                            # 读取文章的doi并写入表格第一行
                            row_doi = [doi_info[2]]
                            for j in range(len(df[p].columns) - 1):
                                row_doi.append('')
                            table_te.append(row_doi)
                            # 将title加入表格其中
                            row_title = [table_name[p]]
                            for j in range(len(df[p].columns) - 1):
                                row_title.append('')
                            table_te.append(row_title)
                            table_te.append(list(df[p].columns))
                            for i in range(len(df[p])):
                                # te.append()
                                table_te.append(list(df[p].iloc[i]))
                            fa = pd.DataFrame(data=table_te)

                            # 写入excel
                            sheet_name = 'table' + str(p + 1)
                            # 写入excel时不加index和columns
                            fa.to_excel(writer,
                                        sheet_name=sheet_name,
                                        header=None,
                                        index=False)
                else:
                    self.log_wp.print_log(" Cannot find table in this page: " +
                                          doi_info[0])
            except Exception as e:
                self.log_wp.print_log(" Cannot find table in this page: " +
                                      doi_info[0])
        else:
            self.log_wp.print_log("Please try other function!")

        return table_name

    def get_rt(self, url):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
        }
        time.sleep(1)
        self.log_wp.print_log("Start crawling the page")
        r = requests.get(url, headers=headers)
        r.encoding = 'utf-8'
        rt = r.text
        self.log_wp.print_log("complete!")

        return rt

    def load_doi(self, path):
        import xlrd
        data = xlrd.open_workbook(path)
        table = data.sheet_by_index(0)
        # 获取表格所有行数
        nrows = table.nrows
        doi_li = []
        # excel文件中的doi处于第一列且省略表头
        for row in range(nrows):
            table_doi = table.row_values(row, start_colx=0, end_colx=None)[0]
            doi_li.append(table_doi)

        return doi_li

    def get_html(self, url):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
        }
        req = urllib.request.Request(url=url, headers=headers)
        html = urllib.request.urlopen(req).read()
        return html

    def save_html(self, file_name, file_content):
        # 注意windows文件命名的禁用符,比如 /
        # path = 'C:/Users/T_sha/Desktop/DOI_MDPI/'
        # with open(file_name.replace('/', '_') + path+".html", "wb") as f:
        with open(file_name + ".html", "wb") as f:
            # 写文件用bytes而不是str,所以要转码
            f.write(file_content)

    def down_html(self, doi_li, path=''):
        for s in range(len(doi_li)):
            name = self.file_name(doi_li[s])
            url_te = self.doi_info(doi_li[s])[0]
            html = self.get_html(url_te)
            self.save_html(path + name, html)
            self.log_wp.print_log('html_' + str(s + 1) +
                                  " download completed!")

    def doi_renamed(self, html_name):
        # 删除‘.html’后缀
        name = html_name[0:-5]
        if len(html_name) > 7:
            # 将doi中的'-'转为'/'
            name = self.str_sub(name, 7, '/')
        else:
            self.log_wp.print_log('Your file name is wrong!')
        return name

    # string指定p位置替换c
    def str_sub(self, string, p, c):
        new = []
        for s in string:
            new.append(s)
        new[p] = c
        return ''.join(new)

    def load_html(self, html_input_path):
        # 输入导入html路径
        html_name = self.doi_renamed(os.path.basename(html_input_path))
        f = open(html_input_path, "r", encoding="utf-8")  # 读取文件
        ft = f.read()  # 把文件内容转化为字符串
        return ft, html_name

    def get_table_html(self, doi_info, rt, path=r'table.xlsx'):
        if doi_info[1] in 'Taylor & Francis Online':
            page = BeautifulSoup(rt, 'lxml')
            # 寻找页面构造成table_name的list
            table_name = []
            for page1 in page.find_all('b'):
                name = page1.text
                if 'Table' in name:
                    table_name.append(name)
            # 删除重复表格
            del table_name[int(len(table_name) / 2):len(table_name)]
            # 删除表格中重复的title
            count = 0
            for t in table_name:
                if 'Table 1' in t:
                    count += 1
            if count > 1:
                del table_name[1:(len(table_name)):2]

            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(table_name)):
                        df = pd.read_html(rt)
                        # 读取table的title并写入dataframe首行
                        table_te = []
                        # 读取文章的doi并写入表格第一行
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        # 将title加入表格其中
                        row_title = [table_name[p]]
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            # te.append()
                            table_te.append(list(df[p].iloc[i]))
                        df = pd.DataFrame(data=table_te)
                        # 写入excel
                        sheet_name = 'table' + str(p + 1)
                        # 写入excel时不加index和columns
                        df.to_excel(writer,
                                    sheet_name=sheet_name,
                                    header=None,
                                    index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this file: " +
                                      self.file_name(doi_info[2]) + '.html')

        elif doi_info[1] in 'MDPI':
            page = BeautifulSoup(rt, 'lxml')  # 利用BeautifulSoup取得网页代码
            table_name = []
            for page1 in page.find_all('caption'):
                name = page1.text
                name = name.replace('\n', '')  # 清除title中的冗余字段
                table_name.append(name)
            self.log_wp.print_log(str(table_name))
            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    df = pd.read_html(rt)
                    for p in range(len(table_name)):
                        # 读取table的title并写入dataframe首行
                        table_te = []
                        # 读取文章的doi并写入表格第一行
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        # 将title加入表格其中
                        row_title = [table_name[p]]
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            # te.append()
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        # 写入excel
                        sheet_name = 'table' + str(p + 1)
                        # 写入excel时不加index和columns
                        fa.to_excel(writer,
                                    sheet_name=sheet_name,
                                    header=None,
                                    index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this file: " +
                                      self.file_name(doi_info[2]) + '.html')
        elif doi_info[1] in "ASME International":
            page = BeautifulSoup(rt, 'lxml')  # 利用BeautifulSoup取得网页代码
            # 将html转为纯文本, header=None, index_col=None
            try:
                df = pd.read_html(rt)
            except Exception as e:
                df = 0
            if df != 0:
                table_name = []
                for page1 in page.find_all('div'):
                    name = page1.text
                    if 'Table' in name[0:5]:
                        if ' ' in name[-1]:
                            table_name.append(name)
                if len(table_name) != 0:
                    self.log_wp.print_log("complete!")
                    del df[0:len(df):2]
                    with pd.ExcelWriter(path) as writer:
                        # 爬取HTML内容
                        for p in range(len(df)):
                            # 读取table的title并写入dataframe首行
                            table_te = []
                            # 读取文章的doi并写入表格第一行
                            row_doi = [doi_info[2]]
                            for j in range(len(df[p].columns) - 1):
                                row_doi.append('')
                            table_te.append(row_doi)
                            # 将title加入表格其中
                            row_title = []
                            try:
                                row_title.append(table_name[p])
                                for j in range(len(df[p].columns) - 1):
                                    row_title.append('')
                                table_te.append(row_title)
                                table_te.append(list(df[p].columns))
                                for i in range(len(df[p])):
                                    # te.append()
                                    table_te.append(list(df[p].iloc[i]))
                                fa = pd.DataFrame(data=table_te)
                                # 写入excel
                                sheet_name = 'table' + str(p + 1)
                                # 写入excel时不加index和columns
                                fa.to_excel(writer,
                                            sheet_name=sheet_name,
                                            header=None,
                                            index=False)
                            except Exception as e:
                                print("An error exist:", e)
            else:
                table_name = []
                self.log_wp.print_log(" Cannot find table in this file: " +
                                      self.file_name(doi_info[2]) + '.html')

        elif doi_info[1] in [
                "Springer", "Nature Publishing Group", "Wiley Blackwell"
        ]:
            page = BeautifulSoup(rt, 'lxml')  # 利用BeautifulSoup取得网页代码
            # 将html转为纯文本, header=None, index_col=None
            table_name = []
            for page1 in page.find_all('header'):
                name = page1.text
                if 'Table' in name:
                    name = ' '.join(name.split())
                    table_name.append(name.replace('\n', ''))
            try:
                if len(table_name) != 0:
                    df = pd.read_html(rt)
                    with pd.ExcelWriter(path) as writer:
                        # 爬取HTML内容
                        for p in range(len(df)):
                            # 读取table的title并写入dataframe首行
                            table_te = []
                            # 读取文章的doi并写入表格第一行
                            row_doi = [doi_info[2]]
                            for j in range(len(df[p].columns) - 1):
                                row_doi.append('')
                            table_te.append(row_doi)
                            # 将title加入表格其中
                            row_title = [table_name[p]]
                            for j in range(len(df[p].columns) - 1):
                                row_title.append('')
                            table_te.append(row_title)
                            table_te.append(list(df[p].columns))
                            for i in range(len(df[p])):
                                # te.append()
                                table_te.append(list(df[p].iloc[i]))
                            fa = pd.DataFrame(data=table_te)
                            # 写入excel
                            sheet_name = 'table' + str(p + 1)
                            # 写入excel时不加index和columns
                            fa.to_excel(writer,
                                        sheet_name=sheet_name,
                                        header=None,
                                        index=False)
                else:
                    self.log_wp.print_log(" Cannot find table in this file: " +
                                          self.file_name(doi_info[2]) +
                                          '.html')
            except Exception as e:
                self.log_wp.print_log(" Cannot find table in this file: " +
                                      self.file_name(doi_info[2]) + '.html')
        else:
            table_name = []
            self.log_wp.print_log("This doi belongs to other databases!")

        return table_name

    def run(self):
        html_list = os.listdir(self.html_path)
        for html_file in html_list:
            path = self.html_path + '/' + html_file
            rt, html_name = self.load_html(path)
            excel_name = self.file_name(html_name)
            output_path = self.output_path + '/' + '%s.xlsx'
            table_name = self.get_table_html(self.doi_info(html_name), rt,
                                             output_path % excel_name)
Пример #6
0
class GetTableInfoFromHtml:
    def __init__(self, html_path, output_path):
        self.html_path = html_path
        self.output_path = output_path
        self.log_wp = LogWp()

    def get_all_url(self, url):
        """
        return all url on the page
        :param url:url of one page
        :return: all url as list
        """
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0)\
                    Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=url, headers=headers)
        html = urllib.request.urlopen(req).read().decode("utf-8")
        soup = BeautifulSoup(html, features='html.parser')
        tags = soup.find_all('a')
        all_url = []
        for tag in tags:
            all_url.append(str(tag.get('href')).strip())
        return all_url

    def get_table_url(self, doi_info):
        """
        return table url on the page for Springer and Nature Publishing Group
        param url:doi_info of article
        return: table url as list
        """
        all_url = self.get_all_url(doi_info[0])
        table_url = []
        for i in all_url:
            if "table" in i:
                if 'article' in i:
                    self.log_wp.print_log(str(i))
                    if doi_info[1] in "Springer":
                        table_url.append('https://link.springer.com' + i)
                    else:
                        table_url.append('https://www.nature.com' + i)
        if len(table_url) == 0:
            self.log_wp.print_log("There is no table url in this article!")
        self.log_wp.print_log(str(table_url))
        return table_url

    def doi_info(self, doi_str):
        """
        get url and database of doi
        param doi_str: doi as str
        return: doi_info=[doi_url,doi_database]
        doi_url: str
        """
        global doi_url
        doi_info = []
        if doi_str[0:7] in "10.1016":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "Elsevier"
        elif doi_str[0:7] in ["10.1007", "10.1361", "10.1023"]:
            doi_url = "https://link.springer.com/article/" + doi_str
            doi_database = "Springer"
        elif doi_str[0:7] in "10.1080":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "Taylor & Francis Online"
        elif doi_str[0:7] in ["10.1002", "10.1111"]:
            doi_url = "https://onlinelibrary.wiley.com/doi/" + doi_str
            doi_database = "Wiley Blackwell"
        elif doi_str[0:7] in "10.1115":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "ASME International"
        elif doi_str[0:7] in "10.3390":
            all_url = self.get_all_url("https://doi.org/" + doi_str)
            for url_str in all_url:
                if "htm" in url_str:
                    doi_url = "https://www.mdpi.com/" + url_str
                    break
            doi_database = "MDPI"
        elif doi_str[0:7] == "10.1038":
            doi_url = "https://doi.org/" + doi_str
            doi_database = "Nature Publishing Group"
        else:
            doi_url = "other URL"
            doi_database = "other database"
        doi_info.append(doi_url)
        doi_info.append(doi_database)
        doi_info.append(doi_str)
        return doi_info

    def filename(self, name):
        st = '\|/:?*<>;'
        for s in st:
            if s in name:
                name = name.replace(s, '-')
        return name

    def get_table(self, doi_info, path=r'table.xlsx'):
        """
        get all table name from the page,
        param doi_info: [doi_url,1doi_info_name]str
        param path: r""str
        return
        """
        table_name = []
        if doi_info[1] in ['Springer', 'Nature Publishing Group']:
            table_url = self.get_table_url(doi_info)
            if len(table_url) != 0:
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(table_url)):
                        time.sleep(1)
                        self.log_wp.print_log("Start crawling the page")
                        r = requests.get(table_url[p])
                        rt = r.text
                        try:
                            df = pd.read_html(rt)
                            self.log_wp.print_log("complete!")
                        except Exception as e:
                            print(e)
                            self.log_wp.print_log('format of table ' + str(p) + ' is PDF')
                            continue
                        start = rt.find("<h1")
                        end = rt.rfind("</h1>")
                        title_str = ''
                        for i in range(start, end + 5):
                            title_str += rt[i]
                        title_start = title_str.find("Table")
                        title_end = title_str.find("</h1>")
                        title = ''
                        for j in range(title_start, title_end):
                            self.log_wp.print_log(str(title_str[j]))
                            title += title_str[j]
                        table_name.append(title)
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[0].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(title)
                        for j in range(len(df[0].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[0].columns))
                        for i in range(len(df[0])):
                            table_te.append(list(df[0].iloc[i]))
                        df[0] = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        df[0].to_excel(writer, sheet_name=sheet_name, header=None, index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this page: " + self.filename(doi_info[0]))
        elif doi_info[1] in 'Taylor & Francis Online':
            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('b'):
                name = page1.text
                if 'Table' in name:
                    table_name.append(name)
            del table_name[int(len(table_name) / 2):len(table_name)]
            count = 0
            for t in table_name:
                if 'Table 1' in t:
                    count += 1
            if count > 1:
                del table_name[1:(len(table_name)):2]
            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(table_name)):
                        df = pd.read_html(rt)
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        df = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        df.to_excel(writer, sheet_name=sheet_name, header=None, index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this page: " + self.filename(doi_info[0]))
        elif doi_info[1] in 'MDPI':
            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('caption'):
                name = page1.text
                name = name.replace('\n', '')
                table_name.append(name)
            self.log_wp.print_log(str(table_name))
            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    time.sleep(1)
                    self.log_wp.print_log("Start crawling the page")
                    r = requests.get(doi_info[0])
                    rt = r.text
                    df = pd.read_html(rt)
                    self.log_wp.print_log("complete!")
                    for p in range(len(table_name)):
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this page: " + self.filename(doi_info[0]))
        elif doi_info[1] in "ASME International":
            import re
            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('div'):
                name = page1.text
                if 'Table' in name[0:5]:
                    if ' ' in name[-1]:
                        table_name.append(name)
            if len(table_name) != 0:
                df = pd.read_html(rt)
                self.log_wp.print_log("complete!")
                del df[0:len(df):2]
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(df)):
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this page: " + doi_info[0])
        elif doi_info[1] in "Wiley Blackwell":
            rt = self.get_rt(doi_info[0])
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('header'):
                name = page1.text
                if 'Table' in name:
                    name = ' '.join(name.split())
                    table_name.append(name.replace('\n', ''))
            try:
                if len(table_name) != 0:
                    df = pd.read_html(rt)
                    with pd.ExcelWriter(path) as writer:
                        for p in range(len(df)):
                            table_te = []
                            row_doi = [doi_info[2]]
                            for j in range(len(df[p].columns) - 1):
                                row_doi.append('')
                            table_te.append(row_doi)
                            row_title = list()
                            row_title.append(table_name[p])
                            for j in range(len(df[p].columns) - 1):
                                row_title.append('')
                            table_te.append(row_title)
                            table_te.append(list(df[p].columns))
                            for i in range(len(df[p])):
                                table_te.append(list(df[p].iloc[i]))
                            fa = pd.DataFrame(data=table_te)
                            sheet_name = 'table' + str(p + 1)
                            fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False)
                else:
                    self.log_wp.print_log(" Cannot find table in this page: " + doi_info[0])
            except Exception as e:
                print(e)
                pself.log_wp.print_log(" Cannot find table in this page: " + doi_info[0])
        else:
            self.log_wp.print_log("Please try other function!")
        return table_name

    def get_rt(self, url):
        """
        param url:str
        return
        """
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        time.sleep(1)
        self.log_wp.print_log("Start crawling the page")
        r = requests.get(url, headers=headers)
        r.encoding = 'utf-8'
        rt = r.text
        self.log_wp.print_log("complete!")
        return rt

    def load_doi(self, path):
        import xlrd
        data = xlrd.open_workbook(path)
        table = data.sheet_by_index(0)
        doi_li = []
        for row in range(table.nrows):
            table_doi = table.row_values(row, start_colx=0, end_colx=None)[0]
            doi_li.append(table_doi)
        return doi_li

    def get_html(self, url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=url, headers=headers)
        html = urllib.request.urlopen(req).read()
        return html

    def save_html(self, file_name, file_content):
        with open(file_name + ".html", "wb") as f:
            f.write(file_content)

    def down_html(self, doi_li, path=''):
        for s in range(len(doi_li)):
            name = self.filename(doi_li[s])
            url_te = self.doi_info(doi_li[s])[0]
            html = self.get_html(url_te)
            self.save_html(path + name, html)
            self.log_wp.print_log('html_' + str(s + 1) + " download completed!")

    def doi_renamed(self, html_name):
        name = html_name[0:-5]
        if len(html_name) > 7:
            name = self.str_sub(name, 7, '/')
        else:
            self.log_wp.print_log('Your file name is wrong!')
        return name

    def str_sub(self, string, p, c):
        new = []
        for s in string:
            new.append(s)
        new[p] = c
        return ''.join(new)

    def load_html(self, html_input_path):
        html_name = self.doi_renamed(os.path.basename(html_input_path))
        f = open(html_input_path, "r", encoding="utf-8")
        ft = f.read()
        return ft, html_name

    def get_table_html(self, doi_info, rt, path=r'table.xlsx'):
        """
        get all table name from the page,
        param doi_info: [doi_url,1doi_info_name]str
        param rt: requests.get(url).text
        param path: str
        return
        """
        if doi_info[1] in 'Taylor & Francis Online':
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('b'):
                name = page1.text
                if 'Table' in name:
                    table_name.append(name)
            del table_name[int(len(table_name) / 2):len(table_name)]
            count = 0
            for t in table_name:
                if 'Table 1' in t:
                    count += 1
            if count > 1:
                del table_name[1:(len(table_name)):2]
            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(table_name)):
                        df = pd.read_html(rt)
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        df = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        df.to_excel(writer, sheet_name=sheet_name, header=None, index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html')
        elif doi_info[1] in 'MDPI':
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('caption'):
                name = page1.text
                name = name.replace('\n', '')
                table_name.append(name)
            self.log_wp.print_log(str(table_name))
            if len(table_name) != 0:
                with pd.ExcelWriter(path) as writer:
                    df = pd.read_html(rt)
                    for p in range(len(table_name)):
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html')
        elif doi_info[1] in "ASME International":
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('div'):
                name = page1.text
                if 'Table' in name[0:5]:
                    if ' ' in name[-1]:
                        table_name.append(name)
            if len(table_name) != 0:
                df = pd.read_html(rt)
                self.log_wp.print_log("complete!")
                del df[0:len(df):2]
                with pd.ExcelWriter(path) as writer:
                    for p in range(len(df)):
                        table_te = []
                        row_doi = [doi_info[2]]
                        for j in range(len(df[p].columns) - 1):
                            row_doi.append('')
                        table_te.append(row_doi)
                        row_title = list()
                        row_title.append(table_name[p])
                        for j in range(len(df[p].columns) - 1):
                            row_title.append('')
                        table_te.append(row_title)
                        table_te.append(list(df[p].columns))
                        for i in range(len(df[p])):
                            table_te.append(list(df[p].iloc[i]))
                        fa = pd.DataFrame(data=table_te)
                        sheet_name = 'table' + str(p + 1)
                        fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False)
            else:
                self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html')
        elif doi_info[1] in ["Springer", "Nature Publishing Group", "Wiley Blackwell"]:
            page = BeautifulSoup(rt, 'lxml')
            table_name = []
            for page1 in page.find_all('header'):
                name = page1.text
                if 'Table' in name:
                    name = ' '.join(name.split())
                    table_name.append(name.replace('\n', ''))
            try:
                if len(table_name) != 0:
                    df = pd.read_html(rt)
                    with pd.ExcelWriter(path) as writer:
                        for p in range(len(df)):
                            table_te = []
                            row_doi = [doi_info[2]]
                            for j in range(len(df[p].columns) - 1):
                                row_doi.append('')
                            table_te.append(row_doi)
                            row_title = list()
                            row_title.append(table_name[p])
                            for j in range(len(df[p].columns) - 1):
                                row_title.append('')
                            table_te.append(row_title)
                            table_te.append(list(df[p].columns))
                            for i in range(len(df[p])):
                                table_te.append(list(df[p].iloc[i]))
                            fa = pd.DataFrame(data=table_te)
                            sheet_name = 'table' + str(p + 1)
                            fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False)
                else:
                    self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html')
            except Exception as e:
                print(e)
                self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html')
        else:
            table_name = []
            self.log_wp.print_log("This doi belongs to other databases!")
        return table_name

    def run(self):
        html_list = os.listdir(self.html_path)
        for html_file in html_list:
            path = os.path.join(self.html_path, html_file)
            rt, html_name = self.load_html(path)
            excel_name = self.filename(html_name)
            output_path = self.output_path + '/' + '%s.xlsx'
            table_name = self.get_table_html(self.doi_info(html_name), rt, output_path % excel_name)
Пример #7
0
class AcquireTargetInfo:
    def __init__(self, c_path, origin_text_path, prop_list, excels_path,
                 out_path):
        self.c_path = c_path
        self.prop_list = prop_list
        self.origin_text_path = origin_text_path
        self.excels_path = excels_path
        self.dict_info = Dictionary(self.c_path)
        self.out_path = out_path
        self.log_wp = LogWp()

    def mkdir(self, file_name):
        pathd = os.getcwd() + '\\' + file_name
        if os.path.exists(pathd):
            for root, dirs, files in os.walk(pathd, topdown=False):
                for name in files:
                    os.remove(os.path.join(root, name))
                for name in dirs:
                    os.rmdir(os.path.join(root, name))
            os.rmdir(pathd)
        os.mkdir(pathd)

    def get_doi_fromtxt(self, txt_path):
        text_name = txt_path.replace(".txt", "")
        doi = text_name.replace("-", "/", 1)
        return doi

    def get_abrre(self, text, prop_name):
        processor = TPreProcessor(text, prop_name, self.c_path)
        text = processor.processor()
        sentences = nltk.sent_tokenize(text)
        sentences_split = text.split(" ")
        alloy_write_type = self.dict_info.alloy_writing_type
        len_type = len(alloy_write_type)
        abbre_to_alloy = {}
        for sent in sentences:
            processor = TPreProcessor(sent, prop_name, self.c_path)
            filter_data = processor.processor()
            words = nltk.word_tokenize(filter_data)
            for word in words:
                for type_i in range(0, len_type):
                    outcome = re.findall(alloy_write_type[type_i], word)
                    outcome_alloy = None
                    if outcome:
                        abbre = "(" + word + ")"
                        if abbre in sentences_split:
                            index_alloy = sentences_split.index(abbre) - 1
                            alloy = sentences_split[index_alloy]
                            for type_j in range(0, len_type):
                                outcome_alloy = re.findall(
                                    alloy_write_type[type_j], alloy)
                                if outcome_alloy:
                                    abbre_to_alloy[word] = alloy
                                    break
                    if outcome_alloy:
                        break
        return abbre_to_alloy

    def get_text_triple(self, prop_name):
        self.mkdir('output_tt')
        text_path = r"output_tt\full_text"
        self.mkdir(text_path)
        ft = FilterText(self.origin_text_path, text_path)
        txt_name = ft.process()
        length = len(os.listdir(self.origin_text_path))
        all_txt_info = []
        for i in range(0, length):
            n_path = os.listdir(self.origin_text_path)[i]
            doi = self.get_doi_fromtxt(n_path)
            file = open(text_path + '/' + n_path, 'r', encoding='utf-8')
            data = file.read()
            pre_processor = PreProcessor(data, self.c_path)
            filter_txt = pre_processor.pre_processor()
            file_origin = open(self.origin_text_path + '/' + n_path,
                               'r',
                               encoding='utf-8')
            data_origin = file_origin.read()
            abbre_pairs = self.get_abrre(data_origin, prop_name)
            positioner = SentencePositioner(filter_txt, prop_name, self.c_path)
            target_sents = positioner.target_sent()
            for index, sent in target_sents.items():
                processor = TPreProcessor(sent, prop_name, self.c_path)
                filter_data = processor.processor()
                parse = PhraseParse(filter_data, prop_name, self.c_path)
                sub_order, sub_id, object_list = parse.alloy_sub_search()
                ree = RelationExtraciton(prop_name, filter_data, sub_order,
                                         sub_id, object_list, self.c_path,
                                         abbre_pairs)
                all_outcome = ree.triple_extraction()
                if all_outcome:
                    for id_m, info in all_outcome.items():
                        sole_info = dict()
                        sole_info['doi'] = doi
                        sole_info['material'] = info[0]
                        sole_info['prop_name'] = info[1]
                        sole_info['prop_value'] = info[2]
                        all_txt_info.append(sole_info)
        return all_txt_info

    def gather_tableinfo_textinfo(self, all_txt_info, table_info, prop_name,
                                  prop_pattern, unit_pattern_text):
        gather_outcome = list()
        for file_name, t_info in table_info.items():
            sole_info = dict()
            all_triple_info = list()
            sole_doi = list()
            for sole_m_info in t_info:
                triple_info = dict()
                triple_info['source'] = 'table'
                if 'doi' in sole_m_info.keys():
                    plus_doi = sole_m_info['doi']
                    sole_doi.append(plus_doi)
                    sole_m_info.pop('doi')
                if 'material' in sole_m_info.keys():
                    sole_material = sole_m_info['material']
                    noisy = re.findall('\s*\[.+\]', str(sole_material))
                    if noisy:
                        for puc in noisy:
                            sole_material = str(sole_material).replace(puc, '')
                    triple_info['material'] = sole_material
                    sole_m_info.pop('material')
                if 'unit' in sole_m_info.keys():
                    sole_unit = sole_m_info['unit']
                    triple_info['unit'] = sole_unit
                    sole_m_info.pop('unit')
                if 'other_info' in sole_m_info.keys():
                    sole_other_info = sole_m_info['other_info']
                    triple_info['other_prop_info'] = sole_other_info
                    sole_m_info.pop('other_info')
                if 'child_tag' in sole_m_info.keys():
                    sole_tag_info = sole_m_info['child_tag']
                    triple_info['child_tag'] = sole_tag_info
                    sole_m_info.pop('child_tag')
                if 'table_topic' in sole_m_info.keys():
                    sole_tag_info = sole_m_info['table_topic']
                    triple_info['table_topic'] = sole_tag_info
                    sole_m_info.pop('table_topic')
                if len(sole_m_info) == 1:
                    for prop_name_t, value in sole_m_info.items():
                        sole_propname = str(prop_name_t)
                        triple_info['prop_name'] = sole_propname
                        sole_value = str(value)
                        triple_info['value'] = sole_value
                elif len(sole_m_info) >= 1:
                    get_prop = None
                    for prop_name_t, value in sole_m_info.items():
                        for pattern in prop_pattern[prop_name]:
                            prop_search = re.findall(pattern, str(prop_name_t))
                            if prop_search:
                                sole_propname = str(prop_name_t)
                                triple_info['prop_name'] = sole_propname
                                sole_value = str(value)
                                triple_info['value'] = sole_value
                                get_prop = True
                                break
                        if get_prop:
                            break
                all_triple_info.append(triple_info)
            if list(set(sole_doi)):
                sole_info[list(set(sole_doi))[0]] = all_triple_info
                gather_outcome.append(sole_info)
        gather = 0
        for q in gather_outcome:
            k = tuple(q.keys())[0]
            i = q[k]
            for n in i:
                for w, v in n.items():
                    if w == 'value':
                        gather += 1
        self.log_wp.print_log("gather number :%s", gather)
        copy_all_txt_info = copy.copy(all_txt_info)
        if copy_all_txt_info:
            all_text = 0
            all_gather_doi = []
            for info_one in gather_outcome:
                all_gather_doi.append(tuple(info_one.keys())[0])
            for triple_info_sole in copy_all_txt_info:
                if triple_info_sole['doi'] in all_gather_doi:
                    all_text += 1
                    plus_info = dict()
                    plus_info['source'] = 'text'
                    plus_info['prop_name'] = triple_info_sole['prop_name']
                    prop_value = triple_info_sole['prop_value']
                    plus_info['material'] = triple_info_sole['material']
                    unit_search = re.findall(unit_pattern_text[prop_name],
                                             str(prop_value))
                    if unit_search:
                        plus_info['unit'] = unit_search[0]
                        prop_value = prop_value.replace(unit_search[0], '')
                        plus_info['value'] = prop_value
                    else:
                        plus_info['unit'] = ""
                        plus_info['value'] = prop_value
                    for get_info in gather_outcome:
                        if tuple(
                                get_info.keys())[0] == triple_info_sole['doi']:
                            get_info[triple_info_sole['doi']].append(plus_info)
                if triple_info_sole['doi'] not in all_gather_doi:
                    all_text += 1
                    plus_info = {}
                    full_info = {}
                    sole_triple = []
                    plus_info['source'] = 'text'
                    plus_info['prop_name'] = triple_info_sole['prop_name']
                    prop_value = triple_info_sole['prop_value']
                    plus_info['material'] = triple_info_sole['material']
                    unit_search = re.findall(unit_pattern_text[prop_name],
                                             str(prop_value))
                    if unit_search:
                        plus_info['unit'] = unit_search[0]
                        prop_value = prop_value.replace(unit_search[0], '')
                        plus_info['value'] = prop_value
                    else:
                        plus_info['unit'] = ""
                        plus_info['value'] = prop_value
                    if plus_info:
                        sole_triple.append(plus_info)
                        full_info[triple_info_sole['doi']] = sole_triple
                        gather_outcome.append(full_info)
                        all_gather_doi.append(triple_info_sole['doi'])
            self.log_wp.print_log("all_text number :%s", all_text)
        return gather_outcome

    def transform_comp_outcome(self, all_composition):
        ele_list = self.dict_info.ele_list
        gather_outcome = []
        for file_name, t_info in all_composition.items():
            sole_info = {}
            all_triple_info = []
            for sole_m_info in t_info:
                triple_info = {}
                sole_doi = sole_m_info['doi']
                sole_m_info.pop('doi')
                if 'material' in sole_m_info.keys():
                    sole_material = sole_m_info['material']
                    noisy = re.findall('\[.+\]', str(sole_material))
                    if noisy:
                        for puc in noisy:
                            sole_material = str(sole_material).replace(puc, '')
                    triple_info['material'] = sole_material
                    sole_m_info.pop('material')
                for element in ele_list:
                    if element in sole_m_info.keys():
                        triple_info[element] = sole_m_info[element]
                        sole_m_info.pop(element)
                if sole_m_info:
                    triple_info["other_eleinfo"] = sole_m_info
                all_triple_info.append(triple_info)
            sole_info[sole_doi] = all_triple_info
            gather_outcome.append(sole_info)
        return gather_outcome

    def allinfo_dependencyparse(self, comp_info, prop_info):
        all_ele_doi = []
        all_prop_doi = []
        outcome = []
        for doi_info_ele in comp_info:
            ele_doi = tuple(doi_info_ele.keys())[0]
            all_ele_doi.append(ele_doi)
        for doi_info_prop in prop_info:
            prop_doi = tuple(doi_info_prop.keys())[0]
            all_prop_doi.append(prop_doi)
        prop_info_modified = copy.copy(prop_info)
        for doi_info_ele in comp_info:
            ele_doi = tuple(doi_info_ele.keys())[0]
            if ele_doi in all_prop_doi:
                for doi_info_prop in prop_info:
                    prop_doi = tuple(doi_info_prop.keys())[0]
                    plus_info = {}
                    all_doi_info = []
                    if ele_doi == prop_doi:
                        if doi_info_prop in prop_info_modified:
                            prop_info_modified.remove(doi_info_prop)
                        ele_doi_fullinfo = doi_info_ele[ele_doi]
                        ele_allname = []
                        prop_allname = []
                        pop_name = []
                        for one_material_ele in ele_doi_fullinfo:
                            if 'material' in one_material_ele.keys():
                                ele_m_name = one_material_ele['material']
                                ele_allname.append(ele_m_name)
                        modified_ele_allname = []
                        for name in ele_allname:
                            space_search = re.findall('\s', str(name))
                            if space_search:
                                name_list = str(name).split()
                                modified_ele_allname.append(str(name))
                                for name_sepe in name_list:
                                    modified_ele_allname.append(name_sepe)
                            else:
                                modified_ele_allname.append(name)
                        for one_material_prop in doi_info_prop[prop_doi]:
                            if 'material' in one_material_prop.keys():
                                prop_m_name = one_material_prop['material']
                                prop_allname.append(prop_m_name)
                                if prop_m_name not in modified_ele_allname and len(
                                        ele_doi_fullinfo) == 1:
                                    if one_material_prop['source'] == 'table':
                                        combine_info = {}
                                        for prop_name, prop_value in one_material_prop.items(
                                        ):
                                            combine_info[
                                                prop_name] = prop_value
                                        for ele_name, ele_value in ele_doi_fullinfo[
                                                0].items():
                                            combine_info[ele_name] = ele_value
                                        all_doi_info.append(combine_info)
                                    else:
                                        all_doi_info.append(one_material_prop)
                                if prop_m_name not in modified_ele_allname and len(
                                        ele_doi_fullinfo) != 1:
                                    all_doi_info.append(one_material_prop)
                                if prop_m_name in modified_ele_allname:
                                    for one_material_ele in ele_doi_fullinfo:
                                        if 'material' in one_material_ele.keys(
                                        ):
                                            ele_m_name = one_material_ele[
                                                'material']
                                            space_search = re.findall(
                                                '\s', str(ele_m_name))
                                            if space_search:
                                                ele_m_name_split = ele_m_name.split(
                                                )
                                                if prop_m_name in ele_m_name_split or prop_m_name == ele_m_name:
                                                    pop_name.append(ele_m_name)
                                                    combine_info = {}
                                                    for prop_name, prop_value in one_material_prop.items(
                                                    ):
                                                        combine_info[
                                                            prop_name] = prop_value
                                                    for ele_name, ele_value in one_material_ele.items(
                                                    ):
                                                        combine_info[
                                                            ele_name] = ele_value
                                                    all_doi_info.append(
                                                        combine_info)
                                            else:
                                                if prop_m_name == ele_m_name:
                                                    combine_info = {}
                                                    for prop_name, prop_value in one_material_prop.items(
                                                    ):
                                                        combine_info[
                                                            prop_name] = prop_value
                                                    for ele_name, ele_value in one_material_ele.items(
                                                    ):
                                                        combine_info[
                                                            ele_name] = ele_value
                                                    all_doi_info.append(
                                                        combine_info)
                        for one_material_ele in ele_doi_fullinfo:
                            if 'material' in one_material_ele.keys():
                                ele_m_name = one_material_ele['material']
                                if ele_m_name not in pop_name:
                                    if ele_m_name not in prop_allname:
                                        all_doi_info.append(one_material_ele)
                    if all_doi_info:
                        plus_info[ele_doi] = all_doi_info
                        outcome.append(plus_info)
            else:
                outcome.append(doi_info_ele)
        for extra_prop in prop_info_modified:
            outcome.append(extra_prop)
        return outcome

    def structureinfo_toexcel(self, all_structureinfo, out_path):
        ele_list = self.dict_info.ele_list
        xls = openpyxl.Workbook()
        sht = xls.create_sheet("0")
        sht = xls.create_sheet(index=0)
        sht.cell(1, 1, "Source")
        sht.cell(1, 2, "DOIs")
        sht.cell(1, 3, "table_topic")
        sht.cell(1, 4, "material")
        sht.cell(1, 5, "Property_name")
        sht.cell(1, 6, "Property_value")
        sht.cell(1, 7, "Unit")
        col_n = 8
        row_now = 2
        sht.cell(1, col_n, str("other_element_info"))
        col_n += 1
        sht.cell(1, col_n, str("other_property_info"))
        col_n += 1
        sht.cell(1, col_n, str("child_tag"))
        col_n += 1
        for ele in ele_list:
            sht.cell(1, col_n, ele)
            col_n += 1
        for m_info in all_structureinfo:
            doi = tuple(m_info.keys())[0]
            length_m_info = m_info[doi]
            for index_m in range(len(length_m_info)):
                sht.cell(row_now, 2, doi)
                material_now = length_m_info[index_m]
                if 'source' in material_now.keys():
                    sht.cell(row_now, 1, str(material_now['source']))
                if 'table_topic' in material_now.keys():
                    sht.cell(row_now, 3, str(material_now['table_topic']))
                if 'material' in material_now.keys():
                    sht.cell(row_now, 4, str(material_now['material']))
                if 'prop_name' in material_now.keys():
                    sht.cell(row_now, 5, str(material_now['prop_name']))
                if 'value' in material_now.keys():
                    sht.cell(row_now, 6, str(material_now['value']))
                if 'unit' in material_now.keys():
                    sht.cell(row_now, 7, str(material_now['unit']))
                if "other_eleinfo" in material_now.keys():
                    sht.cell(row_now, 8, str(material_now['other_eleinfo']))
                if "other_prop_info" in material_now.keys():
                    sht.cell(row_now, 9, str(material_now['other_prop_info']))
                if "child_tag" in material_now.keys():
                    sht.cell(row_now, 10, str(material_now["child_tag"]))
                col_ele = 11
                for ele in ele_list:
                    if ele in material_now.keys():
                        sht.cell(row_now, col_ele, material_now[ele])
                    col_ele += 1
                row_now += 1
        del xls['Sheet']
        self.log_wp.excel_save(xls, out_path)

    def run(self):
        prop_pattern = self.dict_info.table_prop_pattern
        unit_pattern_text = self.dict_info.table_unit_pattern_text
        for prop_name in self.prop_list:
            self.mkdir('output_tt')
            text_path = r"output_tt\full_text"
            self.mkdir(text_path)
            all_txt_info = self.get_text_triple(prop_name)
            target_property = prop_name  # 'density' 'liquidus'  'solidus'  'solvus'
            te = TableExtraction(self.excels_path,
                                 self.c_path,
                                 prop_name=target_property)
            info_all = te.property_info_extraction()
            i_l = 0
            for k, v in info_all.items():
                i_l += len(v)
            all_composition = te.composition_triple_extraction()
            gather_outcome = self.gather_tableinfo_textinfo(
                all_txt_info, info_all, prop_name, prop_pattern,
                unit_pattern_text)
            gather = 0
            for q in gather_outcome:
                k = tuple(q.keys())[0]
                i = q[k]
                gather += len(i)
            ele_transform = self.transform_comp_outcome(all_composition)
            all_structureinfo = self.allinfo_dependencyparse(
                ele_transform, gather_outcome)
            b = 0
            for a in all_structureinfo:
                k = tuple(a.keys())[0]
                i = a[k]
                for n in i:
                    for w, v in n.items():
                        if w == 'value':
                            b += 1
            out_path = self.out_path + '/' + str(prop_name) + '.xlsx'
            self.structureinfo_toexcel(all_structureinfo, out_path)