示例#1
0
def extract_sentences(paper_path, para_yes):
    """extracts sentences from a paper into two lists, given that para_yes contains
    a list of document element numbers corresponding to paragraphs manually identified
    as those containing synthesis information"""

    f = open(paper_path, 'rb')
    doc = Document.from_file(f, readers=[HtmlReader()])

    sen_yes_arr = list()
    sen_no_arr = list()

    elem_all = np.arange(0, len(doc))
    para_no = np.delete(elem_all, para_yes)

    for i in para_no:
        if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph:
            for sentence in doc.elements[i]:
                sen_no_arr.append(sentence)

    for i in para_yes:
        if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph:
            for sentence in doc.elements[i]:
                sen_yes_arr.append(sentence)

    return sen_yes_arr, sen_no_arr
示例#2
0
def read_html_paper(paper_path):
    """Opens a HTML paper and stores it as a chemdataextractor Document"""

    f = open(paper_path, 'rb')
    doc = Document.from_file(f, readers=[HtmlReader()])

    return doc
示例#3
0
    def get_img(self, doc):
        """Get images from doc using chemdataextractor"""

        # Load document image data from file
        tem_images = []
        cde_doc = Document.from_file(open(doc[1], "rb"))
        log.info('This article is : %s' % doc[0])
        imgs = cde_doc.figures
        del cde_doc

        # Identify relevant images from records
        for img in imgs:
            detected = False  # Used to avoid processing images twice
            records = img.records
            caption = img.caption
            for record in records:
                if detected is True:
                    break

                rec = record.serialize()
                if [self.img_type] in rec.values():
                    detected = True
                    log.info('%s instance found!' % self.img_type)
                    tem_images.append((doc[0], img.id, img.url, caption.text.replace('\n', ' ')))

        if len(tem_images) != 0:
            return tem_images
        else:
            return None
示例#4
0
def extract():
    """Extract melting points from patents."""
    Paragraph.parsers = [CompoundParser(), ChemicalLabelParser(), MpParser()]
    Table.parsers = []
    patents = []
    for root, dirs, files in os.walk('../examples/mp/grants'):
        for filename in files:
            if not filename.endswith('.xml'):
                continue
            path = os.path.abspath(os.path.join(root, filename))
            size = os.path.getsize(path)
            patents.append((path, filename, size))

    patents = sorted(patents, key=lambda p: p[2])

    for path, filename, size in patents:
        print(path)
        shutil.copyfile(path, '../examples/mp/used/%s' % filename)
        with open(path) as f:
            d = Document.from_file(f)
        if os.path.isfile('../examples/mp/results/%s.json' % filename):
            continue
        records = [
            r.serialize() for r in d.records if len(r.melting_points) == 1
        ]
        with open('../examples/mp/results/%s.json' % filename, 'w') as fout:
            fout.write(
                json.dumps(records, ensure_ascii=False,
                           indent=2).encode('utf8'))
示例#5
0
 def test_document_usage(self):
     """Test RscHtmlReader used via Document.from_file."""
     fname = '1752-153X-5-55.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'springer', fname),
         'rb')
     d = Document.from_file(f, readers=[SpringerHtmlReader()])
     self.assertEqual(len(d.elements), 97)
示例#6
0
 def test_document_usage(self):
     """Test ElsevierHtmlReader used via Document.from_file."""
     fname = 'S0143720816310816.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'elsevier', fname),
         'rb')
     d = Document.from_file(f, readers=[ElsevierHtmlReader()])
     self.assertEqual(len(d.elements), 246)
 def test_document_usage(self):
     """Test RscHtmlReader used via Document.from_file."""
     fname = '10.1039_C6OB02074G.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'rsc', fname),
         'rb')
     d = Document.from_file(f, readers=[RscHtmlReader()])
     self.assertEqual(len(d.elements), 61)
 def test_document_usage(self):
     """Test UsptoXmlReader used via Document.from_file."""
     fname = 'US06840965B2.xml'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'uspto', fname),
         'rb')
     d = Document.from_file(f, readers=[UsptoXmlReader()])
     self.assertEqual(len(d.elements), 112)
示例#9
0
 def test_document_usage(self):
     """Test AcsHtmlReader used via Document.from_file."""
     fname = 'acs.jmedchem.6b00723.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'acs', fname),
         'rb')
     d = Document.from_file(f, readers=[AcsHtmlReader()])
     self.assertEqual(len(d.elements), 198)
示例#10
0
def extract_document(filename,
                     extract_all=True,
                     allow_wildcards=False,
                     output=os.path.join(os.path.dirname(os.getcwd()), 'csd')):
    """ Extracts chemical records from a document and identifies chemical schematic diagrams.
    Then substitutes in if the label was found in a record

    :param filename: Location of document to be extracted
    :param extract_all : Boolean to determine whether output is combined with chemical records
    :param allow_wildcards: Bool to indicate whether results containing wildcards are permitted
    :param output: Directory to store extracted images

    :return : Dictionary of chemical records with diagram SMILES strings, or List of label candidates and smiles
    """

    log.info('Extracting from %s ...' % filename)

    # Extract the raw records from CDE
    doc = Document.from_file(filename)
    figs = doc.figures

    # Identify image candidates
    csds = find_image_candidates(figs, filename)

    # Download figures locally
    fig_paths = download_figs(csds, output)
    log.info("All relevant figures from %s downloaded successfully" % filename)

    # When diagrams are not found, return results without CSR extraction
    if extract_all and not fig_paths:
        log.info('No chemical diagrams detected. Returning chemical records.')
        return doc.records.serialize()
    elif not extract_all and not fig_paths:
        log.info('No chemical diagrams detected. Returning empty list.')
        return []

    log.info('Chemical diagram(s) detected. Running ChemSchematicResolver...')
    # Run CSR
    results = []
    for path in fig_paths:
        try:
            results.append(extract_image(path,
                                         allow_wildcards=allow_wildcards))
        except:
            log.error('Could not extract image at %s' % path)
            pass

    if not extract_all:
        return results

    records = doc.records.serialize()

    # Substitute smiles for labels
    combined_results = substitute_labels(records, results)
    log.info(
        'All diagram results extracted and combined with chemical records.')

    return combined_results
示例#11
0
def list_chemicals(foin):
    fchem=open(foin,'rb')
    docchem=Document.from_file(fchem)
    ct=0
    t = PrettyTable(['Filename','Entity_count','Start','End','Entity'])
    for i in docchem.cems:
        ct=ct+1
        t.add_row([foin,ct,i.start,i.end,i.text])
    t.align='l'
    t.border=False
    return(t)
示例#12
0
文件: tasks.py 项目: ti250/cdeweb
def get_result(f, fname):
    try:
        document = Document.from_file(f, fname=fname)
    except Exception:
        return {}
    records = document.records.serialize()
    records = natsort.natsorted(
        records,
        lambda x: x.get('labels', ['ZZZ%s' %
                                   (99 - len(x.get('names', [])))])[0])
    result = {
        'records': records,
        'abbreviations': document.abbreviation_definitions
    }
    return result
示例#13
0
def cde_read_pdfs(argv, pdf_path="./pdfs"):
    try:
        pdf_path = argv[0]
    except:
        print("missing arguments" + "\n -string pdf files path")
        return
    pdf_dir = Path(pdf_path)
    files_list = get_files_list(pdf_dir)
    print(files_list)
    for a_file in files_list:
        file_name = a_file.name
        pdf_f = open(a_file, 'rb')
        doc = Document.from_file(pdf_f)
        uniques = get_uniques(doc)
        max_lbl, max_val = get_max(uniques)
        print(file_name, "Unique entities:", len(uniques),
              "Most common entity:", max_lbl, max_val)
def extract_all_chem_names(read_path = "", write_in_file = False):
    start_time = time.time()
    doc = None

    #Always open files as binary
    print(f"\nLoading the information from: {read_path}.....")
    with open(read_path, "rb") as f:
        doc = Document.from_file(f)
        f.close()
    print("Document is created")


    print("\nExtracting all chemical names in the document.....")
    #records ==> records of all mentions and abbreviations, properties and spectra
    #Concatenate all the chemical names in the list
    all_chemicals = []
    for compound in doc.records:
        comp = compound.serialize()
        if "names" in comp:
            all_chemicals += comp["names"]

    all_chemicals.sort()
    total_chems = len(all_chemicals)
    print(f"The total number of chemical names extracted are {total_chems}")

    if (write_in_file):
        write_path = read_path.split(".")[0] + "_chem_list.txt"
        print(f"\nWriting the information to file {write_path}")
        with open(write_path, "w+") as f:
            for chemical in all_chemicals:
                f.write(chemical + "\n")
            f.close()

    tot_time = round(time.time() - start_time, 3)
    print(f"\nTime taken to extract all the chemical name from the doc is {tot_time} seconds")

    return all_chemicals;
示例#15
0
def ligandfinder(docpath, index):  #update to handle list of synonyms
    found_chems = []
    #''' import document'''
    f = open(docpath, 'rb')
    doc = Document.from_file(f)
    no_elements = len(doc.elements)
    for i in range(0, no_elements):
        check = False
        if isinstance(doc.elements[i],
                      cde.doc.text.Paragraph):  #will do tables later
            for sentence in doc.elements[i].raw_sentences:
                #'''scrape for sentences/paragraphs with ligand synonyms'''
                if 'ligand' or 'ligands' or 'coordinating' or 'surfactant' or 'surfactants' in cwt.tokenize(
                        sentence):
                    check = True
            if check:
                if not doc.elements[i].cems == True:
                    #'''scrape those sentences/paragraphs for ligand names'''
                    for chemical in doc.elements[i].cems:
                        if chemical.text not in found_chems and chemical.text not in cf.element_filter:
                            found_chems.append(
                                (chemical.text, index
                                 ))  #'''add ligand names to chem_records_df'''
    return found_chems
示例#16
0
import csv

from chemdataextractor import Document

if len(sys.argv) == 1:
    print('No file path provided. Terminating program.')
    sys.exit()

file_path = sys.argv[1]
if not os.path.isfile(file_path):
    print('Path does not refer to a valid file. Terminating program')
    sys.exit()

with open(file_path, 'rb') as f:
    #Convert to Document
    full_text_doc = Document.from_file(f)

#Extract lists of records from Documents
print("Extracting... (This may take a minute or two)")
doc_records = full_text_doc.records.serialize()
print("Extracted")
print(doc_records)

concentration_matrix = []
cleaned_doc_records = [
    record for record in doc_records if 'measured_concentrations' in record
]
for record in cleaned_doc_records:
    name = record['names'][0]
    for concentration in record['measured_concentrations']:
        concentration_matrix.append([name, *concentration.values()])
示例#17
0
req_head = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
}
# get the page content
html_response = requests.get(article_url, headers=req_head)
#save the content as a temporary file in the local disk
f = open("temp.html", "w+")
f.write(str(html_response.content))
f.close()

# open de temporary file and read in binary mode
f = open("temp.html", 'rb')

# create a document object from the file
doc = Document.from_file(f)

for element in doc.elements:
    print(element)

para = doc.elements[14]

print(para)
print("Sentences:", len(para.sentences))
print("Tokens:", para.tokens)
print("Tokens:", len(para.tokens))
print("Tokens:", len(para.tokens[0]))

#list of unique occurences
uniques = []
for chement in doc.cems:
示例#18
0
class SoParser(BaseParser):
    root = so

    def interpret(self, result, start, end):
        compound = Compound(solubility=[
            Solubility(value=first(result.xpath('./value/text()')),
                       units=first(result.xpath('./units/text()')))
        ])
        yield compound


# In[40]:

Paragraph.parsers = [SoParser()]

# In[42]:

d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(
        u'The procedure was followed to yield a pale yellow solid (solubility is 28 mg/mL)'
    ))
d.records.serialize()

# In[12]:

d = Document.from_file("../../test1.htm")
d.records.serialize()

# In[ ]:
示例#19
0
print('Chemical entities(cems):\n', doc.cems)

print('abbreviation definitions:\n', doc.abbreviation_definitions)

print('Records:\n', doc.records)

print('Record 0:\n', doc.records[0].serialize())

print('Record 1:\n', doc.records[1].serialize())

filepath = "pdfs/c8cp05975f.pdf"

f = open(filepath, 'rb')

doc = Document.from_file(f)  #, readers=[PdfReader()])

for element in doc.elements:
    print(element)

para = doc.elements[14]

print(para)
print("Sentences:", len(para.sentences))
print("Tokens:", para.tokens)
print("Tokens:", len(para.tokens))
print("Tokens:", len(para.tokens[0]))

#list of unique occurences
uniques = []
for chement in doc.cems:
示例#20
0
    def paragraph_extract(self):
        no_pargas = []
        if self.journal == "Springer" or self.journal == "NaturePublishingGroup":
            html_file = os.listdir(self.html_path)
            number_file = len(os.listdir(self.html_path))
            no_pargas = []
            success_extracted = []
            content_exist = []
            content_label = "Sec\d+\S*"
            for file_i in range(0, number_file):
                sole_file = html_file[file_i]
                file = open(self.html_path + '/' + sole_file, 'rb')
                doc = Document.from_file(file)
                paragraphs = doc.paragraphs
                all_parag = ''
                content_find = None
                for parag in paragraphs:
                    if "Abs" in str(parag.id):
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        text = text.replace("\n", '')
                        all_parag += text
                        all_parag += " "

                    re_d = re.findall(content_label, str(parag.id))
                    if re_d:
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        text = text.replace("\n", '')
                        all_parag += text
                        all_parag += " "
                        content_find = True
                if content_find:
                    content_exist.append(sole_file)
                    # 若全文找不到段落标签
                if not all_parag:
                    self.log_wp.print_log("No paragraph label:%s", sole_file)
                    no_pargas.append(sole_file)
                    for parag in paragraphs:
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        text = text.replace("\n", '')
                        all_parag += text
                        all_parag += " "

                else:
                    success_extracted.append(sole_file)
                txt_name = str(sole_file).replace(".html", ".txt")
                path = self.out_path + '/' + txt_name
                self.log_wp.write_totxt_log(path, str(all_parag))

        if self.journal == "Tandfonline":
            html_file = os.listdir(self.html_path)
            number_file = len(os.listdir(self.html_path))
            no_pargas = []
            success_extracted = []
            content_exist = []
            content_label = "[Ss]\d{3}\S*"

            for file_i in range(0, number_file):
                sole_file = html_file[file_i]
                file = open(self.html_path + '/' + sole_file, 'rb')
                doc = Document.from_file(file)
                elements = doc.elements
                parags = doc.paragraphs
                all_parag = ''
                abs_search = None
                content_find = None
                for ele in elements:
                    if str(ele.id) == 'abstract':
                        abs_search = 1
                    if abs_search == 1:
                        abstract = ele.text
                        if abstract[0].isupper() and len(abstract) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", abstract)
                        for ref in refs:
                            abstract = abstract.replace(ref, '')
                        abstract = abstract.replace("\n", '')
                        all_parag += abstract
                        all_parag += " "
                        break
                for parag in parags:
                    re_d = re.findall(content_label, str(parag.id))
                    if re_d:
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        text = text.replace("\n", '')
                        all_parag += text
                        all_parag += " "
                        content_find = True
                if content_find:
                    content_exist.append(sole_file)

                # 若全文找不到段落标签
                if not all_parag:
                    self.log_wp.print_log("No paragraph label:%s", sole_file)
                    no_pargas.append(sole_file)
                    for parag in parags:
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        text = text.replace("\n", '')
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        all_parag += text
                        all_parag += " "
                else:
                    success_extracted.append(sole_file)

                txt_name = str(sole_file).replace(".html", ".txt")
                path = self.out_path + '/' + txt_name
                self.log_wp.write_totxt_log(path, str(all_parag))

        if self.journal == "WileyBlackwell":
            html_file = os.listdir(self.html_path)
            number_file = len(os.listdir(self.html_path))
            no_pargas = []
            success_extracted = []
            content_exist = []
            content_label = "sec"

            for file_i in range(0, number_file):
                sole_file = html_file[file_i]
                file = open(self.html_path + '/' + sole_file, 'rb')
                doc = Document.from_file(file)
                parags = doc.paragraphs
                all_parag = ''
                content_find = None
                for parag in parags:
                    if content_label in str(
                            parag.id) and "reference" not in str(
                                parag.id):  # 避免参考文件信息加入到结果中
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        text = text.replace("\n", '')
                        all_parag += text
                        all_parag += " "
                        content_find = True
                if content_find:
                    content_exist.append(sole_file)

                # 若全文找不到段落标签
                if not all_parag:
                    self.log_wp.print_log("No paragraph label:%s", sole_file)
                    no_pargas.append(sole_file)
                    for parag in parags:
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        text = text.replace("\n", '')
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        all_parag += text
                        all_parag += " "

                else:
                    success_extracted.append(sole_file)

                txt_name = str(sole_file).replace(".html", ".txt")
                path = self.out_path + '/' + txt_name
                self.log_wp.write_totxt_log(path, str(all_parag))

        if self.journal == "ASME":
            html_file = os.listdir(self.html_path)
            number_file = len(os.listdir(self.html_path))
            no_pargas = []
            success_extracted = []
            content_exist = []
            content_label = "ContentTab"

            for file_i in range(0, number_file):
                sole_file = html_file[file_i]
                file = open(self.html_path + '/' + sole_file, 'rb')
                doc = Document.from_file(file)
                parags = doc.paragraphs
                all_parag = ''
                content_find = None
                for parag in parags:
                    if content_label == str(parag.id):
                        text = parag.text
                        if text == "References":
                            break
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        text = text.replace("\n", '')
                        all_parag += text
                        all_parag += " "
                        content_find = True
                if content_find:
                    content_exist.append(sole_file)

                # 若全文找不到段落标签
                if not all_parag:
                    self.log_wp.print_log("No paragraph label:%s", sole_file)
                    no_pargas.append(sole_file)
                    for parag in parags:
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        text = text.replace("\n", '')
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        all_parag += text
                        all_parag += " "
                else:
                    success_extracted.append(sole_file)

                txt_name = str(sole_file).replace(".html", ".txt")
                path = self.out_path + '/' + txt_name
                self.log_wp.write_totxt_log(path, str(all_parag))

        if self.journal == "MDPI":
            html_file = os.listdir(self.html_path)
            number_file = len(os.listdir(self.html_path))
            no_pargas = []
            success_extracted = []
            content_exist = []
            content_label = "sec\d+\S*"
            content_label_2 = "^\d+[A-Z]\S*"
            for file_i in range(0, number_file):
                sole_file = html_file[file_i]
                file = open(self.html_path + '/' + sole_file, 'rb')
                doc = Document.from_file(file)
                paragraphs = doc.paragraphs
                all_parag = ''
                content_find = None
                for parag in paragraphs:
                    if "Abs" in str(parag.id) or "abs" in str(parag.id):
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        text = text.replace("\n", '')
                        all_parag += text
                        all_parag += " "

                    re_d = re.findall(content_label, str(parag.id))
                    re_d_2 = re.findall(content_label_2, str(parag.id))
                    if re_d or re_d_2:
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        text = text.replace("\n", '')
                        all_parag += text
                        all_parag += " "
                        content_find = True
                if content_find:
                    content_exist.append(sole_file)
                    # 若全文找不到段落标签
                if not all_parag:
                    self.log_wp.print_log("No paragraph label:%s", sole_file)
                    no_pargas.append(sole_file)
                    for parag in paragraphs:
                        text = parag.text
                        if text[0].isupper() and len(text) > 300:
                            all_parag += '\n'
                        refs = re.findall(r"\[\d+[^\[]*\]", text)
                        for ref in refs:
                            text = text.replace(ref, '')
                        text = text.replace("\n", '')
                        all_parag += text
                        all_parag += " "

                else:
                    success_extracted.append(sole_file)
                txt_name = str(sole_file).replace(".html", ".txt")
                path = self.out_path + '/' + txt_name
                self.log_wp.write_totxt_log(path, str(all_parag))

        return no_pargas, success_extracted, content_exist
示例#21
0
"""
Script to run the ChemDataExtractor on the
200 pmids.

http://chemdataextractor.org/docs/cem
https://pubs.acs.org/doi/abs/10.1021/acs.jcim.6b00207
"""

import os
from chemdataextractor import Document
from chemdataextractor.reader import PlainTextReader

cde_annotations = open("tool_annotations/ChemDataExtractor_annotations.txt",
                       "w",
                       encoding="utf8")

for d in os.listdir("../citations"):
    with open("../citations/{}".format(d), "rb") as f:
        citation = Document.from_file(f, readers=[PlainTextReader()])
        pmid = d.split(".")[0]
        cde_annotations.write("\n{}\n".format(pmid))
        for ann in citation.cems:
            cde_annotations.write("{}\n".format(ann))

cde_annotations.close()