def process_token_list(self, token_list): """ Takes the token_list returned from extract_entity_tokens(), processes the non-entity portions and combines all tokens into a single list. It also stores the entity names and list idxs in a dictionary Parameters: token_list (list, required): list of texts to be tokenized and already tokenized entities Returns: tokens (list): ordered list of tokens for the whole text entity_idxs (list): list of items and their token idx for each list """ tokens = [] entity_idxs = [] CWT = ChemWordTokenizer() for j, item in enumerate(token_list): if isinstance(item, str): item_tokens = CWT.tokenize(item) ### Split numbers from common units split_tokens = [] for token in item_tokens: split_tokens += self.split_token(token) tokens += split_tokens else: tokens += item item_idx = len(tokens) - 1 entity_idxs.append([item[0], item_idx]) return tokens, entity_idxs
def chem_tokenize(text): cwt = ChemWordTokenizer() tokens = cwt.tokenize(text) token_indexs = cwt.span_tokenize(text) tokenized_info = [] for token_index, token in zip(token_indexs, tokens): tokenized_info.append((token, token_index[0], token_index[1] - 1)) return tokenized_info
# Builds on top of ChemParserBatch_v7 # Takes all paragraphs, from the paper. No random selection # v1 -> Increases speed by just appending to the csv file (instead of writing the whole dataframe) from chemdataextractor.nlp.tokenize import ChemWordTokenizer import chemdataextractor as cde import pandas as pd import random import tqdm import sys import os cwt = ChemWordTokenizer() def read_file_list(file_list_path): open_file = open(file_list_path, 'r') file_list = [] DOI_list = [] for item in open_file.readlines(): # Customize for each case file_name = item.replace('\n', '').split('\t')[0].split('/')[2] DOI = item.replace('\n', '').split('\t')[1] file_list.append(file_name) DOI_list.append(DOI) return(file_list, DOI_list) def clean_paragraph(paragraph): par_clean = paragraph.text.replace('\n',' ') par_clean = par_clean.replace('/',' / ') par_clean = par_clean.replace('‑','-') par_clean = par_clean.replace('- ','-') return(par_clean)
def __init__(self): self.cwt = ChemWordTokenizer()