def load_final_pipe(): import string import en_core_sci_lg from collections import Counter from tqdm import tqdm en_core_sci_lg.load() return load_object(FINAL_PIPE_FILE_PATH)
def dependency_parser_visualizer(text): nlp = en_core_sci_lg.load() doc = nlp(text) print(list(doc.sents)) # Examine the entities extracted by the mention detector. print(doc.ents) from spacy import displacy displacy.render(next(doc.sents), style='dep', jupyter=True)
def __init__(self, data_dir: str): '''Initializes a CORD-19 data preprocessing class Args: data_dir: Raw data directory ''' self.data_dir = data_dir # Initialize NLP model self.nlp = en_core_sci_lg.load(disable=["tagger", "ner"]) self.nlp.max_length = 2000000 self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) self.nlp_words_to_check = 100
def similarize_tokens(words, df, feature): nlp = en_core_sci_lg.load() processed_text = nlp(df[feature]) word_tokens = nlp(' '.join(words)) similarities = [] for token1 in word_tokens: for token2 in processed_text: if token1 != token2: similarity = token1.similarity(token2) if (similarity is not None and similarity > 0.5 and similarity < 1): similarities.append( {'token1': token1.text, 'token2': token2.text, 'similarity': similarity}) return similarities
def __init__(self): self.gbif_source_path = ( "/Users/chloesekkat/Documents/batch8_ceebios/data/simplified_taxon_gbif.csv" ) self.papers_data_dir = ( "/Users/chloesekkat/Documents/batch8_ceebios/data_open_source" ) self.to_keep = [ "id", "title", "paperAbstract", "authors", "year", "fieldsOfStudy", "journalName", "doiUrl", ] self.keyword_processor = get_gbif_keyprocessor(self.gbif_source_path) self.nlp = en_core_sci_lg.load()
import xx_sent_ud_sm import en_core_sci_lg nlp_uni = xx_sent_ud_sm.load() nlp_sci = en_core_sci_lg.load() # UNIVERSAL def is_token_allowed_uni(token): ''' Only allow valid tokens which are not stop words and punctuation symbols. ''' if not token or not token.text.strip() or token.is_stop or token.is_punct: return False return True def preprocesstoken_uni(token): # Reduce token to its lowercase lemma form return token.lemma_.strip().lower() def tokenize_uni(x): try: return str([ preprocesstoken_uni(token) for token in nlp_uni(x) if is_token_allowed_uni(token) ]) except: return str([])
""" This script converts the jsonlines data format to the csv format for annotation usage: python convert_from_jsonl.py <inpath> """ import plac import jsonlines from pathlib import Path import csv import jsonlines from helperutilz import * from ekphrasis_preprocess import text_processor import plac import pandas as pd import en_core_sci_lg nlp = en_core_sci_lg.load() # plac.annotations(inpath=("inpath for ", "positional", "i", Path), # outpath=("outpath for jsonlines for prodigy", "positional", "o", Path), # process=("boolean", "option", "p", bool), # label=("string ", "option", "l", str), # ) def convert(inpath, outpath, process=True, label='fullname'): print(f"reading in {inpath}") Path(outpath).parent.mkdir(parents=True, exist_ok=True) unique_set = set() cnt = 0 kept = 0
import scispacy import spacy import en_core_sci_lg from scipy.spatial.distance import cosine import joblib from IPython.display import HTML, display from ipywidgets import interact, Layout, HBox, VBox, Box import ipywidgets as widgets from IPython.display import clear_output from tqdm import tqdm from os.path import isfile import seaborn as sb import matplotlib.pyplot as plt from joblib import dump, load nlp = en_core_sci_lg.load(disable=["tagger", "parser", "ner"]) nlp.max_length = 2000000 def spacy_tokenizer(sentence): return [ word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space or len(word) == 1) ] def print_top_words(model, vectorizer, n_top_words): feature_names = vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): message = "\nTopic #%d: " % topic_idx
def preprocess(self, raw_data = DEFAULT_ROOT_PATH, output_file = DEFAULT_OUTPUT_FILE): metadata_path = f'{raw_data}/metadata.csv' meta_df = pd.read_csv(metadata_path, dtype={ 'pubmed_id': str, 'Microsoft Academic Paper ID': str, 'doi': str }) #print(meta_df.head()) all_json = glob.glob(f'{raw_data}/pdf_json/**/*.json', recursive=True) #print(len(all_json)) class FileReader: def __init__(self, file_path): with open(file_path) as file: content = json.load(file) self.paper_id = content['paper_id'] self.abstract = [] self.body_text = [] # Abstract for entry in content['abstract']: self.abstract.append(entry['text']) # Body text for entry in content['body_text']: self.body_text.append(entry['text']) self.abstract = '\n'.join(self.abstract) self.body_text = '\n'.join(self.body_text) def __repr__(self): return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...' first_row = FileReader(all_json[0]) def get_breaks(content, length): data = "" words = content.split(' ') total_chars = 0 # add break every length characters for i in range(len(words)): total_chars += len(words[i]) if total_chars > length: data = data + "<br>" + words[i] total_chars = 0 else: data = data + " " + words[i] return data dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []} for idx, entry in enumerate(all_json): if idx % (len(all_json) // 10) == 0: print(f'Processing index: {idx} of {len(all_json)}') try: content = FileReader(entry) except Exception as e: continue # invalid paper format, skip # get metadata information meta_data = meta_df.loc[meta_df['sha'] == content.paper_id] # no metadata, skip this paper if len(meta_data) == 0: continue dict_['abstract'].append(content.abstract) dict_['paper_id'].append(content.paper_id) dict_['body_text'].append(content.body_text) # also create a column for the summary of abstract to be used in a plot if len(content.abstract) == 0: # no abstract provided dict_['abstract_summary'].append("Not provided.") elif len(content.abstract.split(' ')) > 100: # abstract provided is too long for plot, take first 100 words append with ... info = content.abstract.split(' ')[:100] summary = get_breaks(' '.join(info), 40) dict_['abstract_summary'].append(summary + "...") else: # abstract is short enough summary = get_breaks(content.abstract, 40) dict_['abstract_summary'].append(summary) # get metadata information meta_data = meta_df.loc[meta_df['sha'] == content.paper_id] try: # if more than one author authors = meta_data['authors'].values[0].split(';') if len(authors) > 2: # if more than 2 authors, take them all with html tag breaks in between dict_['authors'].append(get_breaks('. '.join(authors), 40)) else: # authors will fit in plot dict_['authors'].append(". ".join(authors)) except Exception as e: # if only one author - or Null valie dict_['authors'].append(meta_data['authors'].values[0]) # add the title information, add breaks when needed try: title = get_breaks(meta_data['title'].values[0], 40) dict_['title'].append(title) # if title was not provided except Exception as e: dict_['title'].append(meta_data['title'].values[0]) # add the journal information dict_['journal'].append(meta_data['journal'].values[0]) # add doi dict_['doi'].append(meta_data['doi'].values[0]) df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary']) df_covid.head() #df_covid.to_csv("/data/jilin/4225proj/df_covid.csv", index = False) df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split())) # word count in abstract df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split())) # word count in body df_covid['body_unique_words']=df_covid['body_text'].apply(lambda x:len(set(str(x).split()))) # number of unique words in body df_covid.head() #df_covid['abstract'].describe(include='all') df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True) #df_covid['abstract'].describe(include='all') #df_covid['body_text'].describe(include='all') #print(df_covid.describe()) df = df_covid df.dropna(inplace=True) from tqdm import tqdm from langdetect import detect from langdetect import DetectorFactory # set seed DetectorFactory.seed = 0 # hold label - language languages = [] # go through each text for ii in tqdm(range(0,len(df))): # split by space into list, take the first x intex, join with space text = df.iloc[ii]['body_text'].split(" ") lang = "en" try: if len(text) > 50: lang = detect(" ".join(text[:50])) elif len(text) > 0: lang = detect(" ".join(text[:len(text)])) # ught... beginning of the document was not in a good format except Exception as e: all_words = set(text) try: lang = detect(" ".join(all_words)) # what!! :( let's see if we can find any text in abstract... except Exception as e: try: # let's try to label it through the abstract then lang = detect(df.iloc[ii]['abstract_summary']) except Exception as e: lang = "unknown" pass # get the language languages.append(lang) from pprint import pprint languages_dict = {} for lang in set(languages): languages_dict[lang] = languages.count(lang) #print("Total: {}\n".format(len(languages))) pprint(languages_dict) df['language'] = languages df = df[df['language'] == 'en'] #print(df.info()) import string punctuations = string.punctuation stopwords = list(STOP_WORDS) stopwords[:10] custom_stop_words = [ 'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', 'www' ] for w in custom_stop_words: if w not in stopwords: stopwords.append(w) # Parser parser = en_core_sci_lg.load(disable=["tagger", "ner"]) parser.max_length = 7000000 def spacy_tokenizer(sentence): mytokens = parser(sentence) mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ] mytokens = " ".join([i for i in mytokens]) return mytokens tqdm.pandas() df["processed_text"] = df["body_text"].progress_apply(spacy_tokenizer) #df['body_word_count'].describe() #df['body_unique_words'].describe() from sklearn.feature_extraction.text import TfidfVectorizer def vectorize(text, maxx_features): vectorizer = TfidfVectorizer(max_features=maxx_features) X = vectorizer.fit_transform(text) return X text = df['processed_text'].values X = vectorize(text, 2 ** 12) X.shape from sklearn.decomposition import PCA pca = PCA(n_components=0.8, random_state=42) X_reduced = pca.fit_transform(X.toarray()) #print(X_reduced.shape) np.savetxt(output_file, X_reduced, delimiter = ",")
# import scispacy # import spacy import en_core_sci_lg from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer nlp_engine = en_core_sci_lg.load() stop_words = set(stopwords.words("english")) lemmatizer = WordNetLemmatizer()
import en_core_sci_lg import de_core_news_lg import os import datetime from pandarallel import pandarallel import warnings warnings.filterwarnings('ignore') pandarallel.initialize(use_memory_fs=False) # client = Elasticsearch([{'host': 'localhost'}, {'port': 9200}]) nlp_german = de_core_news_lg.load( exclude=["parser", "ner", "tok2vec", "textcat"]) nlp_sci = en_core_sci_lg.load(exclude=["parser", "ner", "tok2vec", "textcat"]) # UNIVERSAL def is_token_allowed_german(token): ''' Only allow valid tokens which are not stop words and punctuation symbols. ''' if not token or not token.text.strip() or token.is_stop or token.is_punct: return False return True def preprocesstoken_german(token):
#Part of the following code was obtained from here: #https://github.com/allenai/scispacy #Getting the annotations from each of the tweet's text df_scispacy_annotations = pd.DataFrame(columns=[ 'Tweet_id', 'Text_section', 'Span_start', 'Span_end', 'Annotation_type', 'Extras' ]) df_scispacy_tweets_tagged = pd.DataFrame( columns=['Tweet_id', 'Tweet_full_text']) print("Configuring the Scispacy tagger. Please wait...") nlp = {} print("Configuring the UMLS linker. Please wait..") #We setup the scispacy tagger using the UML linker first nlp['umls'] = en_core_sci_lg.load() linker = EntityLinker(resolve_abbreviations=True, name="umls") nlp['umls'].add_pipe(linker) linker_umls = nlp['umls'].get_pipe("EntityLinker") print("Starting the tagging process. Please wait...") for index, row in df_filtered.iterrows(): annotation_umls = nlp['umls'](str(row['tweet_text'])) #UMLS Linker count = 0 if len(annotation_umls.ents) > 0: df_scispacy_tweets_tagged.loc[len(df_scispacy_tweets_tagged.index)] = [ row['tweet_id'], row['tweet_text']