def get_content(DOI, format="json", refresh=True, *args, **kwds): """ Helper function to read file content as xml. Args: input_doi (str): DOI of article *args: **kwds: Returns: Content of returned XML file """ if not refresh: db = AtlasConnection().db elsevier = db.elsevier entries = elsevier.find({"doi": DOI}) if len(entries): if len(entries) > 1: print( "More than one entry for given DOI! Only using only first entry." ) entry = entries[0] if entry["collected"]: content = entry["xml"] return content else: if format == "xml": content = download(*args, **kwds).text return content elif format == "json": content = download(*args, **kwds) return content
def __init__(self): #Similar mats self.sm = SimilarMaterials() # Connect to db self.db = AtlasConnection().db # Mat parser self.parser = SimpleParser()
def random_abstract(): # locations for relevant/not relevant classifier and vecotrizers models_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../nlp/') classifier_location = os.path.join(models_location, 'r_nr_classifier.p') cv_location = os.path.join(models_location, 'cv.p') tfidf_location = os.path.join(models_location, 'tfidf.p') # load in relevant/not-relevant classifier and vectorizers r_nr_clf = pickle.load(open(classifier_location, 'rb')) cv = pickle.load(open(cv_location, 'rb')) tfidf = pickle.load(open(tfidf_location, 'rb')) no_abstract = True random_abs = None db = AtlasConnection.db(local=True, db="production").db while no_abstract: random_document = list( db.abstracts.aggregate([{ "$sample": { "size": 1 } }]))[0] random_abs = random_document['abstract'] vectorized = cv.transform([random_abs]) transformed = tfidf.transform(vectorized) if r_nr_clf.predict(transformed): no_abstract = False return random_abs
class Abstracts(Resource): """ The Abstracts resource. Allows user to request abstracts using their ids and specify the fields to be returned. """ input_schema = EntryRequestSchema() DB = AtlasConnection() get_type = {'id': DB.get_documents_by_id, 'doi': DB.get_documents_by_doi} abstract_shema = AbstractSchema() abstracts_shema = AbstractSchema(many=True) # def abort_if_abstract_doesnt_exist(self, ids): # # If id not found in MongoDB # missing = [] # for id in ids: # if id not in atlas: # missing.append(id) # if len(missing): # abort(status.HTTP_404_NOT_FOUND, message="Cannot find Abstract for id(s) {}".format(missing)) def _prepare_response(self, id, fields, id_type): id = id.split(',') if id_type == 'id' or id_type == 'doi': print(id) response = { "valid_response": True, "response": self.get_type[id_type](id) } code = status.HTTP_200_OK else: response = { "valid_response": False, "error": "Invalid id_type. Please use either 'doi' or 'id'.", } code = status.HTTP_400_BAD_REQUEST response = jsonify(response) response.status_code = code return response @require_api_key def get(self, id, fields=None, id_type='doi'): if fields == None: fields = ['title', 'authors', 'journal', 'doi', 'abstract'] return self._prepare_response(id, fields, id_type) @require_api_key def post(self): try: json_data = request.get_json() # Validate input data = self.input_schema.load(json_data).data id = data['id'] fields = data['fields'] id_type = data.get('id_type', 'doi') return self._prepare_response(id, fields, id_type) except ValidationError as err: return jsonify(err.messages), status.HTTP_400_BAD_REQUEST
def suggest_phrase(n_clicks, phrase, notes): db = AtlasConnection(access="admin", db="test").db if n_clicks is not None: if phrase: doc = { "phrase": phrase, "notes": notes, "date": datetime.now() } print(doc) db.suggestedPhrases.insert(doc) return "submitted!" return "Suggest a phrase!"
def get_keywords(material): db = AtlasConnection(db="test").db print(db.info) parser = SimpleParser() material = parser.matgen_parser(material) print("number of materials is", db.keywords.count()) keywords = db.keywords.find_one({'material': material}) if keywords is not None: tf = keywords['keywords_tf'] tf_arranged = arrange_keywords(tf) tfidf = keywords['keywords_tfidf'] tfidf_arranged = arrange_keywords(tfidf) df = pd.DataFrame() df['tf'] = tf_arranged df['tfidf'] = tfidf_arranged return generate_table(df) else: return "No keywords for the specified material"
def __init__(self, db_name="matstract_db", local=True): db = "production" if db_name == "matstract_db" else "testing" self._db = AtlasConnection(local=local, db=db).db self.parser = parsing.MaterialParser() self.simple_parser = parsing.SimpleParser() self.mat_list = [] self.elem_name_dict = dict() for i, elem in enumerate(self.ELEMENTS): self.elem_name_dict[self.ELEMENT_NAMES[i]] = elem models_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), "") classifier_location = os.path.join(models_location, 'r_nr_classifier.p') cv_location = os.path.join(models_location, 'cv.p') tfidf_location = os.path.join(models_location, 'tfidf.p') # load in relevant/not-relevant classifier and vectorizers self.clf = pickle.load(open(classifier_location, 'rb')) self.cv = pickle.load(open(cv_location, 'rb')) self.tfidf = pickle.load(open(tfidf_location, 'rb'))
def check_scopus_collection(year, issn): """ Checks the scopus_log collection on MongoDB whether the data for a given year/journal combination has been collected. Args: year: (str) year issn: (str) issn of journal Returns: (bool) True if status of the year/journal pair is "complete" """ db = AtlasConnection(access='admin', db="test").db log = db.build_log entry = log.find({"year": year, "issn": issn})[0] if entry["status"] == "complete": return True elif entry["status"] == "incomplete": return False else: raise KeyError("Entry has no status!")
def __init__(self): self._ac = AtlasConnection(db="production") self._ec = ElasticConnection() self.filters = []
class MatstractSearch: """The class running all search queries""" def __init__(self): self._ac = AtlasConnection(db="production") self._ec = ElasticConnection() self.filters = [] def search(self, text='', materials=(), max_results=1000): if materials is not None: max_results = 10000 print("searching for {} and {}".format(text, materials)) pipeline = list() if materials: self.material_filter = MaterialFilter(materials) for cond in self.material_filter.conditions: pipeline.append(cond) pipeline.append({ "$lookup": { "from": "abstracts", "localField": "doi", "foreignField": "doi", "as": "abstracts" } }) pipeline.append({"$match": {"abstracts": {"$ne": []}}}) pipeline.append({"$unwind": "$abstracts"}) pipeline.append({ "$project": { "_id": "$abstracts._id", "doi": 1, "abstract": "$abstracts.abstract", "year": "$abstracts.year", "authors": "$abstracts.authors", "title": "$abstracts.title", "journal": "$abstracts.journal", "link": "$abstracts.link", "chem_mentions": "$unique_mats" } }) pipeline.append({"$project": {"abstracts": 0}}) pipeline.append({"$limit": max_results}) if text: ids = self._ec.query(text, max_results=max_results) self.document_filter = DocumentFilter(ids) if not materials or not len(materials): return self._ac.get_documents_by_id(ids) for cond in self.document_filter.conditions: pipeline.append(cond) return self._ac.db.mats_.aggregate(pipeline) def more_like_this(self, text='', materials=(), max_results=100): if text is None or text == '': return None query = { "query": { "more_like_this": { "fields": ['title', 'abstract'], "like": text } } } hits = self._ec.search(index="tri_abstracts", body=query, size=max_results, request_timeout=60)["hits"]["hits"] ids = [ObjectId(h["_id"]) for h in hits] return self._ac.get_documents_by_id(ids)
def generate_trends_graph(search=None, material=None, layout=None): sp = SimpleParser() if material is not None: material = sp.matgen_parser(material) db = AtlasConnection(db="production").db pipeline = list() pipeline.append({"$match": {"MAT": material}}) pipeline.append({ "$lookup": { "from": "abstracts", "localField": "doi", "foreignField": "doi", "as": "abstracts" } }) pipeline.append({"$match": {"abstracts": {"$ne": []}}}) pipeline.append({"$unwind": "$abstracts"}) pipeline.append({"$project": {"year": "$abstracts.year"}}) pipeline.append({"$project": {"abstracts": 0}}) pipeline.append({"$group": {"_id": "$year", "count": {"$sum": 1}}}) res = db.ne_071018.aggregate(pipeline) if res is not None: results = list(db.ne_071018.aggregate(pipeline)) else: results = [] results_dict = dict() for res in results: if int(res["_id"]) in results_dict: results_dict[int(res["_id"])] += res["count"] else: results_dict[int(res["_id"])] = res["count"] results = sorted(results_dict.items(), key=lambda x: x[0]) print(results) # results = list(MS.search(text=search, filters=filters, max_results=10000)) hist = dict() if len(results) > 0: # histdata = {} # years = [int(r["year"]) for r in results] # for year in years: # if year in histdata.keys(): # histdata[year] += 1 # else: # histdata[year] = 1 # for year in range(min(2000, min(histdata.keys())), 2017): # if not year in histdata.keys(): # histdata[year] = 0 # if 2018 in histdata: # del(histdata[2018]) # TODO remove after demo # histdata = sorted(histdata.items(), key=operator.itemgetter(0)) hist["data"] = [{ 'x': [x[0] for x in results], 'y': [x[1] for x in results], 'line': { "width": 2, "color": 'rgb(0, 0, 0)' } }] else: hist["data"] = [{ 'x': [], 'y': [], 'line': { "width": 2, "color": 'rgb(0, 0, 0)' } }] if layout is not None: hist["layout"] = layout return hist
import dash_materialsintelligence as dmi from flask import send_from_directory from dash.dependencies import Input, Output from matstract.web.view import new_search_app, summary_app, matsearch_app from matstract.web.callbacks import new_search_callbacks, summary_callbacks, matsearch_callbacks from matstract.models.database import AtlasConnection # app config app = dash.Dash() app.css.config.serve_locally = True app.scripts.config.serve_locally = True app.config.suppress_callback_exceptions = True app.title = "Matstract - Rediscovering Materials" db = AtlasConnection().db # loading css files css_files = [ "dash_extra.css", "skeleton.min.css", "webstract.css", "googleapis.raleway.css", "googleapis.dosis.css" ] stylesheets_links = [ html.Link(rel='stylesheet', href='/static/css/' + css) for css in css_files ] header = html.Div([ dcc.Location(id="url", refresh=False), html.Div(dmi.DropdownCreatable(), style={"display": "none"}), html.Img( src=
def get_entities(mat, class_name="three columns"): # Normalize the material parser = SimpleParser() material = parser.matgen_parser(mat) # Open connection and get NEs associated with the material db = AtlasConnection(db="test").db entities = list(db.ne_norm.find({'MAT': material})) #entities = list(db.ne_norm.find({'doi': {'$in': dois}})) num_entities = len(entities) # Extract the entities if entities is not None: apl, pro, spl, smt, cmt, dsc = [], [], [], [], [], [] for doc in entities: # Get the properties pro.append(doc['PRO']) # Get the application apl.append(doc['APL']) # Get the SPL spl.append(doc['SPL']) # Get the synthesis method smt.append(doc['SMT']) # Get the characterization method cmt.append(doc['CMT']) # Get the characterization method dsc.append(doc['DSC']) pro = [p for pp in pro for p in pp if len(p) > 2] pro = nltk.FreqDist(pro).most_common(40) apl = [p for pp in apl for p in pp if len(p) > 2] apl = nltk.FreqDist(apl).most_common(20) apl = [(a, score) for a, score in apl if a not in ['coating', 'electrode']] spl = [p for pp in spl for p in pp if len(p) > 2] spl = nltk.FreqDist(spl).most_common(3) smt = [p for pp in smt for p in pp if len(p) > 2] smt = nltk.FreqDist(smt).most_common(20) cmt = [p for pp in cmt for p in pp if len(p) > 2] cmt = nltk.FreqDist(cmt).most_common(20) dsc = [p for pp in dsc for p in pp if len(p) > 2] dsc = nltk.FreqDist(dsc).most_common(20) if class_name == "three columns": return html.Div([ html.Div([ html.Div(trends_app.display_trends_graph(material), className="six columns"), gen_output(pro, num_entities, 'Property', material, class_name), gen_output(apl, num_entities, 'Application', material, class_name) ], className="row"), html.Div([ gen_output(cmt, num_entities, 'Characterization', material, class_name), gen_output(smt, num_entities, 'Synthesis', material, class_name), gen_output(dsc, num_entities, 'Sample descriptor', material, class_name), gen_output(spl, num_entities, 'Phase', material, class_name) ], className="row"), ]) else: return html.Div([ html.Div([ gen_output(pro, num_entities, 'Property', material, class_name), gen_output(apl, num_entities, 'Application', material, class_name), gen_output(cmt, num_entities, 'Characterization', material, class_name) ], className="row"), html.Div([ gen_output(smt, num_entities, 'Synthesis', material, class_name), gen_output(dsc, num_entities, 'Sample descriptor', material, class_name), gen_output(spl, num_entities, 'Phase', material, class_name) ], className="row"), ]) else: return "No entities for the specified material"
import dash_html_components as html import dash_core_components as dcc import pandas as pd from matstract.models.database import AtlasConnection, ElasticConnection from matstract.extract import parsing from matstract.models.search import MatstractSearch import re db = AtlasConnection(db="production").db client = ElasticConnection() def highlight_material(body, material): highlighted_phrase = html.Mark(material) if len(material) > 0 and material in body: chopped = body.split(material) newtext = [] for piece in chopped[:-1]: newtext.append(piece) newtext.append(highlighted_phrase) newtext.append(chopped[-1]) return newtext return body def highlight_multiple_materials(body, materials): if len(materials) > 0 and any([material in body for material in materials]): newtext = [] for material in materials: highlighted_phrase = html.Mark(material) if len(newtext) > 0:
if n % 1000 == 0: print(n) return entity_dict def worker(docs, upper, lower): with open('{}.out'.format(upper), 'w+') as f: print('beginning...', file=f) nes = [ne for doc in docs for ne in doc[upper]] ne_dict = clean_to_dict(nes) return ne_dict if __name__ == '__main__': db = AtlasConnection(db='test').db ne = db.ne_071918 docs = list( ne.find(projection={ 'doi': 1, 'PRO': 1, 'SMT': 1, 'CMT': 1, 'SPL': 1, 'APL': 1, 'DSC': 1 })[:1000]) smt = [smt for doc in docs for smt in doc['SMT']] smt_dict = clean_to_dict(smt) print(smt_dict) pickle.dump(smt_dict, open('apl_dict.p', 'wb'))
class MatstractSearch: """The class running all search queries""" VALID_FILTERS = [ "material", "property", "application", "descriptor", "characterization", "synthesis", "phase" ] FILTER_DICT = { "material": "MAT", "property": "PRO", "application": "APL", "descriptor": "DSC", "characterization": "CMT", "synthesis": "SMT", "phase": "SPL", } def __init__(self, local=False): self._ac = AtlasConnection(db="production", local=local) self._ec = ElasticConnection() self.filters = [] def search(self, text=None, materials=None, max_results=1000, filters=None): print("searching for '{}' and {}".format(text, filters)) pipeline = list() if filters: for f in filters: if f is not None: search_filter = SearchFilter( filter_type=self.FILTER_DICT[f[0]], values=f[1].split(",")) for cond in search_filter.conditions: pipeline.append(cond) pipeline.append({ "$lookup": { "from": "abstracts", "localField": "doi", "foreignField": "doi", "as": "abstracts" } }) pipeline.append({"$match": {"abstracts": {"$ne": []}}}) pipeline.append({"$unwind": "$abstracts"}) pipeline.append({ "$project": { "_id": "$abstracts._id", "doi": 1, "abstract": "$abstracts.abstract", "year": "$abstracts.year", "authors": "$abstracts.authors", "title": "$abstracts.title", "journal": "$abstracts.journal", "link": "$abstracts.link", "chem_mentions": "$unique_mats" } }) pipeline.append({"$project": {"abstracts": 0}}) elif materials: # if filters are supplied don't look at materials for material in materials: if material is not None: material_filter = MaterialFilter(material.split(",")) for cond in material_filter.conditions: pipeline.append(cond) pipeline.append({ "$lookup": { "from": "abstracts", "localField": "doi", "foreignField": "doi", "as": "abstracts" } }) pipeline.append({"$match": {"abstracts": {"$ne": []}}}) pipeline.append({"$unwind": "$abstracts"}) pipeline.append({ "$project": { "_id": "$abstracts._id", "doi": 1, "abstract": "$abstracts.abstract", "year": "$abstracts.year", "authors": "$abstracts.authors", "title": "$abstracts.title", "journal": "$abstracts.journal", "link": "$abstracts.link", "chem_mentions": "$unique_mats" } }) pipeline.append({"$project": {"abstracts": 0}}) if len(pipeline) > 0: results = self._ac.db.ne_071018.aggregate(pipeline) ids = [str(entry["_id"]) for entry in results] else: ids = None if text and (ids is None or len(ids) > 0): ids = self._ec.query(text, ids=ids, max_results=max_results) return self._ac.get_documents_by_id(ids) def more_like_this(self, text='', materials=(), max_results=100): if text is None or text == '': return None query = { "query": { "more_like_this": { "fields": ['title', 'abstract'], "like": text } } } hits = self._ec.search(index="tri_abstracts", body=query, size=max_results, request_timeout=60)["hits"]["hits"] ids = [ObjectId(h["_id"]) for h in hits] return self._ac.get_documents_by_id(ids)
def __init__(self, local=False): self._db = AtlasConnection(access="annotator", local=local, db="production").db
def __init__(self, local=False): self._ac = AtlasConnection(db="production", local=local) self._ec = ElasticConnection() self.filters = []
def contribute(user_creds="matstract/config/db_creds.json", max_block_size=100, num_blocks=1, apikey=None): """ Gets a incomplete year/journal combination from elsevier_log, queries for the corresponding dois, and downloads the corresponding xmls for each to the elsevier collection. Args: user_creds ((:obj:`str`, optional)): path to contributing user's write-permitted credential file. max_block_size ((:obj:`int`, optional)): maximum number of articles in block (~1s/article). Defaults to 100. num_blocks ((:obj:`int`, optional)): maximum number of blocks to run in session. Defaults to 1. """ user = json.load(open(user_creds, 'r'))["scopus"]["name"] db = AtlasConnection(access="admin", db="test").db log = db.build_log build = db.build for i in range(num_blocks): time.sleep( 3 ) # to make sure we don't send more than 6 request / second on 16 cores (3 > 16/6) # Verify access at start of each block to detect dropped VPN sessions. verify_access() # Get list of all available blocks sorted from largest to smallest. available_blocks = log.find( { "status": "incomplete", "num_articles": { "$lt": max_block_size } }, ["year", "issn", "journal"]).limit(1).sort("num_articles", -1) # Break if no remaining blocks smaller than max_block_size if available_blocks.count() == 0: print( "No remaining blocks with size <= {}.".format(max_block_size)) break else: print("Blocks remaining = {}".format( min(num_blocks - i, available_blocks.count()))) target = available_blocks[0] date = datetime.datetime.now().isoformat() log.update_one({"_id": target["_id"]}, { "$set": { "status": "in progress", "updated_by": user, "updated_on": date } }) # Collect scopus for block if "journal" in target: print("Collecting entries for {}, {} (Block ID {})...".format( target.get("journal"), target.get("year"), target.get("_id"))) else: print("Collecting entries for {}, {} (Block ID {})...".format( target.get("issn"), target.get("year"), target.get("_id"))) dois = find_articles(year=target["year"], issn=target["issn"], get_all=True, apikey=apikey) new_entries = collect_entries_by_doi_search(dois, user, apikey=apikey) # Update log with number of articles for block num_articles = len(new_entries) log.update_one({"_id": target["_id"]}, {"$set": { "num_articles": num_articles }}) # Insert entries into Matstract database print("Inserting entries into Matstract database...") for entry in tqdm(new_entries): build.replace_one({"doi": entry["doi"]}, entry, upsert=True) # Mark block as completed in log date = datetime.datetime.now().isoformat() log.update_one({"_id": target["_id"]}, { "$set": { "status": "complete", "completed_by": user, "completed_on": date, "updated_by": user, "updated_on": date } })
def collect_entries_by_doi_search(dois, user, apikey=None): """ Collects the scopus entry for each DOI in dois and processes them for insertion into the Matstract database. Args: dois (list(str)): List of DOIs user: (dict): Credentials of user entry_type (str): "full_article" or "abstract". Default is "abstract" Returns: entries (list(dict)): List of entries to be inserted into database """ db = AtlasConnection(db="test").db entries = [] miniblocks = [dois[x:x + 25] for x in range(0, len(dois), 25)] for miniblock in tqdm(miniblocks): if db.build.find({ "doi": { "$in": miniblock } }).count() == len(miniblock): continue query = " OR ".join(["DOI({})".format(doi) for doi in miniblock]) search = ElsSearch(query=query, index="scopus") search._uri = search.uri + "&view=COMPLETE" if apikey: CLIENT = ElsClient(apikey, num_res=10000) search.execute(els_client=CLIENT, get_all=True) results = search.results for result in results: date = datetime.datetime.now().isoformat() doi = result['prism:doi'] try: article = MiniAbstract(result) abstract = article.abstract raw_abstract = article.raw_abstract if abstract is None or raw_abstract is None: entries.append({ "doi": doi, "completed": False, "error": "No Abstract!", "pulled_on": date, "pulled_by": user }) else: entries.append({ "doi": doi, "title": article.title, "abstract": abstract, "raw_abstract": raw_abstract, "authors": article.authors, "url": article.url, "subjects": [], "journal": article.journal, "date": article.cover_date, "citations": article.citations, "completed": True, "pulled_on": date, "pulled_by": user }) except HTTPError as e: entries.append({ "doi": doi, "completed": False, "error": str(e), "pulled_on": date, "pulled_by": user }) return entries
def get_entities(material): # Normalize the material parser = SimpleParser() material = parser.matgen_parser(material) # Open connection and get NEs associated with the material db = AtlasConnection(db="test").db dois = db.mats_.find({'unique_mats': material}).distinct('doi') entities = list(db.ne.find({'doi': {'$in': dois}})) num_entities = len(entities) # Extract the entities if entities is not None: apl, pro, spl, smt, cmt, dsc = [], [], [], [], [], [] for doc in entities: # Get the properties pro.append(doc['PRO']) # Get the application apl.append(doc['APL']) spl.append(doc['SPL']) # Get the synthesis method smt.append(doc['SMT']) # Get the characterization method cmt.append(doc['CMT']) # Get the characterization method dsc.append(doc['DSC']) pro = [ pro_dict[p] for pp in pro for p in pp if len(p) > 2 and p in pro_dict.keys() ] pro = nltk.FreqDist(pro).most_common(20) apl = [ apl_dict[p] for pp in apl for p in pp if len(p) > 2 and p in apl_dict.keys() ] apl = nltk.FreqDist(apl).most_common(10) spl = [p for pp in spl for p in pp if len(p) > 2] spl = nltk.FreqDist(spl).most_common(3) smt = [ smt_dict[p] for pp in smt for p in pp if len(p) > 2 and p in smt_dict.keys() ] smt = nltk.FreqDist(smt).most_common(10) cmt = [ cmt_dict[p] for pp in cmt for p in pp if len(p) > 2 and p in cmt_dict.keys() ] cmt = nltk.FreqDist(cmt).most_common(10) dsc = [ dsc_dict[p] for pp in dsc for p in pp if len(p) > 2 and p in dsc_dict.keys() ] dsc = nltk.FreqDist(dsc).most_common(10) return html.Div([ gen_output(pro, num_entities, 'Property', material), gen_output(cmt, num_entities, 'Characterization', material), gen_output(smt, num_entities, 'Synthesis', material), gen_output(spl, num_entities, 'Phase', material), gen_output(apl, num_entities, 'Application', material), gen_output(dsc, num_entities, 'Sample descriptor', material), ]) else: return "No entities for the specified material"