def __init__(self, materials): parser = parsing.SimpleParser() conditions = [] if materials is not None: materials = materials.split() include = list() exclude = list() for material in materials: if material[-1] == ",": material = material[0:-1] if material[0] == "-": material = material[1::] parsed = parser.parse(material) exclude.append(parsed if parsed else material) else: parsed = parser.parse(material) include.append(parsed if parsed else material) if len(include): conditions.append( {"$match": { "unique_mats": { "$in": include } }}) if len(exclude): conditions.append( {"$match": { "unique_mats": { "$nin": exclude } }}) super().__init__(conditions)
def get_search_results(search="", material="", max_results=10000): results = None if material is None: material = '' else: parser = parsing.SimpleParser() if search is None: search = '' if search == '' and material == '': return None if material and not search: results = db.abstracts_leigh.find( {"normalized_cems": parser.matgen_parser(material)}) elif search and not material: ids = find_similar(search, max_results) results = sort_results( db.abstracts.find({"_id": { "$in": ids[0:1000] }}), ids) elif search and material: ids = find_similar(search, max_results)[0:1000] results = db.abstracts_leigh.aggregate([{ "$match": { "_id": { "$in": ids } } }, { "$match": { "normalized_cems": parser.matgen_parser(material) } }]) return list(results)
def to_highlight(names_list, material): parser = parsing.SimpleParser() names = [] for name in names_list: if 'names' in name.keys() and parser.matgen_parser( name['names'][0]) == parser.matgen_parser(material): return name['names'][0]
def is_material_features(self, word, chems_in_sent, sent, idx, NE_tagged=True): #Check if can be parsed by pymatgen parser = parsing.SimpleParser() matgen_parsed = 1 if parser.matgen_parser(word) else 0 #Check if word before or after can be parsed if NE_tagged: (previous_word, previous_pos), previous_ne = self.get_feature( sent, idx - 1, NE_tagged) (next_word, next_pos), next_ne = self.get_feature(sent, idx + 1, NE_tagged) else: previous_word, previous_pos = self.get_feature( sent, idx - 1, NE_tagged) next_word, next_pos = self.get_feature(sent, idx + 1, NE_tagged) prev_matgen_parsed = 1 if parser.matgen_parser(previous_word) else 0 next_matgen_parsed = 1 if parser.matgen_parser(next_word) else 0 #Check if can be parsed by Olga's parser #mp = MaterialParser() #try: # olga_parsed = 1 if mp.get_chemical_structure(word)['formula'] else 0 #except: # olga_parsed = 0 #Check if chem_data_extractor thinks inside a mention (Need entire sent, idx) cde_parsed = 1 if word in chems_in_sent else 0 #prev_cde_parsed = 1 if previous_word in chems_in_sent else 0 #next_cde_parsed = 1 if next_word in chems_in_sent else 0 #Add can be parsed for word before (need entire send, idx) features = [ matgen_parsed, #olga_parsed, cde_parsed, prev_matgen_parsed, next_matgen_parsed, #prev_cde_parsed, #next_cde_parsed ] return features
def __init__(self, db_name="matstract_db", local=True): db = "production" if db_name == "matstract_db" else "testing" self._db = AtlasConnection(local=local, db=db).db self.parser = parsing.MaterialParser() self.simple_parser = parsing.SimpleParser() self.mat_list = [] self.elem_name_dict = dict() for i, elem in enumerate(self.ELEMENTS): self.elem_name_dict[self.ELEMENT_NAMES[i]] = elem models_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), "") classifier_location = os.path.join(models_location, 'r_nr_classifier.p') cv_location = os.path.join(models_location, 'cv.p') tfidf_location = os.path.join(models_location, 'tfidf.p') # load in relevant/not-relevant classifier and vectorizers self.clf = pickle.load(open(classifier_location, 'rb')) self.cv = pickle.load(open(cv_location, 'rb')) self.tfidf = pickle.load(open(tfidf_location, 'rb'))
def __init__(self, filter_type, values): parser = parsing.SimpleParser() conditions = [] if values is not None: include = set() exclude = set() for val in values: # if val[-1] == ",": # val = val[0:-1] if val[0] == "-": val = val[1::] parsed = parser.parse(val) if filter_type == "MAT" else val exclude.add(parsed if parsed else val) else: parsed = parser.parse(val) if filter_type == "MAT" else val include.add(parsed if parsed else val) if len(include) and len(exclude): conditions.append({ "$match": { filter_type: { "$or": {{ "$in": list(include) }, { "$nin": list(exclude) }} } } }) elif len(include): conditions.append( {"$match": { filter_type: { "$in": list(include) } }}) elif len(exclude): conditions.append( {"$match": { filter_type: { "$nin": list(exclude) } }}) super().__init__(conditions)
def syntactical_features(self, word): ''' Create syntax-based features for a token :param word: string containg the word for which syntactical features are generated :return: list of syntactical features ''' #All syntax features pre1 = word[:1] pre2 = word[:2] pre3 = word[:3] suf1 = word[-1:] suf2 = word[-2:] suf3 = word[-2:] length = len(word) is_lower = 1 if word.islower() else 0 is_upper = 1 if word.upper() else 0 is_title = 1 if word.istitle() else 0 is_digit = 1 if word.isdigit() else 0 is_alnum = 1 if word.isalnum() else 0 #check if word is a number try: float(word) is_number = 1 except ValueError: is_number = 0 #Check if word is a chemical formula parser = parsing.SimpleParser() is_formula = 1 if parser.matgen_parser(word) else 0 #Check if punctuation due tokenization is_punct = 1 if word in string.punctuation else 0 #Combine the features features = [ word, pre1, pre2, pre3, suf1, suf2, suf3, length, is_lower, is_upper, is_title, is_digit, is_alnum, is_formula, is_punct ] return features
def to_highlight(names_list, material): parser = parsing.SimpleParser() for name in names_list: if parser.matgen_parser(name) == parser.matgen_parser(material): return material