示例#1
0
 def __init__(self, materials):
     parser = parsing.SimpleParser()
     conditions = []
     if materials is not None:
         materials = materials.split()
         include = list()
         exclude = list()
         for material in materials:
             if material[-1] == ",":
                 material = material[0:-1]
             if material[0] == "-":
                 material = material[1::]
                 parsed = parser.parse(material)
                 exclude.append(parsed if parsed else material)
             else:
                 parsed = parser.parse(material)
                 include.append(parsed if parsed else material)
         if len(include):
             conditions.append(
                 {"$match": {
                     "unique_mats": {
                         "$in": include
                     }
                 }})
         if len(exclude):
             conditions.append(
                 {"$match": {
                     "unique_mats": {
                         "$nin": exclude
                     }
                 }})
     super().__init__(conditions)
示例#2
0
def get_search_results(search="", material="", max_results=10000):
    results = None
    if material is None:
        material = ''
    else:
        parser = parsing.SimpleParser()
    if search is None:
        search = ''
    if search == '' and material == '':
        return None
    if material and not search:
        results = db.abstracts_leigh.find(
            {"normalized_cems": parser.matgen_parser(material)})
    elif search and not material:
        ids = find_similar(search, max_results)
        results = sort_results(
            db.abstracts.find({"_id": {
                "$in": ids[0:1000]
            }}), ids)
    elif search and material:
        ids = find_similar(search, max_results)[0:1000]
        results = db.abstracts_leigh.aggregate([{
            "$match": {
                "_id": {
                    "$in": ids
                }
            }
        }, {
            "$match": {
                "normalized_cems": parser.matgen_parser(material)
            }
        }])
    return list(results)
示例#3
0
def to_highlight(names_list, material):
    parser = parsing.SimpleParser()
    names = []
    for name in names_list:
        if 'names' in name.keys() and parser.matgen_parser(
                name['names'][0]) == parser.matgen_parser(material):
            return name['names'][0]
示例#4
0
    def is_material_features(self,
                             word,
                             chems_in_sent,
                             sent,
                             idx,
                             NE_tagged=True):

        #Check if can be parsed by pymatgen
        parser = parsing.SimpleParser()
        matgen_parsed = 1 if parser.matgen_parser(word) else 0
        #Check if word before or after can be parsed
        if NE_tagged:
            (previous_word, previous_pos), previous_ne = self.get_feature(
                sent, idx - 1, NE_tagged)
            (next_word,
             next_pos), next_ne = self.get_feature(sent, idx + 1, NE_tagged)
        else:
            previous_word, previous_pos = self.get_feature(
                sent, idx - 1, NE_tagged)
            next_word, next_pos = self.get_feature(sent, idx + 1, NE_tagged)
        prev_matgen_parsed = 1 if parser.matgen_parser(previous_word) else 0
        next_matgen_parsed = 1 if parser.matgen_parser(next_word) else 0

        #Check if can be parsed by Olga's parser
        #mp = MaterialParser()
        #try:
        #    olga_parsed = 1 if mp.get_chemical_structure(word)['formula'] else 0
        #except:
        #    olga_parsed = 0

        #Check if chem_data_extractor thinks inside a mention (Need entire sent, idx)
        cde_parsed = 1 if word in chems_in_sent else 0
        #prev_cde_parsed = 1 if previous_word in chems_in_sent else 0
        #next_cde_parsed = 1 if next_word in chems_in_sent else 0
        #Add can be parsed for word before (need entire send, idx)

        features = [
            matgen_parsed,
            #olga_parsed,
            cde_parsed,
            prev_matgen_parsed,
            next_matgen_parsed,
            #prev_cde_parsed,
            #next_cde_parsed
        ]
        return features
    def __init__(self, db_name="matstract_db", local=True):
        db = "production" if db_name == "matstract_db" else "testing"
        self._db = AtlasConnection(local=local, db=db).db
        self.parser = parsing.MaterialParser()
        self.simple_parser = parsing.SimpleParser()
        self.mat_list = []
        self.elem_name_dict = dict()
        for i, elem in enumerate(self.ELEMENTS):
            self.elem_name_dict[self.ELEMENT_NAMES[i]] = elem

        models_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), "")
        classifier_location = os.path.join(models_location, 'r_nr_classifier.p')
        cv_location = os.path.join(models_location, 'cv.p')
        tfidf_location = os.path.join(models_location, 'tfidf.p')

        # load in relevant/not-relevant classifier and vectorizers
        self.clf = pickle.load(open(classifier_location, 'rb'))
        self.cv = pickle.load(open(cv_location, 'rb'))
        self.tfidf = pickle.load(open(tfidf_location, 'rb'))
示例#6
0
 def __init__(self, filter_type, values):
     parser = parsing.SimpleParser()
     conditions = []
     if values is not None:
         include = set()
         exclude = set()
         for val in values:
             # if val[-1] == ",":
             #     val = val[0:-1]
             if val[0] == "-":
                 val = val[1::]
                 parsed = parser.parse(val) if filter_type == "MAT" else val
                 exclude.add(parsed if parsed else val)
             else:
                 parsed = parser.parse(val) if filter_type == "MAT" else val
                 include.add(parsed if parsed else val)
         if len(include) and len(exclude):
             conditions.append({
                 "$match": {
                     filter_type: {
                         "$or": {{
                             "$in": list(include)
                         }, {
                             "$nin": list(exclude)
                         }}
                     }
                 }
             })
         elif len(include):
             conditions.append(
                 {"$match": {
                     filter_type: {
                         "$in": list(include)
                     }
                 }})
         elif len(exclude):
             conditions.append(
                 {"$match": {
                     filter_type: {
                         "$nin": list(exclude)
                     }
                 }})
     super().__init__(conditions)
示例#7
0
    def syntactical_features(self, word):
        '''
        Create syntax-based features for a token

        :param word: string containg the word for which syntactical features are generated
        :return: list of syntactical features
        '''
        #All syntax features
        pre1 = word[:1]
        pre2 = word[:2]
        pre3 = word[:3]
        suf1 = word[-1:]
        suf2 = word[-2:]
        suf3 = word[-2:]
        length = len(word)
        is_lower = 1 if word.islower() else 0
        is_upper = 1 if word.upper() else 0
        is_title = 1 if word.istitle() else 0
        is_digit = 1 if word.isdigit() else 0
        is_alnum = 1 if word.isalnum() else 0

        #check if word is a number
        try:
            float(word)
            is_number = 1
        except ValueError:
            is_number = 0

        #Check if word is a chemical formula
        parser = parsing.SimpleParser()
        is_formula = 1 if parser.matgen_parser(word) else 0

        #Check if punctuation due tokenization
        is_punct = 1 if word in string.punctuation else 0

        #Combine the features
        features = [
            word, pre1, pre2, pre3, suf1, suf2, suf3, length, is_lower,
            is_upper, is_title, is_digit, is_alnum, is_formula, is_punct
        ]
        return features
示例#8
0
def to_highlight(names_list, material):
    parser = parsing.SimpleParser()
    for name in names_list:
        if parser.matgen_parser(name) == parser.matgen_parser(material):
            return material