示例#1
0
 def get_analogy(n_clicks, pos_1, neg_1, pos_2):
     if n_clicks is not None and \
             pos_1 is not None and \
             pos_1 != "" and \
             neg_1 is not None and \
             neg_1 != "" and \
             pos_2 is not None and \
             pos_2 != "":
         ee = EmbeddingEngine()
         pos_1 = ee.phraser[ee.dp.process_sentence(pos_1.split())[0]]
         neg_1 = ee.phraser[ee.dp.process_sentence(neg_1.split())[0]]
         pos_2 = ee.phraser[ee.dp.process_sentence(pos_2.split())[0]]
         pos_1_vec = ee.get_word_vector(pos_1[0])
         neg_1_vec = ee.get_word_vector(neg_1[0])
         pos_2_vec = ee.get_word_vector(pos_2[0])
         if pos_1_vec is not None and neg_1_vec is not None and pos_2_vec is not None:
             diff_vec = pos_2_vec + pos_1_vec - neg_1_vec
             norm_diff = diff_vec / np.linalg.norm(diff_vec,
                                                   axis=0)  # unit length
             close_words = ee.close_words(norm_diff, exclude_self=False)[0]
             print(close_words)
             for close_word in close_words:
                 if close_word not in [pos_1[0], neg_1[0], pos_2[0]]:
                     return close_word.replace("_", " ")
         else:
             return "?"
     else:
         return "?"
示例#2
0
    def get_relevant_materials(_, search_text, n_search_text, plus_elems, minus_elems):
        if search_text is not None and search_text != "":
            ee = EmbeddingEngine()

            # the positive word vectors
            sentence = ee.phraser[ee.dp.process_sentence(search_text.split())]

            # the negative word vectors
            n_sentence = ee.phraser[ee.dp.process_sentence(n_search_text.split())] \
                if n_search_text is not None and len(n_search_text) > 0 else None

            # finding materials sorted by similarity
            most_similar = ee.find_similar_materials(
                sentence=sentence,
                n_sentence=n_sentence,
                min_count=15,
                use_output_emb=True)

            # filtering the results by elements and returning top 50
            elem_filtered = ee.filter_by_elements(most_similar, plus_elems, minus_elems, max=50)

            # display top 50 results
            matlist = ee.most_common_form(elem_filtered[:50])
            material_names, material_scores, material_counts, _ = zip(*matlist)
            return matlist_figure([number_to_substring(name) for name in material_names], material_scores, material_counts)
        else:
            return ""
示例#3
0
 def get_similar_words(_, word):
     if word is not None and word != "":
         ee = EmbeddingEngine()
         close_words, scores = ee.close_words(word)
         return [
             html.Span([
                 "({:.2f}) {}".format(scores[i],
                                      close_word.replace("_", " ")),
                 html.Br()
             ]) for i, close_word in enumerate(close_words)
         ]
     else:
         return ""
示例#4
0
class MatSearch(Resource):
    EE = EmbeddingEngine()

    @require_api_key
    def get(self, wordphrase, top_k=100):
        try:

            response = {
                "valid_response": True,
                "response": {
                    'original_wordphrase':
                    wordphrase,
                    'materials':
                    self.EE.find_similar_materials(wordphrase,
                                                   min_count=10)[0:top_k]
                }
            }
            status_code = status.HTTP_200_OK
        except:
            response = {
                "valid_response": False,
                "error": "Could not get synonyms."
            }
            status_code = status.HTTP_400_BAD_REQUEST
        response = jsonify(response)
        response.status_code = status_code
        return response
示例#5
0
    def phrase_tokens(self):
        def ungroup_tokens(toks):
            new_toks = []
            for t_r in toks:
                new_toks.append([])
                for t in t_r:
                    for ii, elem in enumerate(t["text"]):
                        new_toks[-1].append({"text": elem,
                                             "pos": t["pos"][ii],
                                             "annotation": t["annotation"]})
            return new_toks

        grouped_toks = self.group_and_process()
        ee = EmbeddingEngine()
        for row_idx, tokenRow in enumerate(grouped_toks):
            for idx, token in enumerate(tokenRow):
                grouped_toks[row_idx][idx]["text"] = ee.phraser[ee.dp.process_sentence(
                    token["text"] if type(token["text"]) is list else [token["text"]])]
                new_pos_tags = []
                for i, tok in enumerate(grouped_toks[row_idx][idx]["text"]):
                    p_l = len(new_pos_tags)
                    if "_" not in tok:
                        new_pos_tags.append(grouped_toks[row_idx][idx]["pos"][p_l])
                    else:
                        new_pos_tags.append("_".join([
                            grouped_toks[row_idx][idx]["pos"][k] for k in range(p_l, p_l+len(tok.split("_")))
                        ]))
                grouped_toks[row_idx][idx]["pos"] = new_pos_tags
        return ungroup_tokens(grouped_toks)
示例#6
0
class EmbeddingResource(Resource):
    EE = EmbeddingEngine()
    embedding_schema = EmbeddingSchema()
    embeddings_schema = EmbeddingSchema(many=True)

    def _prepare_response(self, wordphrases):
        try:
            embeddings = []
            for wp in wordphrases:
                compound = False
                embedding = Embedding(wp,
                                      '',
                                      self.EE.get_word_vector(wp),
                                      compound=False)
                if " " in wp and embedding.embedding is None:
                    embeddings.append(
                        Embedding(wp,
                                  '',
                                  self.EE.get_word_vector(wp),
                                  compound=True))
                else:
                    embeddings.append(embedding)
            response = {
                "valid_response": True,
                "response": self.embeddings_schema.dump(embeddings)
            }
            status_code = status.HTTP_200_OK
        except:
            response = {
                "valid_response": False,
                "error": "Something went wrong..."
            }
            status_code = status.HTTP_400_BAD_REQUEST
        response = jsonify(response)
        response.status_code = status_code
        return response

    @require_api_key
    def get(self, wordphrase):
        wps = wordphrase.split(',')
        return self._prepare_response(wps)

    @require_api_key
    def post(self):
        json_data = request.get_json(force=True)
        print(json_data, "look")
        try:
            wordphrases = json_data["wordphrase"]
            wps = wordphrases.split(',')
            return self._prepare_response(wps)

        except KeyError as err:
            response = {
                "valid_response": False,
                "error": "Provided json file does not contain wordphrases."
            }
            response = jsonify(response)
            response.status_code = status.HTTP_400_BAD_REQUEST
            return response
示例#7
0
 def get_similar_words(_, word):
     if word is not None and word != "":
         ee = EmbeddingEngine()
         close_words, scores = ee.close_words(word, top_k=8)
         print(close_words)
         return dt.DataTable(rows=[{
             "#":
             i + 1,
             'Words and phrases similar to "{}"'.format(word):
             w.replace("_", " "),
             "Cosine similarity":
             int(scores[i] * 1000) / 1000
         } for i, w in enumerate(close_words)],
                             row_selectable=False,
                             filterable=False,
                             editable=False,
                             sortable=False,
                             column_widths=[25, None, 140],
                             id='analogies_table')
         # return [html.Span(["({:.2f}) {}".format(scores[i], close_word.replace("_", " ")), html.Br()])
         #         for i, close_word in enumerate(close_words)]
     else:
         return ""
示例#8
0
    def __init__(self):
        """
        The constructor for the Cluster Plot object
        :param entity_type: 'all' or 'materials'
        :param limit: number of most common entities to plot
        :param heatphrase: color according to similarity to this phrase
        :param wordphrases: filter to show only the specified phrases
        """

        ds = np.DataSource()
        # material_names_url = "https://s3-us-west-1.amazonaws.com/materialsintelligence/material_map_tsne_words.npy"
        material_coords_url = "https://s3-us-west-1.amazonaws.com/materialsintelligence/final_material_map_atl10_30_ee12_lr200.npy"

        # ds.open(material_names_url)
        ds.open(material_coords_url)

        self.ee = EmbeddingEngine()
        self.embs = self.ee.embeddings / self.ee.norm
        # materials_json = urlopen("https://s3-us-west-1.amazonaws.com/matstract/material_map_10_mentions.json")
        # materials_data = materials_json.read().decode("utf-8")
        # self.materials_tsne_data = json.loads(materials_data)["data"][0]
        # self.norm_matnames = [self.ee.dp.get_norm_formula(m) for m in self.materials_tsne_data["text"]]
        # self.matname2index = dict()
        # for i, label in enumerate(self.norm_matnames):
        #     self.matname2index[label] = i

        self.materials_tsne_data = np.load(ds.abspath(material_coords_url))
        formula_counts = dict()
        for formula in self.ee.formulas_full:
            formula_counts[formula] = 0
            for elem in self.ee.formulas_full[formula]:
                formula_counts[formula] += self.ee.formulas_full[formula][elem]

        mat_counts = sorted(formula_counts.items(), key=lambda x: x[1], reverse=True)
        mat_counts = [mat_count for mat_count in mat_counts if mat_count[1] >= 10]

        self.norm_matnames = [m[0] for m in mat_counts]
        self.matname2index = dict()
        for i, label in enumerate(self.norm_matnames):
            self.matname2index[label] = i
示例#9
0
    def phrase_tokens(self):
        def ungroup_tokens(toks):
            new_toks = []
            for t_r in toks:
                new_toks.append([])
                for t in t_r:
                    for ii, elem in enumerate(t["text"]):
                        new_toks[-1].append({"text": elem,
                                             "pos": t["pos"][ii],
                                             "annotation": t["annotation"]})
            return new_toks

        grouped_toks = self.group_and_process()
        ee = EmbeddingEngine()
        for row_idx, tokenRow in enumerate(grouped_toks):
            for idx, token in enumerate(tokenRow):
                # processing the sentence
                processesed_sentence, split_indices = ee.dp.process_sentence(
                    token["text"] if type(token["text"]) is list else [token["text"]])
                grouped_toks[row_idx][idx]["text"] = ee.phraser[processesed_sentence]

                # some tokens are split during processing so need to update pos tags
                processed_pos = []
                for ii, pos in enumerate(grouped_toks[row_idx][idx]["pos"]):
                    processed_pos += [pos] if ii not in split_indices else [pos, pos]
                grouped_toks[row_idx][idx]["pos"] = processed_pos

                # grouping words together
                new_pos_tags = []
                for i, tok in enumerate(grouped_toks[row_idx][idx]["text"]):
                    p_l = len(new_pos_tags)
                    if "_" not in tok:
                        new_pos_tags.append(grouped_toks[row_idx][idx]["pos"][p_l])
                    else:
                        new_pos_tags.append("_".join([
                            grouped_toks[row_idx][idx]["pos"][k] for k in range(p_l, p_l+len(tok.split("_")))
                        ]))
                grouped_toks[row_idx][idx]["pos"] = new_pos_tags
        return ungroup_tokens(grouped_toks)
示例#10
0
class Synonyms(Resource):
    EE = EmbeddingEngine()

    @require_api_key
    def get(self, wordphrase, top_k=8):
        try:

            response = {
                "valid_response": True,
                "response": {
                    'original_wordphrase': wordphrase,
                    'synonyms': self.EE.close_words(wordphrase, top_k)
                }
            }
            status_code = status.HTTP_200_OK
        except:
            response = {
                "valid_response": False,
                "error": "Could not get synonyms."
            }
            status_code = status.HTTP_400_BAD_REQUEST
        response = jsonify(response)
        response.status_code = status_code
        return response
示例#11
0
from flask import Flask, request, jsonify
from flask_restful import Api
from stract.api.models import *
from matstract.models.database import AtlasConnection
from matstract.models.word_embeddings import EmbeddingEngine
from matstract.models.cluster_plot import ClusterPlot
from matstract.models.search import Search
from matstract.models.similar_materials import SimilarMaterials
from matstract.models.errors import *
import json
db = AtlasConnection()
ee = EmbeddingEngine()

app = Flask(__name__)
api = Api(app)

cp = ClusterPlot()


# endpoint to test
@app.route('/api/test/<message>', methods=["Get"])
def test_api(message):
    messages = message.split(',')
    test = [APITest(message) for message in messages]
    return TestSchema(many=True).jsonify(test)


# endpoint to abstracts
@app.route('/api/abstracts/<abstract_id>', methods=["GET"])
def retrieve_abstracts(abstract_id):
    abstract_ids = abstract_id.split(',')