Exemplo n.º 1
0
def getCodebookActivation():
    """
            Plot result of codebook Activation
            Get som model id and return a png of codebook activations
            ---
            consumes:
              - application/json
            produces:
              - application/json
            parameters:
              - in: query
                name: som_model_id
                type: string
                required: true
                description: id of SOM model
            responses:
              200:
                description: png of codebook activations
                schema:
                    type: object
                    properties:
                        pictures:
                         type: string
              500:
                description: Internal Server Error
                schema:
                    type: object
                    properties:
                        error:
                         type: string
              299:
                description: Model is still training or not trained
                schema:
                    type: object
                    properties:
                        warning:
                         type: string
    """
    log.info("/analytics-backend/getCodebookActivation")
    som_model_id = request.args.get('som_model_id')

    filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                        ) + "som_" + str(som_model_id) + ".pickle"
    try:
        som_model = som_ms.load_obj(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                            ) + "som_" + str(som_model_id) + "_training.txt"
        return returnModelStatus(filename, som_model_id)

    png = som_ms.getCodebookActivation(som_model)

    # f = open(png, 'r+')
    # data = f.read()

    import base64
    with open(png, "rb") as imageFile:
        s = base64.b64encode(imageFile.read())

    return json.dumps({'picture': s})
Exemplo n.º 2
0
def trainCodebookClustering():
    """
            Train model Codebook Clustering
            Get the id of SOM model, return the id of the model that will be trained and start in a new thread the training of the model (The training process could take hours)
            ---
            parameters:
              - in: body
                name: body
                schema:
                  type: object
                  properties:
                    som_model_id:
                      type: string
                      description: id of model SOM
                required: true
            responses:
              200:
                description: Id trained model
                schema:
                    type: object
                    properties:
                      som_model_id:
                        type: string
              500:
                description: Internal Server Error
                schema:
                    type: object
                    properties:
                        error:
                         type: string
              299:
                description: Model is still training or not trained
                schema:
                    type: object
                    properties:
                        warning:
                         type: string
    """
    log.info("/analytics-backend/trainCodebookClustering")
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)

    som_model_id = data_json["som_model_id"]
    filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                        ) + "som_" + str(som_model_id) + ".pickle"
    try:
        som_model = som_ms.load_obj(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                            ) + "som_" + str(som_model_id) + "_training.txt"
        return returnModelStatus(filename, som_model_id)

    identifier = core.utility.utilities.getUniqueIdentifier()
    thread.start_new_thread(som_ms.trainNewModelCodebookCluster,
                            (som_model, identifier))

    response = jsonify({"codebook_cluster_model_id": identifier})
    return response
Exemplo n.º 3
0
def getCostOfSom():
    """
            Get cost of Som
            Get som model id and return cost of model
            ---
            parameters:
              - in: query
                name: som_model_id
                type: string
                required: true
                description: id of SOM model
            responses:
              200:
                description: cost of som
                schema:
                    type: object
                    properties:
                      cost of model:
                        type: string
              500:
                description: Internal Server Error
                schema:
                    type: object
                    properties:
                        error:
                         type: string
              299:
                description: Model is still training or not trained
                schema:
                    type: object
                    properties:
                        warning:
                         type: string
    """
    log.info("/analytics-backend/getCostOfSom")
    som_model_id = request.args.get('som_model_id')

    filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                        ) + "som_" + str(som_model_id) + ".pickle"
    try:
        som_model = som_ms.load_obj(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                            ) + "som_" + str(som_model_id) + "_training.txt"
        return returnModelStatus(filename, som_model_id)

    cost = som_ms.getCostOfSom(som_model)
    response = jsonify({"cost of model": cost})
    return response
Exemplo n.º 4
0
def keywordsExtraction():
    """
        Keywords Extraction
        Get a the id of the bigram model, the list of tweet messages or the url of the csv with tweet messages and return the lists of keywords
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                url_input:
                  type: string
                  description: url of csv with tweet messages
                bigram_model_id:
                  type: string
                  description: id of bigram model
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: lists of keywords
            schema:
              type: array
              items:
                type: array
                items:
                  type: string
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
          299:
            description: Model is still training or not trained
            schema:
                type: object
                properties:
                    warning:
                     type: string
    """
    log.info("/analytics-backend/keywordsExtraction")
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)

    if 'url_input' in data_json:
        url_input = data_json["url_input"]
        df = pd.read_csv(url_input)

        # DEBUGGING
        debugging = conf.get('MAIN', 'debugging')
        if (debugging == 'True'):
            document_path_file = conf.get('MAIN', 'path_document')
            df = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
            df = df.head()

        input_list = df['message'].tolist()
    else:
        input_list = json.dumps(data_json["tweets"])
        input_list = pd.read_json(input_list,
                                  encoding='utf8')['message'].tolist()

    bigram_model_id = data_json["bigram_model_id"]

    filename = conf.get('MAIN', 'path_pickle_bigram_model_incr_fold'
                        ) + "bigram_" + str(bigram_model_id) + ".pickle"
    try:
        bigram_model = som_ms.load_obj(filename)
    except:
        filename = conf.get(
            'MAIN', 'path_pickle_bigram_model_incr_fold') + "bigram_" + str(
                bigram_model_id) + "_training.txt"
        return returnModelStatus(filename, bigram_model_id)

    keywds = text_ranking_ms.extractKeywords(input_list, bigram_model)

    response = jsonify(keywds)
    return response
Exemplo n.º 5
0
def getCodebookWords():
    """
        Get all words associated to codebooks
        Get a the id of word2vec model, the id of SOM model and the list of tweet messages and return the lists of associated words
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                w2v_model_id:
                  type: string
                  description: id of model to use for word embedding
                som_model_id:
                  type: string
                  description: id of SOM model
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: list of codebook words
            schema:
              type: object
              properties:
                0:
                  type: array
                  items:
                    type: string
                  description: codebook word
                1:
                  type: array
                  items:
                    type: string
                  description: codebook word
                ...:
                  type: array
                  items:
                    type: string
                  description: codebook word
                n:
                  type: array
                  items:
                    type: string
                  description: codebook word
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
          299:
            description: Model is still training or not trained
            schema:
                type: object
                properties:
                    warning:
                     type: string
    """
    log.info("/analytics-backend/getCodebookWords")
    # reading json input
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)

    w2v_model_id = data_json["w2v_model_id"]
    som_model_id = data_json["som_model_id"]
    input_list = json.dumps(data_json["tweets"])
    input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist()

    filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                        ) + "word2vec_" + str(w2v_model_id) + ".pickle"
    try:
        w2v_model = word2vec_ms.Word2Vec.load(filename)
    except:
        filename = conf.get(
            'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str(
                w2v_model_id) + "_training.txt"
        return returnModelStatus(filename, w2v_model_id)

    filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                        ) + "som_" + str(som_model_id) + ".pickle"
    try:
        som_model = som_ms.load_obj(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                            ) + "som_" + str(som_model_id) + "_training.txt"
        return returnModelStatus(filename, som_model_id)

    codebook2words = som_ms.getCodebookWords(input_list, w2v_model, som_model)
    response = jsonify(codebook2words)
    return response
Exemplo n.º 6
0
def getCellFrequencyDistribution():
    """
        Plot frequency distribution
        Get word2vec model id, som model id, the list of tweet messages or the url of the csv with messages, the type of result ("bubble" or "bar") and return a result graph
        ---
        consumes:
          - application/json
        produces:
          - application/json
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                w2v_model_id:
                  type: string
                  description: id of model to use for word embedding
                som_model_id:
                  type: string
                  description: id of SOM model
                num:
                  type: string
                  description: number of variables sorted from biggest
                type_chart:
                  type: string
                  description: type of result "bubble" or "bar"
                url_input:
                  type: string
                  description: url of the csv with messages
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: graph of frequencies
            schema:
                type: object
                properties:
                    html:
                     type: string
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
          299:
            description: Model is still training or not trained
            schema:
                type: object
                properties:
                    warning:
                     type: string
    """
    log.info("/analytics-backend/getCellFrequencyDistribution")
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)

    type_chart = data_json["type_chart"]
    num = data_json["num"]
    w2v_model_id = data_json["w2v_model_id"]
    som_model_id = data_json["som_model_id"]

    if 'url_input' in data_json:
        url_input = data_json["url_input"]
        df = pd.read_csv(url_input)

        # DEBUGGING
        debugging = conf.get('MAIN', 'debugging')
        if (debugging == 'True'):
            document_path_file = conf.get('MAIN', 'path_document')
            df = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
            df = df.head()

        input_list = df['message'].tolist()
    else:
        input_list = json.dumps(data_json["tweets"])
        input_list = pd.read_json(input_list,
                                  encoding='utf8')['message'].tolist()

    filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                        ) + "word2vec_" + str(w2v_model_id) + ".pickle"
    try:
        w2v_model = word2vec_ms.Word2Vec.load(filename)
    except:
        filename = conf.get(
            'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str(
                w2v_model_id) + "_training.txt"
        return returnModelStatus(filename, w2v_model_id)

    filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                        ) + "som_" + str(som_model_id) + ".pickle"
    try:
        som_model = som_ms.load_obj(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                            ) + "som_" + str(som_model_id) + "_training.txt"
        return returnModelStatus(filename, som_model_id)

    html = som_ms.getCellFrequencyDistribution(input_list, w2v_model,
                                               som_model, num, type_chart)
    return json.dumps({'html': html})
Exemplo n.º 7
0
def computeTopics():
    """
        Extracts topics from a list of tweets
        Get word2vec model id, som model id, the codebook cluster model id, the list of tweet messages or the url of the csv with messages and return a result graph
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                w2v_model_id:
                  type: string
                  description: id of model to use for word embedding
                som_model_id:
                  type: string
                  description: id of SOM model
                codebook_cluster_model_id:
                  type: string
                  description: id of codebook cluster model
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: graphs of topics
            schema:
              type: array
              items:
                type: object
                properties:
                  directed:
                    type: boolean
                  graph:
                    type: object
                  links:
                    type: array
                    items:
                      type: object
                      properties:
                        source:
                          type: integer
                        target:
                          type: integer
                  multigraph:
                    type: boolean
                  nodes:
                    type: array
                    items:
                      type: object
                      properties:
                        id:
                          type: integer
                        name:
                          type: string
                        pos:
                          type: array
                          items:
                            type: integer
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
          299:
            description: Model is still training or not trained
            schema:
                type: object
                properties:
                    warning:
                     type: string
    """
    log.info("/analytics-backend/computeTopics")

    # remove old html topics
    import glob
    import os
    for fl in glob.glob("./templates/dried_*.html"):
        os.remove(fl)

    # reading json input
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)

    w2v_model_id = data_json["w2v_model_id"]
    som_model_id = data_json["som_model_id"]
    cluster_model_id = data_json["codebook_cluster_model_id"]
    input_list = json.dumps(data_json["tweets"])
    input_list = pd.read_json(input_list, encoding='utf8')['message'].tolist()

    # load models
    filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                        ) + "word2vec_" + str(w2v_model_id) + ".pickle"
    try:
        w2v_model = word2vec_ms.Word2Vec.load(filename)
    except:
        filename = conf.get(
            'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str(
                w2v_model_id) + "_training.txt"
        return returnModelStatus(filename, w2v_model_id)

    filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                        ) + "som_" + str(som_model_id) + ".pickle"
    try:
        som_model = som_ms.load_obj(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                            ) + "som_" + str(som_model_id) + "_training.txt"
        return returnModelStatus(filename, som_model_id)

    filename = conf.get(
        'MAIN', 'path_pickle_codebook_cluster_model_incr_fold'
    ) + "codebook_cluster_" + str(cluster_model_id) + ".pickle"
    try:
        cluster_model = som_ms.load_obj(filename)
    except:
        conf.get(
            'MAIN', 'path_pickle_codebook_cluster_model_incr_fold'
        ) + "codebook_cluster_" + str(cluster_model_id) + "_training.txt"
        return returnModelStatus(filename, cluster_model_id)

    dried_topics = Topics.doSomAndDryTopics(input_list, w2v_model, som_model,
                                            cluster_model)
    graphs = Topics.predictTopics(input_list,
                                  w2v_model,
                                  som_model,
                                  cluster_model,
                                  dried_topics,
                                  type_chart="json")

    response = jsonify(graphs)
    return response
Exemplo n.º 8
0
def doSomAndPlot1():
    """
        Get entities: Apply SOM and plot result of codebook MST
        Get word2vec model id, som model id, the list of tweet messages or the url of the csv with messages, the type of result and return a result graph
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                w2v_model_id:
                  type: string
                  description: id of model to use for word embedding
                som_model_id:
                  type: string
                  description: id of SOM model
                type_chart:
                  type: string
                  description: type of result "d3" (html) of json
                url_input:
                  type: string
                  description: url of the csv with messages
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: graph of entities
            schema:
              type: object
              properties:
                directed:
                  type: boolean
                graph:
                  type: object
                links:
                  type: array
                  items:
                    type: object
                    properties:
                      source:
                        type: integer
                      target:
                        type: integer
                multigraph:
                  type: boolean
                nodes:
                  type: array
                  items:
                    type: object
                    properties:
                      id:
                        type: integer
                      name:
                        type: string
                      pos:
                        type: array
                        items:
                          type: integer
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
          299:
            description: Model is still training or not trained
            schema:
                type: object
                properties:
                    warning:
                     type: string
    """
    log.info("/analytics-backend/doSomAndPlot")

    # reading json input
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)
    type_chart = data_json["type_chart"]
    w2v_model_id = data_json["w2v_model_id"]
    som_model_id = data_json["som_model_id"]

    if 'url_input' in data_json:
        url_input = data_json["url_input"]
        df = pd.read_csv(url_input)

        # DEBUGGING
        debugging = conf.get('MAIN', 'debugging')
        if (debugging == 'True'):
            document_path_file = conf.get('MAIN', 'path_document')
            df = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
            df = df.head()

        input_list = df['message'].tolist()
    else:
        input_list = json.dumps(data_json["tweets"])
        input_list = pd.read_json(input_list,
                                  encoding='utf8')['message'].tolist()

    filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                        ) + "word2vec_" + str(w2v_model_id) + ".pickle"
    try:
        model = word2vec_ms.Word2Vec.load(filename)
    except:
        filename = conf.get(
            'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str(
                w2v_model_id) + "_training.txt"
        return returnModelStatus(filename, w2v_model_id)

    # get embedded words from input
    cleaned_tweet_list = clean_text_ms.cleanText(input_list)
    embedded_words, dict_index2word, dict_word2index = word2vec_ms.getEmbeddedWords(
        cleaned_tweet_list, model)

    filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                        ) + "som_" + str(som_model_id) + ".pickle"
    try:
        som_model = som_ms.load_obj(filename)
    except:
        filename = conf.get('MAIN', 'path_pickle_som_model_incr_fold'
                            ) + "som_" + str(som_model_id) + "_training.txt"
        return returnModelStatus(filename, som_model_id)

    file_name = conf.get('MAIN', 'MST_html_d3_output_file')
    response = som_ms.doSomAndPlot(som_model, embedded_words, dict_index2word,
                                   file_name, type_chart)

    if (type_chart == "d3"):
        return render_template('MST_d3.html')
        return html
    elif (type_chart == "json"):
        return jsonify(response)
    else:
        return internalServerError(500)
Exemplo n.º 9
0
def textRanking():
    """
        Text Ranking
        Get a the id of the bigram and w2v models, the list of tweet messages or the url of the csv with tweet messages and return the text ranking
        ---
        parameters:
          - in: body
            name: body
            schema:
              type: object
              properties:
                url_input:
                  type: string
                  description: url of csv with tweet messages
                bigram_model_id:
                  type: string
                  description: id of bigram model
                w2v_model_id:
                  type: string
                  description: id of word2vec model
                tweets:
                  type: array
                  items:
                    type: object
                    properties:
                      message:
                        type: string
                        description: tweeet message
            required: true
        responses:
          200:
            description: text ranking
            schema:
              type: array
              items:
                type: array
                items:
                  type: string
          500:
            description: Internal Server Error
            schema:
                type: object
                properties:
                    error:
                     type: string
    """
    log.info("/analytics-backend/textRanking")
    data_json = json.dumps(request.get_json(silent=True))
    data_json = json.loads(data_json)

    if 'url_input' in data_json:
        url_input = data_json["url_input"]
        df = pd.read_csv(url_input)

        # DEBUGGING
        debugging = conf.get('MAIN', 'debugging')
        if (debugging == 'True'):
            document_path_file = conf.get('MAIN', 'path_document')
            df = pd.read_csv(document_path_file,
                             encoding='utf-8',
                             error_bad_lines=False)
            df = df.head(100)

        input_list = df['message'].tolist()
    else:
        input_list = json.dumps(data_json["tweets"])
        input_list = pd.read_json(input_list,
                                  encoding='utf8')['message'].tolist()

    w2v_model_id = data_json["w2v_model_id"]
    filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold'
                        ) + "word2vec_" + str(w2v_model_id) + ".pickle"
    try:
        #w2v_model = som_ms.load_obj(filename)
        w2v_model = Word2Vec.load(filename)
    except:
        filename = conf.get(
            'MAIN', 'path_pickle_w2v_model_incr_fold') + "word2vec_" + str(
                w2v_model_id) + "_training.txt"
        return returnModelStatus(filename, w2v_model_id)

    bigram_model_id = data_json["bigram_model_id"]
    filename = conf.get('MAIN', 'path_pickle_bigram_model_incr_fold'
                        ) + "bigram_" + str(bigram_model_id) + ".pickle"
    try:
        bigram_model = som_ms.load_obj(filename)
    except:
        filename = conf.get(
            'MAIN', 'path_pickle_bigram_model_incr_fold') + "bigram_" + str(
                bigram_model_id) + "_training.txt"
        return returnModelStatus(filename, bigram_model_id)
    print "input list len" + str(len(input_list))
    keywds = text_ranking_ms.extractKeywords(input_list, bigram_model)
    print "keywds list len" + str(len(keywds))
    topics = TopicLabeler.textRanking(keywds, w2v_model)

    response = jsonify(topics)
    return response