示例#1
0
def saveDictation():
    '''Endpoint that saves a sound file for later acoustic adaptation.

        Body:
            cookie: Cookie of current user.
        Files:
            url: Sound file.

        '''
    cookie = request.form['cookie']
    # Get current user
    res = database.find_one('connections', {'_id': cookie})
    email_name = res['email_name']
    text = request.form['text']
    url = request.files['url']

    out = os.path.join('./data', email_name)
    if not os.path.exists(out):
        os.makedirs(out)
    wav_path = os.path.join(out, 'wav')
    if not os.path.exists(wav_path):
        os.makedirs(wav_path)

    # Keep the number of dictations in the database.
    res = database.find_one('savedDictations', {'_id': email_name})
    if res is None:
        counter = 0
        database.insert_one('savedDictations', {
            '_id': email_name,
            'num': counter
        })
    else:
        counter = res['num'] + 1
        database.update_one('savedDictations', {'_id': email_name},
                            {"$set": {
                                'num': counter
                            }})

    # Keep a file that contains all the ids.
    with open(os.path.join(out, 'ids'), 'a') as f:
        f.write(str(counter) + '\n')
    # Keep a file that contains all the transcriptions.
    with open(os.path.join(out, 'transcriptions'), 'a') as f:
        f.write('<s> ' + text.strip('\n') + ' </s> ' + ' (' + str(counter) +
                ')' + '\n')

    # Save current dictation in filesystem.
    url.save(os.path.join(wav_path, str(counter) + '.wav'))
    return {'message': 'OK'}
示例#2
0
 def record(self, entries):
     """
     record 3-tuple into db, ['txhash', 'start', 'end']
     :param entries:
     :return:
     """
     count = None
     for ent in entries:
         count = db.insert_one({
             'txhash': ent[0],
             'start': ent[1],
             'end': ent[2],
         })
     return count
示例#3
0
def s_auth():
    username = request.form.get('username')
    password = request.form.get('password')
    password2 = request.form.get('password2')

    is_successful, user = authentication.signup(username, password, password2)
    app.logger.info('%s', is_successful)

    if (is_successful):
        session["user"] = user
        updateuser = db.insert_one(user)
        return redirect('/')
    else:
        return render_template('registerfail.html')
示例#4
0
def save_data(links, browser):
    global PAGE_TOTAL
    global start_page
    global db
    global end_page
    global save_db
    global save_file
    index = 0
    for link in links:
        index = index + 1
        infor = get_article_infor(browser, link)
        article_number = PAGE_TOTAL * (start_page - 1) + index
        file_name = 'Article-' + str(article_number)
        if save_db == 'yes':
            insert_one(db, COLLECTION, infor)
        if save_file == 'yes':
            save_to_file(infor, file_name)
        print('%d. Saved: %s' % ((PAGE_TOTAL *
                                  (start_page - 1) + index), infor['title']))

    start_page = start_page + 1
    if start_page <= int(end_page):
        links = get_article_links(browser)
        save_data(links, browser)
示例#5
0
def getEmails():
    '''Endpoint that returns the sent emails of a Gmail user.

        Body:
            cookie: Cookie of current user.
            token: Authentication token from Gmail API.
            keep: If true, keep current user;s data after log out.
        '''

    data = request.form
    token = data['token']
    cookie = data['cookie']
    keep = data['keep']
    # Get user's info and save a new connection in the database.
    email_name, name, picture = getInfo(token)
    database.insert_one('connections', {
        '_id': cookie,
        'email_name': email_name,
        'keep': keep
    })

    # If user's data exist, return the emails.
    res = database.find_one('users', {'_id': email_name})
    if res is not None:
        res = database.find_one('messages', {'_id': email_name})
        if res is not None:
            messages = res['messages']
            return jsonify(messages)
        else:
            # In case data are not sychronized correctly.
            database.delete_one('users', {'_id': email_name})

    database.insert_one('users', {
        '_id': email_name,
        'name': name,
        'picture': picture
    })
    # Send get request in gmail api to get the user's emails.
    read_endpoint = "https://www.googleapis.com/gmail/v1/users/userId/messages"
    headers = {
        'Authorization': 'Bearer ' + token,
        'Accept': 'application/json'
    }
    read_response = requests.get(read_endpoint,
                                 headers=headers,
                                 params={
                                     'userId': 'me',
                                     'labelIds': ['SENT']
                                 })
    messages = read_response.json().get('messages')
    clean_messages = []
    for idx, message in enumerate(messages):
        # Get message based in the id.
        get_endpoint = "https://www.googleapis.com/gmail/v1/users/userId/messages/id"
        get_response = requests.get(get_endpoint,
                                    headers=headers,
                                    params={
                                        'userId': 'me',
                                        'id': message['id'],
                                        'format': 'raw'
                                    })
        raw_msg = get_response.json().get("raw")
        string_message = str(base64.urlsafe_b64decode(raw_msg), "ISO-8859-7")
        # Convert current message to mime format.
        mime_msg = email.message_from_string(string_message)
        # Convert current message from mime to string.
        body, msg_headers = mime2str(mime_msg)
        # Clean current message
        proccesed_body = process_text(body)
        size = len(msg_headers)
        clean_messages.append({
            'body': body,
            'processed_body': proccesed_body,
            'sender': (msg_headers[0] if size > 0 else " "),
            'subject': (msg_headers[2] if size > 2 else " ")
        })

    # Save user's emails in database and return them.
    database.insert_one('messages', {
        '_id': email_name,
        'messages': clean_messages
    })
    return jsonify(clean_messages)
示例#6
0
def getClusters():
    '''Endpoint that clusters the emails.

        Body:
            cookie: Cookie of current user.
            metric: Metric to be used for closest point calculation.
            n_clusters: Number of clusters.
            method: Method of selecting number of clusters to be used (knee, silhouette).
            min_cl: Min number of clusters.
            max_cl: Max number of clusters.
            level: Level of clustering (per sentence or per email).
        '''
    data = request.form
    cookie = data['cookie']
    metric = data['metric']
    n_clusters = data['n_clusters']
    method = data['method']
    min_cl = int(data['min_cl'])
    max_cl = int(data['max_cl'])
    level = data['level']

    # Get current user.
    res = database.find_one('connections', {'_id': cookie})
    email_name = res['email_name']

    # Get messages of current user.
    res = database.find_one('messages', {'_id': email_name})
    messages_col = res['messages']
    # Keep them as sentences if asked to.
    emails = []
    for msg in messages_col:
        if level == "sentence":
            emails.extend(msg['processed_body'])
        else:
            emails.append(" ".join(msg['processed_body']))

    # Represent them as vectors.
    X = get_spacy(emails, nlp)

    if n_clusters == "":
        # Get metrics in different number of clusters (range [min_cl, max_cl]).
        sse, silhouette = get_metrics(X, min_cl, max_cl)
        if method == 'elbow':
            n_clusters = find_knee(sse, min_cl)
        else:
            n_clusters = silhouette_analysis(silhouette, min_cl)
    # Run k-means with given number of clusters.
    n_clusters = int(n_clusters)
    labels, centers = run_kmeans(X, n_clusters)

    # Save computed clusters in filesystem.
    out = os.path.join('./data', os.path.join(email_name, 'clusters'))
    save_clusters(emails, labels, os.path.join(email_name, 'clusters'))
    cluster2text(out, n_clusters)

    # Get a sample for each cluster.
    samples = []
    for i in range(n_clusters):
        samples.append(emails[closest_point(centers[i], X, metric)])

    # We want to keep some representative words for each cluster
    # in order to identify the topic it represents. So we take
    # the words with the heighest tf-idf metric in each cluster.
    cv = CountVectorizer(stop_words=STOP_WORDS)
    tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
    keywords_total = []
    for i in range(n_clusters):
        emails_cluster = [
            emails[j] for j in range(len(emails)) if labels[j] == i
        ]
        word_count_vector = cv.fit_transform(emails_cluster)
        tfidf.fit(word_count_vector)
        feature_names = cv.get_feature_names()
        tf_idf_vector = tfidf.transform(cv.transform(emails_cluster))
        sorted_items = sort_coo(tf_idf_vector.tocoo())
        keywords = extract_topn_from_vector(feature_names, sorted_items, 10)
        keywords_total.append(keywords)

    # Delete previous user's clustering.
    database.delete_one('clusters', {'_id': email_name})
    # Insert computed clusters in database.
    database.insert_one(
        'clusters', {
            '_id': email_name,
            'centers': centers.tolist(),
            'labels': labels.tolist(),
            'samples': samples,
            'keywords': keywords_total,
            'metric': metric
        })

    clusters = [[] for i in range(n_clusters)]
    for idx, email in enumerate(emails):
        clusters[labels[idx]].append(email)

    weight = '0.5'
    # Create language models using srilm.
    for cluster in os.listdir(out):
        cluster_path = os.path.join(out, cluster)
        if os.path.isdir(cluster_path):
            if subprocess.call([
                    'ngram-count -kndiscount -interpolate -text ' +
                    os.path.join(cluster_path, 'corpus') +
                    ' -wbdiscount1 -wbdiscount2 -wbdiscount3 -lm ' +
                    os.path.join(cluster_path, 'model.lm')
            ],
                               shell=True):
                print('Error in subprocess')
            if subprocess.call([
                    'ngram -lm ' + lmPath + ' -mix-lm ' +
                    os.path.join(cluster_path, 'model.lm') + ' -lambda ' +
                    weight + ' -write-lm ' +
                    os.path.join(cluster_path, 'merged.lm')
            ],
                               shell=True):
                print('Error in subprocess')

    return jsonify({
        'samples': samples,
        'keywords': keywords_total,
        'clusters': clusters
    })
示例#7
0
 def insert_data(self, table, values):
     database.insert_one(self.conn, table, values)