示例#1
0
def main(args):

	movie_name = input("Movie name: ")
##### IMPORT DATA ######
	movie_words, unique_words = import_words("context_analysis/data/" + movie_name + "_words_warped.txt")
	google_args = ["google/data/" + movie_name + "_gc_full_labels_1.json", 
			"google/data/" + movie_name + "_gc_full_labels_2.json",
			"google/data/" + movie_name + "_gc_full_labels_3.json"]

	gc_labels, unique_gc_labels = import_google_labels(google_args)
	aws_labels, unique_aws_labels = import_aws_labels("context_analysis/data/" + movie_name + "_aws_labels.json")

##### INITIALISE MODELS ######
	liteClient = retinasdk.LiteClient("557d9940-40ab-11e8-9172-3ff24e827f76")
	semantic_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=500000)

##### Vectorize words ######
	# returns embeddings vectors for every word in the w2v vocabulary
#	word2vec_whole_vectors = w2vec_vocab(semantic_model)

	word2vec_word_vectors, unknown_words = word2vec_generate_vectors(semantic_model, unique_words)
	word2vec_gc_vectors, unknown_gc_labels = word2vec_generate_vectors(semantic_model, unique_gc_labels)
	word2vec_aws_vectors, unknown_aws_labels = word2vec_generate_vectors(semantic_model, unique_aws_labels)

##### Make sure numbers add up ######
	print("Word vectors: ", len(word2vec_word_vectors))
	print("GC vectors:", len(word2vec_gc_vectors))
	print("AWS vectors", len(word2vec_aws_vectors))
	all_labels_vectors = np.concatenate((word2vec_word_vectors, word2vec_gc_vectors, word2vec_aws_vectors))
	print("All vectors:", len(all_labels_vectors))

#	retina_word_vectors = retina_generate_vectors(liteClient, words)

##### CLUSTER ESTIMATION #######
#	estimate_elbow_cluster_num(all_labels_vectors)
#	for i in range(30, 36):
#		print("Num of clusters" + str(i))
#		estimate_silhou_cluster_num(all_labels_vectors, i)

#### PCA #######
#	do_PCA(all_labels_vectors)

#### CLUSTERING METHODS #######
#	cluster_kmeans(all_labels_vectors, semantic_model)
#	cluster_agglom(word2vec_word_vectors, semantic_model)
	cluster_components = cluster_agglom(all_labels_vectors, semantic_model)

	clusters_across_time(cluster_components, movie_words, gc_labels, aws_labels)

	print("Done")
示例#2
0
def predict(claim, source='All'):
    lite_client = retinasdk.LiteClient("2bc45a70-3a85-11e8-9172-3ff24e827f76")

    def get_news_titles(claim, keywords):
        kw = keywords
        import itertools
        claim_words = claim.split()
        for i in range(len(keywords)):
            keywords[i] = keywords[i] + " "
        keys_flat = list(itertools.chain(*keywords))
        keywords = ''.join(keys_flat)
        new_claim = ""
        for word in claim_words:
            if word in keywords:
                new_claim = new_claim + " +" + word
            else:
                new_claim = new_claim + " " + word
        news_titles = []
        news_api = search_news_api(new_claim)
        if news_api is not None:
            news_titles.append(news_api)
        return news_titles

    keywords = lite_client.getKeywords(claim)
    news = get_news_titles(claim.lower(), keywords)
    if (type(news[0])) != 'str':
        new = []
        for y in news:
            for x in y:
                new.append(x)
        news = new
    count_agree = 0
    count_disagree = 0
    for title in news:

        test_sim = paralleldots.similarity(claim, title)
        score = test_sim["actual_score"]
        if score > 0.5:
            count_agree += 1
        elif score <= 0.5:
            count_disagree += 1
    if len(news) <= 0:
        return -1
    else:
        probability = (count_agree / (count_agree + count_disagree)) * 100
        return probability
示例#3
0
def hammingCompare(outtweets, innerTwitter):
    client = retinasdk.FullClient(apiKey.retina_token,
                                  apiServer="http://api.cortical.io/rest",
                                  retinaName="en_associative")
    liteClient = retinasdk.LiteClient(apiKey.retina_token)
    res = []

    for index, outtweet in enumerate(outtweets):
        result = {}
        # get simHash
        simhash_pair = getSimHash(outtweet[2], innerTwitter, client)
        if len(simhash_pair) > 1:
            diff_bits = simhash.num_differing_bits(simhash_pair['out_hash'],
                                                   simhash_pair['in_hash'])
            hashes = [simhash_pair['out_hash'], simhash_pair['in_hash']]
            blocks = 4  # Number of blocks to use
            distance = 3  # Number of bits that may differ in matching pairs
            matches = simhash.find_all(hashes, blocks, distance)
            res.append([index, outtweet[2], matches])
    return res
示例#4
0
import argparse
import urllib2
from bs4 import BeautifulSoup
import itertools
import retinasdk
import credentials
liteClient = retinasdk.LiteClient(credentials.API_KEY)


def init():
    parser = argparse.ArgumentParser()
    parser.add_argument('document_url',
                        metavar='url',
                        type=str,
                        help='Document url to check for on snopes.com')
    parser.add_argument(
        '--w',
        metavar='words_cutoff',
        type=int,
        default=7,
        help=
        'Cuts off lines with less of n words for keywords extraction (default=7)'
    )
    parser.add_argument(
        '--k',
        metavar='keywords_cutoff',
        type=int,
        default=4,
        help='Consider only the first n relevant keywords (default=4)')
    parser.add_argument('--o',
                        metavar='no_single_keyword',
    with open(filename) as filehandle:
        lines = filehandle.readlines()

    with open(filename, 'w') as filehandle:
        lines = filter(lambda x: x.strip(), lines)
        filehandle.writelines(lines)


#string to list converter
def Convert(string):
    li = list(string.split("-"))
    return li


#retinasdk client connection
liteClient = retinasdk.LiteClient("1d5b1cc0-aa65-11e9-8f72-af685da1b20e")

#################################################################################

#make sure directory is cleaned from last run
#if os.path.isfile("keywords.txt"):
#	os.remove("keywords.txt")

if os.path.isfile("clean.txt"):
    os.remove("clean.txt")
if os.path.isfile("keywords.txt"):
    os.remove("keywords.txt")
if os.path.isfile("temp.txt"):
    os.remove("temp.txt")
if os.path.isfile("outfile.txt"):
    os.remove("outfile.txt")
import retinasdk
liteClient = retinasdk.LiteClient("")


def get_topic(titles):
    paragraph = ""
    for title in titles:
        paragraph += title
    keywords = liteClient.getKeywords(paragraph.encode('utf-8'))
    for index, title in enumerate(titles):
        if keywords[0] in title.lower():
            return index
示例#7
0
 def keyword_classifier(self, sentence):
     liteClient = retinasdk.LiteClient(
         "ba4c1950-95ec-11e8-917d-b5028d671452")
     return liteClient.getKeywords(sentence)
def match_texts(text_1, text_2):
    """Match text."""
    text_1_raw = text_1
    text_2_raw = text_2
    text_1 = uni_to_ascii(text_1)
    text_2 = uni_to_ascii(text_2)
    uuid_str = str(uuid4())
    liteClient = retinasdk.LiteClient(RETINA_API_KEY)
    text_1_md5 = md5.new(text_1).hexdigest()
    text_2_md5 = md5.new(text_2).hexdigest()
    sim, text_1_fp, text_2_fp = "", "", ""
    text_1_keywords, text_2_keywords = "", ""
    try:
        text_1_fp = liteClient.getFingerprint(text_1)
    except CorticalioException:
        print(CorticalioException + "\nText:" + text_1)
        pass
    try:
        text_2_fp = liteClient.getFingerprint(text_2)
    except CorticalioException:
        print(str(CorticalioException) + "\nText:" + text_2)
        pass
    try:
        sim = str(liteClient.compare(text_1_fp, text_2_fp))
    except CorticalioException:
        print(CorticalioException)
        pass
    try:
        text_1_keywords = liteClient.getKeywords(text_1)
    except CorticalioException:
        print(CorticalioException)
        pass
    try:
        text_2_keywords = liteClient.getKeywords(text_2)
    except CorticalioException:
        print(CorticalioException)
        pass

    if len(text_1_keywords) == 0:
        text_1_keywords = ["[None]"]
    if len(text_2_keywords) == 0:
        text_2_keywords = ["[None]"]
    if text_1_fp == "":
        text_1_fp = ["[None]"]
    if text_2_fp == "":
        text_2_fp = ["[None]"]
    if sim == "" or text_1_fp == "[None]" or text_2_fp == "[None]":
        sim = "[None]"
    document = Document()
    style = document.styles['Normal']
    font = style.font
    font.name = 'Calibri'
    font.size = Pt(12)
    document.add_heading(DOCUMENT_HEADING, 0)
    document.add_paragraph('Text 1 and Text 2 similarity',
                           style='Intense Quote')
    document.add_paragraph(sim)
    document.add_paragraph('Matching details', style='Intense Quote')
    table = document.add_table(rows=4, cols=3)
    table.style = 'Table Grid'
    font.bold = True
    table.cell(0, 1).text = "Text 1"
    table.cell(0, 2).text = "Text 2"
    table.cell(1, 0).text = "md5"
    table.cell(2, 0).text = "Keywords"
    table.cell(3, 0).text = "Fingerprint"
    font.bold = False
    table.cell(1, 1).text = text_1_md5
    table.cell(1, 2).text = text_2_md5
    table.cell(2, 1).text = ", ".join(text_1_keywords)
    table.cell(2, 2).text = ", ".join(text_2_keywords)
    table.cell(3, 1).text = ", ".join([str(el) for el in text_1_fp])
    table.cell(3, 2).text = ", ".join([str(el) for el in text_2_fp])
    document.add_page_break()
    document.add_heading('Original Text 1', 0)
    document.add_paragraph(get_clean_text(text_1_raw))
    document.add_page_break()
    document.add_heading('Original Text 2', 0)
    document.add_paragraph(get_clean_text(text_2_raw))
    document.save(RESULTS_FOLDER + "/" + uuid_str + FILES_EXTENTION)

    db = create_engine(SME_SQLALCHEMY_DATABASE_URI)
    db.echo = False
    metadata = MetaData(db)
    text_table = Table('text', metadata, autoload=True)
    text_table.insert().execute(text_md5=text_1_md5,
                                text_keywords=", ".join(text_1_keywords),
                                text_fingerprint=", ".join(
                                    [str(el) for el in text_1_fp]),
                                text=text_1)
    text_table.insert().execute(text_md5=text_2_md5,
                                text_keywords=", ".join(text_2_keywords),
                                text_fingerprint=", ".join(
                                    [str(el) for el in text_2_fp]),
                                text=text_2)

    return uuid_str
# %%
from typing import List
import retinasdk
import matplotlib.pyplot as plt
import numpy as np
import json

from preprocessing.learned_lemmatization_experiments.past_tense_from_w2v import past_tense

# %%
api_key = json.load(
    open(
        "./preprocessing/learned_lemmatization_experiments/cortico_api_key.json",
        "r")).get("key")
liteClient = retinasdk.LiteClient(api_key)


# %%
def embed(word: str) -> List[int]:
    fingerprint = liteClient.getFingerprint(word)
    return fingerprint


def to_array(fingerprint: List[int]):
    dim = 128
    x = np.zeros(dim**2)
    x[fingerprint] = 1
    x = x.reshape((dim, dim))
    return x

示例#10
0
from apiStorage import apiKey

# helper functions

# reusable client for handling API calls
sFunctionFullClient = retinasdk.FullClient(
    apiKey,
    apiServer="http://api.cortical.io/rest",
    retinaName="en_synonymous")

aFunctionFullClient = retinasdk.FullClient(
    apiKey,
    apiServer="http://api.cortical.io/rest",
    retinaName="en_associative")

FunctionLiteClient = retinasdk.LiteClient(apiKey)


# input: category - a fingerprint of the category filter
#        term     - the term you want to add to the category
# output: the resulting fingerprint of assimilating given term
def assimilateTermInCategory(category, term):
    orExpression = {"or": [{"positions": category}, {"term": term}]}
    return sFunctionFullClient.getFingerprintForExpression(
        json.dumps(orExpression)).positions


#input: FP1 - fingerprint 1 to be merged with FP2
#       FP2 - fingerprint 2
# output: the resulting fingerprint of merging FP1 and FP2
def mergeFingerprints(FP1, FP2):
示例#11
0
import math
from gcc_phat import gcc_phat   #generalized cross correlation phase transform
import numpy as np
import json
import simpleaudio as sa
import sys
import speech_recognition as sr
import pyaudio

#api key for Cortical - keyword parser
#use retina_sdk_florida.txt for florida
import retinasdk
liteClient = None
with f as open("api_keys/retina_sdk.txt"):
    key = f.read().strip()
    liteClient = retinasdk.LiteClient(key)

#for Watson sentiment analysis
from watson_developer_cloud.natural_language_understanding_v1 \
    import Features, EntitiesOptions, KeywordsOptions, SentimentOptions
from watson_developer_cloud import NaturalLanguageUnderstandingV1

naturalLanguageUnderstanding = None
with f as open("api_keys/watson.txt"):
    key = f.read().strip()
    naturalLanguageUnderstanding = NaturalLanguageUnderstandingV1(
            version='2018-11-16', iam_apikey=key)
### use watson_florida.txt for florida

#for communication with other programs
import time
示例#12
0
import requests
import math
import json
import argparse
import urllib.request
import itertools
import retinasdk
import credentials
from bs4 import BeautifulSoup
from selenium import webdriver

# set paras
liteClient = retinasdk.LiteClient("2fe758f0-8670-11e8-917d-b5028d671452")

DANDELION_APP_ID = '159581bc091046b28e91a97ca4d5032f'
DANDELION_APP_KEY = '159581bc091046b28e91a97ca4d5032f'

ENTITY_URL1 = 'https://api.dandelion.eu/datatxt/nex/v1'
ENTITY_URL2 = 'https://api.dandelion.eu/datatxt/sent/v1'


#==============================
# task1 form check
#==============================
def get_entities(text, confidence=0.5, lang='en'):
    payload = {
        '$app_id': DANDELION_APP_ID,
        '$app_key': DANDELION_APP_KEY,
        'text': text,
        'confidence': confidence,
        'lang': lang,
示例#13
0
import subprocess, re, json, random
import retinasdk
from sentence_selection_v2 import compute_document_score
"""
[{
    original_sentence: "original sentence",
    gap_sentence: "sentence with gap",
    distractors: [],
    answer: 0
}]
"""

liteClient = retinasdk.LiteClient("f12af3f0-3a0d-11e8-9172-3ff24e827f76")


def distractor_selection():
    pass


def gapify(sentences):

    #get the keywords for every sentence
    keywords = []
    for sentence in sentences:
        keywords.append(liteClient.getKeywords(sentence))

    #get rid of gap and find selector
    gap_questions = []
    for i, sentence in enumerate(sentences):

        for k, key in enumerate(keywords[i]):
示例#14
0
#!/usr/bin/env python3
import retinasdk
liteClient = retinasdk.LiteClient("6ea5d540-4fb8-11ea-8f72-af685da1b20e")


def get_key_words(text):
    return liteClient.getKeywords(text)
示例#15
0
import retinasdk
from misc import bcolors, testfiles
import json
import pandas as pd
import numpy as np
from sys import stdout

liteClient = retinasdk.LiteClient("e29fcfe0")
fullClient = retinasdk.FullClient("your_api_key",
                                  apiServer="http://api.cortical.io/rest",
                                  retinaName="en_associative")


def compare_texts(texts1, texts2):
    print(bcolors.HEADER +
          "Compute similarity between sentences in dataframes:" + bcolors.ENDC)

    cosines = []
    i = 0
    l = len(texts1)

    for s1, s2 in zip(texts1, texts2):
        percent = i / l * 100
        stdout.write("\r{0:.3f} %".format(percent))
        stdout.flush()
        cosines.append(
            fullClient.compare(json.dumps([{
                "text": s1
            }, {
                "text": s2
            }])).cosineSimilarity)
示例#16
0
nltk.download('averaged_perceptron_tagger')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sid
#from random import *
import simpleaudio as sa
import json
#import client
#import socket
#import json
#import time
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
    import Features, EntitiesOptions, KeywordsOptions, SentimentOptions
import retinasdk
#apiKey = "69ba0c10-5e17-11e9-8f72-af685da1b20e"
apiKey = "f09d0fe0-3223-11e9-bb65-69ed2d3c7927"  #FOR DEMO DAY ONLY
liteClient = retinasdk.LiteClient(apiKey)

import threading
from threading import Lock, Thread
lock = Lock()

naturalLanguageUnderstanding = NaturalLanguageUnderstandingV1(
    version='2018-11-16',
    iam_apikey='_wxBEgRMBJ_WzXRWYzlTLYrNp3A0mmYEjKp-UQsdhvap')

setup_bool = False
confirmation_final = 1000
no_clue_final = 999
wakeup_final = 998
sleep_final = 997
move_final = 996
示例#17
0
def setupCio():
    """ Setup Cortical.io clients."""
    apiKey = os.environ.get("CORTICAL_API_KEY")
    cioFullClient = retinasdk.FullClient(apiKey)
    cioLiteClient = retinasdk.LiteClient(apiKey)
    return cioFullClient, cioLiteClient
def txt_comp(answer_txt, key_txt):
    liteClient = retinasdk.LiteClient("4e5305c0-50e8-11ea-8f72-af685da1b20e")
    return liteClient.compare(answer_txt, key_txt)
from keras.models import Model
from keras.layers import Input
from sklearn.feature_extraction.text import TfidfVectorizer
from math import isnan
from textdistance import cosine
pth = "your--path/my_stance_data/"

vectorizer = TfidfVectorizer(max_features=2000)
dfb = pd.read_csv(pth + "train_bodies.csv")
dfh = pd.read_csv(pth + "train_stances.csv")

lb = dfb.values.tolist()
lh = dfh.values.tolist()
voch, vocb = [], []

liteClient = retinasdk.LiteClient("d2690680-f10c-11e8-bb65-69ed2d3c7927")


def lem(l):
    a = l.split()
    b = []
    for w in a:
        b.append(_wnl.lemmatize(w))
    return " ".join(b)


def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):