Exemplo n.º 1
0
def groupSuject(array):
    old_token = ''
    old_word = ''
    lp = []
    word = Word('', 'subject')
    text = ''
    new_array = []
    size = len(array)
    count = 1
    for a in array:
        if a.token == 'subject':
            if a.token == old_token:
                text += "_" + a.text
            else:
                text = a.text

            if count == size:
                new_array.append(Word(text, 'subject'))
                text = ''
        else:

            if old_token == 'subject':
                new_array.append(Word(text, 'subject'))
                text = ''
            new_array.append(a)

        old_token = a.token
        count += 1

    return new_array
def load_data(data, category):
    os.chdir(category)
    for filename in os.listdir(os.getcwd()):
        if filename.endswith(".wav"):
            W = Word(category, filename)
            W.set_mfcc_matrix()
            data.append(W)
    os.chdir("..")
def reverse_poss_dict(poss_dict_path):
    out = dict()
    with open(poss_dict_path, encoding='utf-8') as f:
        lines = (l.strip().split('\t') for l in f)
        for c in lines:
            for fun in c[2:]:
                if fun in out:
                    out[fun].append(Word(c[0].lower(), c[1].lower()))
                else:
                    out[fun] = [Word(c[0].lower(), c[1].lower())]
    return out
Exemplo n.º 4
0
def relatedness(context1, context2, cocs, key_sets):
	context1 = filter(lambda x : not x == "" and x.relevant(), [ Word(w) for w in context1.lower().split(" ") ])
	context1 = [ w.lemma() for w in context1 ]
	context2 = filter(lambda x : x.relevant(), [ Word(w) for w in context2.lower().split(" ") ])
	context2 = [ w.lemma() for w in context2 ]

	context1 = get_coc(context1)
	context2 = get_coc(context2)

	total = 0.0
	for i in xrange(len(key_sets)):
		s1 = dic_similarity(cocs[i], key_sets[i], context1)
		s2 = dic_similarity(cocs[i], key_sets[i], context2)
		total += s1 * s2
	total /= len(key_sets)
	return total
Exemplo n.º 5
0
 def import_wordlist(self):
     word_list_elmnt = self.driver.find_elements_by_class_name('thing')
     for word_elmnt in word_list_elmnt:
         word_a = word_elmnt.find_element(By.XPATH,
                                          './/div[3]/div').text  # Word A
         word_b = word_elmnt.find_element(By.XPATH,
                                          './/div[4]/div').text  # Word B
         self.wordList.append(Word(word_a, word_b))
Exemplo n.º 6
0
def switchDisjuntion(array):
    new_array = []
    for a in array:
        if a.token == 'disjuntion':
            new_array.append(Word('v', 'disjuntion'))
        else:
            new_array.append(a)

    return new_array
Exemplo n.º 7
0
def switchImpilies(array):
    new_array = []
    for a in array:
        if a.token == 'implies':
            new_array.append(Word('->', 'implies'))
        else:
            new_array.append(a)

    return new_array
Exemplo n.º 8
0
def switchNegative(array):
    new_array = []
    for a in array:
        if a.token == 'negative':
            new_array.append(Word('~', 'negative'))
        else:
            new_array.append(a)

    return new_array
Exemplo n.º 9
0
def switchConjuntion(array):
    new_array = []
    for a in array:
        if a.token == 'conjuntion':
            new_array.append(Word('^', 'conjuntion'))
        else:
            new_array.append(a)

    return new_array
def read_poss_dict(path):
    with open(path, encoding='utf-8') as f:
        # format:
        #    columnist \t NOUN \t columnistFem_N \t columnistMasc_N
        lines = (l.strip().split('\t') for l in f)
        return defaultdict(
            lambda: [],
            {Word(l[0].lower(), l[1].lower()): l[2:]
             for l in lines})
Exemplo n.º 11
0
    def toPL(cls, lp):
        dict = {}
        atomics = []
        atomic_name = 'P'
        atomic_num = 1
        results = []
        for text in lp:
            array = []
            words = text.split(' ')
            new_array = []
            for word in words:
                token = tokenize(word)
                array.append(Word(word, token))

            array = removeLigations(array)
            array = positionNegative(array)
            array = groupSuject(array)
            array = switchImpilies(array)
            array = switchDisjuntion(array)
            array = switchConjuntion(array)
            array = switchNegative(array)

            for a in array:
                if a.token == 'subject':
                    if a.text not in atomics:
                        atomic = atomic_name + str(atomic_num)
                        dict[a.text] = atomic
                        atomic_num += 1
                        atomics.append(a.text)
                        new_array.append(Word(atomic, 'subject'))
                    else:
                        atomic = dict.get(a.text)
                        new_array.append(Word(atomic, 'subject'))
                else:
                    new_array.append(a)
            results.append(" ".join(a.text for a in new_array))

        size = len(results)
        conclusion = results[size - 1]
        results = results[0:size - 1]
        return results, conclusion, dict
Exemplo n.º 12
0
data_rating = data_rating_train.append(data_rating_test)

print('Range of userId is [{}, {}]'.format(data_rating.userId.min(), data_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(data_rating.itemId.min(), data_rating.itemId.max()))

# Read the grouping information
if args.pretrain_grouping:
    data_grouping = pd.read_csv(args.data_grouping, sep=",", header=0, names=['friendId', 'tagId', 'score'],engine='python')
    config['num_friends_pretrain'] = int(data_grouping.friendId.max() + 1)
    config['num_items_pretrain'] = int(data_grouping.tagId.max() + 1)
    del data_grouping
    print ("group data reading finished!")

# Process the tweet
vocab = Word()
tweet = vocab.load_tweets(data_tweet, args.max_seq_len)
pad_word = vocab.pad
tweet_pad = np.full(shape=(1, args.max_seq_len), fill_value=pad_word, dtype=np.int64)
tweet = np.vstack([tweet, tweet_pad])

# config
config['num_users'], config['num_items'] = int(data_rating.userId.max() + 1), int(data_rating.itemId.max() + 1)
config['user_friends'],config['user_tweets'],config['num_friends'] = load_friends_tweets(args.data_profile)
args.tweet = tweet
config['args'] = args
config['vocab'] = vocab

# Specify the exact model
model = sys.argv[1] if len(sys.argv) == 2 else "gmf"
if args.model.lower() == "gmf":
Exemplo n.º 13
0
print('Range of userId is [{}, {}]'.format(data_rating.userId.min(), data_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(data_rating.itemId.min(), data_rating.itemId.max()))
print('Range of tweetId is [{}, {}]'.format(data_rating.tweetId.min(), data_rating.tweetId.max()))

# Read the grouping information
if args.pretrain_grouping:
    data_grouping = pd.read_csv(args.data_grouping, sep=",", header=0, names=['friendId', 'tagId', 'score'],engine='python')
    config['num_friends_pretrain'] = int(data_grouping.friendId.max() + 1)
    config['num_items_pretrain'] = int(data_grouping.tagId.max() + 1)
    del data_grouping

args.item_num = int(data_rating.itemId.max() + 1)

# Process the tweet
vocab = Word()
tweet = vocab.load_tweets(data_tweet, max_len=200)

# Read the grouping information
data_grouping = pd.read_csv(args.data_grouping, sep=",", header=0, names=['friendId', 'tagId', 'score'],engine='python')
config['num_friends_pretrain'] = int(data_grouping.friendId.max() + 1)
config['num_items_pretrain'] = int(data_grouping.tagId.max() + 1)
del data_grouping

# config
config['num_users'], config['num_items'] = int(data_rating.userId.max() + 1), int(data_rating.itemId.max() + 1)
config['user_friends'], config['num_friends'] = load_friends(args.data_friends)
args.tweet = tweet
config['args'] = args
config['vocab'] = vocab
Exemplo n.º 14
0
    print "Starting..."
    # initiate empty ratings
    methodsRating = []
    humanRating = []
    questions = task.values()

    jointVocCache = dict()
    partVoc = set(vectors.keys())

    print len(disambiguatedWords), "disambiguated words"

    done = 0
    for i in xrange(len(questions)):
        question = questions[i]

        word1 = Word(question['word1']).lemma()
        word2 = Word(question['word2']).lemma()
        context1 = [
            Word(x).lemma() for x in question['context1'].lower().split(' ')
        ]
        context2 = [
            Word(x).lemma() for x in question['context2'].lower().split(' ')
        ]

        # so we are not using disambiguated words in the context..?
        context1 = filter(lambda x: x in partVoc, context1)
        context2 = filter(lambda x: x in partVoc, context2)

        # set finders to false
        w1 = False
        w2 = False
def get_bigrams(tree):
    for w in tree:
        dep = Word(w.lemma, w.upostag)
        head = Word(tree[w.head].lemma, tree[w.head].upostag)
        yield (dep, head, w.deprel)
Exemplo n.º 16
0
	task, _ = load_task(taskFilename)
	questions = task.values()

	methodsRating = []
	humanRating = []

	print "Answering", len(task), "questions..."

	for i in xrange(len(questions)):
		if i % 100 == 0 and not i == 0:
			print "\tIteration", i, ": ", spearman(methodsRating, humanRating)

		question = questions[i]

		word1 = Word(question['word1']).lemma()
		word2 = Word(question['word2']).lemma()
		context1 = question['context1'] + " " + question['word1'] + " " + question['word1']
		context2 = question['context2'] + " " + question['word2'] + " " + question['word2']

		r_s = relatedness(context1, context2, newD, key_sets)
		v_s = vector_similarity(r_s, word1, word2, vectors)
		score = r_s * v_s**2
		methodsRating.append( score )
		humanRating.append(question['rating'])

 	print 
 	print spearman(methodsRating, humanRating)