예제 #1
0
파일: ex2.py 프로젝트: Chinmay26/rltk
# -*- coding: utf-8 -*-

import rltk

j1 = {'id': 1, 'name': 'abc', 'gender': 'male'}
j2 = {'id': '2', 'name': 'bcd', 'gender': 'male'}
edit_distance_cost = {
    'insert': {
        'c': 50
    },
    'insert_default': 100,
    'delete_default': 100,
    'substitute_default': 100
}

tk1 = rltk.init()
tk1.load_feature_configuration('C1', 'feature_config_1.json')
print tk1.compute_feature_vector(j1, j2, name='C1')

tk2 = rltk.init()
tk2.load_edit_distance_table('A1', edit_distance_cost)
tk2.load_feature_configuration('C1', 'feature_config_2.json')
print tk2.compute_feature_vector(j1, j2, name='C1')
예제 #2
0
# output: a new json line file containing only the pages relevant to dow 30
#         new tags containing company and product names will be added to html
#         body to make it convenient for inferlink

import json
import os
import nltk
import rltk
import sys
from bs4 import BeautifulSoup
from nltk.classify import NaiveBayesClassifier

reload(sys)
sys.setdefaultencoding('utf-8')

fsm = rltk.init()  # fuzzy string match


# ######## Train sentiment analysis model #########
def format_sentence(sent):
    return {word: True for word in nltk.word_tokenize(sent)}


pos = []
with open("./pos_tweets.txt") as f:
    for i in f:
        pos.append([format_sentence(i), 'positive'])

neg = []
with open("./neg_tweets.txt") as f:
    for i in f:
예제 #3
0
import rltk

tk = rltk.init()

iter = tk.get_file_iterator('file_iter_test.txt', type='text')
for id, value in iter:
    print id, value

iter = tk.get_file_iterator('file_iter_test.jsonl',
                            type='json_line',
                            id_path='id_str',
                            value_path='content')
for id, value in iter:
    print id, value

iter = tk.get_file_iterator('file_iter_test.csv',
                            type='csv',
                            id_column='id',
                            value_columns=['content'
                                           ])  # field_names=['xx', 'yy']
for id, value in iter:
    print id, value

print '----'
iter1 = tk.get_file_iterator('file_iter_test.txt', type='text')
for id, value in iter1:
    print id, value
    break
iter2 = iter1.copy()
for id, value in iter1:
    print id, value
import sys
import json
import os
import sys

reload(sys) 
sys.setdefaultencoding('utf8')
packagePath='/Users/zihaozhai/Desktop/Trojan/Internship/ISI/rltk'
inputPath='HitiJsonInput/'
sys.path.append(packagePath)
import rltk
tk = rltk.init()
topicWords=[]
thr=0.8
def pickTopicWords(data,tp,file):
	if data['_source'].has_key(tp):
		for d in data['_source'][tp]:
			haveName=0
			for ele in range(len(topicWords)):
				if topicWords[ele]['type']==tp:
					for n in range(len(topicWords[ele]['originalNames'])):
						if topicWords[ele]['originalNames'][n]['name']==d:
							topicWords[ele]['originalNames'][n]['docIds'].append(file)
							haveName=1
					for n in range(len(topicWords[ele]['originalNames'])):
						if haveName==0:
							similarity=tk.jaccard_index_similarity(set(topicWords[ele]['originalNames'][n]['name'].split(' ')), set(d.split(' ')))
							if similarity>=thr:
								haveName=1
								obj={
									'name':d,