# -*- coding: utf-8 -*- import rltk j1 = {'id': 1, 'name': 'abc', 'gender': 'male'} j2 = {'id': '2', 'name': 'bcd', 'gender': 'male'} edit_distance_cost = { 'insert': { 'c': 50 }, 'insert_default': 100, 'delete_default': 100, 'substitute_default': 100 } tk1 = rltk.init() tk1.load_feature_configuration('C1', 'feature_config_1.json') print tk1.compute_feature_vector(j1, j2, name='C1') tk2 = rltk.init() tk2.load_edit_distance_table('A1', edit_distance_cost) tk2.load_feature_configuration('C1', 'feature_config_2.json') print tk2.compute_feature_vector(j1, j2, name='C1')
# output: a new json line file containing only the pages relevant to dow 30 # new tags containing company and product names will be added to html # body to make it convenient for inferlink import json import os import nltk import rltk import sys from bs4 import BeautifulSoup from nltk.classify import NaiveBayesClassifier reload(sys) sys.setdefaultencoding('utf-8') fsm = rltk.init() # fuzzy string match # ######## Train sentiment analysis model ######### def format_sentence(sent): return {word: True for word in nltk.word_tokenize(sent)} pos = [] with open("./pos_tweets.txt") as f: for i in f: pos.append([format_sentence(i), 'positive']) neg = [] with open("./neg_tweets.txt") as f: for i in f:
import rltk tk = rltk.init() iter = tk.get_file_iterator('file_iter_test.txt', type='text') for id, value in iter: print id, value iter = tk.get_file_iterator('file_iter_test.jsonl', type='json_line', id_path='id_str', value_path='content') for id, value in iter: print id, value iter = tk.get_file_iterator('file_iter_test.csv', type='csv', id_column='id', value_columns=['content' ]) # field_names=['xx', 'yy'] for id, value in iter: print id, value print '----' iter1 = tk.get_file_iterator('file_iter_test.txt', type='text') for id, value in iter1: print id, value break iter2 = iter1.copy() for id, value in iter1: print id, value
import sys import json import os import sys reload(sys) sys.setdefaultencoding('utf8') packagePath='/Users/zihaozhai/Desktop/Trojan/Internship/ISI/rltk' inputPath='HitiJsonInput/' sys.path.append(packagePath) import rltk tk = rltk.init() topicWords=[] thr=0.8 def pickTopicWords(data,tp,file): if data['_source'].has_key(tp): for d in data['_source'][tp]: haveName=0 for ele in range(len(topicWords)): if topicWords[ele]['type']==tp: for n in range(len(topicWords[ele]['originalNames'])): if topicWords[ele]['originalNames'][n]['name']==d: topicWords[ele]['originalNames'][n]['docIds'].append(file) haveName=1 for n in range(len(topicWords[ele]['originalNames'])): if haveName==0: similarity=tk.jaccard_index_similarity(set(topicWords[ele]['originalNames'][n]['name'].split(' ')), set(d.split(' '))) if similarity>=thr: haveName=1 obj={ 'name':d,