def relationship_minig(min_votes, iter_stop): single_graph_file ='scripts/output/single_graph.json' pair_graph_file = 'scripts/output/pair_graph.json' social_graph_file = 'scripts/output/single_graph.json' dir_file = 'scripts/input/dir_rel.json' clip_file = 'scripts/input/clip_rel.json' single_graph = json_io.read_json(single_graph_file) pair_graph = json_io.read_json(pair_graph_file) bi_graph, social_graph = graph_init(single_graph, pair_graph, social_graph_file, dir_file, clip_file) output_graph = {'nodes':[], "links":[]} node_dic = {} change = True itr = 0 # iterator algorithm1 while change: role_pair, dominant_keyword, votes = bi_graph.dominant_pair() if role_pair is None: break source, target, dir_prob = bi_graph.get_direction(role_pair, dominant_keyword) valid_tag = valid_checking(social_graph, source, target, dominant_keyword) if source not in node_dic: node_dic[source] = len(node_dic) output_graph['nodes'].append({"group": node_dic[source], "name": source, "ID": node_dic[source]}) if target not in node_dic: node_dic[target] = len(node_dic) output_graph['nodes'].append({"group": node_dic[target], "name": target, "ID": node_dic[target]}) if valid_tag != False and votes >= int(min_votes): if type(valid_tag) != unicode: print source, '-->', dominant_keyword, '-->', target add_links(output_graph, source, target, dominant_keyword, votes, node_dic) social_graph.relationship_tagging(source, target, dominant_keyword, votes) else: print source, '-->', valid_tag, '-->', target add_links(output_graph, source, target, valid_tag, votes, node_dic) social_graph.relationship_tagging(source, target, valid_tag, votes) print votes, dir_prob bi_graph.update_weighting(role_pair, dominant_keyword) if valid_tag: bi_graph.remove_keyword(role_pair, dominant_keyword) else: bi_graph.remove_edges(role_pair, dominant_keyword) itr += 1 if itr == int(iter_stop): break json_io.write_json('result/social_graph.json', output_graph) social_graph.clear() social_graph.shutdown()
def social_reconstruction(keyword_list_file, relations_file): keyword_list = csv_io.read_csv(keyword_list_file) relations = json_io.read_json(relations_file) relation_graph = {'nodes': [], 'links': []} node_index = {} index = 0 for keyword in keyword_list: if keyword not in node_index: relation_graph['nodes'].append({'name': keyword, 'group': index, 'ID': index}) node_index[keyword] = index index += 1 for name, relation in relations.iteritems(): #total = sum(relation.values()) for person in relation: #if total != 0 and (float(relation[person]) / total > (1.0/len(relation)) - 0.03 ): relation_graph['links'].append({'source': node_index[name], 'target': node_index[person], 'value': relation[person], 'label': person }) relation_graph['links'].append({'source': node_index[person], 'target': node_index[name], 'value': relation[person], 'label': name }) print relation_graph json_io.write_json('output/result/relation_graph.json', relation_graph)
def reconstruct_role(recongition_merge_file, keword_list_file): keyword_to_frame = json_io.read_json(recongition_merge_file) keword_list = csv_io.read_csv(keword_list_file) leading_keyword = keword_list[0] for keyword, frame_list in keyword_to_frame.iteritems(): for frame in frame_list: for face in frame_list[frame]: name = keyword + str(face['frame_position']) + '.jpg' face['img'] = cv2.imread(OUTPUT_PATH + '/img/' + name) detector, matcher = cv_face.init_feature('orb') # Find other characters face_list = {} character_list = {} for keyword, frame_list in keyword_to_frame.iteritems(): print keyword for frame in frame_list: for face in frame_list[frame]: if face and face['face_id'] not in face_list: face_list[face['face_id']] = [] if face: face_list[face['face_id']].append(face) rank = sorted(face_list, key=lambda k: len(face_list[k]), reverse=True) character_list[keyword] = [face_list[rank[0]]] i=0
def face_recongnition(position_merge_file): frame_list = json_io.read_json(position_merge_file) # Read face image for frame in frame_list: img_name = frame_list[frame]['keyword'].encode('utf8') + str(frame_list[frame]['frame_position']) + '.jpg' frame_list[frame]['img'] = cv2.imread( OUTPUT_PATH + "img/" + img_name , 0) # transforamt to keyword as key keyword_list = {} for frame in frame_list: keyword = frame_list[frame]['keyword'] face_id = frame_list[frame]['face_id'] if keyword not in keyword_list: keyword_list[keyword] = {} if face_id not in keyword_list[keyword]: keyword_list[keyword][face_id] = [] keyword_list[keyword][face_id].append(frame_list[frame]) for keyword, frame_list in keyword_list.iteritems(): print keyword for frame in frame_list: for face in frame_list[frame]: print face['ID'], print global detector global matcher detector, matcher = cv_face.init_feature('orb') threadLock = threading.Lock() thread_count = 0 threads = [] match_rate = {} for keyword, frame_list in keyword_list.iteritems(): thread = Pthread(thread_count, 'Thread-'+str(thread_count), frame_list, threadLock) thread.start() threads.append(thread) thread_count += 1 # wait all threads complete for thread in threads: thread.join() for keyword, frame_list in keyword_list.iteritems(): print keyword for frame in frame_list: for face in frame_list[frame]: if 'img' in face: del face['img'] print face['ID'], print json_io.write_json('output/face_recongnition.json', keyword_list)
def build_bipartite_graph(keyword_dic_file): keyword_dic = json_io.read_json(keyword_dic_file) keyword_dic = weight_normalize(keyword_dic) pair_bipartite_graph = to_pair(keyword_dic) json_io.write_json(OUTPUT_PATH + 'pair_graph.json', pair_bipartite_graph) json_io.write_json(OUTPUT_PATH + 'single_graph.json', keyword_dic)
def to_db(mydb, term_id, document_list, doc_hash, input_dir): for doc in document_list: terms_tf = json_io.read_json(input_dir+doc) for term, tf in terms_tf.iteritems(): term = term.replace("'", "") if len(term) > 255: term = term[:254] sql = "INSERT INTO doc_lookups (doc_id,title,tf,term_id) VALUES (" \ + "'" + str(doc_hash[doc[:-5]]) + "','" + doc[:-5] + "','" + str(tf) + "','" + str(term_id[term]) + "');" mydb.exe_sql(sql)
def to_db(mydb, term_id, document_list, doc_hash, input_dir): for doc in document_list: terms_tf = json_io.read_json(input_dir + doc) for term, tf in terms_tf.iteritems(): term = term.replace("'", "") if len(term) > 255: term = term[:254] sql = "INSERT INTO doc_lookups (doc_id,title,tf,term_id) VALUES (" \ + "'" + str(doc_hash[doc[:-5]]) + "','" + doc[:-5] + "','" + str(tf) + "','" + str(term_id[term]) + "');" mydb.exe_sql(sql)
def idf(input_dir): doc_id = 0 term_idf = {} for doc in document_list: terms = json_io.read_json(input_dir + doc) for term in terms: if term not in term_idf: term_idf[term] = [] term_idf[term].append(term) return term_idf
def idf(input_dir): doc_id = 0 term_idf = {} for doc in document_list: terms = json_io.read_json(input_dir+doc) for term in terms: if term not in term_idf: term_idf[term] = [] term_idf[term].append(term) return term_idf
def movie_prosessing(movie_file, two_entity_file, search_result_file): two_entity_set = json_io.read_json(two_entity_file) keyword_search_result = csv_io.read_csv(search_result_file) # load video videoInput = cv2.VideoCapture(movie_file) # crate a start_frame to end_frame dictionary for two_entity_set look up start_end = {} for row in keyword_search_result: start_frame, end_frame = time_format.to_frame(row) while start_frame in start_end: start_frame = start_frame + 0.001 while end_frame in start_end: end_frame = end_frame + 0.001 start_end[start_frame] = end_frame frame = {} face_count = 0 for keyword in two_entity_set: for start_frame in two_entity_set[keyword]: frame_position = int(start_frame) - 24 * 10 finish_frame = start_end[start_frame] + 24 * 10 while frame_position <= finish_frame: print keyword videoInput.set(cv2.cv.CV_CAP_PROP_POS_FRAMES, frame_position) flag, img = videoInput.read() gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.equalizeHist(gray) face_position_list, rects = cv_image.face_detect(gray, frame_position, (85, 85)) #face_position_list, rects = faceDetection(gray, frame_position) if 0xFF & cv2.waitKey(5) == 27: cv2.destroyAllWindows() sys.exit(1) if len(face_position_list) == 1: print 'detected' image_name = keyword + str(frame_position) + '.jpg' cv_image.output_image(rects, img, OUTPUT_PATH + '/img/' + image_name) for face_position in face_position_list: face_count += 1 print face_count frame[face_count] = { 'keyword' : keyword, 'face_position': face_position.tolist(), 'ID' : face_count, 'frame_position': frame_position, 'face_id': face_count} frame_position += FRAME_INTERVAL #close video videoInput.release() json_io.write_json(OUTPUT_PATH + 'frame.json', frame)
def position_merge(frame_file): frame = json_io.read_json(frame_file) keys = frame.keys() for i in range(0, len(frame)): for j in range(i+1, len(frame)): if is_near( frame[keys[i]], frame[keys[j]] ): frame[keys[j]]['face_id'] = frame[keys[i]]['face_id'] print 111 json_io.write_json(OUTPUT_PATH + 'merge_position.json', frame)
def neo4j_db(neo4j_url, data_path): gdb = GraphDatabase(neo4j_url) data = json_io.read_json(data_path) if 'nodes' in data and 'links' in data: nodes = [] for node in data['nodes']: nodes.append(gdb.nodes.create(name=node['name'])) for edge in data['links']: source = nodes[edge['source']] target = nodes[edge['target']] source.relationships.create('Knows', target)
def file_to_db(self, data_path): data = json_io.read_json(data_path) with self.db.transaction: for source_name, targets in data.iteritems(): if source_name in self.nodes: source = self.nodes[source_name] else: source = self.db.node(name=source_name) self.nodes[source_name] = source for target_name in targets: if target_name in self.nodes: target = self.nodes[target_name] else: target = self.db.node(name=target_name) self.nodes[target_name] = target #for attr, val in targets[target_name].iteritems(): self.rels.append(source.knows(target)) return self.nodes
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- from modules import json_io from db.MyDB import MyDB if __name__=='__main__': codebook_path = 'modules/codebook/' config = json_io.read_json('config.json') db_config = config[u'database'] mydb = MyDB( db_config[u'dbtype'], db_config[u'host'], db_config[u'dbname'], \ db_config[u'username'], db_config[u'password'], db_config[u'encoding'], "") tables = config[u'table'] for category, table_info in tables.iteritems(): for name, years in table_info.iteritems(): f = codebook_path + 'codebook_' + name[:6] + '.json' table_format = json_io.read_json(f) mydb.create_table(name, table_format.keys(), table_format.values()) mydb.close()
for doc in document_list: terms = json_io.read_json(input_dir+doc) for term in terms: if term not in term_idf: term_idf[term] = [] term_idf[term].append(term) return term_idf if __name__=='__main__': if len(sys.argv) > 1: input_dir = sys.argv[1] else: input_dir = 'output/en_tf/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(input_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") try: term_id = mydb.select('SELECT id FROM terms order by id desc limit 1;')[0][0] except IndexError, e: term_id = 1 #Get idf term_doc_list = idf(input_dir) term_hash= {} doc_number = len(document_list)
def load_pattern(self, dir_file, clip_file): self.dir_patterns = json_io.read_json(dir_file) self.clip_patterns = json_io.read_json(clip_file)
if match: target = match.groups()[0] return source, target if __name__=='__main__': if len(sys.argv) < 2: print >> sys.stderr, "Usage: <file>" exit(-1) if sys.argv[1][-4:] != 'json': f = open(sys.argv[1]) graph = read_graph(f) else: graph = json_io.read_json(sys.argv[1]) if len(sys.argv) == 3: movie_dic = json_io.read_json(sys.argv[2]) nodes = movie_dic.keys() for i in xrange(len(nodes)): nodes[i] = str(nodes[i]) else: nodes = [] for i in range(1, 26): nodes.append(str(i)) graph = Graph(graph, nodes) s_rank = datetime.datetime.now() rank = page_rank(graph, 20, 0.85)
def split_text(text): if len(text) >= 10: line = text[:10] text = text[10:] else: line = text text = None return text, line if __name__=="__main__": if len(sys.argv) != 2: print sys.stderr exit(-1) config = json_io.read_json('config.json')[u'database'] mydb = IServDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") sql_query = ("SELECT * FROM \"twitter\" WHERE q = '%s' ORDER BY user_id ASC" % (sys.argv[1])) responese = mydb.select(sql_query) responese = sorted(responese, key=lambda row: row[3], reverse=True) if len(responese) == 0: print "There is no data matching the query string." else: print '---------------------------------------------' print ("%10s | %10s | %10s" % ("text", "user_name", "user_id")) for row in responese:
def reconstruct_role(recongition_merge_file, keword_list_file): keyword_to_frame = json_io.read_json(recongition_merge_file) keword_list = csv_io.read_csv(keword_list_file) leading_keyword = keword_list[0] for keyword, frame_list in keyword_to_frame.iteritems(): for frame in frame_list: for face in frame_list[frame]: name = keyword + str(face['frame_position']) + '.jpg' face['img'] = cv2.imread(OUTPUT_PATH + '/img/' + name) detector, matcher = cv_face.init_feature('orb') # Find other characters face_list = {} character_list = {} for keyword, frame_list in keyword_to_frame.iteritems(): print keyword for frame in frame_list: for face in frame_list[frame]: if face and face['face_id'] not in face_list: face_list[face['face_id']] = [] if face: face_list[face['face_id']].append(face) rank = sorted(face_list, key=lambda k: len(face_list[k]), reverse=True) character_list[keyword] = [face_list[rank[0]]] i=0 #for face in face_list[rank[5]]: # i+=1 for j in rank: face = face_list[j][0] cv2.imwrite(OUTPUT_PATH + '/result2/' + keyword + str(i) + '.jpg', face['img']) i += 1 if len(rank) > 1 and '-' in keyword: character_list[keyword].append(face_list[rank[1]]) ''' for i in range(1, len(rank)): if cv_face.list_match(MIN_MATCH, character_list[keyword][0], face_list[rank[i]], detector, matcher): continue else: character_list[keyword].append(face_list[rank[i]]) break if len(character_list[keyword]) == 1: character_list[keyword].append(face_list[rank[1]])''' face_list = {} print role_list = {} # Use leading role image to check lead_role_list = character_list[leading_keyword] for keyword, characters in character_list.iteritems(): if keyword == leading_keyword or len(characters) < 2: continue if leading_keyword in keyword: print keyword, '---' match_count1 = 0 match_count2 = 0 for face in character_list[leading_keyword][0]: match_count1 += cv_face.get_match_rate(face['img'], characters[0][0]['img']) cv2.imwrite(OUTPUT_PATH + '/result/' + '000' + keyword + '.jpg', characters[0][0]['img']) for face in character_list[leading_keyword][0]: match_count2 += cv_face.get_match_rate(face['img'], characters[1][0]['img']) cv2.imwrite(OUTPUT_PATH + '/result/' + '001' + keyword + '.jpg', characters[1][0]['img']) if match_count1 > match_count2: print 'characters1', match_count1, match_count2 del characters[0] else: print 'characters2', match_count1, match_count2 del characters[1] role_list[keyword.split('-')[0]] = characters[0] for keyword, characters in character_list.iteritems(): if leading_keyword in keyword or len(characters) < 2: continue important_person = keyword.split('-')[1] if important_person in role_list: print keyword, important_person, '---' match_count1 = 0 match_count2 = 0 for face in role_list[important_person]: match_count1 += cv_face.get_match_rate(face['img'], characters[0][0]['img']) #cv2.imwrite(OUTPUT_PATH + '/result/' + '000' + keyword + '.jpg', characters[0][0]['img']) for face in role_list[important_person]: match_count2 += cv_face.get_match_rate(face['img'], characters[1][0]['img']) #cv2.imwrite(OUTPUT_PATH + '/result/' + '001' + keyword + '.jpg', characters[1][0]['img']) if match_count1 > match_count2: print 'characters1', match_count1, match_count2 del characters[0] else: print 'characters2', match_count1, match_count2 del characters[1] else: del characters[1] # Output for keyword, characters in character_list.iteritems(): for character in characters: if '-' in keyword: keyword = keyword.split('-')[0] cv2.imwrite(OUTPUT_PATH + '/result/' + keyword + '.jpg', character[0]['img'])
# -*- coding: utf-8 -*- import sys from my_class.DataDB import DataDB from my_class.Document import Document from modules import json_io from doc_preprocessing import get_docs_list if __name__ == '__main__': if len(sys.argv) >= 2: data_dir = sys.argv[1] config = sys.argv[1] else: data_dir = 'output/processed_data/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(data_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") table_name = "documents" key_list = ['doc_id', 'content'] for doc in document_list: doc_obj = Document(doc_hash[doc], doc, data_dir) content = doc_obj.read().replace("'", '"') data_list = [str(doc_hash[doc]), content]
def split_text(text): if len(text) >= 10: line = text[:10] text = text[10:] else: line = text text = None return text, line if __name__ == "__main__": if len(sys.argv) != 2: print sys.stderr exit(-1) config = json_io.read_json('config.json')[u'database'] mydb = IServDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") sql_query = ( "SELECT * FROM \"twitter\" WHERE q = '%s' ORDER BY user_id ASC" % (sys.argv[1])) responese = mydb.select(sql_query) responese = sorted(responese, key=lambda row: row[3], reverse=True) if len(responese) == 0: print "There is no data matching the query string." else: print '---------------------------------------------' print("%10s | %10s | %10s" % ("text", "user_name", "user_id"))
#! /usr/bin/env python # -*- coding: utf-8 -*- import urllib2 from modules import json_io pages = json_io.read_json('urls.json') for category, urls in pages.iteritems(): page_count = 0 for url in urls: page_count += 1 file_name = category + str(page_count) with open('pages/' + file_name, 'w') as output_file: page = urllib2.urlopen(url).read() output_file.write(page)
from modules import json_io from modules.CodeBook import CodeBook from db.MyDB import MyDB def get_table_name(table_dic, encoding, year): category = table_dic[encoding.decode('utf-8')] for table_name, years in category.iteritems(): if year >= years[0] and year <= years[1]: return table_name if __name__=='__main__': ''' Read NHI .dat fromat data, decode and insert to db ''' config = json_io.read_json('config.json') codebook = CodeBook(config[u'codebook'][u'path']) db_config = config[u'database'] mydb = MyDB( db_config[u'dbtype'], db_config[u'host'], db_config[u'dbname'], \ db_config[u'username'], db_config[u'password'], db_config[u'encoding'], None) table_dic = config[u'table'] for root, _, files in os.walk(config[u'data'][u'folder_path']): for f in files: print f encoding = f[5:7] year = int(f[7:11]) data = codebook.decode_file(root+f, encoding, year) table_name = get_table_name(table_dic, encoding, year)
def relationship_minig(min_votes, iter_stop): single_graph_file = 'scripts/output/single_graph.json' pair_graph_file = 'scripts/output/pair_graph.json' social_graph_file = 'scripts/output/single_graph.json' dir_file = 'scripts/input/dir_rel.json' clip_file = 'scripts/input/clip_rel.json' single_graph = json_io.read_json(single_graph_file) pair_graph = json_io.read_json(pair_graph_file) bi_graph, social_graph = graph_init(single_graph, pair_graph, social_graph_file, dir_file, clip_file) output_graph = {'nodes': [], "links": []} node_dic = {} change = True itr = 0 # iterator algorithm1 while change: role_pair, dominant_keyword, votes = bi_graph.dominant_pair() if role_pair is None: break source, target, dir_prob = bi_graph.get_direction( role_pair, dominant_keyword) valid_tag = valid_checking(social_graph, source, target, dominant_keyword) if source not in node_dic: node_dic[source] = len(node_dic) output_graph['nodes'].append({ "group": node_dic[source], "name": source, "ID": node_dic[source] }) if target not in node_dic: node_dic[target] = len(node_dic) output_graph['nodes'].append({ "group": node_dic[target], "name": target, "ID": node_dic[target] }) if valid_tag != False and votes >= int(min_votes): if type(valid_tag) != unicode: print source, '-->', dominant_keyword, '-->', target add_links(output_graph, source, target, dominant_keyword, votes, node_dic) social_graph.relationship_tagging(source, target, dominant_keyword, votes) else: print source, '-->', valid_tag, '-->', target add_links(output_graph, source, target, valid_tag, votes, node_dic) social_graph.relationship_tagging(source, target, valid_tag, votes) print votes, dir_prob bi_graph.update_weighting(role_pair, dominant_keyword) if valid_tag: bi_graph.remove_keyword(role_pair, dominant_keyword) else: bi_graph.remove_edges(role_pair, dominant_keyword) itr += 1 if itr == int(iter_stop): break json_io.write_json('result/social_graph.json', output_graph) social_graph.clear() social_graph.shutdown()