def get_data(fields_to_retrieve, collection, config_fn, dataset_filename, filter_query=None): if dataset_filename and os.path.isfile(dataset_filename): df = pd.read_csv(dataset_filename) else: config_fn = 'config_mongo_inb.json' collection = 'rc_all' dbm = DBManager(collection=collection, config_fn=config_fn) if not filter_query: filter_query = {} data = dbm.get_tweets_reduced(filter_query, fields_to_retrieve) df = pd.DataFrame(data) data = None # free some memory df.to_csv(os.path.join(dataset_filename), index=False) return df
def setup_database_data(): db_obj = DBManager(databases['books']) cleanup_query = [ "delete from author_book", "delete from book", "delete from author" ] for query in cleanup_query: db_obj.processquery(query=query) setup_query = [ "INSERT INTO `author` VALUES (2,'John Doe','2019-06-28 18:50:11'),(3,'Martin','2019-06-28 19:33:14'),(4,'Jeo','2019-06-29 00:37:02'),(5,'kelin','2019-06-29 01:37:16')", "INSERT INTO `book` VALUES (10,'Jungle book','123456',400,'No Books','UK States','2019-01-20','2019-06-29 01:24:16','2019-06-29 01:50:13')", "INSERT INTO `author_book` VALUES (14,10,2),(15,10,3),(16,10,4),(17,10,5)" ] for query in setup_query: db_obj.processquery(query=query) db_obj.commit() db_obj.close()
def get_mids_by_surface(): surface = request.args.get('surface').strip() print '[get_mid_by_surface]' mids = DBManager.get_candidate_entities(surface, 0.1) print mids res = { 'candidates': '<br>'.join('%s %s' % (m[0], m[1]) for m in mids) } return json.dumps(res)
def export_sentiment_scores_from_ids(file_tweet_ids, collection, config_fn): dbm = DBManager(collection=collection, config_fn=config_fn) tweets_to_export = [] with open(file_tweet_ids, 'r') as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: tweet_id = row['id'] print('Processing tweet: {}'.format(tweet_id)) tweet_obj = dbm.find_record({'id_str': str(tweet_id)}) if tweet_obj is None: print('Missing tweet...') continue tweet_to_export = { 'id': tweet_id, 'text': tweet_obj['complete_text'] } sentiment_obj = tweet_obj['sentiment'] if 'sentiment_score_polyglot' in sentiment_obj: tweet_to_export['score_polyglot'] = \ sentiment_obj['sentiment_score_polyglot'] if 'sentiment_score_sentipy' in sentiment_obj: tweet_to_export['score_sentipy'] = \ sentiment_obj['sentiment_score_sentipy'] if 'sentiment_score_affin' in sentiment_obj: tweet_to_export['score_affin'] = \ sentiment_obj['sentiment_score_affin'] if 'sentiment_score_vader' in sentiment_obj: tweet_to_export['score_vader'] = \ sentiment_obj['sentiment_score_vader'] tweet_to_export['sentiment_score'] = sentiment_obj['score'] tweet_to_export['human_label'] = row['label'] tweets_to_export.append(tweet_to_export) output_file = '../data/bsc/processing_outputs/sentiment_scores_from_ids.csv' print('Saving tweets to the CSV {}'.format(output_file)) with open(output_file, 'w') as csv_file: headers = tweets_to_export[0].keys() csv_writer = csv.DictWriter(csv_file, fieldnames=headers) csv_writer.writeheader() for tweet_to_export in tweets_to_export: csv_writer.writerow(tweet_to_export)
def upload_tweet_sentiment(): print('Process started, it can take several time. Follow updates through ' \ 'the log...') dbm_local = DBManager(collection='tweets_esp_hpai') dbm_remote = DBManager(collection='bsc-ls', config_fn='config_mongo_hpai.json') query = {'$and': [{'lang': {'$in': SPAIN_LANGUAGES}}, {'$or': [{'place.country': 'Spain'}, {'user.location': {'$in': \ get_spain_places_regex()}}]}]} query.update({'retweeted_status': {'$exists': 0}}) query.update({'sentiment_score': {'$exists': 1}}) tweets = dbm_local.search(query) total_tweets = tweets.count() processing_counter = total_segs = modified_records = found_tweets = 0 logging.info('Going to upload {0:,} tweets'.format(total_tweets)) for tweet in tweets: start_time = time.time() processing_counter += 1 logging.info('[{0}/{1}] Processing tweet:\n{2}'.\ format(processing_counter, total_tweets, tweet['id'])) sentiment_dict = { 'sentiment': { 'score': tweet['sentiment_score'] if tweet['sentiment_score_polyglot'] else None, } } if tweet['sentiment_score_polyglot']: sentiment_dict['sentiment']['raw_score_polyglot'] = \ tweet['sentiment_score_polyglot'] if 'sentiment_score_sentipy' in tweet: sentiment_dict['sentiment']['raw_score_sentipy'] = \ tweet['sentiment_score_sentipy'] if 'sentiment_score_affin' in tweet: sentiment_dict['sentiment']['raw_score_affin'] = \ tweet['sentiment_score_affin'] ret_update = dbm_remote.update_record({'id': int(tweet['id'])}, sentiment_dict) if ret_update.matched_count == 0: logging.info('Could not find in the remote server a tweet with ' \ 'the id {}'.format(tweet['id'])) elif ret_update.matched_count == 1: found_tweets += 1 if ret_update.modified_count == 0: logging.info('Found tweet but did not update.') elif ret_update.modified_count == 1: modified_records += 1 logging.info('Remote tweet update with sentiment info!') total_segs += calculate_remaining_execution_time( start_time, total_segs, processing_counter, total_tweets) logging.info('Total processed tweets: {0:,}\n'\ 'Total found tweets in remote server: {1:,}\n' 'Total updated tweets in remote server: {2:,}\n'.\ format(total_tweets, found_tweets, modified_records)) print('Process finished!')
def export_user_sample(sample_size, collection, config_file=None, output_filename=None): project_dir = pathlib.Path(__file__).parents[1].resolve() if not output_filename: output_filename = 'user_sample.jsonl' output = os.path.join(project_dir, 'data', output_filename) dbm = DBManager(collection=collection, config_fn=config_file) query_filter = {'lang': 'es'} projection = {'_id': 0, 'user': 1} logging.info('Getting sample of users, please wait...') tweets = dbm.get_sample(int(sample_size), query_filter, projection) total_tweets = len(tweets) logging.info('Found {} users'.format(total_tweets)) saved_tweets = 0 with open(output, 'w') as f: for i in range(total_tweets): user_obj = tweets[i]['user'] if exists_user(user_obj): saved_tweets += 1 logging.info('[{0}] Saving user: {1}'.format( saved_tweets, user_obj['screen_name'])) f.write("{}\n".format(json.dumps(user_obj)))
def do_collection_merging(master_collection, collections_to_merge, config_fn=None): dbm_master = DBManager(collection=master_collection, config_fn=config_fn) for collection in collections_to_merge: logging.info('Merging collection {0} into {1}'.format( collection, master_collection)) dbm_collection_to_merge = DBManager(collection=collection, config_fn=config_fn) tweets = dbm_collection_to_merge.find_all() logging.info('Trying to insert {0:,} tweets'.format(tweets.count())) try: ret_insertions = dbm_master.insert_many_tweets(tweets, ordered=False) insertion_counter = ret_insertions.inserted_ids logging.info('{0:,} new tweets were inserted into the collection {1}'.\ format(insertion_counter, master_collection)) except Exception as e: logging.error('Error when merging {}'.format(e))
def __init__(self): self.db_obj = DBManager(databases['books']) self.dao_obj = BooksDAO(self.db_obj)
class InternalBooks(object): """ This is the class for manager Internal book store """ def __init__(self): self.db_obj = DBManager(databases['books']) self.dao_obj = BooksDAO(self.db_obj) def __get_books_by_filter(self, key, value): """ :param book_id: :return: """ search_func = { "all": self.dao_obj.get_all_books, "id": self.dao_obj.get_books_by_id, "name": self.dao_obj.get_books_by_name, "publisher": self.dao_obj.get_books_by_publisher, "country": self.dao_obj.get_books_by_country, "release date": self.dao_obj.get_books_by_release_date, } result = search_func[key](value) for row in result: res_author = self.dao_obj.get_author_by_book_id(row['id']) author_list = [val['name'] for val in res_author] row.update({"authors": author_list}) row["release_date"] = str(row["release_date"]) return result @staticmethod def format_get_response(book_info): """ :param book_info: :return: """ response = {"status_code": 200, "status": "success", "data": []} if book_info: response.update({"data": book_info}) return response def get_books(self, key, value): """ :param key: :param value: :return: """ try: logger.info("Entering get_books") result = self.__get_books_by_filter(key, value) if result and key == "id": result = result[0] logger.info("Exiting get_books") return self.format_get_response(result) except Exception as err: logger.exception(err) raise finally: self.db_obj.close() @staticmethod def format_insert_response(book_info): """ :return: """ response = {"status_code": 201, "status": "success", "data": []} book_info.pop("id") response["data"].append({"book": book_info}) return response def __link_author_book(self, book_id, author_list): """ :param book_id: :param author_list: :return: """ for row in author_list: res_author = self.dao_obj.get_author(row) if res_author: author_id = res_author["author_id"] else: author_id = self.dao_obj.insert_into_author(row) self.dao_obj.insert_into_author_book(author_id, book_id) def insert_book(self, new_book): """ This method used to insert new book :return: """ try: logger.info("Entering insert_book") book_info = self.__get_books_by_filter("name", new_book["name"]) if book_info: return self.format_insert_response(book_info[0]) book_entity_obj = Book(new_book) book_id = self.dao_obj.insert_into_book(book_entity_obj) self.__link_author_book(book_id, book_entity_obj.authors) book_info = self.__get_books_by_filter("id", book_id) response = self.format_insert_response(book_info[0]) self.db_obj.commit() logger.info("Exiting insert_book") return response except Exception as err: logger.exception(err) self.db_obj.rollback() raise finally: self.db_obj.close() @staticmethod def format_update_reponse(book_info): """ :param book_info: :return: """ response = { "status_code": 200, "status": "success", "message": "The book {} was updated successfully".format(book_info["name"]) } response.update({"data": book_info}) return response def patch_book(self, book_id, patch_book): """ :return: """ try: logger.info("Entering patch_book") book_info = self.__get_books_by_filter("id", book_id) if not book_info: raise ResourceNotAvailable( "Given resource {} is not available".format(book_id)) book_entity = Book(book_info[0]) author_to_update = False for key, value in patch_book.items(): setattr(book_entity, key, value) if key == "authors": author_to_update = True self.dao_obj.update_into_book(book_id, book_entity) if author_to_update: self.dao_obj.delete_author_book(book_id) self.__link_author_book(book_id, patch_book["authors"]) book_info = self.__get_books_by_filter("id", book_id) response = self.format_update_reponse(book_info[0]) self.db_obj.commit() logger.info("Exiting patch_book") return response except Exception as err: self.db_obj.rollback() logger.exception(err) raise finally: self.db_obj.close() @staticmethod def format_delete_reponse(book_name): """ :param book_name: :return: """ response = { "status_code": 200, "status": "success", "message": "The book {} was deleted successfully".format(book_name), "data": [] } return response def delete_book(self, book_id): """ :param book_id: :return: """ try: logger.info("Entering delete_book") book_info = self.__get_books_by_filter("id", book_id) if not book_info: raise ResourceNotAvailable( "Given resource {} is not available".format(book_id)) self.dao_obj.delete_author_book(book_id) self.dao_obj.delete_from_book(book_id) response = self.format_delete_reponse(book_info[0]["name"]) self.db_obj.commit() logger.info("Exiting delete_book") return response except Exception as err: self.db_obj.rollback() logger.exception(err) raise finally: self.db_obj.close()
def get_subgraph(): mid = request.args.get('mid').strip() print '[get_subgraph]', mid subgraph = self.freebase.client.get_subgraph(mid) print "subgraph", subgraph links = [] nodes_ = {} for path in subgraph: if len(path) == 1: p1 = path[0] if p1[0] not in nodes_: nodes_[p1[0]] = { 'category': 0, 'name': p1[0], 'value': 10 } if p1[2] not in nodes_: nodes_[p1[2]] = { 'category': 2, 'name': p1[2], 'value': 4 } else: p1 = path[0] if p1[0] not in nodes_: nodes_[p1[0]] = { 'category': 0, 'name': p1[0], 'value': 10 } if p1[2] not in nodes_: nodes_[p1[2]] = { 'category': 1, 'name': p1[2], 'value': 4 } p2 = path[1] if p2[2] not in nodes_: nodes_[p2[2]] = { 'category': 2, 'name': p2[2], 'value': 4 } for m in nodes_.keys(): name, name_info = DBManager.get_name(m) nodes_[m]['label'] = name nodes = nodes_.values() for path in subgraph: if len(path) == 1: t = path[0] links.append({ 'source': nodes_[t[0]]['name'], 'target': nodes_[t[2]]['name'], 'weight': 2, 'name': t[1] }) else: t = path[0] links.append({ 'source': nodes_[t[0]]['name'], 'target': nodes_[t[2]]['name'], 'weight': 2, 'name': t[1] }) t = path[1] links.append({ 'source': nodes_[t[0]]['name'], 'target': nodes_[t[2]]['name'], 'weight': 2, 'name': t[1] }) print 'node', nodes print 'links', links return json.dumps({'nodes': nodes, 'links': links})
def __init__(self, colletion=None): if not None: self.__dbm_tweets = DBManager(colletion) self.__dbm_users = DBManager('users') self.__dbm_networks = DBManager('networks') self.__network = []
class NetworkAnalyzer: __dbm_tweets = None __dbm_users = None __dbm_networks = None __network = None __graph = None __nodes = set() __unknown_users = set() __node_sizes = None def __init__(self, colletion=None): if not None: self.__dbm_tweets = DBManager(colletion) self.__dbm_users = DBManager('users') self.__dbm_networks = DBManager('networks') self.__network = [] def __computer_ff_ratio(self, friends, followers): if followers > 0 and friends > 0: return friends / followers else: return 0 # Get interactions in of a given users def get_in_interactions(self, user_screen_name): # compute in interactions, meaning, interactions in which the user # was mentioned, retweeted, quoted, replied in_inter_query = { 'interactions.' + user_screen_name: { '$exists': 1 }, 'screen_name': { '$ne': user_screen_name } } n_users = self.__dbm_users.search(in_inter_query) in_interactions_dict, in_rts, in_rps = {}, {}, {} in_qts, in_mts = {}, {} total_in_interactions = 0 total_in_retweets, total_in_replies = 0, 0 total_in_mentions, total_in_quotes = 0, 0 for n_user in n_users: n_user_interactions = n_user['interactions'] for i_user, interactions in n_user_interactions.items(): if i_user == user_screen_name: in_interactions_dict[ n_user['screen_name']] = interactions['total'] total_in_interactions += interactions['total'] if 'retweets' in interactions.keys(): total_in_retweets += interactions['retweets'] in_rts[ n_user['screen_name']] = interactions['retweets'] if 'replies' in interactions.keys(): total_in_replies += interactions['replies'] in_rps[n_user['screen_name']] = interactions['replies'] if 'mentions' in interactions.keys(): total_in_mentions += interactions['mentions'] in_mts[ n_user['screen_name']] = interactions['mentions'] if 'quotes' in interactions.keys(): total_in_quotes += interactions['quotes'] in_qts[n_user['screen_name']] = interactions['quotes'] in_interactions_obj = { 'total': { 'count': total_in_interactions, 'details': in_interactions_dict }, 'replies': { 'count': total_in_replies, 'details': in_rps }, 'retweets': { 'count': total_in_retweets, 'details': in_rts }, 'mentions': { 'count': total_in_mentions, 'details': in_mts }, 'quotes': { 'count': total_in_quotes, 'details': in_qts } } user_dict = {'in_interactions': in_interactions_obj} return user_dict # Get interactions out of a given users def get_out_interactions(self, user_screen_name): user = self.__dbm_users.search({'screen_name': user_screen_name})[0] # compute out interactions, meaning, interactions originated by # the user user_interactions = user['interactions'] out_interactions_dict, out_rts = {}, {} out_rps, out_qts, out_mts = {}, {}, {} total_out_interactions, total_out_retweets = 0, 0 total_out_mentions, total_out_replies = 0, 0 total_out_quotes = 0 for recipient, interactions in user_interactions.items(): out_interactions_dict[recipient] = interactions['total'] total_out_interactions += interactions['total'] if 'retweets' in interactions: total_out_retweets += interactions['retweets'] out_rts[recipient] = interactions['retweets'] if 'replies' in interactions: total_out_replies += interactions['replies'] out_rps[recipient] = interactions['replies'] if 'mentions' in interactions: total_out_mentions += interactions['mentions'] out_mts[recipient] = interactions['mentions'] if 'quotes' in interactions: total_out_quotes += interactions['quotes'] out_qts[recipient] = interactions['quotes'] out_interactions_obj = { 'total': { 'count': total_out_interactions, 'details': out_interactions_dict }, 'replies': { 'count': total_out_replies, 'details': out_rps }, 'retweets': { 'count': total_out_retweets, 'details': out_rts }, 'mentions': { 'count': total_out_mentions, 'details': out_mts }, 'quotes': { 'count': total_out_quotes, 'details': out_qts } } # compile all information in a dictionary user_dict = {'out_interactions': out_interactions_obj} return user_dict def create_users_db(self, clear_collection=False): logging.info( '::. Network Analyzer: Creating database of users, it can take several minutes, please wait_' ) if clear_collection: self.__dbm_users.clear_collection() users = self.__dbm_tweets.get_unique_users() users_count = len(users) logging.info( '::. Network Analyzer: Extracted {0} unique users from the database...' .format(users_count)) progress = 1 for user in users: db_user = { 'screen_name': user['screen_name'], 'friends': user['friends'], 'followers': user['followers'], 'ff_ratio': self.__computer_ff_ratio(user['friends'], user['followers']), 'interactions': user['interactions'], 'tweets': user['tweets_count'], 'original_tweets': user['original_count'], 'rts': user['retweets_count'], 'qts': user['quotes_count'], 'rps': user['replies_count'], 'verified': user['verified'] } filter_query = {'screen_name': user['screen_name']} logging.debug( '::. Network Analyzer: Updating/creating user {0} ({1}/{2})...' .format(user['screen_name'], progress, users_count)) progress += 1 self.__dbm_users.update_record(filter_query, db_user, create_if_doesnt_exist=True) def generate_network(self, subnet_query={}, depth=1, file_name='network', override_net=False): net_query = subnet_query.copy() net_query.update({'depth': depth}) ret_net = self.__dbm_networks.search(net_query) # the net doesn't exist yet, let's create it if ret_net.count() == 0 or override_net: logging.info( 'Generating the network, it can take several minutes, please wait_' ) users = self.__dbm_users.search(subnet_query) # for each user generate his/her edges for user in users: if 'ff_ratio' in user.keys(): u_ff_ratio = user['ff_ratio'] else: u_ff_ratio = self.__computer_ff_ratio( user['friends'], user['followers']) exists = user['exists'] if 'exists' in user.keys() else '' self.__nodes.add( tuple({ 'screen_name': user['screen_name'], 'ff_ratio': u_ff_ratio, 'exists': exists }.items())) for interacted_user, interactions in user[ 'interactions'].items(): iuser = self.__dbm_users.find_record( {'screen_name': interacted_user}) if not iuser: if depth > 1: iuser_ffratio = self.__get_ffratio(interacted_user) if not iuser_ffratio: self.__unknown_users.add(interacted_user) continue else: self.__unknown_users.add(interacted_user) continue else: if 'ff_ratio' in iuser.keys(): i_ff_ratio = iuser['ff_ratio'] else: i_ff_ratio = self.__computer_ff_ratio( iuser['friends'], iuser['followers']) exists_iuser = iuser['exists'] if 'exists' in iuser.keys( ) else '' self.__nodes.add( tuple({ 'screen_name': iuser['screen_name'], 'ff_ratio': i_ff_ratio }.items())) edge = { 'nodeA': { 'screen_name': user['screen_name'], 'ff_ratio': u_ff_ratio, 'exists': exists }, 'nodeB': { 'screen_name': interacted_user, 'ff_ratio': i_ff_ratio, 'exists': exists_iuser }, 'weight': interactions['total'] } self.__network.append(edge) logging.info('Created a network of {0} nodes and {1} edges'.format( len(self.__nodes), len(self.__network))) logging.info('Unknown users {0}'.format(len(self.__unknown_users))) # save the net in a gefx file for posterior usage f_name = self.save_network_in_gexf_format(file_name) logging.info('Saved the network in the file {0}'.format(f_name)) db_net = {'file_name': str(f_name)} db_net.update(net_query) self.__dbm_networks.save_record(db_net) else: f_net = ret_net[0] logging.info( 'The network was already generated, please find it at {0}'. format(f_net['file_name'])) def create_graph(self): logging.info('Creating the graph, please wait_') self.__graph = net.DiGraph() ff_ratio = defaultdict(lambda: 0.0) # create a directed graph from the edge data and populate a dictionary # with the friends/followers ratio for edge in self.__network: user = edge['nodeA']['screen_name'] interacted_with = edge['nodeB']['screen_name'] num_interactions = edge['weight'] u_ff_ratio = edge['nodeA']['ff_ratio'] self.__graph.add_edge(user, interacted_with, weight=int(num_interactions)) ff_ratio[user] = float(u_ff_ratio) # obtain central node # degrees = net.degree(self.__graph) # central_node, max_degree = sorted(degrees, key=itemgetter(1))[-1] # center the graph around the central node # ego_graph = net.DiGraph(net.ego_graph(self.__graph, central_node)) return def get_graph_nodes(self): return len(self.__nodes) def get_graph_edges(self): return len(self.__network) def get_graph(self): return self.__graph def get_node_sizes(self): return self.__node_sizes def __get_ffratio(self, screen_name): query = { '$or': [{ 'user.screen_name': screen_name }, { 'retweeted_status.user.screen_name': screen_name }, { 'quoted_status.user.screen_name': screen_name }] } tweet_obj = self.__dbm_tweets.find_record(query) if tweet_obj: tweet = tweet_obj['tweet_obj'] if 'retweeted_status' in tweet.keys(): return self.__computer_ff_ratio( tweet['retweeted_status']['user']['friends_count'], tweet['retweeted_status']['user']['followers_count']) elif 'quoted_status' in tweet.keys(): return self.__computer_ff_ratio( tweet['quoted_status']['user']['friends_count'], tweet['quoted_status']['user']['followers_count']) else: return self.__computer_ff_ratio( tweet['user']['friends_count'], tweet['user']['followers_count']) else: return None def save_network_in_gexf_format(self, file_name): today = datetime.strftime(datetime.now(), '%m/%d/%y') f_name = pathlib.Path(__file__).parents[1].joinpath( 'sna', 'gefx', file_name + '_' + today + '.gexf') with open(str(f_name), 'w', encoding='utf-8') as f: f.write('<?xml version="1.0" encoding="UTF-8"?>\n') f.write( '<gexf xmlns="http://www.gexf.net/1.2draft" xmlns:viz="http://www.gexf.net/1.1draft/viz" ' 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' 'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd" ' 'version="1.2">\n') f.write('<meta lastmodifieddate="{0}">\n'.format(today)) f.write('<creator>NetworkAnalysis</creator>\n') f.write('<description>{0}</description>\n'.format(file_name)) f.write('</meta>\n') f.write('<graph mode="static" defaultedgetype="directed">\n') # add data attributes f.write('<attributes class="node">\n') f.write('<attribute id="2" title="ff_ratio" type="float"/>\n') f.write('<attribute id="5" title="exists" type="float"/>\n') f.write('</attributes>\n') # add nodes f.write('<nodes>\n') node_id = 0 list_nodes = [] for node_tup in self.__nodes: node = dict(node_tup) f.write('<node id="{0}" label="{1}">\n'.format( node_id, node['screen_name'])) f.write('<attvalues>\n') f.write('<attvalue for="2" value="{0}"/>\n'.format( node['ff_ratio'])) f.write('</attvalues>\n') #f.write('<viz:size value="{0}"/>\n'.format(node['ff_ratio'])) f.write('</node>\n') node_id += 1 list_nodes.append(node['screen_name']) f.write('</nodes>\n') # add edges f.write('<edges>\n') edge_id = 0 for edge in list(self.__network): id_vertexA = list_nodes.index(edge['nodeA']['screen_name']) id_vertexB = list_nodes.index(edge['nodeB']['screen_name']) weight = edge['weight'] f.write( '<edge id="{0}" source="{1}" target="{2}" weight="{3}"/>\n' .format(edge_id, id_vertexA, id_vertexB, weight)) edge_id += 1 f.write('</edges>\n') f.write('</graph>\n') f.write('</gexf>\n') return f_name
class FaceRecognition(QMainWindow): def __init__(self): super(FaceRecognition, self).__init__() loadUi('ui/main.ui', self) self.webcam = WebCam(self.lblCamera, QTimer(self), self.recognize_each_frame) self.webcam.start() self.db_manager = DBManager() self.recognizer = Recognizer(self.db_manager, transform) self.btRecognize.clicked.connect(self.recognize) self.btAddToDB.clicked.connect(self.add_to_db) self.lblres = ImageWidget(self.lblResult) self.img = None self.clear_result_fields() self.btReset.clicked.connect(self.clear_result_fields) ''' Args: img(array): current frame Is called by WebCam object each frame. Updates img field so that we can access it in other methods of class. Returns: If display box is checked, recognize and draws box and person data on frame. If not, return img without any transforms. ''' def recognize_each_frame(self, img): self.img = img if self.img is None: self.clear_result_fields() return if self.btDisplay.isChecked(): clients, locations = self.recognizer.recognize(self.img) clients = list(clients) img = self.recognizer.draw_results(img, locations, clients) return img ''' Do recognition on the frame at specific moment of time. Updates results fields. ''' def recognize(self): #clear fields self.clear_result_fields() #do recognition clients, _ = self.recognizer.recognize(self.img) clients = list(clients) self.btReset.setEnabled(True) #if no match - person is Unknown. if len(clients) == 0: self.fill_result_fields('Unknown', noname_img) return #get first founded result (expected to have only one person on frame) self.fill_result_fields(self.db_manager.get_fullname(clients[0]), clients[0]['img_path']) def add_to_db(self): self.widget = DataDialog(self, self.img, self.db_manager) self.widget.show() def clear_result_fields(self): self.btReset.setEnabled(False) self.fill_result_fields('', noname_img) def fill_result_fields(self, text, path): #type: (string, string) self.lblName.setText(text) img = cv2.imread(path) self.lblres.displayImage(img)
def do_update_collection(collection_name, source_collection, end_date, start_date=None, config_fn=None): dbm_weekly_collection = DBManager(config_fn=config_fn) # Create collection if does not exists created_collection = dbm_weekly_collection.create_collection( collection_name) if created_collection: logging.info('Creating collection: {}...'.format(collection_name)) dbm_weekly_collection.create_index('id', 'asc', unique=True) logging.info('Creating index: id...') else: logging.info( 'Setting collection of database to {}'.format(collection_name)) dbm_weekly_collection.set_collection(collection_name) dbm_source = DBManager(collection=source_collection, config_fn=config_fn) # If no start date is passed, then use today's date if not start_date: start_date = datetime.today().strptime('%Y-%m-%d') query = {'created_at_date': {'$gte': start_date, '$lte': end_date}} logging.info('Searching for tweets between {0} and {1}...'.\ format(start_date, end_date)) tweets_to_copy = dbm_source.search(query) logging.info('Going to insert {0:,} tweets into the collection {1}'.\ format(tweets_to_copy.count(), collection_name)) try: ret_insertions = dbm_weekly_collection.insert_many_tweets( tweets_to_copy, ordered=False) insertion_counter = ret_insertions.inserted_ids logging.info('{0:,} new tweets were inserted into the collection {1}'.\ format(insertion_counter, collection_name)) except Exception as e: logging.error('Error when merging {}'.format(e))
def gen_unsolved_sentence(fn_in, fn_out): avg_candidate = 0 num = 0 with open(fn_in) as fin, open(fn_out, 'w') as fout: for line in fin: data = json.loads(line, encoding='utf8') gold_entity = data['entity'] surfaces = data['predict'].split("\t") candidates = dict() for surface in surfaces: surface = surface.lower().replace(' ', '') res = DBManager.get_candidate_entities(surface, 0.1) for e in res: if e[0] not in candidates or e[1] > candidates[e[0]]: candidates[e[0]] = e[1] if len(candidates) == 0: sentence = [w.split('|')[0] for w in data['tag_res'].split()][1:-1] if 'pos' in data: all_pos = data['pos'][1:-1] else: all_pos = None # use ngram of surface for surface in surfaces: surface = surface.lower().split() if len(surface) == 0: continue start = find_word(sentence, surface) if start == -1: continue l = len(surface) found = False for j in range(l, 0, -1): # if found: # break for i in range(l - j + 1): if 'pos' not in data or is_entity_occurrence( all_pos, sentence, start + i, start + i + j): s = ''.join(surface[i:i + j]) res = DBManager.get_candidate_entities(s, 0.1) for e in res: if e[1] < 1.1 and ( e[0] not in candidates or e[1] > candidates[e[0]]): candidates[e[0]] = e[1] found = len(res) > 0 # candidates = sorted(candidates.items(), key=lambda x:x[1], reverse=True)[:20] candidates = candidates.items() correct = False for e, _ in candidates: if e == gold_entity: avg_candidate += len(candidates) num += 1 correct = True break print >> fout, ("%s\t%s" % (gold_entity, ' '.join( [c for c, _ in candidates]))).encode('utf8') if not correct: # print >> fout, line.strip(), candidates print surfaces, data['gold'].split('\t'), gold_entity # else: # print line.strip() # print candidates print "%s find correct topic entity" % num print "average number of candidate entities: %s" % (avg_candidate * 1.0 / num)