def build_links_position_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute('CREATE TABLE `redirects_candidates` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,' '`source_article_id` BIGINT UNSIGNED NOT NULL,' '`target_article_id` BIGINT UNSIGNED NULL,' '`target_article_name` VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' ' target_position_in_text INT UNSIGNED NOT NULL,' ' target_position_in_text_only INT UNSIGNED,' ' target_position_in_section INT UNSIGNED,' ' target_position_in_section_in_text_only INT UNSIGNED,' ' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' ' section_number INT UNSIGNED,' ' target_position_in_table INT UNSIGNED,' ' table_number INT UNSIGNED,' ' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,' ' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,' ' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,' ' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,' 'INDEX(`target_article_id`),' 'INDEX(`source_article_id`)' ') ENGINE=InnoDB;') connection.close()
def pickle_vis_data_pandas(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql('select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn) print len(df) no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first() print len(no_dup) feature = no_dup.loc[no_dup['visual_region']=='lead'] print len(feature) feature.reset_index(inplace=True) feature = no_dup.loc[no_dup['visual_region']=='infobox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region']=='navbox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region']=='left-body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t',index=False) feature = no_dup.loc[no_dup['visual_region']=='body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t',index=False)
def build_links_position_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute( 'CREATE TABLE `links` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,' '`source_article_id` BIGINT UNSIGNED NOT NULL,' '`target_article_id` BIGINT UNSIGNED NOT NULL,' ' target_position_in_text INT UNSIGNED NOT NULL,' ' target_position_in_text_only INT UNSIGNED,' ' target_position_in_section INT UNSIGNED,' ' target_position_in_section_in_text_only INT UNSIGNED,' ' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' ' section_number INT UNSIGNED,' ' target_position_in_table INT UNSIGNED,' ' table_number INT UNSIGNED,' ' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,' ' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,' ' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,' ' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,' 'INDEX(`target_article_id`),' 'INDEX(`source_article_id`)' ') ENGINE=InnoDB;') connection.close()
def pickle_correlations_zeros(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() print 'read' df = pd.read_sql('select source_article_id, target_article_id, IFNULL(counts, 0) as counts from link_features group by source_article_id, target_article_id', conn) print 'group' article_counts = df.groupby(by=["target_article_id"])['counts'].sum().reset_index() print 'write to file' article_counts[["target_article_id","counts"]].to_csv(TMP+'article_counts.tsv', sep='\t', index=False)
def build_page_length_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute('CREATE TABLE `page_length` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,' ' page_length_1920_1080 INT UNSIGNED DEFAULT NULL' ') ENGINE=InnoDB;') connection.close()
def build_page_length_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute('CREATE TABLE `redirects_candidates_page_length` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,' ' page_length_1920_1080 INT UNSIGNED DEFAULT NULL' ') ENGINE=InnoDB;') connection.close()
def pickle_vis_data_pandas(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql( 'select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn) print len(df) no_dup = df.sort([ 'source_article_id', 'target_y_coord_1920_1080', 'target_x_coord_1920_1080' ]).groupby(["source_article_id", "target_article_id"]).first() print len(no_dup) feature = no_dup.loc[no_dup['visual_region'] == 'lead'] print len(feature) feature.reset_index(inplace=True) feature = no_dup.loc[no_dup['visual_region'] == 'infobox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id', 'target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region'] == 'navbox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id', 'target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region'] == 'left-body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id', 'target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region'] == 'body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id', 'target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t', index=False)
def plot_degree_filtered_sql(): print 'before select' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute( 'SELECT source_article_id, target_article_id FROM link_occurences where source_article_id in ' ' (select distinct prev_id from clickstream_derived_internal_links);') result = cursor.fetchall() network = Graph() print 'after select' print 'result len' print len(result) for i, link in enumerate(result): if i % 1000000 == 0: print i, len(result) network.add_edge(link[0], link[1]) # filter all nodes that have no edges print 'filter nodes with degree zero graph tool specific code' network = GraphView(network, vfilt=lambda v: v.out_degree() + v.in_degree() > 0) print 'before save' network.save( "output/wikipedianetworkfilteredwithtransitions_prev_id.xml.gz") print 'done' cursor.execute( 'SELECT source_article_id, target_article_id FROM link_occurences where target_article_id in ' ' (select distinct curr_id from clickstream_derived_internal_links);') result = cursor.fetchall() network = Graph() print 'after select' print 'resutl len' print len(result) for i, link in enumerate(result): if i % 1000000 == 0: print i, len(result) network.add_edge(link[0], link[1]) # filter all nodes that have no edges print 'filter nodes with degree zero graph tool specific code' network = GraphView(network, vfilt=lambda v: v.out_degree() + v.in_degree() > 0) print 'before save' network.save( "output/wikipedianetworkfilteredwithtransitions_curr_id.xml.gz") print 'done'
def build_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute('CREATE TABLE `table_css_class` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,' '`source_article_id` BIGINT UNSIGNED NOT NULL,' ' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' 'INDEX(`source_article_id`)' ') ENGINE=InnoDB;') connection.close()
def correlations(network_name): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() # wikipedia graph structural statistics results = None try: results = cursor.execute('select c.curr_id, sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",)) results = cursor.fetchall() except MySQLdb.Error, e: print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
def build_table(): """creates up the basic database structure """ db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) connection = db._create_connection() cursor = connection.cursor() cursor.execute( 'CREATE TABLE `table_css_class` (' '`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,' '`source_article_id` BIGINT UNSIGNED NOT NULL,' ' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,' 'INDEX(`source_article_id`)' ') ENGINE=InnoDB;') connection.close()
def pickle_correlations_zeros_january(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() print 'read' df = pd.read_sql('select source_article_id, target_article_id from link_features', conn) print 'loaded links' df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501 where link_type_derived= "internal-link";', conn) print 'loaded counts' result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id']) print 'merged counts' print result article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index() article_counts['counts'].fillna(0.0, inplace=True) print article_counts print 'write to file' article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
def rbo(): print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() sm = [] try: cursor.execute('select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;', ("entry-sm",)) result = cursor.fetchall() for row in result: record = {} record['curr_id']= row[0] record['counts_sum'] = row[1] record['curr_title'] = row[2] sm.append(row[0]) except MySQLdb.Error, e: print e
def plot_degree_filtered_sql(): print 'before select' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where source_article_id in ' ' (select distinct prev_id from clickstream_derived_internal_links);') result = cursor.fetchall() network = Graph() print 'after select' print 'result len' print len(result) for i, link in enumerate(result): if i % 1000000==0: print i, len(result) network.add_edge(link[0], link[1]) # filter all nodes that have no edges print 'filter nodes with degree zero graph tool specific code' network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print 'before save' network.save("output/wikipedianetworkfilteredwithtransitions_prev_id.xml.gz") print 'done' cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences where target_article_id in ' ' (select distinct curr_id from clickstream_derived_internal_links);') result = cursor.fetchall() network = Graph() print 'after select' print 'resutl len' print len(result) for i, link in enumerate(result): if i % 1000000==0: print i, len(result) network.add_edge(link[0], link[1]) # filter all nodes that have no edges print 'filter nodes with degree zero graph tool specific code' network = GraphView(network, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print 'before save' network.save("output/wikipedianetworkfilteredwithtransitions_curr_id.xml.gz") print 'done'
def rbo(): print 'loading' db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() sm = [] try: cursor.execute( 'select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;', ("entry-sm", )) result = cursor.fetchall() for row in result: record = {} record['curr_id'] = row[0] record['counts_sum'] = row[1] record['curr_title'] = row[2] sm.append(row[0]) except MySQLdb.Error, e: print e
def get_redirecsfromXML(self, dump_date): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql(('select * from redirects'),conn) return df.set_index('source_article_name')['target_article_name'].to_dict()
def print_table(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql('select source_article_id, target_article_id, rel_degree, rel_in_degree, rel_out_degree, ' 'rel_page_rank, rel_kcore, target_x_coord_1920_1080, target_y_coord_1920_1080, visual_region, ' 'IFNULL(counts, 0) as counts from link_features order by source_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080', conn) print "dup" #no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first() no_dup = df.groupby(["source_article_id", "target_article_id"]).first() no_dup = no_dup.reset_index() print "no dup" del df #print no_dup df_top = pd.read_sql("select source_article_id, target_article_id, sim as topic_similarity from topic_similarity", conn) print "no up" topDF = df_top.groupby("source_article_id", as_index=False)["topic_similarity"].median() #print topDF print "no up1" topDF.columns = ["source_article_id", "topic_similarity_article_median"] #print topDF print "no up2" df_top = df_top.merge(topDF, on="source_article_id") #print df_top[(df_top['topic_similarity_article_median'] >0)] print "no up3" df_sem = pd.read_sql("select source_article_id, target_article_id, sim as sem_similarity from semantic_similarity", conn) print "no up4" semDF = df_sem.groupby("source_article_id", as_index=False)["sem_similarity"].median() #rename print "no up5" semDF.columns = ["source_article_id", "sem_similarity_article_median"] print "no up6" #print df_top df_sem = df_sem.merge(semDF, on="source_article_id") #print len(df_sem) print "no up7" df1 = no_dup.merge(df_sem[['source_article_id', 'sem_similarity', 'sem_similarity_article_median']], on="source_article_id") #print no_dup del df_sem, semDF df = no_dup.merge(df_top[['source_article_id', 'topic_similarity', 'topic_similarity_article_median']], on="source_article_id") print "no up9" del no_dup del df_top, topDF table = "" table += resultTableLine (df, "src_degr > target_degr", "df.rel_degree > 0") table += resultTableLine (df, "src_degr <= target_degr", "df.rel_degree <= 0") table += resultTableLine (df, "src_in_degr > target_in_degr", "df.rel_in_degree > 0") table += resultTableLine (df, "src_in_degr <= target_in_degr", "df.rel_in_degree <= 0") table += resultTableLine (df, "src_out_degr > target_out_degr", "df.rel_out_degree > 0") table += resultTableLine (df, "src_out_degr <= target_out_degr", "df.rel_out_degree <= 0") table += resultTableLine (df, "src_kcore > target_kcore", "df.rel_kcore > 0") table += resultTableLine (df, "src_kcore <= target_kcore", "df.rel_kcore <= 0") table += resultTableLine (df, "src_page_rank > target_page_rank", "df.rel_page_rank > 0") table += resultTableLine (df, "src_page_rank <= target_page_rank", "df.rel_page_rank <= 0") table += resultTableLine (df1, "text_sim > median(text_sim) of page", "df.sem_similarity > df.sem_similarity_article_median") table += resultTableLine (df1, "text_sim <= median(text_sim) of page", "df.sem_similarity <= df.sem_similarity_article_median") table += resultTableLine (df, "topic_sim > median(topic_sim) of page", "df.topic_similarity > df.topic_similarity_article_median") table += resultTableLine (df, "topic_sim <= median(topic_sim) of page", "df.topic_similarity <= df.topic_similarity_article_median") table += resultTableLine (df, "left third of screen", "df.target_x_coord_1920_1080 <= 360") table += resultTableLine (df, "middle third of screen", "(df.target_x_coord_1920_1080 > 360) & (df.target_x_coord_1920_1080 <= 720)") table += resultTableLine (df, "right third of screen", "df.target_x_coord_1920_1080 > 720") table += resultTableLine (df, "position = lead", "df.visual_region == 'lead'") table += resultTableLine (df, "position = body", "(df.visual_region == 'body') | (df.visual_region == 'left-body')") table += resultTableLine (df, "position = navbox", "df.visual_region == 'navbox'") #table += resultTableLine (df, "position = left-body", "df.visual_region == 'left-body'") table += resultTableLine (df, "position = infobox", "df.visual_region == 'infobox'") print table
from wsd.database import MySQLDatabase from graph_tool.all import * from conf import * db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute( 'SELECT source_article_id, target_article_id FROM link_occurences;') result = cursor.fetchall() wikipedia = Graph() for link in result: wikipedia.add_edge(link[0], link[1]) # filter all nodes that have no edges wikipedia = GraphView(wikipedia, vfilt=lambda v: v.out_degree() + v.in_degree() > 0) print "clust" wikipedia.vertex_properties["local_clust"] = local_clustering(wikipedia) print "page_rank" wikipedia.vertex_properties["page_rank"] = pagerank(wikipedia) print "eigenvector_centr" eigenvalue, eigenvectorcentr = eigenvector(wikipedia) wikipedia.vertex_properties["eigenvector_centr"] = eigenvectorcentr print "kcore"
def weighted_pagerank(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute('SELECT source_article_id, target_article_id, occ FROM link_occurences;') result = cursor.fetchall() wikipedia = Graph() eprop = wikipedia.new_edge_property("int") for link in result: e = wikipedia.add_edge(link[0], link[1]) eprop[e] = link[2] # filter all nodes that have no edges wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "page_rank_weighted" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank_weighted"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping) print "page_rank" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping) wikipedia.save("output/weightedpagerank/wikipedianetwork_link_occ.xml.gz") print 'link_occ done' cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity group by ' 'source_article_id, target_article_id;') result = cursor.fetchall() wikipedia = Graph() eprop = wikipedia.new_edge_property("double") for link in result: e = wikipedia.add_edge(link[0], link[1]) eprop[e] = link[2] # filter all nodes that have no edges print 'filter nodes graph tool specific code' wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "page_rank_weighted" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank_weighted"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping) print "page_rank" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping) wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz") print 'sem sim distrinct links done' cursor.execute('SELECT source_article_id, target_article_id, sim FROM semantic_similarity;') result = cursor.fetchall() wikipedia = Graph() eprop = wikipedia.new_edge_property("double") for link in result: e = wikipedia.add_edge(link[0], link[1]) eprop[e] = link[2] # filter all nodes that have no edges wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "page_rank_weighted" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank_weighted"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop,damping=damping) print "page_rank" for damping in [0.8, 0.85, 0.9 ,0.95]: print damping key = "page_rank"+str(damping) wikipedia.vertex_properties[key] = pagerank(wikipedia, damping=damping) wikipedia.save("output/weightedpagerank/wikipedianetwork_sem_sim.xml.gz") print 'sem_sim done'
def weighted_pagerank_hyp_engineering(labels): #read vocab, graph graph = read_pickle(SSD_HOME+"pickle/graph") print "loaded graph" values = read_pickle(SSD_HOME+"pickle/values") values_kcore = read_pickle(SSD_HOME+"pickle/values_kcore") # transform kcore values to model going out of the kcore values_kcore = [1./np.sqrt(float(x)) for x in values_kcore] print 'kcore values tranfsormation' #sem_sim_hyp = read_pickle(SSD_HOME+"pickle/sem_sim_hyp") #print "sem_sim_hyp values" #lead_hyp = read_pickle(SSD_HOME+"pickle/lead_hyp") #infobox_hyp = read_pickle(SSD_HOME+"pickle/infobox_hyp") #left_body_hyp = read_pickle(SSD_HOME+"pickle/left-body_hyp") #print "gamma values" vocab = read_pickle(SSD_HOME+"pickle/vocab") print "loaded vocab" state_count = len(vocab) states = vocab.keys() shape = (state_count, state_count) hyp_structural = csr_matrix((values, (graph[0], graph[1])), shape=shape, dtype=np.float) hyp_kcore = csr_matrix((values_kcore, (graph[0], graph[1])), shape=shape, dtype=np.float) print "hyp_kcore" del graph del values_kcore print "after delete" #read sem sim form db and create hyp db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() print 'read' df = pd.read_sql('select source_article_id, target_article_id, sim from semantic_similarity', conn) print 'map sem sim' sem_sim_hyp_i = map_to_hyp_indicies(vocab, df['source_article_id']) sem_sim_hyp_j = map_to_hyp_indicies(vocab, df['target_article_id']) hyp_sem_sim = csr_matrix((df['sim'].values, (sem_sim_hyp_i, sem_sim_hyp_j)), shape=shape, dtype=np.float) print 'done map sem sim' print hyp_sem_sim.shape del sem_sim_hyp_i del sem_sim_hyp_j del df #read vis form csv and create hyp lead = pd.read_csv(TMP+'lead.tsv',sep='\t') lead_i = map_to_hyp_indicies(vocab, lead['source_article_id']) lead_j = map_to_hyp_indicies(vocab, lead['target_article_id']) lead_v = np.ones(len(lead_i), dtype=np.float) hyp_lead = csr_matrix((lead_v, (lead_i, lead_j)), shape=shape, dtype=np.float) print 'done map lead' print hyp_lead.shape del lead del lead_i del lead_j del lead_v infobox = pd.read_csv(TMP+'infobox.tsv',sep='\t') infobox_i = map_to_hyp_indicies(vocab, infobox['source_article_id']) infobox_j = map_to_hyp_indicies(vocab, infobox['target_article_id']) infobox_v = np.ones(len(infobox_i), dtype=np.float) hyp_infobox = csr_matrix((infobox_v, (infobox_i, infobox_j)), shape=shape, dtype=np.float) print 'done map infobox' print hyp_infobox.shape del infobox del infobox_i del infobox_j del infobox_v left_body = pd.read_csv(TMP+'left-body.tsv',sep='\t') left_body_i = map_to_hyp_indicies(vocab, left_body['source_article_id']) left_body_j = map_to_hyp_indicies(vocab, left_body['target_article_id']) left_body_v = np.ones(len(left_body_i), dtype=np.float) hyp_left_body = csr_matrix((left_body_v, (left_body_i, left_body_j)), shape=shape, dtype=np.float) print 'done map infobox' print hyp_left_body.shape del left_body del left_body_i del left_body_j del left_body_v #add the visual hyps to one matrix and set all non zero fields to 1.0 print 'before gamma' hyp_gamma = hyp_left_body + hyp_infobox + hyp_lead hyp_gamma.data = np.ones_like(hyp_gamma.data, dtype=np.float) print 'after gamma' del hyp_left_body del hyp_infobox del hyp_lead #norm print "in norm each " hyp_structural = norm(hyp_structural) hyp_kcore = norm(hyp_kcore) hyp_sem_sim = norm(hyp_sem_sim) hyp_gamma = norm(hyp_gamma) #engineering of hypos and norm again hyp_kcore_struct = norm(hyp_structural + hyp_kcore) hyp_visual_struct = norm(hyp_structural + hyp_gamma) hyp_sem_sim_struct = norm(hyp_structural + hyp_sem_sim) hyp_mix_semsim_kcore = norm(hyp_kcore + hyp_sem_sim) hyp_mix_semsim_visual = norm(hyp_sem_sim + hyp_gamma) hyp_mix_kcore_visual= norm(hyp_kcore + hyp_gamma) hyp_all = norm(hyp_kcore + hyp_sem_sim + hyp_gamma) hyp_all_struct = norm(hyp_kcore + hyp_sem_sim + hyp_gamma + hyp_structural) hyp_semsim_struct = norm(hyp_structural + hyp_kcore) print 'test hypos' hypos={} hypos['hyp_kcore']=hyp_kcore hypos['hyp_sem_sim']=hyp_sem_sim hypos['hyp_visual']=hyp_gamma hypos['hyp_kcore_struct']=hyp_kcore_struct hypos['hyp_visual_struct']=hyp_visual_struct hypos['hyp_sem_sim_struct']=hyp_sem_sim_struct hypos['hyp_mix_semsim_kcore']=hyp_mix_semsim_kcore hypos['hyp_mix_semsim_visual']=hyp_mix_semsim_visual hypos['hyp_mix_kcore_visual']=hyp_mix_kcore_visual hypos['hyp_all']=hyp_all hypos['hyp_all_struct']=hyp_all_struct #load network print "weighted page rank engineering" wikipedia = load_graph("output/wikipedianetwork.xml.gz") #for label, hyp in hypos.iteritems(): name = '_'.join(labels) for label in labels: print label eprop = create_eprop(wikipedia, hypos[label], vocab) wikipedia.edge_properties[label]=eprop #for damping in [0.8, 0.85, 0.9 ,0.95]: for damping in [0.85]: key = label+"_page_rank_weighted_"+str(damping) print key wikipedia.vertex_properties[key] = pagerank(wikipedia, weight=eprop, damping=damping) print 'save network' wikipedia.save("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz") print 'save network' wikipedia.save("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz") print 'done'
def correlations_zeros(labels, consider_zeros=True, clickstream_data='', struct=False): #load network print struct name = '_'.join(labels) wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz") #read counts with zeros if consider_zeros: article_counts = pd.read_csv(TMP+clickstream_data+'article_counts.tsv', sep='\t') print TMP+clickstream_data+'article_counts.tsv' correlations_weighted_pagerank = {} for label in labels: if struct: label = label[7:] for damping in [0.8,0.85,0.9]: key = label+"_page_rank_weighted_"+str(damping) pagerank = wikipedia.vertex_properties[key] page_rank_values = list() counts = list() correlations_values = {} for index, row in article_counts.iterrows(): counts.append(float(row['counts'])) page_rank_values.append(pagerank[wikipedia.vertex(int(row['target_article_id']))]) print 'pearson' p = pearsonr(page_rank_values, counts) print p correlations_values['pearson']=p print 'spearmanr' s = spearmanr(page_rank_values, counts) print s correlations_values['spearmanr']=s print 'kendalltau' k = kendalltau(page_rank_values, counts) print k correlations_values['kendalltau']=k correlations_weighted_pagerank[key]=correlations_values write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_'+name+'.obj', correlations_weighted_pagerank) else: db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() # wikipedia graph structural statistics results = None try: if clickstream_data != '': results = cursor.execute('select c.curr_id, sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",)) results = cursor.fetchall() else: results = cursor.execute('select c.curr_id, sum(c.counts) as counts from clickstream_derived_en_201501 c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",)) results = cursor.fetchall() except MySQLdb.Error, e: print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0])) print 'after sql load' correlations_weighted_pagerank = {} for label in labels: if struct: label = label[7:] for damping in [0.8,0.85,0.9]: key = label+"_page_rank_weighted_"+str(damping) pagerank = wikipedia.vertex_properties[key] correlations={} counts=[] page_rank_values=[] for row in results: counts.append(float(row[1])) page_rank_values.append(pagerank[wikipedia.vertex(int(row[0]))]) print 'pearson' p = pearsonr(page_rank_values, counts) print p correlations['pearson']=p print 'spearmanr' s= spearmanr(page_rank_values, counts) print s correlations['spearmanr']=s print 'kendalltau' k= kendalltau(page_rank_values, counts) print k correlations['kendalltau']=k correlations_weighted_pagerank[key]=correlations write_pickle(HOME+'output/correlations/'+clickstream_data+'correlations_pagerank_without_zeros'+name+'.obj', correlations_weighted_pagerank)
from wsd.database import MySQLDatabase from graph_tool.all import * from conf import * db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() cursor = conn.cursor() cursor.execute('SELECT source_article_id, target_article_id FROM link_occurences;') result = cursor.fetchall() wikipedia = Graph() for link in result: wikipedia.add_edge(link[0], link[1]) # filter all nodes that have no edges wikipedia = GraphView(wikipedia, vfilt=lambda v : v.out_degree()+v.in_degree()>0 ) print "clust" wikipedia.vertex_properties["local_clust"] = local_clustering(wikipedia) print "page_rank" wikipedia.vertex_properties["page_rank"] = pagerank(wikipedia) print "eigenvector_centr" eigenvalue, eigenvectorcentr = eigenvector(wikipedia) wikipedia.vertex_properties["eigenvector_centr"] = eigenvectorcentr print "kcore" wikipedia.vertex_properties["kcore"] = kcore_decomposition(wikipedia)
def print_table(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql( 'select source_article_id, target_article_id, rel_degree, rel_in_degree, rel_out_degree, ' 'rel_page_rank, rel_kcore, target_x_coord_1920_1080, target_y_coord_1920_1080, visual_region, ' 'IFNULL(counts, 0) as counts from link_features order by source_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080', conn) print "dup" #no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first() no_dup = df.groupby(["source_article_id", "target_article_id"]).first() no_dup = no_dup.reset_index() print "no dup" del df #print no_dup df_top = pd.read_sql( "select source_article_id, target_article_id, sim as topic_similarity from topic_similarity", conn) print "no up" topDF = df_top.groupby("source_article_id", as_index=False)["topic_similarity"].median() #print topDF print "no up1" topDF.columns = ["source_article_id", "topic_similarity_article_median"] #print topDF print "no up2" df_top = df_top.merge(topDF, on="source_article_id") #print df_top[(df_top['topic_similarity_article_median'] >0)] print "no up3" df_sem = pd.read_sql( "select source_article_id, target_article_id, sim as sem_similarity from semantic_similarity", conn) print "no up4" semDF = df_sem.groupby("source_article_id", as_index=False)["sem_similarity"].median() #rename print "no up5" semDF.columns = ["source_article_id", "sem_similarity_article_median"] print "no up6" #print df_top df_sem = df_sem.merge(semDF, on="source_article_id") #print len(df_sem) print "no up7" df1 = no_dup.merge(df_sem[[ 'source_article_id', 'sem_similarity', 'sem_similarity_article_median' ]], on="source_article_id") #print no_dup del df_sem, semDF df = no_dup.merge(df_top[[ 'source_article_id', 'topic_similarity', 'topic_similarity_article_median' ]], on="source_article_id") print "no up9" del no_dup del df_top, topDF table = "" table += resultTableLine(df, "src_degr > target_degr", "df.rel_degree > 0") table += resultTableLine(df, "src_degr <= target_degr", "df.rel_degree <= 0") table += resultTableLine(df, "src_in_degr > target_in_degr", "df.rel_in_degree > 0") table += resultTableLine(df, "src_in_degr <= target_in_degr", "df.rel_in_degree <= 0") table += resultTableLine(df, "src_out_degr > target_out_degr", "df.rel_out_degree > 0") table += resultTableLine(df, "src_out_degr <= target_out_degr", "df.rel_out_degree <= 0") table += resultTableLine(df, "src_kcore > target_kcore", "df.rel_kcore > 0") table += resultTableLine(df, "src_kcore <= target_kcore", "df.rel_kcore <= 0") table += resultTableLine(df, "src_page_rank > target_page_rank", "df.rel_page_rank > 0") table += resultTableLine(df, "src_page_rank <= target_page_rank", "df.rel_page_rank <= 0") table += resultTableLine( df1, "text_sim > median(text_sim) of page", "df.sem_similarity > df.sem_similarity_article_median") table += resultTableLine( df1, "text_sim <= median(text_sim) of page", "df.sem_similarity <= df.sem_similarity_article_median") table += resultTableLine( df, "topic_sim > median(topic_sim) of page", "df.topic_similarity > df.topic_similarity_article_median") table += resultTableLine( df, "topic_sim <= median(topic_sim) of page", "df.topic_similarity <= df.topic_similarity_article_median") table += resultTableLine(df, "left third of screen", "df.target_x_coord_1920_1080 <= 360") table += resultTableLine( df, "middle third of screen", "(df.target_x_coord_1920_1080 > 360) & (df.target_x_coord_1920_1080 <= 720)" ) table += resultTableLine(df, "right third of screen", "df.target_x_coord_1920_1080 > 720") table += resultTableLine(df, "position = lead", "df.visual_region == 'lead'") table += resultTableLine( df, "position = body", "(df.visual_region == 'body') | (df.visual_region == 'left-body')") table += resultTableLine(df, "position = navbox", "df.visual_region == 'navbox'") #table += resultTableLine (df, "position = left-body", "df.visual_region == 'left-body'") table += resultTableLine(df, "position = infobox", "df.visual_region == 'infobox'") print table