def create_pagerank_features(): """ We create a dataframe with some features extracted from the PageRank algorithm :return: pandas dataframe for train and test set """ # Load dataset df_train, df_test = load_dataset() def generate_qid_graph_table(row): """ Generating a graph of questions and their neighbors. Appending nodes to the graph directly :param row: dataframe row """ hash_key1 = hashlib.md5(row["question1"].encode('utf-8')).hexdigest() hash_key2 = hashlib.md5(row["question2"].encode('utf-8')).hexdigest() qid_graph.setdefault(hash_key1, []).append(hash_key2) qid_graph.setdefault(hash_key2, []).append(hash_key1) qid_graph = {} _ = df_train.apply(generate_qid_graph_table, axis=1) _ = df_test.apply(generate_qid_graph_table, axis=1) pagerank_dict = get_pagerank(qid_graph) X_train = df_train.apply(lambda x: get_pagerank_value(x, pagerank_dict), axis=1) # Empty garbage collector del df_train gc.collect() X_test = df_test.apply(lambda x: get_pagerank_value(x, pagerank_dict), axis=1) return X_train, X_test
def process_item(self, item, spider): if not isinstance(item, TaseItem): return item if item['url']: rank = pagerank.get_pagerank(item['url']) try: pagerank = float(rank) self.cur.execute(\ "insert into pagerank (sessionid, date_, symbol, pagerank) " "values (%s, %s, %s, %s)", ( global_time, global_date.isoformat(), item['symbol'], pagerank ) ) except ValueError: pass except MySQLdb.IntegrityError, e: #print 'SQL integrity error: %s' % e log.msg('SQL integrity error: %s' % e)
def execute_lookup(key): rank = pagerank.get_pagerank(key) if rank is None: logger.debug('Error looking up pagerank for %s'%(key)) return '' return rank
#!/usr/bin/env python # Thanks to Corey Goldberg (http://code.google.com/p/corey-projects/) for his work on pagerank.py import pagerank sourceFile = open ('urlList.txt', 'r') listDump = open ('PRdump.txt', 'w') for line in sourceFile: rank = pagerank.get_pagerank(line) print >>listDump, rank, "\n",
#!/usr/bin/python # -*- coding: utf-8 -*- import pagerank print pagerank.get_pagerank("http://www.fsf.org")
def printSortedPageRankList( pages ): for page in sorted(pages ): url = pages_dict[page] rank = pagerank.get_pagerank(url) print page + "'s pagerank is: " + rank
def get_pageranks(urls=list()): prs = {} for url in urls: print "Processing url: %s" % url prs[url] = pagerank.get_pagerank(url) return prs
#!/usr/bin/env python import pagerank rank = pagerank.get_pagerank('http://www.google.com') print 'google:', rank print "baidu:", pagerank.get_pagerank('http://www.baidu.com') print pagerank.get_pagerank('http://www.csdn.net') print pagerank.get_pagerank('http://www.codeproject.com')
import csv import pprint import pagerank f = csv.writer(open('articles_pr_2.csv', 'wb')) for item in csv.reader(open('articles.csv')): rank = pagerank.get_pagerank(item[1]) item.append(rank) f.writerow(item) print item
#!/usr/bin/env python import pagerank rank = pagerank.get_pagerank('http://www.google.com') print 'google:',rank print "baidu:", pagerank.get_pagerank('http://www.baidu.com') print pagerank.get_pagerank('http://www.csdn.net') print pagerank.get_pagerank('http://www.codeproject.com')