예제 #1
0
def lexrank(sentences, build_matrix, target_wc_perc, tokenize_fn, min_score,
            include_quot, damping_factor=0.85, error=0.001,
            title=None):
    assert isinstance(min_score, (int, float)), 'min_score needs to be float, not %s' % type(min_score)
    assert isinstance(include_quot, bool)

    include_sent = [(include_quot or not is_quotation(s.text)) and \
                    not extractive_summary.exclude_sentence(s.text) for s in sentences]

    tokenized_sents = [s.map(text=lambda t:list(tokenize_fn(t)),
                             length=lambda t:len(space_split(s.text)))
                             for s in sentences]
    tokenized_title = tokenize_fn(title) if title != None else None

    M = build_matrix(sentences, tokenized_sents, include_sent, 
                     tokenized_title=tokenized_title)

    scores = pageRank(M, s=damping_factor, maxerr=error)
    assert len(scores) == len(sentences)
                
    ## normalize scores: 1 is equal to average page rank
    ## round to avoid float precision issues
    sentence_scores = [round(s*len(tokenized_sents), 5) for s in scores]
    sentence_wc = extractive_summary.count_words(tokenized_sents)
    summary = extractive_summary.build_summary(sentences, sentence_scores, sentence_wc, 
                                               target_wc_perc, min_score)
    return summary, sentence_wc
예제 #2
0
 def testDocMatrix(self):
     links = [
       [0, 2, 2, 3],
       [0],
       [3, 2],
       [0],
     ]
     result = pagerank.pageRank(links)
     self.assertEqual(str(result), '[ 0.36723503  0.0375      0.33665007  0.25861487]')
예제 #3
0
 def testOneCircle(self):
     links = [
             [1, 1, 1, 1],
             [2],
             [3],
             [4],
             [0]
     ]
     result = pagerank.pageRank(links, alpha=1.0)
     self.assertEqual(str(result), '[ 0.2  0.2  0.2  0.2  0.2]')
예제 #4
0
 def testTwoCircles(self):
     links = [
             [1, 2],
             [2],
             [3],
             [4],
             [0]
     ]
     result = pagerank.pageRank(links)
     self.assertEqual(str(result), '[ 0.2116109   0.12411822  0.2296187   0.22099231  0.21365988]')
예제 #5
0
    def format_data(self, dependencygroup):
        header = 'Filename', 'PageRank', 'PageID', 'Outgoing Links'
        converter = pagerank.DependenciesToLinkMatrix(dependencygroup.dependencies)

        matrix = converter.create_matrix()
        ranking = pagerank.pageRank(matrix)
        ids = [idx for idx in range(len(matrix))]
        filenames = [utils.prettify_path(converter.id_to_node_map[nid]) for nid in ids]

        rowinfos = zip(filenames, ranking, ids, matrix)
        rowinfos.sort(key=lambda item: item[1]) #sort by ranking
        rowinfos.reverse()
        rows = []
        for rowi in rowinfos:
            row = (rowi[0], self._fmt_rank(rowi[1]), str(rowi[2]), str(rowi[3]))
            rows.append(row)
        tbl = tableprint.Table(header, rows)
        tbl.write(self._outstream)
예제 #6
0
파일: lexrank.py 프로젝트: rayeya/summbug
def lexrank(sentences,
            build_matrix,
            target_wc_perc,
            tokenize_fn,
            min_score,
            include_quot,
            damping_factor=0.85,
            error=0.001,
            title=None):
    assert isinstance(
        min_score,
        (int, float)), 'min_score needs to be float, not %s' % type(min_score)
    assert isinstance(include_quot, bool)

    include_sent = [(include_quot or not is_quotation(s.text)) and \
                    not extractive_summary.exclude_sentence(s.text) for s in sentences]

    tokenized_sents = [
        s.map(text=lambda t: list(tokenize_fn(t)),
              length=lambda t: len(space_split(s.text))) for s in sentences
    ]
    tokenized_title = tokenize_fn(title) if title != None else None

    M = build_matrix(sentences,
                     tokenized_sents,
                     include_sent,
                     tokenized_title=tokenized_title)

    scores = pageRank(M, s=damping_factor, maxerr=error)
    assert len(scores) == len(sentences)

    ## normalize scores: 1 is equal to average page rank
    ## round to avoid float precision issues
    sentence_scores = [round(s * len(tokenized_sents), 5) for s in scores]
    sentence_wc = extractive_summary.count_words(tokenized_sents)
    summary = extractive_summary.build_summary(sentences, sentence_scores,
                                               sentence_wc, target_wc_perc,
                                               min_score)
    return summary, sentence_wc
예제 #7
0
def group_info():
    
    algo_type = request.args.get('algo_type')
    cur = get_db().cursor()

    sec_since_start_of_day = time.localtime().tm_sec + \
            time.localtime().tm_min*60 + time.localtime().tm_hour*3600
    start_of_day_epoch = time.time() - sec_since_start_of_day
    week_ago_epoch = start_of_day_epoch - 7*24*3600

    ids = get_db_identifications(cur)

    interact_dict = {}
    for (full_name, uniqname, id) in ids:
        interact_dict[id] = {}
        start_time = week_ago_epoch
        while start_time < time.time():
            end_time = start_time + 24*3600
            interactions = get_db_interactions(cur, id, start_time, end_time)
            interact_dict[id][time.localtime(start_time).tm_wday] = interactions
            start_time = end_time

    data = []

    print(str(ids))

    # pagerank points allocation
    if algo_type == 'pagerank':

        # calculate pagerank on a day-by-day basis
        results = {}
        for day in range(7):
            # create pagerank graph
            pr_graph = []
            for (_, _, my_id) in ids:

                # find interactions with all other ids
                pr_row = []
                for (_, _, their_id) in ids:
                    # sum interactions with this id
                    interactions = 0
                    if their_id in interact_dict[my_id][day]:
                        interactions += interact_dict[my_id][day][their_id]

                    # insert into row
                    pr_row.append(interactions)

                # insert into pagerank graph
                pr_graph.append(pr_row)

            # run pagerank algorithm
            if 0:
                pr_graph = np.array([[0,0,1,0,0,0,0,0],
                                     [0,1,1,0,0,0,0,0],
                                     [1,0,1,1,0,0,0,0],
                                     [0,0,0,1,1,0,0,0],
                                     [0,0,0,0,0,0,1,0],
                                     [0,0,0,0,0,1,1,0],
                                     [0,0,0,1,1,0,1,0],
                                     [0,0,0,0,0,0,0,0]])
 
            print("\nAt time: " + str(time.time()))
            print(str(pr_graph))
            results[day] = pagerank.pageRank(np.array(pr_graph), maxerr = 0.1)
            print(str(results[day]))
            for value in results[day]:
                if math.isnan(value):
                    results[day] = [1/float(len(ids))]*len(ids)
                    break

        # create data for graphing
        id_index = 0
        for (full_name, uniqname, id) in ids:
            total_points = 0
            point_dict = {}
            point_dict['full_name'] = full_name
            point_dict['uniqname'] = uniqname
            point_counts = []
            for day_num in range(7):
                day_points = {}
                day_points['day'] = days[day_num]
                day_points['points'] = round(results[day_num][id_index]*100000)
                total_points += day_points['points']
                point_counts.append(day_points)

            # point_counts needs to be rotated so that the current day is last
            curr_day_num = time.localtime().tm_wday
            point_counts = point_counts[curr_day_num+1:] + point_counts[:curr_day_num+1]

            point_dict['point_counts'] = point_counts
            point_dict['total_points'] = total_points
            data.append(point_dict)

            id_index += 1

    # default is linear points
    else:
        for (full_name, uniqname, id) in ids:
            total_points = 0
            point_dict = {}
            point_dict['full_name'] = full_name
            point_dict['uniqname'] = uniqname
            point_counts = []
            for day_num in range(7):
                day_points = {}
                day_points['day'] = days[day_num]
                points = 0
                for pings in interact_dict[id][day_num].values():
                    points += pings
                    total_points += pings
                day_points['points'] = points
                point_counts.append(day_points)

            # point_counts needs to be rotated so that the current day is last
            curr_day_num = time.localtime().tm_wday
            point_counts = point_counts[curr_day_num+1:] + point_counts[:curr_day_num+1]

            point_dict['point_counts'] = point_counts
            point_dict['total_points'] = total_points
            data.append(point_dict)

    # Sort data by total points
    data = sorted(data, key=lambda record: record['total_points'], reverse=True)
    
    return json.dumps(data)
예제 #8
0
        intrain = []
        print fileN
        intrain = extractFailureIntrain(fileintrain, outtrain)

        LOC = len(intrain[0])
        numTest = len(intrain)

        cover = []
        for i in range(0, LOC):
            for j in range(0, numTest):
                temp.append(intrain[j][i])
            cover.append(temp)
            temp = []

        print "start pagerank"
        pr = pagerank.pageRank(cover, MM)
        pr.calCompute()
        pr.calNormalized()

        save = 'C:/Users/Jeongsu Jang/Desktop/2018-1/paper/RBF+pageRank/experiment/tcas/Testv' + str(
            version) + '.txt'
        f = open(save, 'w+')
        for i in range(0, numTest):
            f.write(str(pr.norTest[i][0]) + "\n")

        f.close()

        save = 'C:/Users/Jeongsu Jang/Desktop/2018-1/paper/RBF+pageRank/experiment/tcas/Xv' + str(
            version) + '.txt'
        f = open(save, 'w+')
        for i in range(0, LOC):
	print "---------------------"
	print "## computing histgrams"
	all_word_histgrams = {}
	for imagefname in all_features.keys():
		word_histgram = computeHistograms(fcodebook, all_features[imagefname])
		all_word_histgrams[imagefname] = word_histgram

	print "---------------------"
	print "## build matrix"
	matrix, matrixOrder = buildMatrix(all_word_histgrams)

	# randomWorkVector = [] : consider tfidf score of word in each photo
	randomWorkVector = buildRWVector(matrixOrder, photoTfIdfScore)

	print "---------------------"
	print "## computing pagerank"


    # refer https://gist.github.com/diogojc/1338222/download
	# also refer https://github.com/timothyasp/PageRank
	rank = pagerank.pageRank(matrix, s=.86, rwVector=randomWorkVector)
	rankIndex = rank.argsort()[::-1]
	for i in range(0,30):
		print( str(i) + ": " + str(rank[rankIndex[i]]) + ' - ' + matrixOrder[rankIndex[i]])
		print(photoTfIdfScore[matrixOrder[rankIndex[i]]])
		for title, tfidf in (photoTfIdfScore[matrixOrder[rankIndex[i]]]).iteritems() :
			print( title + ' - ' + str(tfidf))

		# + ' - ' + all_files[rankIndex[i]])