示例#1
0
    def __init__(self,
                 lang,
                 xmlsize=None,
                 inputfilename=None,
                 forcedistrust=False,
                 threshold=0,
                 verbose=False):
        sax.handler.ContentHandler.__init__(self)
        #lang of wikipedia network
        self.lang = lang
        self.read = False
        self.validdisc = False  # valid discussion
        self.xmlsize = xmlsize
        self.inputfilename = inputfilename
        self.count = 0
        self.last_perc_print = ''
        self.threshold = threshold
        self.verbose = verbose
        #set parse parameter for this language
        self.i18n = i18n[self.lang]
        #made the comparison case insensitive
        self.i18n = (self.i18n[0].lower(), self.i18n[1].lower(),
                     self.i18n[2].lower())

        self.allusers = set()

        #this three parameters contains the Network,
        #the first as XDiGraph
        #the second/third as list of tuple
        self.network = Network()
        self.edges = []
        self.nodes = []

        if inputfilename:
            assert 'current' in inputfilename
示例#2
0
    def getNetwork(self):
        W = Network()

        for user, authors in self.pages:

            W.add_node(node(user))
            for a, num_edit in authors.iteritems():
                # add node
                W.add_node(node(a))
                #add edges
                # add edge from 'a' who have done the edit
                # a 'user' who receive the edit
                W.add_edge(node(a), node(user), pool({'value': str(num_edit)}))

        return W
示例#3
0
def main():
    m = Network()

    #read users

    occupation = {
        "0": "other",
        "1": "academic/educator",
        "2": "artist",
        "3": "clerical/admin",
        "4": "college/grad student",
        "5": "customer service",
        "6": "doctor/health care",
        "7": "executive/managerial",
        "8": "farmer",
        "9": "homemaker",
        "10": "K-12 student",
        "11": "lawyer",
        "12": "programmer",
        "13": "retired",
        "14": "sales/marketing",
        "15": "scientist",
        "16": "self-employed",
        "17": "technician/engineer",
        "18": "tradesman/craftsman",
        "19": "unemployed",
        "20": "writer",
    }

    users = {}

    for x in [(int(x[0]), x[0] + ' ' + x[1] + ' ' + x[2] + occupation[x[3]])
              for x in
              [x.strip().split('::') for x in file('users.dat').readlines()]]:
        users[x[0]] = x[1]
    #error: the file 'users.dat' is not in the SVN. There is the .tar.gz file but then it would make more sense to have a line to untar, unzip this file at the beginning of this script. REMOVE this comment if/when fixed.

    #read ratings
    # [(userid, movieid, rating)]

    ratings = [
        tuple(map(int,
                  x.strip().split('::')[:-1]))
        for x in file('ratings.dat').readlines()
    ]
    #print ratings

    #build graph
    m = Network()

    for u in users:
        m.add_node(users[u])

    for u in enumerate(users):
        print int(100.0 * (u[0] + 1) / len(users)), "%"
        u = u[1]

        for v in users:
            if u < v:
                #this can be slow

                #list -> dict
                # (id,movie,rating) -> [movie] = rating

                #user u
                ur = {}
                for t in filter(lambda x: x[0] is u, ratings):
                    ur[t[1]] = t[2]
                #user v
                vr = {}
                for t in filter(lambda x: x[0] is v, ratings):
                    vr[t[1]] = t[2]

                #print u,v,len(ur),len(vr)

                uvm = []  #common movies
                for i in ur:
                    if i in vr:
                        uvm.append(i)
                if uvm:
                    #creation of egde
                    s = 0
                    for movie in uvm:
                        s += abs(ur[movie] - vr[movie])
                    if not s:
                        value = 1.0  #users u and v completly agree
                    else:
                        value = 1.0 * len(uvm) / s  # 1 / avg
                        # s==0 and s==1 is the same

                    m.add_edge(u, v, {'value': str(value)})
                    m.add_edge(v, u, {'value': str(value)})

    write_dot(m, 'graph.dot')