def main():
    """
    See usage message in module header block
    """
    get_subgraph = False  # if True discard nodes without attribute data
    try:
        opts, args = getopt.getopt(sys.argv[1:], "d")
    except:
        usage(sys.argv[0])
    for opt, arg in opts:
        if opt == "-d":
            get_subgraph = True
        else:
            usage(sys.argv[0])

    if len(args) != 1:
        usage(sys.argv[0])

    data_dir = args[0]

    outputdir = '.'

    sys.stdout.write('loading data from ' + data_dir + '...')
    start = time.time()
    datazipfile = data_dir + os.path.sep + 'physician-shared-patient-patterns-2014-days30.zip'
    G = load_physician_referral_data(datazipfile)
    print time.time() - start, 's'

    snap.PrintInfo(G)

    # Remove loops (self-edges).
    # G is a PNGraph so multiple edges not allowed in this type anyway.
    snap.DelSelfEdges(G)
    snap.PrintInfo(G)

    # specify ordered nodelist to map sequential ids to original ids consistent
    nodelist = [node.GetId() for node in G.Nodes()]

    graph_filename = outputdir + os.path.sep + "physician_referall_arclist" + os.path.extsep + "txt"
    nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt"
    write_graph_file(graph_filename, G, nodelist)
    write_subgraph_nodeids(nodeid_filename, nodelist)
def main():
    """
    See usage message in module header block
    """
    directed = True
    try:
        opts, args = getopt.getopt(sys.argv[1:], "")
    except:
        usage(sys.argv[0])
    for opt, arg in opts:
        usage(sys.argv[0])

    if len(args) != 5:
        usage(sys.argv[0])

    data_dir = args[0]
    num_samples = int(args[1])
    num_seeds = int(args[2])
    num_waves = int(args[3]) - 1  # -1 for consistency with SPNet
    outputdir = args[4]

    print "directed:", directed
    print "number of samples:", num_samples
    print "number of seeds:", num_seeds
    print "number of waves:", num_waves
    print "output directory:", outputdir

    if not os.path.exists(outputdir):
        os.mkdir(outputdir)

    sys.stdout.write('loading data from ' + data_dir + '...')
    start = time.time()
    datazipfile = data_dir + os.path.sep + 'physician-shared-patient-patterns-2014-days30.zip'
    G = load_physician_referral_data(datazipfile)
    print time.time() - start, 's'

    snap.PrintInfo(G)

    # get num_samples * num_seeds distinct random seed nodes (sample without replacement)
    # and convert to list of lists where each list is seed set for one sample
    allseeds = random.sample([node.GetId() for node in G.Nodes()],
                             num_samples * num_seeds)
    seedsets = [
        allseeds[i:i + num_seeds] for i in range(0, len(allseeds), num_seeds)
    ]

    sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt"
    sampledesc_f = open(sampledesc_filename, 'w')

    for i in range(num_samples):
        sys.stdout.write('generating snowball sample ' + str(i + 1) + '... ')
        start = time.time()
        # have to convert seedset to TIntV for SNAP
        seedsVec = snap.TIntV()
        for nodeid in seedsets[i]:
            seedsVec.Add(nodeid)
        Gsample0 = snowball_sample(G, num_waves, seedsVec)
        #print 'XXX',Gsample0.GetIntAttrDatN(Gsample0.GetRndNId(), "zone")#XXX
        # renumber nodes so they are numbered 0..N-1
        # Actually can't do this as it loses the node attributes (zone)
        # so instead build a dictionary mapping nodeid:zone
        # so that can be written to zone file in correct order.
        # Note that then the index in nodelist of a nodeid can be used
        # as sequential node number of each node.
        ##Gsample = snap.ConvertGraph(snap.PNEANet, Gsample0, True)
        #print 'YYY',Gsample.GetIntAttrDatN(Gsample.GetRndNId(), "zone")#XXX
        Gsample = Gsample0
        nodelist = list(
        )  # keep this iteration in list so we always use same order in future
        zonedict = dict()  # map nodeid : zone
        for node in Gsample.Nodes():
            nodelist.append(node.GetId())
            zonedict[node.GetId()] = Gsample.GetIntAttrDatN(
                node.GetId(), "zone")
        print time.time() - start, 's'

        snap.PrintInfo(Gsample)
        subgraph_filename = outputdir + os.path.sep + "subgraph" + str(
            i) + os.path.extsep + "txt"
        write_graph_file(subgraph_filename, Gsample, nodelist)
        subzone_filename = outputdir + os.path.sep + "subzone" + str(
            i) + os.path.extsep + "txt"
        write_zone_file(subzone_filename, Gsample, nodelist, zonedict)
        subactor_filename = outputdir + os.path.sep + "subactor" + str(
            i) + os.path.extsep + "txt"
        # TODO get actor attributes
        #write_subactors_file(subactor_filename, Gsample, nodelist)

        # format of sampledesc file is:
        # N subzone_filename subgraph_filename subactor_filename
        sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt"
        sampledesc_f.write("%d %s %s %s\n" %
                           (Gsample.GetNodes(), subzone_filename,
                            subgraph_filename, subactor_filename))

    sampledesc_f.close()
def main():
    """
    See usage message in module header block
    """
    get_subgraph = False  # if True discard nodes without attribute data
    try:
        opts, args = getopt.getopt(sys.argv[1:], "d")
    except:
        usage(sys.argv[0])
    for opt, arg in opts:
        if opt == "-d":
            get_subgraph = True
        else:
            usage(sys.argv[0])

    if len(args) != 1:
        usage(sys.argv[0])

    data_dir = args[0]

    outputdir = '.'

    sys.stdout.write('loading data from ' + data_dir + '...')
    start = time.time()
    (G, patdata, colnames) = load_nber_patent_data(data_dir)
    print time.time() - start, 's'

    snap.PrintInfo(G)

    # Remove loops (self-edges).
    # There is actually for some reason one loop (patent id 5489070).
    # G is a PNGraph so multiple edges not allowed in this type anyway.
    snap.DelSelfEdges(G)
    snap.PrintInfo(G)

    # We do not add attributes to nodes as SNAP node attribute as
    # these seem to get lost by varoius operations including subgraph
    # that we need to use, so instead maintain them just in the
    # dictionary mapping the original node ids to the attributes -
    # fortunately the original node ids are maintained by
    # GetSubGraph() so we can used these to index the patdata
    # dictoinary in the subgraphs

    # Cannot do this:
    #patdata[:][colnames['COUNTRY']] = convert_to_int_cat(patdata[:][colnames['COUNTRY']]) # like factor in R
    # as get "TypeError: unhashable type" so have to do this instead:
    id_countries = [(k, p[colnames['COUNTRY']])
                    for (k, p) in patdata.iteritems()]
    id_countries_int = convert_to_int_cat([x[1] for x in id_countries])
    for i in xrange(len(id_countries)):
        patdata[id_countries[i][0]][colnames['COUNTRY']] = id_countries_int[i]
    for attr in ['COUNTRY']:
        sys.stdout.write('There are %d NA for %s\n' %
                         ([p[colnames[attr]]
                           for p in patdata.itervalues()].count('NA'), attr))

    id_states = [(k, p[colnames['POSTATE']]) for (k, p) in patdata.iteritems()]
    id_states_int = convert_to_int_cat([x[1] for x in id_states])
    for i in xrange(len(id_states)):
        patdata[id_states[i][0]][colnames['POSTATE']] = id_states_int[i]
    for attr in ['POSTATE']:
        sys.stdout.write('There are %d NA for %s\n' %
                         ([p[colnames[attr]]
                           for p in patdata.itervalues()].count('NA'), attr))

    # There are 3774768 unique patent identifiers in the citation data but
    # only 2923922 unique patent identifiers in the patent data (patdata).
    # The size of the set intersection of these patent ids is 2755865
    # i.e. there is patent data for 73% of the patents in the citation network.
    # Presumably this is because the patdata (pat63_99.txt) contains all
    # utilit patents in the period 1963 to 1999 but the citation data
    # cit75_99.txt contains all US patent citations for utility patents
    # granted in the period 1975 to 1999, so there are patent ids in here
    # from earlier periods that are cited by patents in that period,
    # for which therefore we don't have the patent data (prior to 1963).
    # So we have to set the data for all patents in network that we have it
    # for, and the rest (27%) to NA.

    nodelist = list(
    )  # keep the iteration below in list so we always use same order in future

    if get_subgraph:
        # get subgraph induced by nodes that have patent data in the
        # pat63_99.txt file
        nodeVec = snap.TIntV()  # nodelist in TIntV format for use in SNAP
        for node in G.Nodes():
            patid = node.GetId()
            if patdata.has_key(patid):
                nodelist.append(patid)
                nodeVec.Add(patid)
        G = snap.GetSubGraph(G, nodeVec)
        print 'Subgraph with only nodes with patent attribute data:'
        snap.PrintInfo(G)
    else:
        # keep all the graph and just put NA for all data attributes
        citepatent_count = 0
        patentdata_count = 0
        for node in G.Nodes():
            citepatent_count += 1
            patid = node.GetId()
            nodelist.append(patid)
            #print citepatent_count, patentdata_count, patid  #XXX
            if not patdata.has_key(patid):
                #print 'NA for ', patid #XXX
                patdata[patid] = len(colnames) * ["NA"]
                patdata[patid][
                    colnames['HASDATA']] = 0  # no data on this patent
            else:
                patentdata_count += 1
        sys.stdout.write(
            "There are %d unique cited/citing patents of which %d (%f%%) have patent data\n"
            % (citepatent_count, patentdata_count,
               100 * float(patentdata_count) / citepatent_count))

    graph_filename = outputdir + os.path.sep + "patent_citations" + os.path.extsep + "txt"
    write_graph_file(graph_filename, G, nodelist)
    attributes_binary_filename = outputdir + os.path.sep + "patent_binattr" + os.path.extsep + "txt"
    attributes_categorical_filename = outputdir + os.path.sep + "patent_catattr" + os.path.extsep + "txt"
    attributes_continuous_filename = outputdir + os.path.sep + "patent_contattr" + os.path.extsep + "txt"

    write_attributes_file_binary(attributes_binary_filename, G, nodelist,
                                 patdata, colnames)
    write_attributes_file_categorical(attributes_categorical_filename, G,
                                      nodelist, patdata, colnames)
    write_attributes_file_continuous(attributes_continuous_filename, G,
                                     nodelist, patdata, colnames)

    nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt"
    write_subgraph_nodeids(nodeid_filename, nodelist)
def main():
    """
    See usage message in module header block
    """
    get_subgraph = False # if True discard nodes without attribute data
    try:
        opts,args = getopt.getopt(sys.argv[1:], "d")
    except:
        usage(sys.argv[0])
    for opt,arg in opts:
        if opt == "-d":
            get_subgraph = True
        else:
            usage(sys.argv[0])

    if len(args) != 1:
        usage(sys.argv[0])

    data_dir = args[0]

    outputdir = '.'

    sys.stdout.write('loading data from ' + data_dir + '...')
    start = time.time()
    (G, patdata, colnames) = load_epo_patent_data(data_dir)
    print time.time() - start, 's'

    snap.PrintInfo(G)

    # Remove loops (self-edges).
    # There is actually for some reason 92 nodes with self-loops
    # e.g. EP0021443
    # G is a PNGraph so multiple edges not allowed in this type anyway.
    snap.DelSelfEdges(G)
    snap.PrintInfo(G)

    # We do not add attributes to nodes as SNAP node attribute as
    # these seem to get lost by varoius operations including subgraph
    # that we need to use, so instead maintain them just in the
    # dictionary mapping the original node ids to the attributes -
    # fortunately the original node ids are maintained by
    # GetSubGraph() so we can used these to index the patdata
    # dictoinary in the subgraphs


    # convert categorical attribute values to integers like factor in R
    for cat_colname in ['Language','Country']:
        catvalues = [(k, p[colnames[cat_colname]]) for (k,p) in patdata.iteritems()]
        catvalues_int = convert_to_int_cat([x[1] for x in catvalues])
        for i in xrange(len(catvalues)):
            patdata[catvalues[i][0]][colnames[cat_colname]] = catvalues_int[i]
        sys.stdout.write('There are %d NA for %s\n' % ([p[colnames[cat_colname]] for p in patdata.itervalues()].count('NA'), cat_colname))


    # convert categorical set attribute values to integers like factor in R
    for set_colname in ['Classes','Sections']:
        setvalues = [(k, p[colnames[set_colname]]) for (k,p) in patdata.iteritems()]
        setvalues_int = convert_to_int_set([x[1].split(',') for x in setvalues])
        for i in xrange(len(setvalues)):
            patdata[setvalues[i][0]][colnames[set_colname]] = setvalues_int[i]
        sys.stdout.write('There are %d NA for %s\n' % ([p[colnames[set_colname]] for p in patdata.itervalues()].count('NA'), set_colname))

    nodelist = list()  # keep the iteration below in list so we always use same order in future

    if get_subgraph:
        # get subgraph induced by nodes that have patent data in the
        # pat63_99.txt file
        nodeVec = snap.TIntV() # nodelist in TIntV format for use in SNAP
        for node in G.Nodes():
            patid = node.GetId()
            if patdata.has_key(patid):
                nodelist.append(patid)
                nodeVec.Add(patid)
        G = snap.GetSubGraph(G, nodeVec)
        print 'Subgraph with only nodes with patent attribute data:'
        snap.PrintInfo(G)
    else:
        # keep all the graph and just put NA for all data attributes
        citepatent_count = 0
        patentdata_count = 0
        for node in G.Nodes():
            citepatent_count += 1
            patid = node.GetId()
            nodelist.append(patid)
            #print citepatent_count, patentdata_count, patid  #XXX
            if not patdata.has_key(patid):
                #print 'NA for ', patid #XXX
                patdata[patid] = len(colnames)*["NA"]
            else:
                patentdata_count += 1
        sys.stdout.write("There are %d unique cited/citing patents of which %d (%f%%) have patent data\n" % (citepatent_count, patentdata_count, 100*float(patentdata_count)/citepatent_count))


    graph_filename = outputdir + os.path.sep + "patent_citations" + os.path.extsep + "txt"
    write_graph_file(graph_filename, G, nodelist)
    attributes_binary_filename = outputdir + os.path.sep + "patent_binattr"  + os.path.extsep + "txt"
    attributes_categorical_filename = outputdir + os.path.sep + "patent_catattr"  + os.path.extsep + "txt"
    attributes_continuous_filename = outputdir + os.path.sep + "patent_contattr" + os.path.extsep + "txt"
    attributes_set_filename = outputdir + os.path.sep + "patent_setattr" + os.path.extsep + "txt"

    write_attributes_file_binary(attributes_binary_filename, G, nodelist, patdata, colnames)
    write_attributes_file_categorical(attributes_categorical_filename, G, nodelist, patdata, colnames)
    write_attributes_file_continuous(attributes_continuous_filename, G, nodelist, patdata, colnames)
    write_attributes_file_set(attributes_set_filename, G, nodelist, patdata, colnames)

    nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt"
    write_subgraph_nodeids(nodeid_filename, nodelist)

    # write patent sections as original letters before converting to int
    # This cannot be used by EstimNetDirected but is useful to read in R
    # and factor there so that the original names are preserved
    sections_filename = outputdir + os.path.sep + "patent_string_categories" + os.path.extsep + "txt"
    attrnames = ['CPCsections','LanguageCode','CountryCode']
    with open(sections_filename, 'w') as f:
        f.write(' '.join(attrnames) + '\n')
        for i in nodelist:
            for attrname in attrnames:
                val = patdata[i][colnames[attrname]]
                val = 'NA' if (val == 'NA' or val == 'XX') else val
                f.write(val)
                if attrname == attrnames[-1]:
                    f.write('\n')
                else:
                    f.write(' ' )
def main():
    """
    See usage message in module header block
    """
    directed = True
    try:
        opts, args = getopt.getopt(sys.argv[1:], "")
    except:
        usage(sys.argv[0])
    for opt, arg in opts:
        usage(sys.argv[0])

    if len(args) != 5:
        usage(sys.argv[0])

    data_dir = args[0]
    num_samples = int(args[1])
    num_seeds = int(args[2])
    num_waves = int(args[3]) - 1  # -1 for consistency with SPNet
    outputdir = args[4]

    print "directed:", directed
    print "number of samples:", num_samples
    print "number of seeds:", num_seeds
    print "number of waves:", num_waves
    print "output directory:", outputdir

    if not os.path.exists(outputdir):
        os.mkdir(outputdir)

    sys.stdout.write('loading data from ' + data_dir + '...')
    start = time.time()
    (G, profile, colnames) = load_pokec_data(data_dir)
    print time.time() - start, 's'

    snap.PrintInfo(G)

    # We do not add attributes to nodes as SNAP node attribute as
    # these seem to get lost by varoius operations including subgraph
    # that we need to use, so instead maintain them just in the
    # dictionary mapping the original node ids to the attributes -
    # fortunately the original node ids are maintained by
    # GetSubGraph() so we can used these to index the profile
    # dictoinary in the subgraphs

    ## https://snap.stanford.edu/data/soc-pokec-readme.txt
    ## region:
    ##   string, mostly regions in Slovakia (example: "zilinsky kraj,
    ##   kysucke nove mesto" means county Zilina, town Kysucke Nove Mesto,
    ##   Slovakia), some foreign countries (example: "zahranicie,
    ##   zahranicie - nemecko" means foreign country Germany (nemecko)),
    ##   some Czech regions (example: "ceska republika, cz - ostravsky
    ##   kraj" means Czech Republic, county Ostrava (ostravsky kraj))
    ## We just make this a factor, looking at the output written by print
    ## below, it looks reasonable, but is is only a categorical variable
    ## allowing us to tell if two users are in the same region or not.
    ## TODO we could recode this so that we can have different variables
    ## for being in a different country, major city, etc.
    # Cannot do this:
    #profile[:][colnames['region']] = convert_to_int_cat(profile[:][colnames['region']]) # like factor in R
    # as get "TypeError: unhashable type" so have to do this instead:
    id_regions = [(k, p[colnames['region']]) for (k, p) in profile.iteritems()]
    id_regions_int = convert_to_int_cat([x[1] for x in id_regions])
    for i in xrange(len(id_regions)):
        profile[id_regions[i][0]][colnames['region']] = id_regions_int[i]

    for attr in ['region']:
        sys.stdout.write('There are %d NA for %s\n' %
                         ([p[colnames[attr]]
                           for p in profile.itervalues()].count('NA'), attr))

    # get num_samples * num_seeds distinct random seed nodes (sample without replacement)
    # and convert to list of lists where each list is seed set for one sample
    allseeds = random.sample([node.GetId() for node in G.Nodes()],
                             num_samples * num_seeds)
    seedsets = [
        allseeds[i:i + num_seeds] for i in range(0, len(allseeds), num_seeds)
    ]

    sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt"
    sampledesc_f = open(sampledesc_filename, 'w')

    for i in range(num_samples):
        sys.stdout.write('generating snowball sample ' + str(i + 1) + '... ')
        start = time.time()
        # have to convert seedset to TIntV for SNAP
        seedsVec = snap.TIntV()
        for nodeid in seedsets[i]:
            seedsVec.Add(nodeid)
        Gsample = snowball_sample(G, num_waves, seedsVec)
        nodelist = list(
        )  # keep this iteration in list so we always use same order in future
        zonedict = dict()  # map nodeid : zone
        for node in Gsample.Nodes():
            nodelist.append(node.GetId())
            zonedict[node.GetId()] = Gsample.GetIntAttrDatN(
                node.GetId(), "zone")
        print time.time() - start, 's'

        snap.PrintInfo(Gsample)
        subgraph_filename = outputdir + os.path.sep + "subgraph" + str(
            i) + os.path.extsep + "txt"
        write_graph_file(subgraph_filename, Gsample, nodelist)
        subzone_filename = outputdir + os.path.sep + "subzone" + str(
            i) + os.path.extsep + "txt"
        write_zone_file(subzone_filename, Gsample, nodelist, zonedict)
        subactor_binary_filename = outputdir + os.path.sep + "subactorbin" + str(
            i) + os.path.extsep + "txt"
        subactor_categorical_filename = outputdir + os.path.sep + "subactorcat" + str(
            i) + os.path.extsep + "txt"
        subactor_continuous_filename = outputdir + os.path.sep + "subactorcont" + str(
            i) + os.path.extsep + "txt"

        write_subactors_file_binary(subactor_binary_filename, Gsample,
                                    nodelist, profile, colnames)
        write_subactors_file_categorical(subactor_categorical_filename,
                                         Gsample, nodelist, profile, colnames)
        write_subactors_file_continuous(subactor_continuous_filename, Gsample,
                                        nodelist, profile, colnames)

        nodeid_filename = outputdir + os.path.sep + "subnodeid" + str(
            i) + os.path.extsep + "txt"
        write_subgraph_nodeids(nodeid_filename, nodelist)

        # format of sampledesc file is:
        # N subzone_filename subgraph_filename binary_Filename cat_filename cont_filename
        sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt"
        sampledesc_f.write(
            "%d %s %s %s %s %s\n" %
            (Gsample.GetNodes(), subzone_filename, subgraph_filename,
             subactor_binary_filename, subactor_categorical_filename,
             subactor_continuous_filename))

    sampledesc_f.close()