def writeRdf(self): pub_dir = './participabr_snapshot/' if not os.path.isdir(pub_dir): os.mkdir(pub_dir) g = P.context(self.translation_graph) g.serialize(pub_dir+'participabr.ttl', 'turtle') c('participation ttl serialized') g.serialize(pub_dir+'participabr.rdf', 'xml') c('participation xml serialized') # metadados: group, platform, triples = [ (self.snapshoturi, a, po.Snapshot), # (self.snapshoturi, a, po.ParticipabrSnapshot), (self.snapshoturi, po.snapshotID, self.snapshotid), (self.snapshoturi, po.isEgo, False), (self.snapshoturi, po.isGroup, True), (self.snapshoturi, po.isFriendship, True), (self.snapshoturi, po.isInteraction, True), (self.snapshoturi, po.isPost, True), (self.snapshoturi, po.socialProtocol, 'ParticipaBR'), (self.snapshoturi, po.dateObtained, datetime.date(2012, 6, 28)), ] P.add(triples, self.meta_graph) g = P.context(self.meta_graph) g.serialize(pub_dir+'participabrMeta.ttl', 'turtle') c('participation meta ttl serialized') g.serialize(pub_dir+'participabrMeta.rdf', 'xml') c('participation meta xml serialized')
def rdfTweets(self): tweets=[] if self.pickle_filename1: tweets+=readPickleTweetFile( self.data_path+self.pickle_filename1)[0] if self.pickle_filename2: tweets,fopen=readPickleTweetChunk(self.data_path+self.pickle_filename2,tweets,None,10000) # limit chuck to 5k tweets chunk_count=0 self.tweets=tweets # for probing only, remove to release memory while tweets: c("rendering tweets, chunk:",chunk_count,"ntweets:",len(tweets),"snapshotid",self.snapshotid) for tweet in tweets: tweeturi,triples=self.tweetTriples(tweet) if "retweeted_status" in tweet.keys(): self.nretweets+=1 tweeturi0,triples0=self.tweetTriples(tweet) triples+=triples0 triples+=[(tweeturi,po.retweetOf,tweeturi0)] self.ntriples+=len(triples) P.set_(triples,context=self.tweet_graph) c("rendered",self.ntweets,"tweets") c("end of chunk:",chunk_count,"ntriples:",self.ntriples) self.writeTweets(chunk_count) c("chunk has been written") chunk_count+=1 if chunk_count==2: break if self.pickle_filename2: tweets,fopen=readPickleTweetChunk(None,[],fopen,10000) else: tweets=[] for i in range(chunk_count): # free memory P.context(self.tweet_graph[:-1]+str(i),"remove")
def __init__(self,snapshoturi,snapshotid,filename_friendships="foo.gml",\ data_path="../data/facebook/",final_path="./facebook_snapshots/",umbrella_dir="facebook_snapshots/"): self.friendship_graph="social_facebook_friendships" self.meta_graph="social_facebook_meta" self.social_graph="social_facebook" P.context(self.friendship_graph,"remove") P.context(self.meta_graph,"remove") self.snapshotid=snapshotid self.snapshoturi=snapshoturi self.online_prefix="https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir,self.snapshotid) self.isego=True self.isgroup=False self.isfriendship=True self.isinteraction=False self.hastext=False self.friendships_anonymized=True #friendship_network=x.read_gml(data_path+filename_friendships) with open(data_path+filename_friendships) as f: lines=f.readlines() friendship_network=x.readwrite.gml.parse_gml_lines(lines,"id",None) locals_=locals().copy() for i in locals_: if i !="self": exec("self.{}={}".format(i,i)) self.rdfFriendshipNetwork(friendship_network) self.makeMetadata() self.writeAllFB()
def startSession(context="session"): current_user_uri=P.get(NS.per.currentUser) # from rdf.rdflib OK now=datetime.now() P.context("session","remove") if not current_user_uri: nick=randomNick() # OK current_user_uri=P.rdf.timestampedURI(NS.per.Participant,nick,now) # rdf.rdflib OK triples=[ (current_user_uri, a, NS.per.DefaultParticipant), (current_user_uri, NS.per.nick, nick), (current_user_uri, NS.per.registered, now), ] c("Please create a user with P.utils.createUser() ASAP. Registered for now as {} with URI: {}".format(nick,current_user_uri)) else: triples=[] session_uri=P.rdf.timestampedURI(NS.per.Session,nick,now) # from rdf.rdflib OK current_status_uri=NS.per.CurrentStatus # class in per: ontology OK triples+=[ (current_status_uri,NS.per.currentSession,session_uri), (session_uri,NS.per.started,now), (session_uri,NS.per.user,current_user_uri), (current_status_uri,NS.per.currentUser,current_user_uri), ] P.set_(triples,context=context) # from rdf.rdflib OK #P.rdf.minimumOntology() # from rdf.ontology P.rdf.ontology.minimumTestOntology() # from rdf.ontology #P.legacy.triples.datasets.datasets() # from legacy.triples P.legacy.triples.datasets.minimalTestData() # from legacy.triples P.rdf.inference.performRdfsInference("void","minimum_ontology","session_legacy_metadata") # from rdf.inference
def parseLegacyFiles(data_dir=DATADIR+"twitter/"): """Parse legacy pickle files with Twitter tweets""" filenames=os.listdir(data_dir) filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")] snapshots=set() triples=[] for filename in filenames: snapshotid="twitter-legacy-"+filename.replace("_","") snapshoturi=po.TwitterSnapshot+"#"+snapshotid expressed_classes=[po.Participant,po.Tweet] expressed_reference=filename.replace("_","").replace(".pickle","") name_humanized="Twitter"+expressed_reference filesize=os.path.getsize(data_dir+filename)/10**6 fileformat="pickle" fileuri=po.File+"#twitter-file-"+filename triples+=[ (snapshoturi,a,po.Snapshot), (snapshoturi,a,po.TwitterSnapshot), (snapshoturi,po.snapshotID,snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), (snapshoturi, po.humanizedName, name_humanized), (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), (fileuri, po.fileFormat, fileformat), ]+[ (fileuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes ] snapshots.add(snapshoturi) nfiles=len(filenames) nsnapshots=len(snapshots) P.context("social_twitter","remove") platformuri=P.rdf.ic(po.Platform,"Twitter",context="social_twitter") triples+=[ (NS.social.Session,NS.social.nIRCParsedFiles,nfiles), (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots), (platformuri, po.dataDir,data_dir), ] P.add(triples,context="social_twitter") c("parsed {} twitter files ({} snapshots) are in percolation graph and 'social_twitter' context".format(nfiles,nsnapshots)) c("percolation graph have {} triples ({} in social_twitter context)".format(len(P.percolation_graph),len(P.context("social_twitter")))) negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isEgo true } } ") ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isGroup true } } ") nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isFriendship true } } ") ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isInteraction true } } ") nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isPost true } } ") totalsize=sum(P.query(r" SELECT ?size WHERE { GRAPH <social_twitter> { ?s po:fileSize ?size } } ")) c("""{} are ego snapshots, {} are group snapshots {} have a friendship network. {} have an interaction network. {} have post texts and reaction counts Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize)) return snapshots
def __init__(self,snapshoturi,snapshotid,directory="somedir/",\ data_path="../data/",final_path="./gmane_snapshots/",umbrella_dir="gmane_snapshotsX/"): c(snapshoturi, snapshotid, directory) isego = False isgroup = True isfriendship = False isinteraction = True hastext = True interactions_anonymized = False translation_graph = "translation" meta_graph = "translation_meta" gmane_graph = "gmane" P.context(translation_graph, "remove") P.context(meta_graph, "remove") final_path_ = "{}{}/".format(final_path, snapshotid) online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format( umbrella_dir, snapshotid) ncc = nto = nlines = nremoved_lines = nurls = nlost_messages = nparticipants = nreferences = totalchars = nurls = nreplies = nmessages = nempty = 0 dates = [] nchars_all = [] ntokens_all = [] nsentences_all = [] participantvars = ["emailAddress", "name"] messagevars = [ "author", "createdAt", "replyTo", "messageText", "cleanMessageText", "nCharsClean", "nTokensClean", "nSentencesClean", "hasUrl", "nChars", "nTokens", "nSentences", "emptyMessage", "gmaneID", "subject", "cc", "to", "hasReference", "contentType", "organization", "unparsedCC", "unparsedTo", "emailList" ] messagevars.sort() files = os.listdir(data_path + directory) if not files: self.comment = "no files on the snapshot id" return files.sort() nchars_all = [] ntokens_all = [] nsentences_all = [] nchars_clean_all = [] ntokens_clean_all = [] nsentences_clean_all = [] locals_ = locals().copy() del locals_["self"] for i in locals_: exec("self.{}={}".format(i, i)) self.rdfMbox() if len(self.files) > self.nempty: if not os.path.isdir(final_path_): os.mkdir(final_path_) self.email_xml, self.size_xml, self.email_ttl, self.size_ttl = P.rdf.writeByChunks( self.final_path_ + self.snapshotid + "Email", context=self.translation_graph, ntriples=100000) self.makeMetadata() self.writeAllGmane()
def parseLegacyFiles(data_dir=DATADIR+"irc/"): """Parse legacy txt files with irc logs""" filenames=os.listdir(data_dir) filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")] snapshots=set() triples=[] for filename in filenames: snapshotid="irc-legacy-"+filename.replace("#","") snapshoturi=po.TwitterSnapshot+"#"+snapshotid expressed_classes=[po.Participant,po.IRCMessage] expressed_reference=filename.replace("#","").replace(".txt","").replace(".log","") name_humanized="IRC log of channel "+expressed_reference filesize=os.path.getsize(data_dir+filename)/10**6 fileformat="txt" fileuri=po.File+"#Irc-log-"+filename.replace("#","") triples+=[ (snapshoturi,a,po.Snapshot), (snapshoturi,a,po.IRCSnapshot), (snapshoturi,po.snapshotID,snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), (snapshoturi, po.humanizedName, name_humanized), (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), (fileuri, po.fileFormat, fileformat), ]+[ (fileuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes ] snapshots.add(snapshoturi) nfiles=len(filenames) nsnapshots=len(snapshots) P.context("social_irc","remove") platformuri=P.rdf.ic(po.Platform,"IRC",context="social_irc") triples+=[ (NS.social.Session,NS.social.nIRCParsedFiles,nfiles), (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots), (platformuri, po.dataDir,data_dir), ] P.add(triples,context="social_irc") c("parsed {} irc logs files ({} snapshots) are in percolation graph and 'irc_twitter' context".format(nfiles,nsnapshots)) c("percolation graph have {} triples ({} in social_irc context)".format(len(P.percolation_graph),len(P.context("social_irc")))) negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isEgo true } } ") ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isGroup true } } ") nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isFriendship true } } ") ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isInteraction true } } ") nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isPost true } } ") totalsize=sum(P.query(r" SELECT ?size WHERE { GRAPH <social_irc> { ?s po:fileSize ?size } } ")) c("""{} are ego snapshots, {} are group snapshots {} have a friendship structures. {} have an interaction structures. {} have texts Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize)) return snapshots
def __init__(self, snapshoturi, snapshotid, directory="somedir/", data_path="../data/", final_path="./gmane_snapshots/", umbrella_dir="gmane_snapshotsX/"): c(snapshoturi, snapshotid, directory) isego = False isgroup = True isfriendship = False isinteraction = True hastext = True interactions_anonymized = False translation_graph = "translation" meta_graph = "translation_meta" gmane_graph = "gmane" P.context(translation_graph, "remove") P.context(meta_graph, "remove") final_path_ = "{}{}/".format(final_path, snapshotid) online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir, snapshotid) ncc = nto = nlines = nremoved_lines = nurls = nlost_messages = nparticipants = nreferences = totalchars = nurls = nreplies = nmessages = nempty = 0 dates = [] nchars_all = [] ntokens_all = [] nsentences_all = [] participantvars = ["emailAddress", "name"] messagevars = ["author", "createdAt", "replyTo", "messageText", "cleanMessageText", "nCharsClean", "nTokensClean", "nSentencesClean", "hasUrl", "nChars", "nTokens", "nSentences", "emptyMessage", "gmaneID", "subject", "cc", "to", "hasReference", "contentType", "organization", "unparsedCC", "unparsedTo", "emailList"] messagevars.sort() files = os.listdir(data_path+directory) if not files: self.comment = "no files on the snapshot id" return files.sort() nchars_all = [] ntokens_all = [] nsentences_all = [] nchars_clean_all = [] ntokens_clean_all = [] nsentences_clean_all = [] locals_ = locals().copy() del locals_["self"] for i in locals_: exec("self.{}={}".format(i, i)) self.rdfMbox() if len(self.files) > self.nempty: if not os.path.isdir(final_path_): os.mkdir(final_path_) self.email_xml, self.size_xml, self.email_ttl, self.size_ttl = P.rdf.writeByChunks( self.final_path_+self.snapshotid+"Email", context=self.translation_graph, ntriples=100000) self.makeMetadata() self.writeAllGmane()
def __init__(self,snapshoturi,snapshotid,filenames=("foo.pickle",),\ data_path="../data/twitter/",final_path="./twitter_snapshots/",umbrella_dir="twitter_snapshots/"): if len(filenames) == 2: pickle_filename1 = filenames[0] pickle_filename2 = filenames[1] elif filenames[0].count("_") == 1: pickle_filename1 = filenames[0] pickle_filename2 = "" elif filenames[0].count("_") == 2: pickle_filename1 = "" pickle_filename2 = filenames[0] else: raise ValueError("Filenames not understood") participantvars=["stringID","numericID","screenName","favouritesCount","followersCount","friendsCount",\ "language","listedCount","name","statusesCount","createdAt","utfOffset","snapshot"] participantvars.sort() tweetvars = [ "author", "nChars", "nTokens", "stringID", "createdAt", "message", "retweetCount", "language", "inReplyToTweet", "retweetOf", "expandedURL", "hashtag", "snapshot", "stringID", "retweetOf", "userMention", "media" ] isego = False isgroup = True isfriendship = False isinteraction = True hastext = True interactions_anonymized = False tweet_graph = "social_tweets0" meta_graph = "social_twitter_meta" social_graph = "social_twitter" P.context(tweet_graph, "remove") P.context(meta_graph, "remove") final_path_ = "{}{}/".format(final_path, snapshotid) online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format( umbrella_dir, snapshotid) dates = [] size_rdf = [] size_ttl = [] tweet_rdf = [] tweet_ttl = [] nchars_all = [] ntokens_all = [] ntriples = nhashtags = nmedia = nlinks = nuser_mentions = nparticipants = nretweets = ntweets = nreplies = anonymous_user_count = anonymous_tweet_count = 0 locals_ = locals().copy() del locals_["self"] for i in locals_: exec("self.{}={}".format(i, i)) self.rdfTweets() self.makeMetadata() self.writeAllTW()
def __init__(self, snapshoturi, snapshotid, filename_friendships=None, filename_interactions=None, filename_posts=None, data_path="../data/facebook/", final_path="./facebook_snapshots/", umbrella_dir="facebook_snapshots/"): self.friendship_graph = "social_facebook_friendships" self.interaction_graph = "social_facebook_interactions" self.meta_graph = "social_facebook_meta" self.posts_graph = "social_facebook_posts" self.social_graph = "social_facebook" P.context(self.friendship_graph, "remove") P.context(self.interaction_graph, "remove") P.context(self.meta_graph, "remove") P.context(self.posts_graph, "remove") self.snapshotid = snapshotid self.snapshoturi = snapshoturi self.online_prefix = "https://raw.githubusercontent.com/Open\ LinkedSocialData/{}master/{}/".format(umbrella_dir, self.snapshotid) self.isfriendship = bool(filename_friendships) self.isinteraction = bool(filename_interactions) self.hastext = bool(filename_posts) self.nfriends = self.nfriendships = self.ninteracted = \ self.ninteractions = self.nposts = 0 if self.isfriendship: # return networkx graph fnet = readGDF(data_path + filename_friendships) # writes to self.friendship_graph fnet_ = self.rdfFriendshipNetwork(fnet) if self.isinteraction: inet = readGDF(data_path + filename_interactions) # to networkx self.rdfInteractionNetwork(inet) # to self.interaction_graph else: self.groupid2 = 0 if self.hastext: self.rdfGroupPosts(data_path + filename_posts) # to self.posts_graph self.observation_count = 0 locals_ = locals().copy() for i in locals_: if i != "self": if isinstance(locals_[i], str): exec("self.{}='{}'".format(i, locals_[i])) else: exec("self.{}={}".format(i, locals_[i])) self.makeMetadata() # rdflib graph with metadata self.writeAllFB() # write linked data tree
def __init__(self, snapshoturi, snapshotid, filename="foo.txt", data_path="../data/irc/", final_path="./irc_snapshots/", umbrella_dir="irc_snapshots/"): c(snapshoturi, snapshotid, filename) isego = False isgroup = True isfriendship = False isinteraction = True hastext = True interactions_anonymized = False irc_graph = "social_log" meta_graph = "social_irc_meta" social_graph = "social_irc" P.context(irc_graph, "remove") P.context(meta_graph, "remove") final_path_ = "{}{}/".format(final_path, snapshotid) online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format( umbrella_dir, snapshotid) naamessages = nurls = ndirect = nmention = 0 dates = [] nchars_all = [] ntokens_all = [] nsentences_all = [] participantvars = ["nick"] messagevars = [ "author", "createdAt", "mentions", "directedTo", "systemMessage", "text", "cleanMessageText", "nChars", "nTokens", "nSentences", "url", "emptyMessage" ] messagevars.sort() locals_ = locals().copy() del locals_["self"] for i in locals_: exec("self.{}={}".format(i, i)) self.rdfLog() self.makeMetadata() self.writeAllIRC()
def writeTweets(self, chunk_count): if not os.path.isdir(self.final_path): os.mkdir(self.final_path) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) filename = self.snapshotid + "Tweet{:05d}".format(chunk_count) g = P.context(self.tweet_graph) g.namespace_manager.bind("po", po) tttl = filename + ".ttl" trdf = filename + ".rdf" g.serialize(self.final_path_ + tttl, "turtle") c("ttl") g.serialize(self.final_path_ + trdf, "xml") filesizettl = os.path.getsize(self.final_path_ + tttl) / (10**6) filesizerdf = os.path.getsize(self.final_path_ + trdf) / (10**6) self.tweet_ttl += [tttl] self.size_ttl += [filesizettl] self.tweet_rdf += [trdf] self.size_rdf += [filesizerdf] # self.tweet_graph = self.tweet_graph[:-1]+str(chunk_count+1) P.context(self.tweet_graph, 'remove')
def __init__(self,snapshoturi,snapshotid,filenames=("foo.pickle",),\ data_path="../data/twitter/",final_path="./twitter_snapshots/",umbrella_dir="twitter_snapshots/"): if len(filenames)==2: pickle_filename1=filenames[0] pickle_filename2=filenames[1] elif filenames[0].count("_")==1: pickle_filename1=filenames[0] pickle_filename2="" elif filenames[0].count("_")==2: pickle_filename1="" pickle_filename2=filenames[0] else: raise ValueError("Filenames not understood") participantvars=["stringID","numericID","screenName","favouritesCount","followersCount","friendsCount",\ "language","listedCount","name","statusesCount","createdAt","utfOffset","snapshot"] participantvars.sort() tweetvars=["author","nChars","nTokens","stringID","createdAt","message","retweetCount","language","inReplyToTweet","retweetOf","expandedURL","hashtag","snapshot","stringID","retweetOf","userMention","media"] isego=False isgroup=True isfriendship=False isinteraction=True hastext=True interactions_anonymized=False tweet_graph="social_tweets0" meta_graph="social_twitter_meta" social_graph="social_twitter" P.context(tweet_graph,"remove") P.context(meta_graph,"remove") final_path_="{}{}/".format(final_path,snapshotid) online_prefix="https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir,snapshotid) dates=[]; size_rdf=[]; size_ttl=[]; tweet_rdf=[]; tweet_ttl=[]; nchars_all=[]; ntokens_all=[] ntriples=nhashtags=nmedia=nlinks=nuser_mentions=nparticipants=nretweets=ntweets=nreplies=anonymous_user_count=anonymous_tweet_count=0 locals_=locals().copy(); del locals_["self"] for i in locals_: exec("self.{}={}".format(i,i)) self.rdfTweets() self.makeMetadata() self.writeAllTW()
def writeAllTW(self): # write meta and readme with self.desc, then all is finished. g = P.context(self.meta_graph) # ntriples = len(g) # triples = [ # (self.snapshoturi, po.nMetaTriples, ntriples), # ] # P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta")
def writeAllGmane(self): g = P.context(self.meta_graph) g.namespace_manager.bind("po", po) # ntriples = len(g) # triples = [ # (self.snapshoturi, po.nMetaTriples, ntriples), # ] # P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf", "xml") c("serialized meta")
def writeAllIRC(self): # g = P.context(self.meta_graph) # ntriples = len(g) # triples = [ # (self.snapshoturi, po.nMetaTriples, ntriples+1), # ] # P.add(triples, context=self.meta_graph) g = P.context(self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta")
def writeAll(self): g=P.context(self.meta_graph) ntriples=len(g) triples=[ (self.snapshoturi,po.nMetaTriples,ntriples) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data self.dates=[i.isoformat() for i in self.dates] date1=min(self.dates) date2=max(self.dates) with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the IRC snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples, tinteraction=tposts, tposts=tposts, mrdf=self.translation_xml, mttl=self.translation_ttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def __init__(self, snapshoturi, snapshotid, filename_friendships="foo.gml", data_path="../data/facebook/", final_path="./facebook_snapshots/", umbrella_dir="facebook_snapshots/"): self.friendship_graph = "social_facebook_friendships" self.meta_graph = "social_facebook_meta" self.social_graph = "social_facebook" P.context(self.friendship_graph, "remove") P.context(self.meta_graph, "remove") self.snapshotid = snapshotid self.snapshoturi = snapshoturi self.online_prefix = "https://raw.githubusercontent.com/\ OpenLinkedSocialData/{}master/{}/".format(umbrella_dir, self.snapshotid) # participant_uri = P.rdf.ic(po.Snapshot, self.snapshotid, # self.friendship_graph) self.isego = True self.isgroup = False self.isfriendship = True self.isinteraction = False self.hastext = False self.friendships_anonymized = True # friendship_network=x.read_gml(data_path+filename_friendships) with open(data_path + filename_friendships) as f: lines = f.readlines() friendship_network = x.readwrite.gml.parse_gml_lines(lines, "id", None) locals_ = locals().copy() for i in locals_: if i != "self": exec("self.{}={}".format(i, i)) self.rdfFriendshipNetwork(friendship_network) self.makeMetadata() self.writeAllFB()
def rdfTweets(self): tweets = [] if self.pickle_filename1: tweets += readPickleTweetFile(self.data_path + self.pickle_filename1)[0] if self.pickle_filename2: tweets, fopen = readPickleTweetChunk( self.data_path + self.pickle_filename2, tweets, None, 10000) # limit chuck to 5k tweets chunk_count = 0 self.tweets = tweets # for probing only, remove to release memory while tweets: c("rendering tweets, chunk:", chunk_count, "ntweets:", len(tweets), "snapshotid", self.snapshotid) for tweet in tweets: tweeturi, triples = self.tweetTriples(tweet) if "retweeted_status" in tweet.keys(): self.nretweets += 1 tweeturi0, triples0 = self.tweetTriples(tweet) triples += triples0 triples += [(tweeturi, po.retweetOf, tweeturi0)] self.ntriples += len(triples) P.set_(triples, context=self.tweet_graph) c("rendered", self.ntweets, "tweets") c("end of chunk:", chunk_count, "ntriples:", self.ntriples) self.writeTweets(chunk_count) c("chunk has been written") chunk_count += 1 if chunk_count == 2: break if self.pickle_filename2: tweets, fopen = readPickleTweetChunk(None, [], fopen, 10000) else: tweets = [] for i in range(chunk_count): # free memory P.context(self.tweet_graph[:-1] + str(i), "remove")
def writeTranslates(self,mode="full"): c("mode full or chunk or multigraph write:",mode) if mode=="full": g=P.context(self.translation_graph) self.translation_ttl=self.snapshotid+"Translation.ttl" self.translation_xml=self.snapshotid+"Translation.rdf" g.serialize(self.final_path_+self.translation_ttl,"turtle"); c("ttl") g.serialize(self.final_path_+self.translation_xml,"xml") self.translation_size_ttl=os.path.getsize(self.final_path_+self.translation_ttl)/10**6 self.translation_size_xml=os.path.getsize(self.final_path_+self.translation_xml)/10**6 self.ntranslation_triples=len(g) elif mode=="chunk": # writeByChunks raise NotImplementedError("Perform P.utils.writeByChunks on self.translation_graph") elif mode=="multigraph": raise NotImplementedError("Perform serialize(write) on each of the self.translation_graphs")
def writeTranslates(self,mode="full"): c("mode full or chunk or multigraph write:",mode) if mode=="full": g=P.context(self.translation_graph) self.translation_ttl=self.snapshotid+"Translation.ttl" self.translation_xml=self.snapshotid+"Translation.rdf" g.serialize(self.final_path_+self.translation_ttl,"turtle"); c("ttl") g.serialize(self.final_path_+self.translation_xml,"xml") self.size_ttl=os.path.getsize(self.final_path_+self.translation_ttl)/10**6 self.size_xml=os.path.getsize(self.final_path_+self.translation_xml)/10**6 self.ntranslation_triples=len(g) elif mode=="chunk": # writeByChunks raise NotImplementedError("Perform P.utils.writeByChunks on self.translation_graph") elif mode=="multigraph": raise NotImplementedError("Perform serialize(write) on each of the self.translation_graphs")
def performRdfsInference(data_context=None,ontology_context=None,inferred_context=None,clean_inferred_context=True): # clean inference graph if True if clean_inferred_context: P.context(inferred_context,"remove") previous_count=len(P.context(inferred_context)) rdfsInferenceIterate(data_context,ontology_context,inferred_context) new_count=len(P.context(inferred_context)) while previous_count != new_count: previous_count=len(P.context(inferred_context)) rdfsInferenceIterate(inferred_context,ontology_context,inferred_context) new_count=len(P.context(inferred_context)) c("should have all triples resulting from a rdfs subclass subproperty range and domain assertions")
def writeRdf(self): pub_dir = './cidadedemocratica_snapshot/' if not os.path.isdir(pub_dir): os.mkdir(pub_dir) # g = P.context(self.translation_graph) # g.serialize(pub_dir+'cidadedemocratica.ttl', 'turtle') # c('participation ttl serialized') # g.serialize(pub_dir+'cidadedemocratica.rdf', 'xml') # c('participation xml serialized') P.rdf.writeByChunks(pub_dir+'cidadedemocratica', context=self.translation_graph, ntriples=100000) # metadados: group, platform, g = P.context(self.meta_graph) g.serialize(pub_dir+'cidadedemocraticaMeta.ttl', 'turtle') c('participation meta ttl serialized') g.serialize(pub_dir+'cidadedemocraticaMeta.rdf', 'xml') c('participation meta xml serialized')
def writeTweets(self,chunk_count): if not os.path.isdir(self.final_path): os.mkdir(self.final_path) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) filename=self.snapshotid+"Tweet{:05d}".format(chunk_count) g=P.context(self.tweet_graph) g.namespace_manager.bind("po",po) tttl=filename+".ttl" trdf=filename+".rdf" g.serialize(self.final_path_+tttl,"turtle"); c("ttl") g.serialize(self.final_path_+trdf,"xml") filesizettl=os.path.getsize(self.final_path_+tttl)/(10**6) filesizerdf=os.path.getsize(self.final_path_+trdf)/(10**6) self.tweet_ttl+=[tttl] self.size_ttl+=[filesizettl] self.tweet_rdf+=[trdf] self.size_rdf+=[filesizerdf] self.tweet_graph=self.tweet_graph[:-1]+str(chunk_count+1)
def __init__(self,snapshoturi,snapshotid,filename_friendships=None,\ filename_interactions=None,filename_posts=None,\ data_path="../data/facebook/",final_path="./facebook_snapshots/",umbrella_dir="facebook_snapshots/"): self.friendship_graph="social_facebook_friendships" self.interaction_graph="social_facebook_interactions" self.meta_graph="social_facebook_meta" self.posts_graph="social_facebook_posts" self.social_graph="social_facebook" P.context(self.friendship_graph,"remove") P.context(self.interaction_graph,"remove") P.context(self.meta_graph,"remove") P.context(self.posts_graph,"remove") self.snapshotid=snapshotid self.snapshoturi=snapshoturi self.online_prefix="https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir,self.snapshotid) self.isfriendship= bool(filename_friendships) self.isinteraction=bool(filename_interactions) self.hastext=bool(filename_posts) self.nfriends=self.nfriendships=self.ninteracted=self.ninteractions=self.nposts=0 if self.isfriendship: fnet=readGDF(data_path+filename_friendships) # return networkx graph fnet_=self.rdfFriendshipNetwork(fnet) # writes to self.friendship_graph if self.isinteraction: inet=readGDF(data_path+filename_interactions) # return networkx graph self.rdfInteractionNetwork(inet) # writes to self.interaction_graph else: self.groupid2=0 if self.hastext: self.rdfGroupPosts(data_path+filename_posts) # writes to self.posts_graph locals_=locals().copy() for i in locals_: if i !="self": if isinstance(locals_[i],str): exec("self.{}='{}'".format(i,locals_[i])) else: exec("self.{}={}".format(i,locals_[i])) self.makeMetadata() # return rdflib graph with metadata about the structure self.writeAllFB() # write linked data tree
def publishAll(mysqldb=None, mongoshouts=None, irclogs=None, oreshouts=None): """express aa shouts as RDF for publishing""" pub_dir='./aa_snapshots/' if not os.path.isdir(pub_dir): os.mkdir(pub_dir) if mysqldb: c("before mysql publishing") mysqldb = MysqlPublishing(mysqldb) g = P.context(mysqldb.translation_graph) g.serialize(pub_dir+"aamysql.ttl", "turtle") c("mysql ttl ok") g.serialize(pub_dir+"aamysql.rdf", "xml") c("mysql ok") g = P.context(mysqldb.meta_graph) g.serialize(pub_dir+"aamysqlMeta.ttl", "turtle") c("mysql ttl ok") g.serialize(pub_dir+"aamysqlMeta.rdf", "xml") c("mysql ok") if mongoshouts: mongoshouts = MongoPublishing(mongoshouts) g = P.context(mongoshouts.translation_graph) g.serialize(pub_dir+"aamongo.ttl", "turtle") c("mongo ttl ok") g.serialize(pub_dir+"aamongo.rdf", "xml") c("mongo ok") g = P.context(mongoshouts.meta_graph) g.serialize(pub_dir+"aamongoMeta.ttl", "turtle") c("mongo ttl ok") g.serialize(pub_dir+"aamongoMeta.rdf", "xml") c("mongo ok") if irclogs: g = r.Graph() gm = r.Graph() for irclog in irclogs: # filenames irclog = LogPublishing(irclog) g += P.context(irclog.translation_graph) gm += P.context(irclog.meta_graph) g.serialize(pub_dir+"aairc.ttl", "turtle") c("irc ttl ok") g.serialize(pub_dir+"aairc.rdf", "xml") c("irc ok") gm.serialize(pub_dir+"aaircMeta.ttl", "turtle") c("irc ttl ok") gm.serialize(pub_dir+"aaircMeta.rdf", "xml") c("irc ok") if oreshouts: oreshouts = OrePublishing(oreshouts) c("ore ok") return mysqldb, mongoshouts, irclog, oreshouts
def rdfsInferenceIterate(data_context=None,ontology_context=None,inferred_context=None): contexts=[i.identifier.lower() for i in P.context()] if data_context not in contexts: c("no data context") if ontology_context not in contexts: c("no ontology context") if inferred_context not in contexts: c("inferred context to be created context:",inferred_context) for subject, foo, object_ in P.percolation_graph.triples(\ (None,NS.rdfs.subClassOf,None),context=ontology_context): for individual, footype, foosubject in P.percolation_graph.triples(\ (None,a,subject),context=data_context): P.add((individual,a,object_),context=inferred_context) for foosubject, fooproperty, subject in P.percolation_graph.triples(\ (None,None,subject),context=data_context): P.add((foosubject,fooproperty,object_),context=inferred_context) c("finished subclass reasoning") for subject, foo, object_ in P.percolation_graph.triples(\ (None,NS.rdfs.subPropertyOf,None),context=ontology_context): c(subject,foo,object_) for subject2,propertyfoo,object2 in P.percolation_graph.triples(\ (None,subject,None),context=data_context): c(subject2,propertyfoo,object2) P.add((subject2,object_,object2),context=inferred_context) c("finished subproperty reasoning") for subject, foo, object_ in P.percolation_graph.triples(\ (None,NS.rdfs.domain,None),context=ontology_context): for subject2,predicatefoo,objectfoo in P.percolation_graph.triples(\ (None,subject,None),context=data_context): P.add((subject2,a,object_),context=inferred_context) c("finished domain reasoning") for subject, foo, object_ in P.percolation_graph.triples(\ (None,NS.rdfs.range,None),context=ontology_context): for subjectfoo,predicatefoo,object2 in P.percolation_graph.triples(\ (None,subject,None),context=data_context): P.add((object2,a,object_),context=inferred_context) c("finished range reasoning")
def writeAllTW(self): # write meta and readme with self.desc, finished. g = P.context(self.meta_graph) ntriples = len(g) triples = [ (self.snapshoturi, po.nMetaTriples, ntriples), ] P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_ + "scripts"): os.mkdir(self.final_path_ + "scripts") shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py", self.final_path_ + "scripts/triplify.py") # copia do base data tinteraction = """\n\n{} individuals with metadata {} and {} interactions (retweets: {}, replies: {}, user_mentions: {}) constitute the interaction network in the RDF/XML file(s): {} and the Turtle file(s): {} (anonymized: {}).""".format( self.nparticipants, str(self.participantvars), self.nretweets + self.nreplies + self.nuser_mentions, self.nretweets, self.nreplies, self.nuser_mentions, self.tweet_rdf, self.tweet_ttl, self.interactions_anonymized) tposts = """\n\nThe dataset consists of {} tweets with metadata {} {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format( self.ntweets, str(self.tweetvars), self.mcharstweets, self.dcharstweets, self.totalchars, self.mtokenstweets, self.dtokenstweets, self.totaltokens, ) self.dates = [i.isoformat() for i in self.dates] date1 = min(self.dates) date2 = max(self.dates) with open(self.final_path_ + "README", "w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the twitter snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""" .format(snapid=self.snapshotid, date1=date1, date2=date2, ntrip=self.ntriples, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc))
def writeAllFB(self): c("started rendering of the snapshot publication. snapshotID:",self.snapshotid) self.final_path_="{}{}/".format(self.final_path,self.snapshotid) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) #fnet,inet,mnet triples=[] if self.isfriendship: g=P.context(self.friendship_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Friendship.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Friendship.rdf","xml") c("serialized friendships") # get filesize and ntriples filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.rdf")/(10**6) filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.ttl")/(10**6) ntriples=len(g) triples+=[ (self.snapshoturi,po.friendshipXMLFileSizeMB,filesizerdf), (self.snapshoturi,po.friendshipTTLFileSizeMB,filesizettl), (self.snapshoturi,po.nFriendshipTriples,ntriples), ] if self.isinteraction: g=P.context(self.interaction_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Interaction.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Interaction.rdf","xml") c("serialized interaction") filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Interaction.rdf")/(10**6) filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Interaction.ttl")/(10**6) ntriples=len(g) triples+=[ (self.snapshoturi,po.interactionXMLFileSizeMB,filesizerdf), (self.snapshoturi,po.interactionTTLFileSizeMB,filesizettl), (self.snapshoturi,po.nInteractionTriples,ntriples), ] if self.hastext: g=P.context(self.posts_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Posts.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Posts.rdf","xml") c("serialized posts") filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Posts.rdf")/(10**6) filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Posts.ttl")/(10**6) ntriples=len(g) triples+=[ (self.snapshoturi,po.postsXMLFileSizeMB,filesizerdf), (self.snapshoturi,po.postsTTLFileSizeMB,filesizettl), (self.snapshoturi,po.nPostsTriples,ntriples) , ] g=P.context(self.meta_graph) ntriples=len(g) triples+=[ (self.snapshoturi,po.nMetaTriples,ntriples) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data if not os.path.isdir(self.final_path_+"base"): os.mkdir(self.final_path_+"base") originals="" if self.isfriendship: shutil.copy(self.data_path+self.filename_friendships,self.final_path_+"base/") originals+="base/{}".format(self.filename_friendships) tfriendship="""\n\n{nf} individuals with metadata {fvars} and {nfs} friendships constitute the friendship network in the RDF/XML file: {frdf} \nor in the Turtle file: \n{fttl} (anonymized: {fan}).""".format( nf=self.nfriends,fvars=str(self.friendsvars), nfs=self.nfriendships, frdf=self.frdf,fttl=self.fttl, fan=self.friendships_anonymized, ) else: tfriendship="" if self.isinteraction: shutil.copy(self.data_path+self.filename_interactions,self.final_path_+"base/") tinteraction="""\n\n{} individuals with metadata {} and {} interactions with metadata {} constitute the interaction network in the RDF/XML file: {} or in the Turtle file: {} (anonymized: {}).""".format( self.ninteracted,str(self.varsfriendsinteraction), self.ninteractions,str(self.interactionsvars), self.irdf, self.ittl, self.interactions_anonymized) originals+="\nbase/{}".format(self.filename_interactions) else: tinteraction="" if self.hastext: shutil.copy(self.data_path+self.filename_posts,self.final_path_+"base/") tposts="""\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {} posts data in the RDF/XML file: {} or in the Turtle file: {}""".format( self.nposts,self.mcharsposts,self.dcharsposts,self.totalchars, self.mtokensposts,self.dtokensposts,self.totaltokens, self.prdf, self.pttl) originals+="\nbase/{}".format(self.filename_posts) else: tposts="" # P.rdf.writeAll(mnet,aname+"Meta",fpath_,1) # faz um README datetime_string=P.get(r.URIRef(self.snapshoturi),po.dateObtained,None,context="social_facebook")[2] # if not os.path.isdir(self.final_path+"base"): # os.mkdir(self.final_path+"base") with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the facebook snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nOriginal file(s): {origs} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date=datetime_string, tfriendship=tfriendship, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, origs=originals, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def parseLegacyFiles(data_dir=DATADIR + "twitter/"): """Parse legacy pickle files with Twitter tweets""" filenames = os.listdir(data_dir) filenames = [ i for i in filenames if i != "ipython_log.py" and not i.endswith(".swp") ] snapshots = set() triples = [] for filename in filenames: snapshotid = "twitter-legacy-" + filename.replace("_", "").replace( 'tw.pickle', '') snapshoturi = po.Snapshot + "#" + snapshotid expressed_classes = [po.Participant, po.Tweet] expressed_reference = filename.replace("_", "").replace(".pickle", "") name_humanized = "Twitter " + expressed_reference filesize = os.path.getsize(data_dir + filename) / 10**6 fileformat = "pickle" fileuri = po.File + "#twitter-file-" + filename triples += [ (snapshoturi, a, po.Snapshot), # (snapshoturi, a, po.TwitterSnapshot), (snapshoturi, po.snapshotID, snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), # (snapshoturi, po.humanizedName, name_humanized), # (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), # (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), # (fileuri, po.fileFormat, fileformat), ] + [ # (fileuri, po.expressedClass, expressed_class) for # expressed_class in expressed_classes ] snapshots.add(snapshoturi) nfiles = len(filenames) nsnapshots = len(snapshots) P.context("social_twitter", "remove") platformuri = P.rdf.ic(po.Platform, "Twitter", context="social_twitter") triples += [ (NS.social.Session, NS.social.nTwitterParsedFiles, nfiles), (NS.social.Session, NS.social.nTwitterSnapshots, nsnapshots), (platformuri, po.dataDir, data_dir), ] P.add(triples, context="social_twitter") c("parsed {} twitter files ({} snapshots) are in percolation graph \ and 'social_twitter' context".format(nfiles, nsnapshots)) c("percolation graph have {} triples ({} in social_twitter context)". format(len(P.percolation_graph), len(P.context("social_twitter")))) negos = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isEgo true } } " ) ngroups = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isGroup true } } " ) nfriendships = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isFriendship true } } " ) ninteractions = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isInteraction true } } " ) nposts = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isPost true } } " ) totalsize = sum( P.query( r" SELECT ?size WHERE { GRAPH <social_twitter> { ?s po:fileSize ?size } } " )) c("""{} are ego snapshots, {} are group snapshots {} have a friendship network. {} have an interaction network. \ {} have post texts and reaction counts. Total raw data size is {:.2f}MB""" .format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize)) return snapshots
def rdfLog(self): try: with codecs.open(self.data_path + self.filename, "rb", "iso-8859-1") as f: logtext = textFix(f.read()) c('opened log {} as iso-8859-1'.format(self.snapshotid)) except OSError: with open(self.data_path + self.filename, "r") as f: logtext = textFix(f.read()) c('opened log {} as utf8'.format(self.snapshotid)) # msgregex=r"\[(\d{2}):(\d{2}):(\d{2})\] \* ([^ ?]*)[ ]*(.*)" # DELETE ??? # rmessage= r"\[(\d{2}):(\d{2}):(\d{2})\] \<(.*?)\>[ ]*(.*)" # message # lista arquivos no dir # rdate = r"(\d{4})(\d{2})(\d{2})" # date # system message: rsysmsg = r"(\d{4})\-(\d{2})\-(\d{2})T(\d{2}):(\d{2}):(\d{2}) \*\*\* (\S+) (.*)" # user message: rmsg = r"(\d{4})\-(\d{2})\-(\d{2})T(\d{2}):(\d{2}):(\d{2}) \<(.*?)\> (.*)" rurl = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' messages = re.findall(rmsg, logtext) system_messages = re.findall(rsysmsg, logtext) self.NICKS = set([Q(i[-2]) for i in messages] + [Q(i[-2]) for i in system_messages]) triples = [] for nick in self.NICKS: useruri = P.rdf.ic(po.Participant, "{}-{}".format(self.provenance_prefix, nick), self.irc_graph, self.snapshoturi) obs = P.rdf.ic(po.Observation, "{}-{}".format(self.snapshotid, nick), self.irc_graph, self.snapshoturi) triples.extend([ (useruri, po.observation, obs), (obs, po.nick, nick), ]) messageids = set() msgcount = 0 c("starting translation of log with", len(messages) + len(system_messages), "messages") for message in messages: year, month, day, hour, minute, second, nick, text = message nick = Q(nick) datetime_ = datetime.datetime( *[int(i) for i in (year, month, day, hour, minute, second)]) self.dates += [datetime_] timestamp = datetime_.isoformat() messageid = "{}-{}-{}".format(self.snapshotid, nick, timestamp) while messageid in messageids: messageid += '_r_%05x' % random.randrange(16**5) messageids.add(messageid) messageuri = P.rdf.ic(po.IRCMessage, messageid, self.irc_graph, self.snapshoturi) tokens = k.word_tokenize(text) tokens = [i for i in tokens if i not in set(string.punctuation)] direct_nicks = [] # for directed messages at mention_nicks = [] # for mentioned fellows direct = 1 for token in tokens: if token not in self.NICKS: direct = 0 else: if direct: direct_nicks.append(token) else: mention_nicks.append(token) for nick in direct_nicks: useruri2 = po.Participant + "#{}-{}".format( self.snapshotid, nick) triples.append((messageuri, po.directedTo, useruri2)) if direct_nicks: self.ndirect += 1 text_ = text[text.index(direct_nicks[-1]) + len(direct_nicks[-1]) + 1:].lstrip() else: text_ = text for nick in mention_nicks: useruri2 = po.Participant + "#{}-{}".format( self.snapshotid, nick) triples.append((messageuri, po.mentions, useruri2)) self.nmention += len(mention_nicks) useruri = po.Participant + "#{}-{}".format(self.snapshotid, nick) triples.extend(( (messageuri, po.author, useruri), (messageuri, po.systemMessage, False), (messageuri, po.createdAt, datetime_), )) if text: triples.append((messageuri, po.text, text)) if text_: nchars = len(text_) ntokens = len(k.word_tokenize(text_)) nsentences = len(k.sent_tokenize(text_)) triples += [ (messageuri, po.cleanText, text_), # (messageuri, po.nChars, nchars), # (messageuri, po.nTokens, ntokens), # (messageuri, po.nSentences, nsentences), ] urls = re.findall(rurl, text_) for url in urls: triples += [ (messageuri, po.hasUrl, url), ] self.nchars_all += [nchars] self.ntokens_all += [ntokens] self.nsentences_all += [nsentences] self.nurls += len(urls) else: triples += [ (messageuri, po.emptyMessage, True), ] if text.startswith(";aa ") or text.startswith( "lalenia, aa ") or text.startswith("lalenia: aa "): self.naamessages += 1 # triples.append((messageuri, a, po.AAIRCMessage)) triples.append((messageuri, po.aaMessage, True)) msgcount += 1 if msgcount % 1000 == 0: c("finished user message", msgcount) msgcount = 0 for message in system_messages: year, month, day, hour, minute, second, nick, text = message nick = Q(nick) useruri = po.Participant + "#{}-{}".format(self.provenance_prefix, nick) datetime_ = datetime.datetime( *[int(i) for i in (year, month, day, hour, minute, second)]) self.dates += [datetime_] timestamp = datetime_.isoformat() messageid = "{}-{}".format(self.snapshotid, timestamp) while messageid in messageids: messageid += '_r_%05x' % random.randrange(16**5) messageids.add(messageid) messageuri = P.rdf.ic(po.IRCMessage, messageid, self.irc_graph, self.snapshoturi) triples += [(messageuri, po.impliedUser, useruri), (messageuri, po.createdAt, datetime_), (messageuri, po.systemMessage, True)] if text: triples += [(messageuri, po.text, text)] msgcount += 1 if msgcount % 1000 == 0: c("Total system messages:", msgcount) self.messageids = messageids if not os.path.isdir(self.final_path): os.mkdir(self.final_path) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) g = P.context(self.irc_graph) triples_ = [tr for tr in g] triples.extend(triples_) self.log_xml, self.size_xml, self.log_ttl, self.size_ttl = P.rdf.writeByChunks( self.final_path_ + self.snapshotid + "Log", ntriples=100000, triples=triples, bind=[('po', po)])
def parseLegacyFiles(data_dir=DATADIR): """Parse legacy mbox files with emails from the Gmane database""" data_dir = os.path.expanduser(data_dir) directories = os.listdir(data_dir) directories = [i for i in directories if os.path.isdir(data_dir+i)] snapshots = set() triples = [] for directory in directories: all_files = [i for i in os.listdir(data_dir+directory) if i.isdigit()] if all_files: all_files.sort() foo = all_files[0].lstrip("0") if not foo: foo = "0" snapshotid = re.sub(r'^gmane\.', 'email-legacy-', directory.replace('+', 'p'))+foo+"-"+all_files[-1].lstrip("0") snapshoturi = po.Snapshot+"#"+snapshotid expressed_classes = [po.GmaneParticipant, po.EmailPeer, po.EmailMessage] expressed_reference = directory name_humanized = "Gmane email list with id "+expressed_reference directorysize = sum(os.path.getsize(data_dir+directory+"/"+filename) for filename in os.listdir(data_dir+directory))/10**6 fileformat = "mbox" directoryuri = po.Directory+"#gmane-"+directory triples.extend([ (snapshoturi, a, po.Snapshot), # (snapshoturi, a, po.GmaneSnapshot), (snapshoturi, po.dataDir, data_dir), (snapshoturi, po.snapshotID, snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), # (snapshoturi, po.humanizedName, name_humanized), # (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawDirectory, directoryuri), # (directoryuri, po.directorySize, directorysize), (directoryuri, po.directoryName, directory), # (directoryuri, po.fileFormat, fileformat), ]+[ # (directoryuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes ]) snapshots.add(snapshoturi) nsnapshots = ndirectories = len(directories) P.context("gmane", "remove") platformuri = P.rdf.ic(po.Platform, "Gmane", context="gmane") triples.extend([ (NS.social.Session, NS.social.nGmaneParsedDirectories, ndirectories), (NS.social.Session, NS.social.nGmaneSnapshots, nsnapshots), (NS.social.Session, po.platform, platformuri), ]) P.add(triples, context="gmane") c("parsed {} gmane data directories (=={} snapshots) are in percolation graph and 'gmane' context".format(ndirectories, nsnapshots)) c("percolation graph have {} triples ({} in gmane context)".format(len(P.percolation_graph), len(P.context("gmane")))) negos = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isEgo true } } ") ngroups = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isGroup true } } ") nfriendships = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isFriendship true } } ") ninteractions = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isInteraction true } } ") nposts = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isPost true } } ") totalsize = sum(P.query(r" SELECT ?size WHERE { GRAPH <gmane> { ?s po:directorySize ?size } } ")) c("""{} are ego snapshots, {} are group snapshots {} have a friendship structures. {} have an interaction structures. {} have texts Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize)) return snapshots
def writeAll(self): g=P.context(self.meta_graph) ntriples=len(g) triples=[ (self.snapshoturi,po.nMetaTriples,ntriples) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data text="""structure in the RDF/XML file(s): {} and the Turtle file(s): {} (anonymized: False "nicks inteface").""".format( self.nparticipants,str(self.participantvars), self.nchecks,self.ndirect,self.nmention, self.translation_xml, self.translation_ttl) tposts="""\n\nThe dataset consists of {} shout messages with metadata {} {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {} {:.3f} sentences in average (std: {:.3f}) and total sentences in snapshot: {}""".format( self.nmessages,str(self.messagevars), self.mcharsmessages, self.dcharsmessages,self.totalchars, self.mtokensmessages,self.dtokensmessages,self.totaltokens, self.msentencesmessages,self.dsentencesmessages,self.totalsentences, ) self.dates=P.get(r"SELECT ?date WHERE { GRAPH <%s> { ?fooshout po:createdAt ?date } "%(self.translation_graph,)) self.dates=[i.isoformat() for i in self.dates] date1=min(self.dates) date2=max(self.dates) with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the IRC snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples, tinteraction=tposts, tposts=tposts, mrdf=self.translation_xml, mttl=self.translation_ttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def writeAllFB(self): c("started rendering of the snapshot publication. snapshotID:",self.snapshotid) self.final_path_="{}{}/".format(self.final_path,self.snapshotid) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) #fnet,inet,mnet g=P.context(self.friendship_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Friendship.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Friendship.rdf","xml") c("serialized friendships") # get filesize and ntriples filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.rdf")/(10**6) filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.ttl")/(10**6) ntriples=len(g) triples=[ (self.snapshoturi,po.friendshipXMLFileSizeMB,filesizerdf), (self.snapshoturi,po.friendshipTTLFileSizeMB,filesizettl), (self.snapshoturi,po.nFriendshipTriples,ntriples), ] P.add(triples,context=self.meta_graph) g=P.context(self.meta_graph) ntriples=len(g) triples+=[ (self.snapshoturi,po.nMetaTriples,ntriples+1) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data if not os.path.isdir(self.final_path_+"base"): os.mkdir(self.final_path_+"base") shutil.copy(self.data_path+self.filename_friendships,self.final_path_+"base/") originals="base/{}".format(self.filename_friendships) tfriendship="""\n\n{nf} individuals with metadata {fvars} and {nfs} friendships constitute the friendship network in the RDF/XML file: {frdf} \in the Turtle file: \n{fttl} (anonymized {fan}).""".format( nf=self.nfriends,fvars=str(self.friendsvars), nfs=self.nfriendships, frdf=self.frdf,fttl=self.fttl, fan=self.friendships_anonymized, ) datetime_string=P.get(self.snapshoturi,po.dateObtained,None,context="social_facebook")[2] with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the facebook snapshot {snapid} collected around {date}.{tfriendship} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nOriginal file(s): {origs} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date=datetime_string, tfriendship=tfriendship, mrdf=self.mrdf, mttl=self.mttl, origs=originals, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def parseLegacyFiles(datadir=DATADIR + "facebook/"): """Parse legacy gdf, gml and tab files of facebook structures Synthax of facebook filenames is: <prefix><name><date><suffix><extension> where: <prefix> used are: *) avlab_ for files obtained with participants at AVLAB *) posavlab_ for files obtained from participants *) page_ for files about facebook pages *) ego_ for ego networks ommited for gml files and gdf group files. <name> is any string name associated with the user or group delimiting the structure in the file, e.g. FernandValfro. it gets split with spaces before uppercase letter chunks for po:humanizedName: REM splits to REM. RFabbri to RFabbri. <date> daymonthyear in 2/2/4 digits, e.g. 20032014 for 20/March/2014. <suffix> is ommited for friendship .gml .gdf networks, .tab are text and activity files. _interaction is used if interaction network. <extension> is either .gml for gml files, all are ego friendship network data .gdf for gdf files with group and ego, interaction and friendship network data .tab for tab files with post data, such as text These render snapshots of two classes: po:FacebookEgoFriendshipSnapshot from .gml files and gdf files with prefix avlab_ posavlab_ or ego_ po:FacebookGroupFriendshipInteractionSnapshot from .gdf files without prefix with and without _interaction suffix and the .tab files. They form sets of files, all with friendship and interaction networks and some with a .tab file. ToDo: *) Implement parsing of page files. *) Implement parsing of new group files.""" platformuri = P.rdf.ic(po.Platform, "Facebook", context="social_facebook") triples = [ (platformuri, po.dataDir, datadir), ] filenames = os.listdir(datadir) filenames = [ i for i in filenames if not i.endswith("swp") and "ipython_log.py" != i ] snapshots = set() regex = re.compile( r"^(avlab_|ego_|posavlab_|page_)*(.*?)(\d{8})(_interactions|_comments){0,1}\.(gdf|tab|gml)$" ) regex2 = re.compile(r'([A-Z]{2,}(?=[A-Z]|$)|[A-Z][a-z]*)') for filename in filenames: prefix, name, date, sufix, format_ = regex.findall(filename)[0] if prefix == "page_": c("page data currently not supported. Jumping", filename) continue # size in megabytes filesize = os.path.getsize(datadir + filename) / (10**6) snapshotid = 'facebook-legacy-' + filename.replace( "_interactions.gdf", "").replace(".tab", "").replace( '.gml', '').replace('.gdf', '') snapshoturi = po.Snapshot + "#" + snapshotid date_obtained = datetime.date(int(date[4:]), int(date[2:4]), int(date[:2])) name_humanized = " ".join(regex2.findall(name)) metadata = S.legacy.facebook.files.files_dict[filename.replace( "_interactions.gdf", ".gdf").replace(".tab", ".gdf")] if metadata[0]: triples += [(snapshoturi, po.numericID, metadata[0])] if metadata[1]: triples += [(snapshoturi, po.stringID, metadata[1])] if len(metadata) == 3: if not metadata[2]: c("group data without a publishing link: ", filename) else: triples += [(snapshoturi, po.url, metadata[2])] if filename.endswith(".gml") or any( filename.startswith(i) for i in ("ego_", "avlab_", "posavlab_")): isego = True isgroup = False isfriendship = True isinteraction = False isposts = False expressed_classes = (po.Friendship, po.Participant) if metadata[0]: expressed_reference = po.Participant+"#" + \ snapshotid+"-"+metadata[0] else: if "Mirtes" in filename: expressed_reference = po.Participant+"#" + \ snapshotid+"-anon_mirtes" else: raise ValueError( "Numeric ID is needed for friendship networks") triples += [(expressed_reference, a, po.FacebookParticipant)] else: # group snapshot isego = False isgroup = True ffilename = prefix + name + date + ".gdf" ifilename = prefix + name + date + "_interactions.gdf" tfilename = prefix + name + date + ".tab" isfriendship = ffilename in filenames isinteraction = ifilename in filenames isposts = tfilename in filenames if metadata[0]: expressed_reference = po.FacebookGroup+"#" +\ snapshotid+"-"+metadata[0] else: if metadata[1]: expressed_reference = po.FacebookGroup+"#" +\ snapshotid+"-"+metadata[1] else: raise ValueError("Numeric or string ID is needed\ for group networks") triples += [(expressed_reference, a, po.FacebookGroup)] if filename == ffilename: expressed_classes = (po.Friendship, po.Participant) elif filename == ifilename: expressed_classes = (po.Interaction, po.Participant) elif format_ == "tab": expressed_classes = (po.Post, ) else: raise NameError("filename structure not understood") fileuri = NS.po.File + "#" + snapshotid + "-_file_-" + filename triples += [ (snapshoturi, a, po.Snapshot), # (snapshoturi, a, po.FacebookSnapshot), (snapshoturi, po.snapshotID, snapshotid), (snapshoturi, po.isEgo, isego), (snapshoturi, po.isGroup, isgroup), (snapshoturi, po.isFriendship, isfriendship), (snapshoturi, po.isInteraction, isinteraction), (snapshoturi, po.isPost, isposts), (snapshoturi, po.name, name_humanized), (snapshoturi, po.dateObtained, date_obtained), # (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), # (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), (fileuri, po.fileFormat, format_), ] triples += [(fileuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes] note = theNote(filename) # for avlab and posavlab if note: triples += [ (snapshoturi, NS.rdfs.comment, note), ] snapshots.add(snapshoturi) # data about the overall data in percolation graph nfiles = len(filenames) nsnapshots = len(snapshots) triples += [ (NS.social.Session, NS.social.nFacebookParsedFiles, nfiles), (NS.social.Session, NS.social.nFacebookSnapshots, nsnapshots), ] P.context("social_facebook", "remove") P.add(triples, context="social_facebook") c("parsed {} facebook files ({} snapshots) are in percolation \ graph and 'social_facebook' context".format(nfiles, nsnapshots)) c("percolation graph have {} triples ({} in social_facebook context\ )".format(len(P.percolation_graph), len(P.context("social_facebook")))) negos = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isEgo true } ") ngroups = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isGroup true } ") nfriendships = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isFriendship true } ") ninteractions = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isInteraction true } ") nposts = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isPost true } ") totalsize = sum(P.query(r" SELECT ?size WHERE { ?s po:fileSize ?size } ")) c("""{} are ego snapshots, {} are group snapshots {} have a friendship network. {} have an interaction network. \ {} have post texts and reaction counts Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize)) return snapshots
def probeOntology(endpoint_url, graph_urns, final_dir, one_datatype=True): if not os.path.isdir(final_dir): os.makedirs(final_dir) client = P.rdf.sparql.classes.LegacyClient(endpoint_url) from_ = '' for graph_urn in graph_urns: from_ += '\nFROM <%s>' % (graph_urn, ) def mkQuery(query, plain=True): query_ = query.split('WHERE') query__ = (query_[0], from_, '\nWHERE ' + query_[1]) query___ = ''.join(query__) result = client.retrieveQuery(query___) if plain: return pl(result) else: return result['results']['bindings'] c('find all classes') q = "SELECT DISTINCT ?class WHERE { ?s a ?class . }" # classes = pl(client.retrieveQuery(prefix+q)) classes = mkQuery(q) c('antecedents, consequents and restrictions of each class') neighbors = {} triples = [] existential_restrictions = {} universal_restrictions = {} for aclass in classes: q = "SELECT DISTINCT ?cs ?p WHERE { ?i a <%s> . ?s ?p ?i . OPTIONAL { ?s a ?cs . } }" % ( aclass, ) antecedent_property = mkQuery(q) # q = "SELECT DISTINCT ?ap (datatype(?o) as ?do) WHERE { ?i a <%s> . ?i ?ap ?o . filter (datatype(?o) != '') }" % (aclass,) # consequent_property = mkQuery(q) # q = "SELECT DISTINCT ?ap ?co WHERE { ?i a <%s> . ?i ?ap ?o . ?o a ?co . }" % (aclass,) # consequent_property_ = mkQuery(q) q = "SELECT DISTINCT ?ap ?co (datatype(?o) as ?do) WHERE { ?i a <%s> . ?i ?ap ?o . OPTIONAL { ?o a ?co . } }" % ( aclass, ) consequent_property__ = mkQuery(q, 0) consequent_property = [[i['ap']['value'], i['do']['value']] for i in consequent_property__ if 'do' in i] consequent_property_ = [[i['ap']['value'], i['co']['value']] for i in consequent_property__ if 'co' in i] neighbors[aclass] = (antecedent_property, consequent_property + consequent_property_) # neighbors[aclass] = (antecedent_property, dict(consequent_property, **consequent_property_)) # class restrictions q = "SELECT DISTINCT ?p WHERE {?s a <%s>. ?s ?p ?o .}" % (aclass, ) props_c = mkQuery(q) # q = "SELECT DISTINCT ?s WHERE {?s a <%s>}" % (aclass,) # inds = mkQuery(q) q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE {?s a <%s>}" % (aclass, ) ninds = pl(client.retrieveQuery(q))[0] for pc in props_c: if '22-rdf-syntax' in pc: continue # q = "SELECT DISTINCT ?s ?co (datatype(?o) as ?do) WHERE {?s a <%s>. ?s <%s> ?o . OPTIONAL {?o a ?co . }}" % (aclass, pc) q = "SELECT DISTINCT ?co (datatype(?o) as ?do) WHERE {?s a <%s>. ?s <%s> ?o . OPTIONAL {?o a ?co . }}" % ( aclass, pc) inds2 = mkQuery(q, 0) # inds2_ = set([i["s"]["value"] for i in inds2]) objs = set([i["co"]["value"] for i in inds2 if "co" in i.keys()]) vals = set([i["do"]["value"] for i in inds2 if "do" in i.keys()]) q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE {?s a <%s>. ?s <%s> ?o . }" % ( aclass, pc) ninds2 = pl(client.retrieveQuery(q))[0] # if len(inds) == len(inds2_): # existential if ninds == ninds2: # existential if len(vals): ob = list(vals)[0] else: if len(objs): ob = list(objs)[0] else: ob = 0 if ob: B = r.BNode() triples += [(aclass, rdfs.subClassOf, B), (B, a, owl.Restriction), (B, owl.onProperty, pc), (B, owl.someValuesFrom, ob)] if aclass in existential_restrictions.keys(): existential_restrictions[aclass].append((pc, ob)) else: existential_restrictions[aclass] = [(pc, ob)] q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE { ?s <%s> ?o . ?s a ?ca . FILTER(str(?ca) != '%s') }" % ( pc, aclass) ninds3 = pl(client.retrieveQuery(q))[0] # q = "SELECT DISTINCT ?s WHERE { ?s <%s> ?o .}" % (pc,) # inds3 = mkQuery(q) # if set(inds) == set(inds3): # universal # if all([i in set(inds) for i in inds3]): # universal # if ninds == ninds3: # universal if ninds3 == 0: # universal if len(vals): ob = list(vals)[0] else: if len(objs): ob = list(objs)[0] else: ob = 0 if ob: B = r.BNode() triples += [(aclass, rdfs.subClassOf, B), (B, a, owl.Restriction), (B, owl.onProperty, pc), (B, owl.allValuesFrom, ob)] if aclass in universal_restrictions.keys(): universal_restrictions[aclass].append((pc, ob)) else: universal_restrictions[aclass] = [(pc, ob)] del q, aclass, antecedent_property, consequent_property c('find properties') q = "SELECT DISTINCT ?p WHERE {?s ?p ?o}" # properties = pl(client.retrieveQuery(prefix+q)) properties = mkQuery(q) # properties_ = [i.split("/")[-1] for i in properties] c('check if property is functional and get range and domain') functional_properties = set() for prop in properties: # check if property is functional q = 'SELECT DISTINCT (COUNT(?o) as ?co) WHERE { ?s <%s> ?o } GROUP BY ?s' % ( prop, ) is_functional = mkQuery(q) if len(is_functional) == 1 and is_functional[0] == 1: triples.append((prop, a, owl.FunctionalProperty)) functional_properties.add(prop) # datatype or object properties suj = mkQuery("SELECT DISTINCT ?cs WHERE { ?s <%s> ?o . ?s a ?cs . }" % (prop, )) # obj = mkQuery("SELECT DISTINCT ?co (datatype(?o) as ?do) WHERE { ?s <%s> ?o . OPTIONAL { ?o a ?co . } }" % (prop,)) obj1 = mkQuery( "SELECT DISTINCT ?co WHERE { ?s <%s> ?o . ?o a ?co . }" % (prop, )) obj2 = mkQuery( "SELECT DISTINCT (datatype(?o) as ?do) WHERE { ?s <%s> ?o . }" % (prop, )) obj = obj1 + obj2 if len(obj) and ("XMLS" in obj[0]): triples.append((prop, a, owl.DataProperty)) else: triples.append((prop, a, owl.ObjectProperty)) if len(suj) > 1: B = r.BNode() triples.append((prop, rdfs.domain, B)) for ss in suj: triples.append((B, owl.unionOf, ss)) elif suj: triples.append((prop, rdfs.domain, suj[0])) if len(obj) > 1: B = r.BNode() triples.append((prop, rdfs.range, B)) for ss in suj: triples.append((B, owl.unionOf, ss)) elif obj: triples.append((prop, rdfs.range, obj[0])) # for drawing # prop_ = prop.split("/")[-1] # suj_ = [i.split('/')[-1] for i in suj] # obj_ = [i.split('/')[-1] for i in obj] # Drawing c('started drawing') A = gv.AGraph(directed=True, strict=False) q = """PREFIX po: <http://purl.org/socialparticipation/po/> SELECT DISTINCT ?snap WHERE { { ?i po:snapshot ?snap } UNION { ?snap po:snapshotID ?idfoo } }""" # SELECT DISTINCT ?snap WHERE { ?i po:snapshot ?snap }""" snap = mkQuery(q)[0] q = """PREFIX po: <http://purl.org/socialparticipation/po/> SELECT ?provenance WHERE { <%s> po:socialProtocol ?provenance }""" % (snap) # WHERE { { <%s> po:socialProtocolTag ?provenance } UNION # { <%s> po:humanizedName ?provenance } }""" % (snap, snap) provenance = pl(client.retrieveQuery(q))[0] # A.graph_attr["label"] = r"General diagram of ontological structure from %s in the http://purl.org/socialparticipation/participationontology/ namespace.\nGreen edge denotes existential restriction;\ninverted edge nip denotes universal restriction;\nfull edge (non-dashed) denotes functional property." % (provenance,) edge_counter = 1 node_counter = 1 data_nodes = {} for aclass in classes: aclass_ = aclass.split('/')[-1] if aclass_ not in A.nodes(): A.add_node(aclass_, style="filled") n = A.get_node(aclass_) n.attr['color'] = "#A2F3D1" neigh = neighbors[aclass] # for i in range(len(neigh[0])): # antecendents # label = neigh[0][i][0].split("/")[-1] # elabel = neigh[0][i][1] # elabel_ = elabel.split("/")[-1] # if label not in A.nodes(): # A.add_node(label, style="filled") # n = A.get_node(label) # n.attr['color'] = "#A2F3D1" # ekey = '{}-{}-{}'.format(label, aclass_, edge_counter) # edge_counter += 1 # A.add_edge(label, aclass_, ekey) # e = A.get_edge(label, aclass_, key=ekey) # e.attr["label"] = elabel_ # e.attr["penwidth"] = 2. # e.attr["arrowsize"] = 2. # if elabel not in functional_properties: # e.attr["style"] = "dashed" # if neigh[0][i][0] in existential_restrictions.keys(): # restriction = existential_restrictions[neigh[0][i][0]] # prop = [iii[0] for iii in restriction] # obj = [iii[1] for iii in restriction] # if (elabel in prop) and (obj[prop.index(elabel)] == aclass): # e.attr["color"] = "#A0E0A0" # if neigh[0][i][0] in universal_restrictions.keys(): # restriction = universal_restrictions[neigh[0][i][0]] # prop = [iii[0] for iii in restriction] # obj = [iii[1] for iii in restriction] # if (elabel in prop) and (obj[prop.index(elabel)] == aclass): # e.attr["color"] = "inv" for i in range(len(neigh[1])): # consequents label = neigh[1][i][1].split("/")[-1] elabel = neigh[1][i][0] elabel_ = elabel.split('/')[-1] if "XMLS" in label: color = "#FFE4AA" if one_datatype: if label in data_nodes: label_ = data_nodes[label] else: label_ = node_counter node_counter += 1 data_nodes[label] = label_ else: label_ = node_counter node_counter += 1 else: label_ = label color = "#A2F3D1" if label_ not in A.nodes(): A.add_node(label_, style="filled") n = A.get_node(label_) n.attr['label'] = label.split("#")[-1] n.attr['color'] = color ekey = '{}-{}-{}'.format(aclass_, label_, edge_counter) edge_counter += 1 A.add_edge(aclass_, label_, ekey) e = A.get_edge(aclass_, label_, key=ekey) e.attr["label"] = elabel_ e.attr["color"] = color e.attr["penwidth"] = 2 if r.URIRef(elabel) not in functional_properties: e.attr["style"] = "dashed" if aclass in existential_restrictions.keys(): restrictions = existential_restrictions[aclass] prop = [iii[0] for iii in restrictions] if r.URIRef(elabel) in prop: e.attr["color"] = "#A0E0A0" if aclass in universal_restrictions.keys(): restrictions = universal_restrictions[aclass] prop = [iii[0] for iii in restrictions] if r.URIRef(elabel) in prop: e.attr["arrowhead"] = "inv" e.attr["arrowsize"] = 2. # A.draw(os.path.join(final_dir, "{}.png".format(final_dir)), prog="dot") # try: # A.draw(os.path.join(final_dir, "{}_circo.png".format(final_dir)), prog="circo") # except: # pass # A.draw(os.path.join(final_dir, "{}_twopi.png".format(final_dir)), prog="twopi", args="-Granksep=4") # A.write(os.path.join(final_dir, "{}.dot".format(final_dir))) A.draw(os.path.join(final_dir, "draw.png"), prog="dot") try: A.draw(os.path.join(final_dir, "draw_circo.png"), prog="circo") except: pass A.draw(os.path.join(final_dir, "draw_twopi.png"), prog="twopi", args="-Granksep=4") A.write(os.path.join(final_dir, "draw.dot")) # for triple in triples: # g.add(triple) P.start(False) P.context('ontology', 'remove') P.add(triples, 'ontology') g = P.context('ontology') g.serialize(os.path.join(final_dir, 'ontology.owl')) g.serialize(os.path.join(final_dir, 'ontology.ttl'), 'turtle') return locals()
def writeAllFB(self): c("started rendering of the snapshot publication. snapshotID:", self.snapshotid) self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) #fnet,inet,mnet triples = [] if self.isfriendship: g = P.context(self.friendship_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf", "xml") c("serialized friendships") # get filesize and ntriples filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid + "Friendship.rdf") / (10**6) filesizettl = os.path.getsize(self.final_path_ + self.snapshotid + "Friendship.ttl") / (10**6) ntriples = len(g) triples += [ (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf), (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl), (self.snapshoturi, po.nFriendshipTriples, ntriples), ] if self.isinteraction: g = P.context(self.interaction_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Interaction.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Interaction.rdf", "xml") c("serialized interaction") filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid + "Interaction.rdf") / (10**6) filesizettl = os.path.getsize(self.final_path_ + self.snapshotid + "Interaction.ttl") / (10**6) ntriples = len(g) triples += [ (self.snapshoturi, po.interactionXMLFileSizeMB, filesizerdf), (self.snapshoturi, po.interactionTTLFileSizeMB, filesizettl), (self.snapshoturi, po.nInteractionTriples, ntriples), ] if self.hastext: g = P.context(self.posts_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Posts.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Posts.rdf", "xml") c("serialized posts") filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid + "Posts.rdf") / (10**6) filesizettl = os.path.getsize(self.final_path_ + self.snapshotid + "Posts.ttl") / (10**6) ntriples = len(g) triples += [ (self.snapshoturi, po.postsXMLFileSizeMB, filesizerdf), (self.snapshoturi, po.postsTTLFileSizeMB, filesizettl), (self.snapshoturi, po.nPostsTriples, ntriples), ] g = P.context(self.meta_graph) ntriples = len(g) triples += [ (self.snapshoturi, po.nMetaTriples, ntriples), ] P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_ + "scripts"): os.mkdir(self.final_path_ + "scripts") shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py", self.final_path_ + "scripts/triplify.py") # copia do base data if not os.path.isdir(self.final_path_ + "base"): os.mkdir(self.final_path_ + "base") originals = "" if self.isfriendship: shutil.copy(self.data_path + self.filename_friendships, self.final_path_ + "base/") originals += "base/{}".format(self.filename_friendships) tfriendship = """\n\n{nf} individuals with metadata {fvars} and {nfs} friendships constitute the friendship network in the RDF/XML file: {frdf} \nor in the Turtle file: \n{fttl} (anonymized: {fan}).""".format( nf=self.nfriends, fvars=str(self.friendsvars), nfs=self.nfriendships, frdf=self.frdf, fttl=self.fttl, fan=self.friendships_anonymized, ) else: tfriendship = "" if self.isinteraction: shutil.copy(self.data_path + self.filename_interactions, self.final_path_ + "base/") tinteraction = """\n\n{} individuals with metadata {} and {} interactions with metadata {} constitute the interaction network in the RDF/XML file: {} or in the Turtle file: {} (anonymized: {}).""".format(self.ninteracted, str(self.varsfriendsinteraction), self.ninteractions, str(self.interactionsvars), self.irdf, self.ittl, self.interactions_anonymized) originals += "\nbase/{}".format(self.filename_interactions) else: tinteraction = "" if self.hastext: shutil.copy(self.data_path + self.filename_posts, self.final_path_ + "base/") tposts = """\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {} posts data in the RDF/XML file: {} or in the Turtle file: {}""".format(self.nposts, self.mcharsposts, self.dcharsposts, self.totalchars, self.mtokensposts, self.dtokensposts, self.totaltokens, self.prdf, self.pttl) originals += "\nbase/{}".format(self.filename_posts) else: tposts = "" # P.rdf.writeAll(mnet,aname+"Meta",fpath_,1) # faz um README datetime_string = P.get(r.URIRef(self.snapshoturi), po.dateObtained, None, context="social_facebook")[2] # if not os.path.isdir(self.final_path+"base"): # os.mkdir(self.final_path+"base") with open(self.final_path_ + "README", "w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the facebook snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nOriginal file(s): {origs} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""" .format(snapid=self.snapshotid, date=datetime_string, tfriendship=tfriendship, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, origs=originals, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc))
def makeMetadata(self): triples = P.get(self.snapshoturi, None, None, self.social_graph) for rawfile in P.get(self.snapshoturi, po.rawFile, None, self.social_graph, strict=True, minimized=True): triples += P.get(rawfile, None, None, self.social_graph) P.add(triples, context=self.meta_graph) self.totalchars = sum(self.nchars_all) self.mcharsmessages = n.mean(self.nchars_all) self.dcharsmessages = n.std(self.nchars_all) self.totaltokens = sum(self.ntokens_all) self.mtokensmessages = n.mean(self.ntokens_all) self.dtokensmessages = n.std(self.ntokens_all) self.totalsentences = sum(self.nsentences_all) self.msentencesmessages = n.mean(self.nsentences_all) self.dsentencesmessages = n.std(self.nsentences_all) self.nparticipants = len(self.NICKS) self.nmessages = len(self.messageids) self.ntriples = len(P.context(self.irc_graph)) triples = [ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nMessages, self.nmessages), (self.snapshoturi, po.nDirectMessages, self.ndirect), (self.snapshoturi, po.nUserMentions, self.nmention), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mcharsmessages), (self.snapshoturi, po.dCharsOverall, self.dcharsmessages), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokensmessages), (self.snapshoturi, po.dTokensOverall, self.dtokensmessages), (self.snapshoturi, po.nSentencesOverall, self.totalsentences), (self.snapshoturi, po.mSentencesOverall, self.msentencesmessages), (self.snapshoturi, po.dSentencesOverall, self.dsentencesmessages), ] P.add(triples, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.ircParticipantAttribute] * len(self.participantvars), self.participantvars, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.logXMLFilename] * len(self.log_xml) + [po.logTTLFilename] * len(self.log_ttl), self.log_xml + self.log_ttl, context=self.meta_graph) P.rdf.triplesScaffolding( self.snapshoturi, [po.onlineLogXMLFile] * len(self.log_xml) + [po.onlineLogTTLFile] * len(self.log_ttl), [self.online_prefix + i for i in self.log_xml + self.log_ttl], context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) self.desc += "\nnParticipants: {}; nInteractions: {} (directed messages+user mentions).".format( self.nparticipants, self.ndirect + self.nmention) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) self.desc += "\nnMessages: {}; ".format(self.nmessages) self.desc += "nDirectedMessages: {}; nUserMentions: {};".format( self.ndirect, self.nmention) self.desc += "\nnCharsOverall: {}; mCharsOverall: {}; dCharsOverall: {}.".format( self.totalchars, self.mcharsmessages, self.dcharsmessages) self.desc += "\nnTokensOverall: {}; mTokensOverall: {}; dTokensOverall: {};".format( self.totaltokens, self.mtokensmessages, self.dtokensmessages) self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {}; dSentencesOverall: {};".format( self.totalsentences, self.msentencesmessages, self.dsentencesmessages) self.desc += "\nnURLs: {}; nAAMessages {}.".format( self.nurls, self.naamessages) triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix + self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix + self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "channel text log"), (self.snapshoturi, po.socialProtocolTag, "IRC"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform, "IRC", self.meta_graph, self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples, self.meta_graph)
def writeAllTW(self): # write meta and readme with self.desc, finished. g=P.context(self.meta_graph) ntriples=len(g) triples=[ (self.snapshoturi,po.nMetaTriples,ntriples) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data tinteraction="""\n\n{} individuals with metadata {} and {} interactions (retweets: {}, replies: {}, user_mentions: {}) constitute the interaction network in the RDF/XML file(s): {} and the Turtle file(s): {} (anonymized: {}).""".format( self.nparticipants,str(self.participantvars), self.nretweets+self.nreplies+self.nuser_mentions,self.nretweets,self.nreplies,self.nuser_mentions, self.tweet_rdf, self.tweet_ttl, self.interactions_anonymized) tposts="""\n\nThe dataset consists of {} tweets with metadata {} {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format( self.ntweets,str(self.tweetvars), self.mcharstweets,self.dcharstweets,self.totalchars, self.mtokenstweets,self.dtokenstweets,self.totaltokens, ) self.dates=[i.isoformat() for i in self.dates] date1=min(self.dates) date2=max(self.dates) with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the twitter snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def parseLegacyFiles(data_dir=DATADIR): """Parse legacy txt files with irc logs""" data_dir = os.path.expanduser(data_dir) directories = os.listdir(data_dir) directories = [i for i in directories if os.path.isdir(data_dir + i)] snapshots = set() triples = [] for directory in directories: all_files = [ i for i in os.listdir(data_dir + directory) if i.isdigit() ] if all_files: all_files.sort() foo = all_files[0].lstrip("0") if not foo: foo = "0" snapshotid = "legacy-" + directory + "-" + foo + "-" + all_files[ -1].lstrip("0") snapshoturi = po.GmaneSnapshot + "#" + snapshotid expressed_classes = [ po.GmaneParticipant, po.EmailPeer, po.EmailMessage ] expressed_reference = directory name_humanized = "Gmane email list with id " + expressed_reference # get size for all files in dir directorysize = sum( os.path.getsize(data_dir + directory + "/" + filename) for filename in os.listdir(data_dir + directory)) / 10**6 nfiles = len(all_files) fileformat = "mbox" directoryuri = po.Directory + "#gmane-" + directory triples += [ (snapshoturi, a, po.Snapshot), (snapshoturi, po.dataDir, data_dir), (snapshoturi, a, po.Snapshot), (snapshoturi, a, po.GmaneSnapshot), (snapshoturi, po.snapshotID, snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), (snapshoturi, po.humanizedName, name_humanized), (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawDirectory, directoryuri), (directoryuri, po.directorySize, directorysize), (directoryuri, po.directoryName, directory), (directoryuri, po.fileFormat, fileformat), ] + [(directoryuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes] snapshots.add(snapshoturi) nsnapshots = ndirectories = len(directories) #P.context("gmane","remove") platformuri = P.rdf.ic(po.Platform, "Gmane", context="gmane") triples += [ (NS.social.Session, NS.social.nGmaneParsedDirectories, ndirectories), (NS.social.Session, NS.social.nGmaneSnapshots, nsnapshots), (NS.social.Session, po.platform, platformuri), ] P.add(triples, context="gmane") c("parsed {} gmane data directories (=={} snapshots) are in percolation graph and 'gmane' context" .format(ndirectories, nsnapshots)) c("percolation graph have {} triples ({} in gmane context)".format( len(P.percolation_graph), len(P.context("gmane")))) negos = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isEgo true } } " ) ngroups = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isGroup true } } " ) nfriendships = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isFriendship true } } " ) ninteractions = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isInteraction true } } " ) nposts = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isPost true } } " ) totalsize = sum( P.query( r" SELECT ?size WHERE { GRAPH <gmane> { ?s po:directorySize ?size } } " )) c("""{} are ego snapshots, {} are group snapshots {} have a friendship structures. {} have an interaction structures. {} have texts Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize)) return snapshots
def parseLegacyFiles(datadir=DATADIR+"facebook/"): """Parse legacy gdf, gml and tab files of facebook structures Synthax of facebook filenames is: <prefix><name><date><suffix><extension> where: <prefix> used are: *) avlab_ for files obtained with participants at AVLAB *) posavlab_ for files obtained from participants *) page_ for files about facebook pages *) ego_ for ego networks ommited for gml files and gdf group files. <name> is any string name associated with the user or group delimiting the structure in the file, e.g. FernandValfro. it gets split with spaces before uppercase letter chuncks for po:humanizedName: REM splits to REM. RFabbri to RFabbri. <date> daymonthyear in 2/2/4 digits, e.g. 20032014 for 20/March/2014. <suffix> is ommited for friendship .gml .gdf networks, .tab text and activity files. _interaction is used if interaction network. <extension> is either .gml for gml files, all are ego friendship network data .gdf for gdf files with group and ego, interaction and friendship network data .tab for tab files with post data, such as text These render snapshots of two classes: po:FacebookEgoFriendshipSnapshot from .gml files and gdf files with prefix avlab_ posavlab_ or ego_ po:FacebookGroupFriendshipInteractionSnapshot from .gdf files without prefix with and without _interaction suffix and the .tab files. They form sets of files, all with friendship and interaction networks and some with a .tab file. ToDo: *) Implement parsing of page files. *) Implement parsing of new group files.""" platformuri=P.rdf.ic(po.Platform,"Facebook",context="social_facebook") triples=[ (platformuri, po.dataDir,datadir), ] filenames=os.listdir(datadir) filenames=[i for i in filenames if not i.endswith("swp")] # clean filenames: if they are equal except for extension, keep gml file snapshots=set() regex=re.compile(r"^(avlab_|ego_|posavlab_|page_)*(.*?)(\d{8})(_interactions|_comments){0,1}\.(gdf|tab|gml)$") regex2=re.compile(r'([A-Z]{2,}(?=[A-Z]|$)|[A-Z][a-z]*)') for filename in filenames: prefix,name,date,sufix,format_=regex.findall(filename)[0] if prefix=="page_": c("page data currently not supported. Jumping", filename) continue filesize=os.path.getsize(datadir+filename)/(10**6) # size in megabytes snapshotid=filename.replace("_interactions.gdf",".gdf").replace(".tab",".gdf")+"_fb" snapshoturi=po.FacebookSnapshot+"#"+snapshotid # put on ontology as subclass of po:Snapshot date_obtained=datetime.date(int(date[4:]),int(date[2:4]),int(date[:2])) name_humanized=" ".join(regex2.findall(name)) metadata=S.legacy.facebook.files.files_dict[filename.replace("_interactions.gdf",".gdf").replace(".tab",".gdf")] if metadata[0]: triples+=[(snapshoturi,po.numericID,metadata[0])] if metadata[1]: triples+=[(snapshoturi,po.stringID,metadata[1])] if len(metadata)==3: if not metadata[2]: c("group data without a publishing link: ",filename) else: triples+=[(snapshoturi,po.publishedURL,metadata[2])] if filename.endswith(".gml") or any(filename.startswith(i) for i in ("ego_","avlab_","posavlab_")): isego=True isgroup=False isfriendship=True isinteraction=False isposts=False expressed_classes=(po.Friendship,po.Participant) if metadata[0]: expressed_reference=po.FacebookParticipant+"#"+snapshotid+"-"+metadata[0] else: if "Mirtes" in filename: expressed_reference=po.FacebookParticipant+"#"+"anon_mirtes" else: raise ValueError("Numeric ID is needed for friendship networks") triples+=[(expressed_reference,a,po.FacebookParticipant),] else: # group snapshot isego=False isgroup=True ffilename=prefix+name+date+".gdf" ifilename=prefix+name+date+"_interactions.gdf" tfilename=prefix+name+date+".tab" isfriendship=ffilename in filenames isinteraction=ifilename in filenames isposts=tfilename in filenames if metadata[0]: expressed_reference=po.FacebookGroup+"#"+metadata[0] else: if metadata[1]: expressed_reference=po.FacebookGroup+"#"+metadata[1] else: raise ValueError("Numeric or string ID is needed for group networks") triples+=[(expressed_reference,a,po.FacebookGroup)] if filename==ffilename: expressed_classes=(po.Friendship,po.Participant) elif filename==ifilename: expressed_classes=(po.Interaction,po.Participant) elif format_=="tab": expressed_classes=(po.Post,) else: raise NameError("filename structure not understood") fileuri=NS.po.File+"#"+snapshotid+"-_file_-"+filename triples+=[ (snapshoturi, a, po.Snapshot), (snapshoturi, a, po.FacebookSnapshot), (snapshoturi, po.snapshotID, snapshotid), (snapshoturi, po.isEgo, isego), (snapshoturi, po.isGroup, isgroup), (snapshoturi, po.isFriendship, isfriendship), (snapshoturi, po.isInteraction, isinteraction), (snapshoturi, po.isPost, isposts), (snapshoturi, po.humanizedName, name_humanized), (snapshoturi, po.dateObtained, date_obtained), (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), (fileuri, po.fileFormat, format_), ]+[ (fileuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes ] note=theNote(filename) # for avlab and posavlab if note: triples+=[ (snapshoturi,NS.rdfs.comment,note), ] snapshots.add(snapshoturi) # data about the overall data in percolation graph nfiles=len(filenames) nsnapshots=len(snapshots) triples+=[ (NS.social.Session,NS.social.nFacebookParsedFiles,nfiles), (NS.social.Session,NS.social.nFacebookSnapshots,nsnapshots), ] P.context("social_facebook","remove") P.add(triples,context="social_facebook") c("parsed {} facebook files ({} snapshots) are in percolation graph and 'social_facebook' context".format(nfiles,nsnapshots)) c("percolation graph have {} triples ({} in social_facebook context)".format(len(P.percolation_graph),len(P.context("social_facebook")))) negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isEgo true } ") ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isGroup true } ") nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isFriendship true } ") ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isInteraction true } ") nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isPost true } ") totalsize=sum(P.query(r" SELECT ?size WHERE { ?s po:fileSize ?size } ")) c("""{} are ego snapshots, {} are group snapshots {} have a friendship network. {} have an interaction network. {} have post texts and reaction counts Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize)) return snapshots
def makeMetadata(self): self.totalchars=sum(self.nchars_all) self.mchars_messages=n.mean(self.nchars_all) self.dchars_messages=n.std(self.nchars_all) self.totaltokens=sum(self.ntokens_all) self.mtokens_messages=n.mean(self.ntokens_all) self.dtokens_messages=n.std(self.ntokens_all) self.totalsentences=sum(self.nsentences_all) self.msentences_messages=n.mean(self.nsentences_all) self.dsentences_messages=n.std( self.nsentences_all) self.totalchars_clean=sum(self.nchars_clean_all) self.mchars_messages_clean=n.mean(self.nchars_clean_all) self.dchars_messages_clean=n.std(self.nchars_clean_all) self.totaltokens_clean=sum(self.ntokens_clean_all) self.mtokens_messages_clean=n.mean(self.ntokens_clean_all) self.dtokens_messages_clean=n.std(self.ntokens_clean_all) self.totalsentences_clean=sum(self.nsentences_clean_all) self.msentences_messages_clean=n.mean(self.nsentences_clean_all) self.dsentences_messages_clean=n.std( self.nsentences_clean_all) fremoved_lines=self.nremoved_lines/self.nlines triples=[ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nMessages, self.nmessages), (self.snapshoturi, po.nEmptyMessages, self.nempty), (self.snapshoturi, po.nReplies, self.nreplies), (self.snapshoturi, po.nCC, self.ncc), (self.snapshoturi, po.nTo, self.nto), (self.snapshoturi, po.nReferences, self.nreferences), (self.snapshoturi, po.nUrls, self.nurls), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mchars_messages), (self.snapshoturi, po.dCharsOverall, self.dchars_messages), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokens_messages), (self.snapshoturi, po.dTokensOverall, self.dtokens_messages), (self.snapshoturi, po.nSentencesOverall, self.totalsentences), (self.snapshoturi, po.mSentencesOverall, self.msentences_messages), (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages), (self.snapshoturi, po.nCharsOverallClean, self.totalchars_clean), (self.snapshoturi, po.mCharsOverallClean, self.mchars_messages_clean), (self.snapshoturi, po.dCharsOverallClean, self.dchars_messages_clean), (self.snapshoturi, po.nTokensOverallClean, self.totaltokens_clean), (self.snapshoturi, po.mTokensOverallClean, self.mtokens_messages_clean), (self.snapshoturi, po.dTokensOverallClean, self.dtokens_messages_clean), (self.snapshoturi, po.nSentencesOverallClean, self.totalsentences_clean), (self.snapshoturi, po.mSentencesOverallClean, self.msentences_messages_clean), (self.snapshoturi, po.dSentencesOverallClean, self.dsentences_messages_clean), (self.snapshoturi, po.fRemovedLines, fremoved_lines), ] P.add(triples,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.gmaneParticipantAttribute]*len(self.participantvars), self.participantvars,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.gmaneMessageAttribute]*len(self.messagevars), self.messagevars,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.emailXMLFilename]*len(self.email_xml)+[po.emailTTLFilename]*len(self.email_ttl), self.email_xml+self.email_ttl,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.onlineEmailXMLFile]*len(self.email_xml)+[po.onlineEmailTTLFile]*len(self.email_ttl), [self.online_prefix+i for i in self.email_xml+self.email_ttl],context=self.meta_graph) self.mrdf=self.snapshotid+"Meta.rdf" self.mttl=self.snapshotid+"Meta.ttl" self.desc="gmane public email list dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid,self.snapshoturi,self.isego,self.isgroup,) self.desc+="\nisFriendship: {}; ".format(self.isfriendship) self.desc+="isInteraction: {}.".format(self.isinteraction) self.desc+="\nnParticipants: {}; nInteractions: {} (replies+references+cc+to).".format(self.nparticipants,self.nreplies+self.nreferences+self.ncc+self.nto) self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext) self.desc+="\nnMessages: {} (+ empty: {}); ".format(self.nmessages,self.nempty) self.desc+="nReplies: {}; nReferences: {}; nTo {}; nCC: {}.".format(self.nreplies,self.nreferences,self.ncc,self.nto) self.desc+="\nnChars: {}; mChars: {}; dChars: {}.".format(self.totalchars,self.mchars_messages,self.dchars_messages) self.desc+="\nnTokens: {}; mTokens: {}; dTokens: {};".format(self.totaltokens,self.mtokens_messages,self.dtokens_messages) self.desc+="\nnSentences: {}; mSentences: {}; dSentences: {}.".format(self.totalsentences,self.msentences_messages,self.dsentences_messages) self.desc+="\nnCharsClean: {}; mCharsClean: {}; dCharsClean: {}.".format(self.totalchars_clean,self.mchars_messages_clean,self.dchars_messages_clean) self.desc+="\nnTokensClean: {}; mTokensClean: {}; dTokensClean: {};".format(self.totaltokens_clean,self.mtokens_messages_clean,self.dtokens_messages_clean) self.desc+="\nnSentencesClean: {}; mSentencesClean: {}; dSentencesClean: {}.".format(self.totalsentences_clean,self.msentences_messages_clean,self.dsentences_messages_clean) self.desc+="\nnUrls: {}; fRemovedLines {};.".format(self.nurls,fremoved_lines) self.ntriples=len(P.context(self.translation_graph)) triples=[ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "Gmane public mailing list archive RSS feed"), (self.snapshoturi, po.socialProtocolTag, "Gmane"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform,"Gmane",self.meta_graph,self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), (self.snapshoturi, po.gmaneID, self.directory), ] P.add(triples,context=self.meta_graph)
def makeMetadata(self): self.totalchars = sum(self.nchars_all) self.mchars_messages = n.mean(self.nchars_all) self.dchars_messages = n.std(self.nchars_all) self.totaltokens = sum(self.ntokens_all) self.mtokens_messages = n.mean(self.ntokens_all) self.dtokens_messages = n.std(self.ntokens_all) self.totalsentences = sum(self.nsentences_all) self.msentences_messages = n.mean(self.nsentences_all) self.dsentences_messages = n.std(self.nsentences_all) self.totalchars_clean = sum(self.nchars_clean_all) self.mchars_messages_clean = n.mean(self.nchars_clean_all) self.dchars_messages_clean = n.std(self.nchars_clean_all) self.totaltokens_clean = sum(self.ntokens_clean_all) self.mtokens_messages_clean = n.mean(self.ntokens_clean_all) self.dtokens_messages_clean = n.std(self.ntokens_clean_all) self.totalsentences_clean = sum(self.nsentences_clean_all) self.msentences_messages_clean = n.mean(self.nsentences_clean_all) self.dsentences_messages_clean = n.std(self.nsentences_clean_all) fremoved_lines = self.nremoved_lines / self.nlines triples = [ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nMessages, self.nmessages), (self.snapshoturi, po.nEmptyMessages, self.nempty), (self.snapshoturi, po.nReplies, self.nreplies), (self.snapshoturi, po.nCC, self.ncc), (self.snapshoturi, po.nTo, self.nto), (self.snapshoturi, po.nReferences, self.nreferences), (self.snapshoturi, po.nUrls, self.nurls), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mchars_messages), (self.snapshoturi, po.dCharsOverall, self.dchars_messages), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokens_messages), (self.snapshoturi, po.dTokensOverall, self.dtokens_messages), (self.snapshoturi, po.nSentencesOverall, self.totalsentences), (self.snapshoturi, po.mSentencesOverall, self.msentences_messages), (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages), (self.snapshoturi, po.nCharsOverallClean, self.totalchars_clean), (self.snapshoturi, po.mCharsOverallClean, self.mchars_messages_clean), (self.snapshoturi, po.dCharsOverallClean, self.dchars_messages_clean), (self.snapshoturi, po.nTokensOverallClean, self.totaltokens_clean), (self.snapshoturi, po.mTokensOverallClean, self.mtokens_messages_clean), (self.snapshoturi, po.dTokensOverallClean, self.dtokens_messages_clean), (self.snapshoturi, po.nSentencesOverallClean, self.totalsentences_clean), (self.snapshoturi, po.mSentencesOverallClean, self.msentences_messages_clean), (self.snapshoturi, po.dSentencesOverallClean, self.dsentences_messages_clean), (self.snapshoturi, po.fRemovedLines, fremoved_lines), ] P.add(triples, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.gmaneParticipantAttribute] * len(self.participantvars), self.participantvars, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.gmaneMessageAttribute] * len(self.messagevars), self.messagevars, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.emailXMLFilename] * len(self.email_xml) + [po.emailTTLFilename] * len(self.email_ttl), self.email_xml + self.email_ttl, context=self.meta_graph) P.rdf.triplesScaffolding( self.snapshoturi, [po.onlineEmailXMLFile] * len(self.email_xml) + [po.onlineEmailTTLFile] * len(self.email_ttl), [self.online_prefix + i for i in self.email_xml + self.email_ttl], context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "gmane public email list dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) self.desc += "\nnParticipants: {}; nInteractions: {} (replies+references+cc+to).".format( self.nparticipants, self.nreplies + self.nreferences + self.ncc + self.nto) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) self.desc += "\nnMessages: {} (+ empty: {}); ".format( self.nmessages, self.nempty) self.desc += "nReplies: {}; nReferences: {}; nTo {}; nCC: {}.".format( self.nreplies, self.nreferences, self.ncc, self.nto) self.desc += "\nnChars: {}; mChars: {}; dChars: {}.".format( self.totalchars, self.mchars_messages, self.dchars_messages) self.desc += "\nnTokens: {}; mTokens: {}; dTokens: {};".format( self.totaltokens, self.mtokens_messages, self.dtokens_messages) self.desc += "\nnSentences: {}; mSentences: {}; dSentences: {}.".format( self.totalsentences, self.msentences_messages, self.dsentences_messages) self.desc += "\nnCharsClean: {}; mCharsClean: {}; dCharsClean: {}.".format( self.totalchars_clean, self.mchars_messages_clean, self.dchars_messages_clean) self.desc += "\nnTokensClean: {}; mTokensClean: {}; dTokensClean: {};".format( self.totaltokens_clean, self.mtokens_messages_clean, self.dtokens_messages_clean) self.desc += "\nnSentencesClean: {}; mSentencesClean: {}; dSentencesClean: {}.".format( self.totalsentences_clean, self.msentences_messages_clean, self.dsentences_messages_clean) self.desc += "\nnUrls: {}; fRemovedLines {};.".format( self.nurls, fremoved_lines) self.ntriples = len(P.context(self.translation_graph)) triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix + self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix + self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "Gmane public mailing list archive RSS feed"), (self.snapshoturi, po.socialProtocolTag, "Gmane"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform, "Gmane", self.meta_graph, self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), (self.snapshoturi, po.gmaneID, self.directory), ] P.add(triples, context=self.meta_graph)
def writeAllFB(self): c("started rendering of the snapshot publication. snapshotID:", self.snapshotid) self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) g = P.context(self.friendship_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf", "xml") c("serialized friendships") # get filesize and ntriples # filesizerdf = os.path.getsize(self.final_path_+self.snapshotid + # "Friendship.rdf")/(10**6) # filesizettl = os.path.getsize(self.final_path_+self.snapshotid + # "Friendship.ttl")/(10**6) # ntriples = len(g) # triples = [ # (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf), # (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl), # (self.snapshoturi, po.nFriendshipTriples, ntriples), # ] g = P.context(self.meta_graph) # ntriples = len(g) # triples.append( # (self.snapshoturi, po.nMetaTriples, ntriples+1), # ) # P.add(triples, context=self.meta_graph) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta") if not os.path.isdir(self.final_path_ + "scripts"): os.mkdir(self.final_path_ + "scripts") shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py", self.final_path_ + "scripts/triplify.py") # copia do base data if not os.path.isdir(self.final_path_ + "base"): os.mkdir(self.final_path_ + "base") shutil.copy(self.data_path + self.filename_friendships, self.final_path_ + "base/") originals = "base/{}".format(self.filename_friendships) tfriendship = """\n\n{nf} individuals with metadata {fvars} and {nfs} friendships constitute the friendship network in the RDF/XML file: {frdf} \in the Turtle file: \n{fttl} (anonymized {fan}).""".format( nf=self.nfriends, fvars=str(self.friendsvars), nfs=self.nfriendships, frdf=self.frdf, fttl=self.fttl, fan=self.friendships_anonymized, ) datetime_string = P.get(self.snapshoturi, po.dateObtained, None, context=self.social_graph)[2] with open(self.final_path_ + "README", "w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the facebook snapshot {snapid} collected around {date}.{tfriendship} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nOriginal file(s): {origs} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ \ directory.\n:::""".format(snapid=self.snapshotid, date=datetime_string, tfriendship=tfriendship, mrdf=self.mrdf, mttl=self.mttl, origs=originals, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc))