from lxml import etree from datetime import datetime import settings from mongoRepository import mongoRep import argparse parser = argparse.ArgumentParser() parser.add_argument('--path', help='absolute path for users.xml') args = parser.parse_args() filePath = args.path settingsData = settings.get() rep = mongoRep(settingsData["connectionString"], "users") context = etree.iterparse(filePath, events=('end', ), tag='row') for event, elem in context: doc = {"_id": int(elem.attrib["Id"])} if "Age" in elem.attrib: doc["age"] = int(elem.attrib["Age"]) doc["name"] = str(elem.attrib["DisplayName"]) doc["up"] = int(elem.attrib["UpVotes"]) doc["down"] = int(elem.attrib["DownVotes"]) doc["rep"] = int(elem.attrib["Reputation"]) doc["accId"] = int(elem.attrib["AccountId"]) doc["date"] = datetime.strptime( str(elem.attrib["CreationDate"]), "%Y-%m-%dT%H:%M:%S.%f").replace(microsecond=0)
from datetime import datetime import settings from mongoRepository import mongoRep import argparse parser = argparse.ArgumentParser() parser.add_argument('--path', help='absolute path for comments.xml') args = parser.parse_args() filePath= args.path settingsData = settings.get() rep = mongoRep(settingsData["connectionString"], "comments") buffer = [] bufferLength = 1000 i = 1 context = etree.iterparse(filePath, events=('end',), tag='row') for event, elem in context: doc = {"_id": int(elem.attrib["Id"])} if "UserId" in elem.attrib: doc["user"] = int(elem.attrib["UserId"]) doc["post"] = int(elem.attrib["PostId"]) doc["date"] = datetime.strptime(str(elem.attrib["CreationDate"]), "%Y-%m-%dT%H:%M:%S.%f").replace(microsecond=0) doc["text"] = str(elem.attrib["Text"]) doc["score"]= int(elem.attrib["Score"]) buffer.append(doc)
from lxml import etree from datetime import datetime import settings from mongoRepository import mongoRep import argparse parser = argparse.ArgumentParser() parser.add_argument('--path', help='absolute path for votes.xml') args = parser.parse_args() filePath = args.path settingsData = settings.get() rep = mongoRep(settingsData["connectionString"], "votes") context = etree.iterparse(filePath, events=('end', ), tag='row') for event, elem in context: doc = {"_id": int(elem.attrib["Id"])} doc["type"] = int(elem.attrib["VoteTypeId"]) doc["post"] = int(elem.attrib["PostId"]) if "UserId" in elem.attrib: doc["user"] = int(elem.attrib["UserId"]) doc["date"] = datetime.strptime( str(elem.attrib["CreationDate"]), "%Y-%m-%dT%H:%M:%S.%f").replace(microsecond=0) print(rep.insert(doc))
from datetime import datetime import settings from mongoRepository import mongoRep import uuid import argparse from parse import findall parser = argparse.ArgumentParser() parser.add_argument('--path', help='absolute path for PostHistory.xml') args = parser.parse_args() filePath= args.path settingsData = settings.get() tagsSearchPattern = "<{}>" rep = mongoRep(settingsData["connectionString"], "history") context = etree.iterparse(filePath, events=('end',), tag='row') buffer = [] bufferLength = 1000 i = 1 for event, elem in context: # print (elem.attrib) # if("Tags" in elem.attrib): # print([r.fixed[0] for r in findall(tagsSearchPattern, str(elem.attrib["Tags"]))]) # if (i == 10): # break # i = i + 1 doc = {"_id": int(elem.attrib["Id"])} doc["history"] = int(elem.attrib["PostHistoryTypeId"]) doc["post"] = int(elem.attrib["PostId"]) doc["rev"] = str(elem.attrib["RevisionGUID"])
from lxml import etree from datetime import datetime import settings from mongoRepository import mongoRep import argparse parser = argparse.ArgumentParser() parser.add_argument('--path', help='absolute path for links.xml') args = parser.parse_args() filePath = args.path settingsData = settings.get() rep = mongoRep(settingsData["connectionString"], "links") buffer = [] bufferLength = 1000 i = 1 context = etree.iterparse(filePath, events=('end', ), tag='row') for event, elem in context: doc = {"_id": int(elem.attrib["Id"])} doc["post"] = int(elem.attrib["PostId"]) doc["rel"] = int(elem.attrib["RelatedPostId"]) doc["date"] = datetime.strptime( str(elem.attrib["CreationDate"]), "%Y-%m-%dT%H:%M:%S.%f").replace(microsecond=0) doc["link"] = int(elem.attrib["LinkTypeId"]) buffer.append(doc)
from lxml import etree from datetime import datetime import settings from mongoRepository import mongoRep import argparse from parse import findall parser = argparse.ArgumentParser() parser.add_argument('--path', help='absolute path for Posts.xml') args = parser.parse_args() filePath = args.path settingsData = settings.get() tagsSearchPattern = "<{}>" rep = mongoRep(settingsData["connectionString"], "posts") context = etree.iterparse(filePath, events=('end', ), tag='row') buffer = [] bufferLength = 1000 i = 1 for event, elem in context: # print (elem.attrib) # if("Tags" in elem.attrib): # print([r.fixed[0] for r in findall(tagsSearchPattern, str(elem.attrib["Tags"]))]) # if (i == 5): # break # i = i + 1 doc = {"_id": int(elem.attrib["Id"])} doc["type"] = int(elem.attrib["PostTypeId"]) if ("AcceptedAnswerId" in elem.attrib): doc["accepted"] = int(elem.attrib["AcceptedAnswerId"])