Пример #1
0
from lxml import etree
from datetime import datetime
import settings
from mongoRepository import mongoRep

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--path', help='absolute path for users.xml')
args = parser.parse_args()

filePath = args.path

settingsData = settings.get()

rep = mongoRep(settingsData["connectionString"], "users")

context = etree.iterparse(filePath, events=('end', ), tag='row')

for event, elem in context:
    doc = {"_id": int(elem.attrib["Id"])}
    if "Age" in elem.attrib:
        doc["age"] = int(elem.attrib["Age"])
    doc["name"] = str(elem.attrib["DisplayName"])
    doc["up"] = int(elem.attrib["UpVotes"])
    doc["down"] = int(elem.attrib["DownVotes"])
    doc["rep"] = int(elem.attrib["Reputation"])
    doc["accId"] = int(elem.attrib["AccountId"])
    doc["date"] = datetime.strptime(
        str(elem.attrib["CreationDate"]),
        "%Y-%m-%dT%H:%M:%S.%f").replace(microsecond=0)
Пример #2
0
from datetime import datetime
import settings
from mongoRepository import mongoRep

import argparse


parser = argparse.ArgumentParser()
parser.add_argument('--path', help='absolute path for comments.xml')
args = parser.parse_args()

filePath= args.path

settingsData = settings.get()

rep = mongoRep(settingsData["connectionString"], "comments")

buffer = []
bufferLength = 1000
i = 1
context = etree.iterparse(filePath, events=('end',), tag='row')
 
for event, elem in context:
    doc = {"_id": int(elem.attrib["Id"])}
    if "UserId" in elem.attrib:
        doc["user"] = int(elem.attrib["UserId"])
    doc["post"] = int(elem.attrib["PostId"])
    doc["date"] = datetime.strptime(str(elem.attrib["CreationDate"]), "%Y-%m-%dT%H:%M:%S.%f").replace(microsecond=0)
    doc["text"] = str(elem.attrib["Text"])
    doc["score"]= int(elem.attrib["Score"])
    buffer.append(doc)
Пример #3
0
from lxml import etree
from datetime import datetime
import settings
from mongoRepository import mongoRep

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--path', help='absolute path for votes.xml')
args = parser.parse_args()

filePath = args.path

settingsData = settings.get()

rep = mongoRep(settingsData["connectionString"], "votes")

context = etree.iterparse(filePath, events=('end', ), tag='row')

for event, elem in context:
    doc = {"_id": int(elem.attrib["Id"])}
    doc["type"] = int(elem.attrib["VoteTypeId"])
    doc["post"] = int(elem.attrib["PostId"])
    if "UserId" in elem.attrib:
        doc["user"] = int(elem.attrib["UserId"])
    doc["date"] = datetime.strptime(
        str(elem.attrib["CreationDate"]),
        "%Y-%m-%dT%H:%M:%S.%f").replace(microsecond=0)
    print(rep.insert(doc))
Пример #4
0
from datetime import datetime
import settings
from mongoRepository import mongoRep
import uuid

import argparse
from parse import findall
parser = argparse.ArgumentParser()
parser.add_argument('--path', help='absolute path for PostHistory.xml')
args = parser.parse_args()

filePath= args.path

settingsData = settings.get()
tagsSearchPattern = "<{}>"
rep = mongoRep(settingsData["connectionString"], "history")
context = etree.iterparse(filePath, events=('end',), tag='row')
buffer = []
bufferLength = 1000
i = 1
for event, elem in context:
    # print (elem.attrib)
    # if("Tags" in elem.attrib):
    #     print([r.fixed[0] for r in findall(tagsSearchPattern, str(elem.attrib["Tags"]))])
    # if (i == 10):
    #     break
    # i = i + 1
    doc = {"_id": int(elem.attrib["Id"])}
    doc["history"] = int(elem.attrib["PostHistoryTypeId"])
    doc["post"] = int(elem.attrib["PostId"])
    doc["rev"] = str(elem.attrib["RevisionGUID"])
Пример #5
0
from lxml import etree
from datetime import datetime
import settings
from mongoRepository import mongoRep

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--path', help='absolute path for links.xml')
args = parser.parse_args()

filePath = args.path

settingsData = settings.get()

rep = mongoRep(settingsData["connectionString"], "links")

buffer = []
bufferLength = 1000
i = 1
context = etree.iterparse(filePath, events=('end', ), tag='row')

for event, elem in context:
    doc = {"_id": int(elem.attrib["Id"])}
    doc["post"] = int(elem.attrib["PostId"])
    doc["rel"] = int(elem.attrib["RelatedPostId"])
    doc["date"] = datetime.strptime(
        str(elem.attrib["CreationDate"]),
        "%Y-%m-%dT%H:%M:%S.%f").replace(microsecond=0)
    doc["link"] = int(elem.attrib["LinkTypeId"])
    buffer.append(doc)
Пример #6
0
from lxml import etree
from datetime import datetime
import settings
from mongoRepository import mongoRep

import argparse
from parse import findall
parser = argparse.ArgumentParser()
parser.add_argument('--path', help='absolute path for Posts.xml')
args = parser.parse_args()

filePath = args.path

settingsData = settings.get()
tagsSearchPattern = "<{}>"
rep = mongoRep(settingsData["connectionString"], "posts")
context = etree.iterparse(filePath, events=('end', ), tag='row')
buffer = []
bufferLength = 1000
i = 1
for event, elem in context:
    # print (elem.attrib)
    # if("Tags" in elem.attrib):
    #     print([r.fixed[0] for r in findall(tagsSearchPattern, str(elem.attrib["Tags"]))])
    # if (i == 5):
    #     break
    # i = i + 1
    doc = {"_id": int(elem.attrib["Id"])}
    doc["type"] = int(elem.attrib["PostTypeId"])
    if ("AcceptedAnswerId" in elem.attrib):
        doc["accepted"] = int(elem.attrib["AcceptedAnswerId"])