Exemplo n.º 1
0
    def append(self, data):
        node = Node(data)
        temp = self.head

        while (temp.next != None):
            temp = temp.next
        temp.next = node
        self.__count += 1
Exemplo n.º 2
0
    def insert(self, pos, data):
        node = Node(data)
        temp = self.head

        if pos == 0:
            self.head = node
            self.head.next = temp
        else:
            i = 0
            while (i < pos and temp != None):
                i += 1
                prev = temp
                temp = temp.next

            if i == pos:
                prev.next = node
                node.next = temp
            else:
                raise IndexError("Overflow!")
        self.__count += 1
def insertToTree(key_words_path):
    global root
    #print ("Inserting in to the the Tree")
    #print (root.value)
    current = None
    if key_words_path[0] == root.value:
        current = root
    else:
        print("Root of the tree and the root node you passed are not same!!")
        return 0

    for i in range(len(key_words_path) - 1):
        # check if first one is already there in one of the child nodes of the current node!!
        # if there continue; else create and add one
        if len(current.childList) >= 1:
            flagMissed = 0
            for k in range(len(current.childList)):
                if key_words_path[i + 1] == current.childList[k].value:
                    current = current.childList[k]
                    break

                else:
                    flagMissed = flagMissed + 1

            if flagMissed == len(current.childList):
                newNodeCreated = None
                # create the new node and add to the childlist of the current node
                newNodeCreated = Node(current,
                                      key_words_path[i + 1],
                                      iscategory=1,
                                      wikiId=None,
                                      inLinks=None,
                                      outLinks=None,
                                      ispage=None)

                if leafObjects.has_key(newNodeCreated):
                    continue
                else:
                    leafObjects[newNodeCreated] = key_words_path[i + 1]
                    if leafObjects.has_key(current):
                        leafObjects.__delitem__(current)
                #current.childList.append(newNodeCreated)
                current = newNodeCreated

        else:
            newNodeCreated = None
            # create the new node and add to the childlist of the current node
            newNodeCreated = Node(current,
                                  key_words_path[i + 1],
                                  iscategory=1,
                                  wikiId=None,
                                  inLinks=None,
                                  outLinks=None,
                                  ispage=None)
            if leafObjects.has_key(newNodeCreated):
                continue
            else:
                leafObjects[newNodeCreated] = key_words_path[i + 1]
                if leafObjects.has_key(current):
                    leafObjects.__delitem__(current)
            #current.childList.append(newNodeCreated)
            current = newNodeCreated

    return 1
def ProcessArticleCategoryLinking(file_name, replace_punctuation):
    global root
    global leafsWithValue
    global pagesCouldntAssign
    global linenumberOfArticles
    global breaking
    with codecs.open(file_name, 'r', 'utf-8') as inf:
        #file_sample=open(file_name)
        line = ""
        line = inf.readline()
        linenumberOfArticles = linenumberOfArticles + 1
        while line != "":
            for i in range(5):
                if line != "":
                    # we dont need counts of occurances for Wiki data, as each subcategory or page is definitely going to be representing only one case.
                    # Check for existance bbefore processing, because proessing and then checking in dictionary is costly in time
                    line = line.strip('\n')
                    if i == 0:
                        try:
                            wikiId = int(line)
                            breaking = 0
                        except ValueError:
                            #print ("Probelm with this line : at this position,  WikiID expecting But got  : " + u' '+(line).encode('utf-8').strip())
                            print("Probelm with this line : " +
                                  str(linenumberOfArticles))
                            breaking = 1
                            break

                    elif i == 1:
                        #value= unidecode(line).lower().translate(replace_punctuation)
                        value = line.encode('utf-8').lower().translate(
                            replace_punctuation)
                        breaking = 0

                    elif i == 2:
                        try:
                            inLinks = 0
                            inLinks = int(line)
                            breaking = 0

                        except ValueError:
                            #print ("Probelm with this line : at this position, Wiki Articles expecting !! but got!! : " +  u' '+(line).encode('utf-8').strip())
                            print("Probelm with this line : " +
                                  str(linenumberOfArticles))
                            breaking = 1
                            break

                    elif i == 3:
                        #categoriesList=unidecode(line).lower().strip().split("|")
                        categoriesList = line.encode(
                            'utf-8').lower().strip().split("|")
                        for i in range(len(categoriesList)):
                            #print i
                            categoriesList[i] = categoriesList[i].translate(
                                replace_punctuation)
                        breaking = 0

                    elif i == 4:
                        #outLinksList=unidecode(line).lower().strip().split("|")
                        #outLinksList=line.encode('utf-8').lower().strip().split("|")
                        #outLinks = len(outLinksList)
                        outLinks = ""
                        breaking = 0

                    elif i == 5:
                        pass

                    else:
                        print("got some wrong number in case selection")
                else:
                    print("Some problem in lines processing")
                line = inf.readline()
                linenumberOfArticles = linenumberOfArticles + 1
            # now lets create the node and assign it to the relevant parents!!
            # first lets get the leaf category that is mapped and stored in the dictionary leafsWithValue

            if breaking == 0 and inLinks > 5:
                for category in categoriesList:

                    if leafsWithValue.has_key(category):
                        NodeObj = None
                        NodeObj = leafsWithValue[category]
                        for i in range(len(NodeObj)):
                            Node(NodeObj[i], value, None, wikiId, inLinks,
                                 outLinks, 1)
                    else:
                        pagesCouldntAssign = pagesCouldntAssign + 1

            inLinks = 0
            line = inf.readline()
            linenumberOfArticles = linenumberOfArticles + 1
            breaking = 1
    #reload(sys)
    #sys.setdefaultencoding("utf-8")
    print(
        "This code is running on the correct version of python it supports.. 2.7"
    )

#importlib.reload(sys)
#import sys
#sys.setdefaultencoding('utf8')

# Create the first root node and send it to the every File call
# since i know what a root node is supposed to be, i use it
root = Node(None,
            "applied sciences",
            iscategory=1,
            wikiId=None,
            inLinks=None,
            outLinks=None,
            ispage=None)
leafObjects = {}
leafsWithValue = {}
pagesCouldntAssign = 0
stack = []
writingFailed = 0
#levels = [1,2] # the level value and the leaf value will be saved
#fo = open("results.txt", "rw+")
Level1 = {}
levelVal = [2, 5, 7, 10, 13, 15, 18,
            20]  # for now hardcoding 8 levels to save. we can automate this
minLengthOfCategories = 0  # be super careful with this parameter. Think and change it.
size = 55000
Exemplo n.º 6
0
def ProcessArticleCategoryLinking(file_name, replace_punctuation):
    global root
    global leafsWithValue
    global pagesCouldntAssign
    with codecs.open(file_name, 'r', 'utf-8') as inf:
        #file_sample=open(file_name)
        line = ""
        line = inf.readline()
        while line != "":
            for i in range(5):
                if line != "":
                    # we dont need counts of occurances for Wiki data, as each subcategory or page is definitely going to be representing only one case.
                    # Check for existance bbefore processing, because proessing and then checking in dictionary is costly in time
                    line = line.strip('\n')
                    if i == 0:
                        wikiId = int(line)

                    elif i == 1:
                        value = unidecode(line).lower().translate(
                            replace_punctuation)

                    elif i == 2:
                        inLinks = int(line)

                    elif i == 3:
                        categoriesList = unidecode(line).lower().strip().split(
                            "|")
                        for i in range(len(categoriesList)):
                            #print i
                            categoriesList[i] = categoriesList[i].translate(
                                replace_punctuation)

                    elif i == 4:
                        outLinksList = unidecode(line).lower().strip().split(
                            "|")
                        outLinks = len(outLinksList)

                    elif i == 5:
                        pass

                    else:
                        print("got some wrong number in case selection")
                else:
                    print("Some problem in lines processing")
                line = inf.readline()
            # now lets create the node and assign it to the relevant parents!!
            # first lets get the leaf category that is mapped and stored in the dictionary leafsWithValue

            if inLinks > 5:
                for category in categoriesList:

                    if leafsWithValue.has_key(category):
                        NodeObj = None
                        NodeObj = leafsWithValue[category]
                        for i in range(len(NodeObj)):
                            Node(NodeObj[i], value, None, wikiId, inLinks,
                                 outLinks, 1)
                    else:
                        pagesCouldntAssign = pagesCouldntAssign + 1

            line = inf.readline()
def NodeCreation():
    #(self, parent, value=None,iscategory=None,wikiId=None,inLinks=None,outLinks=None,ispage=None):
    root = Node(None,
                1,
                iscategory=1,
                wikiId=None,
                inLinks=None,
                outLinks=None,
                ispage=None)
    N2 = Node(root,
              2,
              iscategory=1,
              wikiId=None,
              inLinks=None,
              outLinks=None,
              ispage=None)
    N3 = Node(root,
              3,
              iscategory=1,
              wikiId=None,
              inLinks=None,
              outLinks=None,
              ispage=None)

    N6 = Node(N2,
              6,
              iscategory=1,
              wikiId=None,
              inLinks=None,
              outLinks=None,
              ispage=None)
    N7 = Node(N2,
              7,
              iscategory=1,
              wikiId=None,
              inLinks=None,
              outLinks=None,
              ispage=None)
    N8 = Node(N2,
              8,
              iscategory=1,
              wikiId=None,
              inLinks=None,
              outLinks=None,
              ispage=None)

    N18 = Node(N6,
               18,
               iscategory=None,
               wikiId=10,
               inLinks=10,
               outLinks=10,
               ispage=1)
    N19 = Node(N6,
               19,
               iscategory=None,
               wikiId=10,
               inLinks=10,
               outLinks=10,
               ispage=1)
    N20 = Node(N6,
               20,
               iscategory=None,
               wikiId=10,
               inLinks=10,
               outLinks=10,
               ispage=1)

    N13 = Node(N7,
               13,
               iscategory=1,
               wikiId=None,
               inLinks=None,
               outLinks=None,
               ispage=None)
    N14 = Node(N8,
               14,
               iscategory=1,
               wikiId=None,
               inLinks=None,
               outLinks=None,
               ispage=None)

    N10 = Node(N3,
               10,
               iscategory=1,
               wikiId=None,
               inLinks=None,
               outLinks=None,
               ispage=None)
    N15 = Node(N10,
               15,
               iscategory=None,
               wikiId=10,
               inLinks=10,
               outLinks=10,
               ispage=1)

    return root
Exemplo n.º 8
0
 def __init__(self, data):
     node = Node(data)
     self.head = node
Level1 = {}
levelVal = [5, 9, 11, 14, 17, 20, 22, 25]
inLinksLimit = 0
#############################################################################################################

#importlib.reload(sys)
#import sys
#sys.setdefaultencoding('utf8')

# Create the first root node and send it to the every File call
# since i know what a root node is supposed to be, i use it
#root = Node(None,"applied sciences",iscategory=1,wikiId=None,inLinks=None,outLinks=None,ispage=None)
root = Node(None,
            RootNodeValue,
            iscategory=1,
            wikiId=None,
            inLinks=None,
            outLinks=None,
            ispage=None)
missingCategories = {}
#missingCategories[RootNodeValue] = [root]
leafObjects = {}
leafsWithValue = {}
pagesCouldntAssign = 0
stack = []
SplitFileArticleName = ""

minLengthOfCategories = 0  # be super careful with this parameter. Think and change it.
size = 55000
Level2 = {}
Level3 = {}