def append(self, data): node = Node(data) temp = self.head while (temp.next != None): temp = temp.next temp.next = node self.__count += 1
def insert(self, pos, data): node = Node(data) temp = self.head if pos == 0: self.head = node self.head.next = temp else: i = 0 while (i < pos and temp != None): i += 1 prev = temp temp = temp.next if i == pos: prev.next = node node.next = temp else: raise IndexError("Overflow!") self.__count += 1
def insertToTree(key_words_path): global root #print ("Inserting in to the the Tree") #print (root.value) current = None if key_words_path[0] == root.value: current = root else: print("Root of the tree and the root node you passed are not same!!") return 0 for i in range(len(key_words_path) - 1): # check if first one is already there in one of the child nodes of the current node!! # if there continue; else create and add one if len(current.childList) >= 1: flagMissed = 0 for k in range(len(current.childList)): if key_words_path[i + 1] == current.childList[k].value: current = current.childList[k] break else: flagMissed = flagMissed + 1 if flagMissed == len(current.childList): newNodeCreated = None # create the new node and add to the childlist of the current node newNodeCreated = Node(current, key_words_path[i + 1], iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) if leafObjects.has_key(newNodeCreated): continue else: leafObjects[newNodeCreated] = key_words_path[i + 1] if leafObjects.has_key(current): leafObjects.__delitem__(current) #current.childList.append(newNodeCreated) current = newNodeCreated else: newNodeCreated = None # create the new node and add to the childlist of the current node newNodeCreated = Node(current, key_words_path[i + 1], iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) if leafObjects.has_key(newNodeCreated): continue else: leafObjects[newNodeCreated] = key_words_path[i + 1] if leafObjects.has_key(current): leafObjects.__delitem__(current) #current.childList.append(newNodeCreated) current = newNodeCreated return 1
def ProcessArticleCategoryLinking(file_name, replace_punctuation): global root global leafsWithValue global pagesCouldntAssign global linenumberOfArticles global breaking with codecs.open(file_name, 'r', 'utf-8') as inf: #file_sample=open(file_name) line = "" line = inf.readline() linenumberOfArticles = linenumberOfArticles + 1 while line != "": for i in range(5): if line != "": # we dont need counts of occurances for Wiki data, as each subcategory or page is definitely going to be representing only one case. # Check for existance bbefore processing, because proessing and then checking in dictionary is costly in time line = line.strip('\n') if i == 0: try: wikiId = int(line) breaking = 0 except ValueError: #print ("Probelm with this line : at this position, WikiID expecting But got : " + u' '+(line).encode('utf-8').strip()) print("Probelm with this line : " + str(linenumberOfArticles)) breaking = 1 break elif i == 1: #value= unidecode(line).lower().translate(replace_punctuation) value = line.encode('utf-8').lower().translate( replace_punctuation) breaking = 0 elif i == 2: try: inLinks = 0 inLinks = int(line) breaking = 0 except ValueError: #print ("Probelm with this line : at this position, Wiki Articles expecting !! but got!! : " + u' '+(line).encode('utf-8').strip()) print("Probelm with this line : " + str(linenumberOfArticles)) breaking = 1 break elif i == 3: #categoriesList=unidecode(line).lower().strip().split("|") categoriesList = line.encode( 'utf-8').lower().strip().split("|") for i in range(len(categoriesList)): #print i categoriesList[i] = categoriesList[i].translate( replace_punctuation) breaking = 0 elif i == 4: #outLinksList=unidecode(line).lower().strip().split("|") #outLinksList=line.encode('utf-8').lower().strip().split("|") #outLinks = len(outLinksList) outLinks = "" breaking = 0 elif i == 5: pass else: print("got some wrong number in case selection") else: print("Some problem in lines processing") line = inf.readline() linenumberOfArticles = linenumberOfArticles + 1 # now lets create the node and assign it to the relevant parents!! # first lets get the leaf category that is mapped and stored in the dictionary leafsWithValue if breaking == 0 and inLinks > 5: for category in categoriesList: if leafsWithValue.has_key(category): NodeObj = None NodeObj = leafsWithValue[category] for i in range(len(NodeObj)): Node(NodeObj[i], value, None, wikiId, inLinks, outLinks, 1) else: pagesCouldntAssign = pagesCouldntAssign + 1 inLinks = 0 line = inf.readline() linenumberOfArticles = linenumberOfArticles + 1 breaking = 1
#reload(sys) #sys.setdefaultencoding("utf-8") print( "This code is running on the correct version of python it supports.. 2.7" ) #importlib.reload(sys) #import sys #sys.setdefaultencoding('utf8') # Create the first root node and send it to the every File call # since i know what a root node is supposed to be, i use it root = Node(None, "applied sciences", iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) leafObjects = {} leafsWithValue = {} pagesCouldntAssign = 0 stack = [] writingFailed = 0 #levels = [1,2] # the level value and the leaf value will be saved #fo = open("results.txt", "rw+") Level1 = {} levelVal = [2, 5, 7, 10, 13, 15, 18, 20] # for now hardcoding 8 levels to save. we can automate this minLengthOfCategories = 0 # be super careful with this parameter. Think and change it. size = 55000
def ProcessArticleCategoryLinking(file_name, replace_punctuation): global root global leafsWithValue global pagesCouldntAssign with codecs.open(file_name, 'r', 'utf-8') as inf: #file_sample=open(file_name) line = "" line = inf.readline() while line != "": for i in range(5): if line != "": # we dont need counts of occurances for Wiki data, as each subcategory or page is definitely going to be representing only one case. # Check for existance bbefore processing, because proessing and then checking in dictionary is costly in time line = line.strip('\n') if i == 0: wikiId = int(line) elif i == 1: value = unidecode(line).lower().translate( replace_punctuation) elif i == 2: inLinks = int(line) elif i == 3: categoriesList = unidecode(line).lower().strip().split( "|") for i in range(len(categoriesList)): #print i categoriesList[i] = categoriesList[i].translate( replace_punctuation) elif i == 4: outLinksList = unidecode(line).lower().strip().split( "|") outLinks = len(outLinksList) elif i == 5: pass else: print("got some wrong number in case selection") else: print("Some problem in lines processing") line = inf.readline() # now lets create the node and assign it to the relevant parents!! # first lets get the leaf category that is mapped and stored in the dictionary leafsWithValue if inLinks > 5: for category in categoriesList: if leafsWithValue.has_key(category): NodeObj = None NodeObj = leafsWithValue[category] for i in range(len(NodeObj)): Node(NodeObj[i], value, None, wikiId, inLinks, outLinks, 1) else: pagesCouldntAssign = pagesCouldntAssign + 1 line = inf.readline()
def NodeCreation(): #(self, parent, value=None,iscategory=None,wikiId=None,inLinks=None,outLinks=None,ispage=None): root = Node(None, 1, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) N2 = Node(root, 2, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) N3 = Node(root, 3, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) N6 = Node(N2, 6, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) N7 = Node(N2, 7, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) N8 = Node(N2, 8, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) N18 = Node(N6, 18, iscategory=None, wikiId=10, inLinks=10, outLinks=10, ispage=1) N19 = Node(N6, 19, iscategory=None, wikiId=10, inLinks=10, outLinks=10, ispage=1) N20 = Node(N6, 20, iscategory=None, wikiId=10, inLinks=10, outLinks=10, ispage=1) N13 = Node(N7, 13, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) N14 = Node(N8, 14, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) N10 = Node(N3, 10, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) N15 = Node(N10, 15, iscategory=None, wikiId=10, inLinks=10, outLinks=10, ispage=1) return root
def __init__(self, data): node = Node(data) self.head = node
Level1 = {} levelVal = [5, 9, 11, 14, 17, 20, 22, 25] inLinksLimit = 0 ############################################################################################################# #importlib.reload(sys) #import sys #sys.setdefaultencoding('utf8') # Create the first root node and send it to the every File call # since i know what a root node is supposed to be, i use it #root = Node(None,"applied sciences",iscategory=1,wikiId=None,inLinks=None,outLinks=None,ispage=None) root = Node(None, RootNodeValue, iscategory=1, wikiId=None, inLinks=None, outLinks=None, ispage=None) missingCategories = {} #missingCategories[RootNodeValue] = [root] leafObjects = {} leafsWithValue = {} pagesCouldntAssign = 0 stack = [] SplitFileArticleName = "" minLengthOfCategories = 0 # be super careful with this parameter. Think and change it. size = 55000 Level2 = {} Level3 = {}