def read_ids(idfilename): with gzip.open(idfilename) as idfile: for line in idfile: line = line.strip() regex = "(\d+)\,\s+(.*)" identry = re.findall(regex, line) identry = identry[0] if len(identry) < 2: continue myid = identry[0] myentry = identry[1] id_to_name[myid] = myentry name_to_id[myentry] = myid p("Read all ids", "info") idfile.close()
try: categoryinputfilename = sys.argv[1] hiddencategoryinputfilename = sys.argv[2] except: print "\n[RUN]: \n"\ "python Is_hidden_there.py \n"\ "\t [Subcat-links.txt.gz]\n"\ "\t [All_hidden_categories.txt.gz]\n" exit(0) hidden_cat = dict() #Dictionary to check all hidden category names hiddencnt = 0 #Counter to see how many hidden categories are read hiddencat_found = False #Boolean to determine if hidden cateogry is found within graph # Reads all hidden categories from the file containing the names of all hidden categories p("Reading all hidden categories", "info") with gzip.open(hiddencategoryinputfilename) as hiddencategories: for line in hiddencategories: line = line.strip().lower() if line in hidden_cat: hidden_cat[line]+=1 else: hiddencnt += 1 hidden_cat[line]=1 p("All hidden categories read", "info") # Reads the category graph p("Read all category links", "info") with gzip.open(categoryinputfilename, "rb") as inputfile: for line in inputfile: line = line.strip()
def find_grades(categoryinfofilename, categoriesoutputfilename): global grades global C_in global C_out global avg_in global avg_out starttime = time.time() begintime = starttime p("Reading all category info", "info") parent = child = "" graph = dict() # Dictionary to keep track of the children to each parent cat subgraph = dict() # Dictionary to keep track of the parents of each cat # Creating category graph with open(categoryinfofilename) as categorygraph: for line in categorygraph: line = line.strip() if line.startswith("*"): #children child = name_to_id[line[2:]] if parent == "": continue if parent in graph: if child not in graph[parent]: graph[parent].append(child) else: graph[parent] = [child] if child in subgraph: if parent in subgraph[child]: a = 0 else: subgraph[child].append(parent) else: subgraph[child] = [parent] else: line = line.replace("_", " ") parent = name_to_id[line] #idmapper.insert_name(parent) p("Finished reading all info [Time: %s sec]" %(time.time()-starttime), "info") maxparent = maxchildren = outlinks = inlinks = 0 mparent = mchildren = "" C_in = len(subgraph) C_out = len(subgraph) for category in graph: if len(graph[category])> maxchildren: maxchildren = len(graph[category]) mchildren = category outlinks+= len(graph[category]) grades[category] = [len(graph[category])] if category in subgraph: grades[category].append(len(subgraph[category])) inlinks += len(subgraph[category]) if len(subgraph[category]) > maxparent: maxparent = len(subgraph[category]) mparent = category """ if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): cnt10+= 1 if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): cnt20+= 1 if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): cnt30+= 1 if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): cnt40+= 1 if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): cnt50+= 1 if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): cnt60+= 1 if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): cnt70+= 1 if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): cnt80+= 1 if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): cnt90+= 1 if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): cnt100+= 1 print "category: %s, number: %d\n" %(category, len(graph[category])) """ #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])) #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))) subgraph.pop(category, None) else: grades[category].append(0) #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category]))) for category in subgraph: grades[category] = [0, len(subgraph[category])] inlinks += len(subgraph[category]) avg_in = inlinks / C_in avb_out = outlinks / C_out
#print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])) #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))) subgraph.pop(category, None) else: grades[category].append(0) #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category]))) for category in subgraph: grades[category] = [0, len(subgraph[category])] inlinks += len(subgraph[category]) avg_in = inlinks / C_in avb_out = outlinks / C_out #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category]))) #return create_grades() #outputfile.close() """ thresholds = [10, 20, 30, 40, 50] p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info") p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info") p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info") p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info") p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info") p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info") p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info") p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info") p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info") p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info") p("Maxparent: %d (%s)" %(maxparent, mparent), "info") p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info") #subcats = graph["2015"]
import gzip, json, yaml, io from myprint import myprint as p """ Program for a file containing all dictionary entries and what page ids they are based on. Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy Why? Because the file containing all possible entries and their page ids is extremely large. Input: A file containing all processed Wikipedia article pages and their corresponding ids Output: A file continaing the relevant dictionary entries and what page ids they are based on """ # Reading the English enmappingfilename = "pageid-pagetitle-en.txt.gz" enpagetitle_to_id = dict() p("Reading %s" % (enmappingfilename), "info") with gzip.open(enmappingfilename, "rb") as enmappingfile: for line in enmappingfile: line = line.strip() splittet = line.split("\t") if len(splittet) < 2: continue pageid = splittet[0] pagetitle = splittet[1] if pagetitle in enpagetitle_to_id: enpagetitle_to_id[pagetitle].append(pageid) else: enpagetitle_to_id[pagetitle] = [pageid] enmappingfile.close() latestversion = 5 # Latest version, update for later versions
"\t [category-info.txt]\n"\ "\t [article-info.txt.gz]\n"\ "\t [article-output.txt.gz] \n\n"\ "[FUNC:]\n"\ "Create the complete paths of the articles. \n" exit(0) startcategory = "Main topic classifications" #"Fundamental Categories" startcategory = startcategory.lower() parent = "" graph = dict() #Dictionary to keep track of the children to each parent cat subgraph = dict() #Dictionary to keep track of the parents to each subcategory starttime = time.time() begintime = starttime p("Reading all category info", "info") parent = child = "" with open(categoryinfofilename) as categorygraph: for line in categorygraph: line = line.strip() if line.startswith("*"): #children child_name = line[2:] idmapper.insert_name(child_name) child = idmapper.name_to_id(child_name) if parent == "": continue if parent in graph: # if child not in graph[parent]: graph[parent].append(child) else: graph[parent] = [child]
"python Hidden_categories.py \n"\ "\t [enwiki-latest-page_props.sql.gz]\n"\ "\t [enwiki-latest-page.sql.gz\n"\ "\t [All_hidden_categories.txt.gz]\n\n"\ "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n" exit(0) hiddencat_id = dict( ) #Dictionary to keep track of the ids of all hidden categories hidden_cat = dict( ) #Dictionary to keep track of the name of all hidden categories hiddencnt = 0 start_time = time.time() # Looping through page props to find the ids of all hidden categories p("Finding all hidden category ids from page props...", "info") with gzip.open(pagepropsinputfilename) as inputfile: for line in inputfile: if (line.startswith("INSERT")): line = line.split("VALUES (")[1] line.decode('utf-8', 'ignore') insertions = line.split("),(") for insertion in insertions: insertion = insertion.lower() words = insertion.split(",") if "hiddencat" in words[1].lower(): # Hidden category is found, id i added hiddencat_id[words[0]] = 1 hiddencnt += 1 # Counter to keep track of number of hidden categories mytime = time.time() - start_time p(
titlefilename = sys.argv[1] outputtitlesfilename = sys.argv[2] except: print "\n[RUN]: \n"\ "python Redirecting.py \n"\ "\t [enwiki-latest-redirects.sql.gz]\n"\ "\t [output-redirect-titles.txt.gz]\n\n"\ "[FUNC:]\n"\ "Find all titles that redirecs \n" exit(0) redirects = dict() #Dictionary for keeping all redirect pages starttime = time.time() # Reads the redictfile p("Reading all redirect titles", "info") with gzip.open(titlefilename) as titlefile: for line in titlefile: if line.startswith("INSERT"): line.decode('utf-8', 'ignore') line_split = line[30:] #.split("VALUES (")[1] insertions = line_split.split("),(") for insertion in insertions: # Code for encoding try: insertion = insertion.decode('unicode-escape') except SyntaxError: insertion = insertion.decode('ascii') except Exception,e: a = 0
""" Program for finding the pageid for an entry. Needs to do the same process as the mapper so that the entries are identical. """ categoryinputfilename = "enwiki-latest-categorylinks.sql.gz" redirectinputfilename = "output-redirect-titles.txt.gz" pagefilename = "enwiki-latest-page.sql.gz" redirects = dict() pageid_to_title = dict() pagetitle_to_id = dict() """ p("Reading %s" %(inputfilename), "info") with gzip.open(inputfilename, "rb") as inputfile: for line in inputfile: line = line.strip() line = line.lower() splittet = line.split("\t") if len(splittet) < 2: #print "< 2: " + line continue nopages[splittet[0]] = splittet[1] inputfile.close() p("Finished reading %s" %(inputfilename), "info") """ p("Reading %s" %(redirectinputfilename), "info")
"python Remove_hiddencats.py \n"\ "\t [Sub-categories-new.txt.gz]\n"\ "\t [All_hidden_categories.txt.gz]\n"\ "\t [Subcat-links.txt.gz\n\n"\ "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n" exit(0) allcategories = dict() #Dictionary to keep track of all categories hidden_cat = dict() #Dictionary to keep track of all hidden categories links = dict() #Dictionary to keep track of all links in the graph artskip = catskip = hiddencnt = 0 starttime = time.time() # Reads all the hidden categories from the file p("Reading all hidden categories", "info") with gzip.open(hiddencategoryinputfilename, "rb") as hiddencategories: for line in hiddencategories: line = line.strip().lower() # Code for representing the category names in same encoding try: line = line.decode('unicode-escape') except SyntaxError: line = line.decode('ascii') except Exception,e: a = 0 try: line = unidecode(line) except UnicodeEncodeError, e:
import gzip, re from myprint import myprint as p from unidecode import unidecode """ Program for finding the pageid for an entry. Needs to do the same process as the mapper so that the entries are identical. """ categoryinputfilename = "enwiki-latest-categorylinks.sql.gz" redirectinputfilename = "output-redirect-titles.txt.gz" pagefilename = "enwiki-latest-page.sql.gz" redirects = dict() pageid_to_title = dict() pagetitle_to_id = dict() """ p("Reading %s" %(inputfilename), "info") with gzip.open(inputfilename, "rb") as inputfile: for line in inputfile: line = line.strip() line = line.lower() splittet = line.split("\t") if len(splittet) < 2: #print "< 2: " + line continue nopages[splittet[0]] = splittet[1] inputfile.close() p("Finished reading %s" %(inputfilename), "info") """ p("Reading %s" % (redirectinputfilename), "info")
print "\n[RUN]: \n"\ "python Hidden_categories.py \n"\ "\t [enwiki-latest-page_props.sql.gz]\n"\ "\t [enwiki-latest-page.sql.gz\n"\ "\t [All_hidden_categories.txt.gz]\n\n"\ "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n" exit(0) hiddencat_id = dict() #Dictionary to keep track of the ids of all hidden categories hidden_cat = dict() #Dictionary to keep track of the name of all hidden categories hiddencnt = 0 start_time = time.time() # Looping through page props to find the ids of all hidden categories p("Finding all hidden category ids from page props...", "info") with gzip.open(pagepropsinputfilename) as inputfile: for line in inputfile: if (line.startswith("INSERT")): line = line.split("VALUES (")[1] line.decode('utf-8', 'ignore') insertions = line.split("),(") for insertion in insertions: insertion = insertion.lower() words = insertion.split(",") if "hiddencat" in words[1].lower(): # Hidden category is found, id i added hiddencat_id[words[0]] = 1 hiddencnt += 1 # Counter to keep track of number of hidden categories mytime = time.time() - start_time p("Found %d hidden category ids (%s min, %s min) ---" %(hiddencnt, mytime/60, mytime%60), "info")
class Path(object): def __init__(self, score, path): self.score = score self.path = path return def __cmp__(self, other): return cmp(other.score, self.score) letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", \ "t", "u", "v", "w", "x", "y", "z", "restfile"] gradesfilename = "category-grade.txt.gz" idfilename = "id-mapper.txt.gz" p("Reading all grades", "info") allgrades = dict() grades = dict() with gzip.open(gradesfilename, "rb") as gradefile: for line in gradefile: line = line.strip() splittet = line.split("\t") category = splittet[0] grade = float(splittet[1]) grades[category] = grade id_to_name = dict() p("Reading all ids for all categories", "info") with gzip.open(idfilename, "rb") as idfile: for line in idfile: line = line.strip()
def find_grades(categoryinfofilename, categoriesoutputfilename): global grades global C_in global C_out global avg_in global avg_out starttime = time.time() begintime = starttime p("Reading all category info", "info") parent = child = "" graph = dict( ) # Dictionary to keep track of the children to each parent cat subgraph = dict() # Dictionary to keep track of the parents of each cat # Creating category graph with open(categoryinfofilename) as categorygraph: for line in categorygraph: line = line.strip() if line.startswith("*"): #children child = name_to_id[line[2:]] if parent == "": continue if parent in graph: if child not in graph[parent]: graph[parent].append(child) else: graph[parent] = [child] if child in subgraph: if parent in subgraph[child]: a = 0 else: subgraph[child].append(parent) else: subgraph[child] = [parent] else: line = line.replace("_", " ") parent = name_to_id[line] #idmapper.insert_name(parent) p("Finished reading all info [Time: %s sec]" % (time.time() - starttime), "info") maxparent = maxchildren = outlinks = inlinks = 0 mparent = mchildren = "" C_in = len(subgraph) C_out = len(subgraph) for category in graph: if len(graph[category]) > maxchildren: maxchildren = len(graph[category]) mchildren = category outlinks += len(graph[category]) grades[category] = [len(graph[category])] if category in subgraph: grades[category].append(len(subgraph[category])) inlinks += len(subgraph[category]) if len(subgraph[category]) > maxparent: maxparent = len(subgraph[category]) mparent = category """ if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): cnt10+= 1 if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): cnt20+= 1 if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): cnt30+= 1 if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): cnt40+= 1 if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): cnt50+= 1 if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): cnt60+= 1 if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): cnt70+= 1 if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): cnt80+= 1 if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): cnt90+= 1 if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): cnt100+= 1 print "category: %s, number: %d\n" %(category, len(graph[category])) """ #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])) #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))) subgraph.pop(category, None) else: grades[category].append(0) #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category]))) for category in subgraph: grades[category] = [0, len(subgraph[category])] inlinks += len(subgraph[category]) avg_in = inlinks / C_in avb_out = outlinks / C_out
# Method for printing the entry, only for debugging def entryprint(self): print "Old: %s, new: %s, tierones: %s\n" %(self.oldentry, self.newentry, self.tierone) # Method for comparing this object to another def myequal(self, obj): for category in obj.tierone: if category not in self.tierone: return False return True version = 4 # Version of the dictionary to be cleaned # Reading the dictionary inputfilename = "igg-dictionary-" + str(version) + ".json" p("Reading json file", "info") with open(inputfilename, "rb") as inputfile: iggdictionary = yaml.load(inputfile) # Loading the dictionary from file p("finished reading python json", "info") iggiabtaxonomy = "igg-iabtaxonomy" + str(version) iggiabtaxonomynew = "igg-iabtaxonomy" + str(version+1) dictionary = iggdictionary[iggiabtaxonomy] newdictionary = dict() # Dictionary for storing the final results paraentries = dict() # Dictionary for keeping track on the changes made to the entries disambiguation = dict() # Dictionary for all disambiguation titles disambiguationentries = dict() # Dictionary for keeping track on all disambiguation entries yearregex = "(\d\d\d\d)" # Regex for recognizing years in the title parenthesisregex = "(\(.*\))" # Regex for recognizing parenthesis in the title
import gzip, re from myprint import myprint as p from unidecode import unidecode inputfilename = "no-mapping.txt.gz" categoryinputfilename = "enwiki-latest-categorylinks.sql.gz" redirectinputfilename = "output-redirect-titles.txt.gz" pagefilename = "enwiki-latest-page.sql.gz" nopages = dict() redirects = dict() pages = dict() p("Reading %s" %(inputfilename), "info") with gzip.open(inputfilename, "rb") as inputfile: for line in inputfile: line = line.strip() line = line.lower() splittet = line.split("\t") if len(splittet) < 2: continue nopages[splittet[0]] = splittet[1] inputfile.close() p("Finished reading %s" %(inputfilename), "info") p("Reading %s" %(redirectinputfilename), "info") with gzip.open(redirectinputfilename, "rb") as redirectfile: for line in redirectfile: #line = unicode(line, "utf-8")
from myprint import myprint as p """ Program for a file containing all dictionary entries and what page ids they are based on. Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy Why? Because the file containing all possible entries and their page ids is extremely large. Input: A file containing all processed Wikipedia article pages and their corresponding ids Output: A file continaing the relevant dictionary entries and what page ids they are based on """ # Reading the English enmappingfilename = "pageid-pagetitle-en.txt.gz" enpagetitle_to_id = dict() p("Reading %s" %(enmappingfilename), "info") with gzip.open(enmappingfilename, "rb") as enmappingfile: for line in enmappingfile: line = line.strip() splittet = line.split("\t") if len(splittet) < 2: continue pageid = splittet[0] pagetitle = splittet[1] if pagetitle in enpagetitle_to_id: enpagetitle_to_id[pagetitle].append(pageid) else: enpagetitle_to_id[pagetitle] = [pageid] enmappingfile.close()
from myprint import myprint as p """ Program for mapping all keywords to IAB categories. """ inputfilename = "articlemapping-all.txt.gz" outputcategoriesfilename = "Outputcategories" letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile"] outputcategories = dict() tiertwo = "" tierone = "" p("Reading output categories", "info") with open(outputcategoriesfilename, "r") as outputcatfile: for line in outputcatfile: line = line.strip() if line.startswith("*"): tierone = line[1:] tierone = line.lower() else: tiertwo = line tiertwo = line.lower() outputcategories[tiertwo] = tierone idfilename = "id-mapper.txt.gz" id_to_name = dict() name_to_id = dict()
Program for mapping all keywords to IAB categories. """ inputfilename = "articlemapping-all.txt.gz" outputcategoriesfilename = "Outputcategories" letters = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile" ] outputcategories = dict() tiertwo = "" tierone = "" p("Reading output categories", "info") with open(outputcategoriesfilename, "r") as outputcatfile: for line in outputcatfile: line = line.strip() if line.startswith("*"): tierone = line[1:] tierone = line.lower() else: tiertwo = line tiertwo = line.lower() outputcategories[tiertwo] = tierone idfilename = "id-mapper.txt.gz" id_to_name = dict() name_to_id = dict() p("Reading id-mapper", "info")
def is_number(input): try: int(input) return True except: return False start_time = time.time() categorycnt = pagecnt = lines = 0 hidden_cat = dict() artskip = catskip = hiddencnt = 0 p("Reading all hidden categories", "info") with gzip.open(hiddencategoryinputfilename) as hiddencategories: for line in hiddencategories: line = line.strip().lower() if line in hidden_cat: hidden_cat[line] += 1 else: hidden_cat[line] = 1 p("All hidden categories read", "info") redirects = dict() p("Reading all redirects", "info") with gzip.open(redirectinputfilename, "rb") as redirectfile: for line in redirectfile: line.decode('utf-8', 'ignore') line = line.lower()
"python Remove_hiddencats.py \n"\ "\t [Sub-categories-new.txt.gz]\n"\ "\t [All_hidden_categories.txt.gz]\n"\ "\t [Subcat-links.txt.gz\n\n"\ "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n" exit(0) allcategories = dict() #Dictionary to keep track of all categories hidden_cat = dict() #Dictionary to keep track of all hidden categories links = dict() #Dictionary to keep track of all links in the graph artskip = catskip = hiddencnt = 0 starttime = time.time() # Reads all the hidden categories from the file p("Reading all hidden categories", "info") with gzip.open(hiddencategoryinputfilename, "rb") as hiddencategories: for line in hiddencategories: line = line.strip().lower() # Code for representing the category names in same encoding try: line = line.decode('unicode-escape') except SyntaxError: line = line.decode('ascii') except Exception, e: a = 0 try: line = unidecode(line) except UnicodeEncodeError, e:
"economics", "education", "environment", "form", "geography", "government", "health", "history", "humanities", "humans", "industry", "information", "intellectual works", "knowledge", "language", "law", "leisure", "life", "mathematics", "matter", "medicine", "mind", "nature", "people", "politics", "professional studies", "science", "scientific disciplines", "society", "sports", "structure", "systems", "technology", "thought", "tools", "transport", "universe", "world" ] startcategory = startcategory.lower() parent = "" graph = dict() #Dictionary to keep track of the children to each parent cat subgraph = dict() #Dictionary to keep track of the parents to each subcategory starttime = time.time() begintime = starttime p("Reading all category info", "info") parent = child = "" with open(categoryinfofilename) as categorygraph: for line in categorygraph: line = line.strip() if line.startswith("*"): #children child = line[2:] idmapper.insert_name(child) if parent == "": continue if parent in graph: # if child not in graph[parent]: graph[parent].append(child) else: graph[parent] = [child] else:
"\t [Page-categories.txt.gz]\n"\ "\t [article-info.txt.gz]\n\n"\ "[FUNCTION]: \n"\ "Store all articles with their immidiate subcategories\n" exit(0) reload(sys) sys.setdefaultencoding('utf-8') articles = dict( ) #Dictionary to keep track of all categories and their articles artcnt = teller = articlecnt = 0 starttime = time.time() # Reads the file file containg links between categories and articles p("Reading all article content...", "info") with gzip.open(articleinputfilename) as articleinfo: for line in articleinfo: line = line.strip() lines = line.split("\t") if len(lines) < 2: continue category = lines[0].lower() page = lines[1].lower() if "" == page or " " == page: continue if page in articles: # page is already in the dictionary if category not in articles[page]: # Add the category if not present
try: categoryinputfilename = sys.argv[1] hiddencategoryinputfilename = sys.argv[2] except: print "\n[RUN]: \n"\ "python Is_hidden_there.py \n"\ "\t [Subcat-links.txt.gz]\n"\ "\t [All_hidden_categories.txt.gz]\n" exit(0) hidden_cat = dict() #Dictionary to check all hidden category names hiddencnt = 0 #Counter to see how many hidden categories are read hiddencat_found = False #Boolean to determine if hidden cateogry is found within graph # Reads all hidden categories from the file containing the names of all hidden categories p("Reading all hidden categories", "info") with gzip.open(hiddencategoryinputfilename) as hiddencategories: for line in hiddencategories: line = line.strip().lower() if line in hidden_cat: hidden_cat[line] += 1 else: hiddencnt += 1 hidden_cat[line] = 1 p("All hidden categories read", "info") # Reads the category graph p("Read all category links", "info") with gzip.open(categoryinputfilename, "rb") as inputfile: for line in inputfile: line = line.strip()
# -*- coding: utf-8 -*- import gzip, json, yaml, io from myprint import myprint as p """ Program for creating a dictionary for a dictionary-based classifier for another language (based on the English dictionary) """ nomappingfilename = "no-mapping.txt.gz" nomapping = dict() p("Reading %s" %(nomappingfilename), "info") with gzip.open(nomappingfilename, "rb") as nomappingfile: for line in nomappingfile: line = line.strip() splittet = line.split("\t") if len(splittet) < 2: continue nomapping[splittet[0]] = splittet[1] nomappingfile.close() entrytoidfilename = "en-entry-to-pageid.json" p("Reading %s" %(entrytoidfilename), "info") with open(entrytoidfilename, "rb") as inputfile: entrypageid = yaml.load(inputfile) p("finished reading python json", "info") norwegianstopwords = [] stopwordfile = "norwegian_stop_words.txt" with open(stopwordfile, "r") as inputfile:
print "Old: %s, new: %s, tierones: %s\n" % ( self.oldentry, self.newentry, self.tierone) # Method for comparing this object to another def myequal(self, obj): for category in obj.tierone: if category not in self.tierone: return False return True version = 4 # Version of the dictionary to be cleaned # Reading the dictionary inputfilename = "igg-dictionary-" + str(version) + ".json" p("Reading json file", "info") with open(inputfilename, "rb") as inputfile: iggdictionary = yaml.load(inputfile) # Loading the dictionary from file p("finished reading python json", "info") iggiabtaxonomy = "igg-iabtaxonomy" + str(version) iggiabtaxonomynew = "igg-iabtaxonomy" + str(version + 1) dictionary = iggdictionary[iggiabtaxonomy] newdictionary = dict() # Dictionary for storing the final results paraentries = dict( ) # Dictionary for keeping track on the changes made to the entries disambiguation = dict() # Dictionary for all disambiguation titles disambiguationentries = dict( ) # Dictionary for keeping track on all disambiguation entries
# -*- coding: utf-8 -*- import gzip, json, yaml, io from myprint import myprint as p """ Program for creating a dictionary for a dictionary-based classifier for another language (based on the English dictionary) """ nomappingfilename = "no-mapping.txt.gz" nomapping = dict() p("Reading %s" % (nomappingfilename), "info") with gzip.open(nomappingfilename, "rb") as nomappingfile: for line in nomappingfile: line = line.strip() splittet = line.split("\t") if len(splittet) < 2: continue nomapping[splittet[0]] = splittet[1] nomappingfile.close() entrytoidfilename = "en-entry-to-pageid.json" p("Reading %s" % (entrytoidfilename), "info") with open(entrytoidfilename, "rb") as inputfile: entrypageid = yaml.load(inputfile) p("finished reading python json", "info") norwegianstopwords = [] stopwordfile = "norwegian_stop_words.txt" with open(stopwordfile, "r") as inputfile: for line in inputfile:
import gzip, re from myprint import myprint as p from unidecode import unidecode inputfilename = "no-mapping.txt.gz" categoryinputfilename = "enwiki-latest-categorylinks.sql.gz" redirectinputfilename = "output-redirect-titles.txt.gz" pagefilename = "enwiki-latest-page.sql.gz" nopages = dict() redirects = dict() pages = dict() p("Reading %s" % (inputfilename), "info") with gzip.open(inputfilename, "rb") as inputfile: for line in inputfile: line = line.strip() line = line.lower() splittet = line.split("\t") if len(splittet) < 2: continue nopages[splittet[0]] = splittet[1] inputfile.close() p("Finished reading %s" % (inputfilename), "info") p("Reading %s" % (redirectinputfilename), "info") with gzip.open(redirectinputfilename, "rb") as redirectfile: for line in redirectfile: #line = unicode(line, "utf-8")
subgraph.pop(category, None) else: grades[category].append(0) #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category]))) for category in subgraph: grades[category] = [0, len(subgraph[category])] inlinks += len(subgraph[category]) avg_in = inlinks / C_in avb_out = outlinks / C_out #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category]))) #return create_grades() #outputfile.close() """ thresholds = [10, 20, 30, 40, 50] p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info") p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info") p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info") p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info") p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info") p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info") p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info") p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info") p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info") p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info") p("Maxparent: %d (%s)" %(maxparent, mparent), "info") p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info") #subcats = graph["2015"]