Пример #1
0
def read_ids(idfilename):
    with gzip.open(idfilename) as idfile:
        for line in idfile:
            line = line.strip()

            regex = "(\d+)\,\s+(.*)"
            identry = re.findall(regex, line)

            identry = identry[0]
            if len(identry) < 2:
                continue
            myid = identry[0]
            myentry = identry[1]

            id_to_name[myid] = myentry
            name_to_id[myentry] = myid
    p("Read all ids", "info")
    idfile.close()
def read_ids(idfilename): 
    with gzip.open(idfilename) as idfile: 
        for line in idfile:
            line = line.strip()

            regex = "(\d+)\,\s+(.*)"
            identry = re.findall(regex, line)
            
            identry = identry[0]
            if len(identry) < 2: 
                continue
            myid = identry[0]
            myentry = identry[1]

            id_to_name[myid] = myentry
            name_to_id[myentry] = myid
    p("Read all ids", "info")
    idfile.close()
Пример #3
0
try:
    categoryinputfilename = sys.argv[1]
    hiddencategoryinputfilename = sys.argv[2]
except:
    print "\n[RUN]: \n"\
    "python Is_hidden_there.py \n"\
    "\t [Subcat-links.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"
    exit(0)

hidden_cat = dict()         #Dictionary to check all hidden category names
hiddencnt = 0               #Counter to see how many hidden categories are read
hiddencat_found = False     #Boolean to determine if hidden cateogry is found within graph

# Reads all hidden categories from the file containing the names of all hidden categories
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename) as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line]+=1
        else:
            hiddencnt += 1
            hidden_cat[line]=1
p("All hidden categories read", "info")

# Reads the category graph
p("Read all category links", "info")
with gzip.open(categoryinputfilename, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()
def find_grades(categoryinfofilename, categoriesoutputfilename): 
    global grades
    global C_in
    global C_out
    global avg_in
    global avg_out
    starttime = time.time()
    begintime = starttime
    p("Reading all category info", "info")
    parent = child = ""
    graph = dict()    # Dictionary to keep track of the children to each parent cat
    subgraph = dict() # Dictionary to keep track of the parents of each cat
    
    # Creating category graph
    
    with open(categoryinfofilename) as categorygraph:
        for line in categorygraph:
            line = line.strip()
            if line.startswith("*"): #children
                child = name_to_id[line[2:]]
                if parent == "":
                    continue
                if parent in graph:
                    if child not in graph[parent]: 
                        graph[parent].append(child)
                else:
                    graph[parent] = [child]

                if child in subgraph: 
                    if parent in subgraph[child]: 
                        a = 0
                    else: 
                        subgraph[child].append(parent)
                else: 
                    subgraph[child] = [parent]

            else:
                line = line.replace("_", " ")
                parent = name_to_id[line]
            #idmapper.insert_name(parent)
                
    p("Finished reading all info [Time: %s sec]" %(time.time()-starttime), "info")
    maxparent = maxchildren = outlinks = inlinks = 0
    mparent = mchildren = ""
    
    C_in = len(subgraph)
    C_out = len(subgraph)
    for category in graph: 
        if len(graph[category])> maxchildren: 
            maxchildren = len(graph[category])
            mchildren = category
        outlinks+= len(graph[category])
        
        grades[category] = [len(graph[category])]

        if category in subgraph:
            
            grades[category].append(len(subgraph[category]))
            
            inlinks += len(subgraph[category])
            if len(subgraph[category]) > maxparent: 
                maxparent = len(subgraph[category])
                mparent = category
            """
                if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): 
                cnt10+= 1
            if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): 
                cnt20+= 1
            if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): 
                cnt30+= 1
            if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): 
                cnt40+= 1
            if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): 
                cnt50+= 1
            if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): 
                cnt60+= 1
            if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): 
                cnt70+= 1
            if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): 
                cnt80+= 1
            if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): 
                cnt90+= 1
            if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): 
                cnt100+= 1
                print "category: %s, number: %d\n" %(category, len(graph[category]))
                """
                #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
        else: 
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph: 
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
                #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
        else: 
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph: 
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
        #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category])))
    #return create_grades()
#outputfile.close()
"""
thresholds = [10, 20, 30, 40, 50]
p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info")

p("Maxparent: %d (%s)" %(maxparent, mparent), "info")
p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info")
#subcats = graph["2015"]
Пример #6
0
import gzip, json, yaml, io
from myprint import myprint as p
"""
Program for a file containing all dictionary entries and what page ids they are based on.
Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy

Why? Because the file containing all possible entries and their page ids is extremely large.

Input: A file containing all processed Wikipedia article pages and their corresponding ids
Output: A file continaing the relevant dictionary entries and what page ids they are based on
"""

# Reading the English
enmappingfilename = "pageid-pagetitle-en.txt.gz"
enpagetitle_to_id = dict()
p("Reading %s" % (enmappingfilename), "info")
with gzip.open(enmappingfilename, "rb") as enmappingfile:
    for line in enmappingfile:
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2:
            continue
        pageid = splittet[0]
        pagetitle = splittet[1]
        if pagetitle in enpagetitle_to_id:
            enpagetitle_to_id[pagetitle].append(pageid)
        else:
            enpagetitle_to_id[pagetitle] = [pageid]
enmappingfile.close()

latestversion = 5  # Latest version, update for later versions
    "\t [category-info.txt]\n"\
    "\t [article-info.txt.gz]\n"\
    "\t [article-output.txt.gz] \n\n"\
    "[FUNC:]\n"\
    "Create the complete paths of the articles. \n"
    exit(0)

startcategory = "Main topic classifications" #"Fundamental Categories"
startcategory = startcategory.lower()
parent = ""
graph = dict()    #Dictionary to keep track of the children to each parent cat
subgraph = dict() #Dictionary to keep track of the parents to each subcategory

starttime = time.time()
begintime = starttime
p("Reading all category info", "info")
parent = child = ""
with open(categoryinfofilename) as categorygraph:
    for line in categorygraph:
        line = line.strip()
        if line.startswith("*"): #children
            child_name = line[2:]
            idmapper.insert_name(child_name)
            child = idmapper.name_to_id(child_name)
            if parent == "":
                continue
            if parent in graph:
             #   if child not in graph[parent]:
                graph[parent].append(child)
            else:
                graph[parent] = [child]
Пример #8
0
    "python Hidden_categories.py \n"\
    "\t [enwiki-latest-page_props.sql.gz]\n"\
    "\t [enwiki-latest-page.sql.gz\n"\
    "\t [All_hidden_categories.txt.gz]\n\n"\
    "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n"
    exit(0)

hiddencat_id = dict(
)  #Dictionary to keep track of the ids of all hidden categories
hidden_cat = dict(
)  #Dictionary to keep track of the name of all hidden categories
hiddencnt = 0
start_time = time.time()

# Looping through page props to find the ids of all hidden categories
p("Finding all hidden category ids from page props...", "info")
with gzip.open(pagepropsinputfilename) as inputfile:
    for line in inputfile:
        if (line.startswith("INSERT")):
            line = line.split("VALUES (")[1]
            line.decode('utf-8', 'ignore')
            insertions = line.split("),(")
            for insertion in insertions:
                insertion = insertion.lower()
                words = insertion.split(",")
                if "hiddencat" in words[1].lower():
                    # Hidden category is found, id i added
                    hiddencat_id[words[0]] = 1
                    hiddencnt += 1  # Counter to keep track of number of hidden categories
mytime = time.time() - start_time
p(
Пример #9
0
    titlefilename = sys.argv[1]
    outputtitlesfilename = sys.argv[2]
except:
    print "\n[RUN]: \n"\
    "python Redirecting.py \n"\
    "\t [enwiki-latest-redirects.sql.gz]\n"\
    "\t [output-redirect-titles.txt.gz]\n\n"\
    "[FUNC:]\n"\
    "Find all titles that redirecs \n"
    exit(0)

redirects = dict()  #Dictionary for keeping all redirect pages
starttime = time.time()

# Reads the redictfile
p("Reading all redirect titles", "info")
with gzip.open(titlefilename) as titlefile:
    for line in titlefile:
        if line.startswith("INSERT"):
            line.decode('utf-8', 'ignore')
            line_split = line[30:] #.split("VALUES (")[1]
            insertions = line_split.split("),(")
            for insertion in insertions:

                # Code for encoding
                try:
                    insertion = insertion.decode('unicode-escape')
                except SyntaxError:
                    insertion = insertion.decode('ascii')
                except Exception,e:
                    a = 0
"""
Program for finding the pageid for an entry.
Needs to do the same process as the mapper so that the entries are identical.
"""


categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

redirects = dict()
pageid_to_title = dict()
pagetitle_to_id = dict()

"""
p("Reading %s" %(inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            #print "< 2: " + line
            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" %(inputfilename), "info")
"""
p("Reading %s" %(redirectinputfilename), "info")
Пример #11
0
    "python Remove_hiddencats.py \n"\
    "\t [Sub-categories-new.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"\
    "\t [Subcat-links.txt.gz\n\n"\
    "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n"
    exit(0)

allcategories = dict()  #Dictionary to keep track of all categories
hidden_cat = dict()     #Dictionary to keep track of all hidden categories
links = dict()          #Dictionary to keep track of all links in the graph
artskip = catskip = hiddencnt = 0

starttime = time.time()

# Reads all the hidden categories from the file
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename, "rb") as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()

        # Code for representing the category names in same encoding
        try:
            line = line.decode('unicode-escape')
        except SyntaxError:
            line = line.decode('ascii')
        except Exception,e:
            a = 0

        try:
            line = unidecode(line)
        except UnicodeEncodeError, e:
Пример #12
0
import gzip, re
from myprint import myprint as p
from unidecode import unidecode
"""
Program for finding the pageid for an entry.
Needs to do the same process as the mapper so that the entries are identical.
"""

categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

redirects = dict()
pageid_to_title = dict()
pagetitle_to_id = dict()
"""
p("Reading %s" %(inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            #print "< 2: " + line
            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" %(inputfilename), "info")
"""
p("Reading %s" % (redirectinputfilename), "info")
    print "\n[RUN]: \n"\
    "python Hidden_categories.py \n"\
    "\t [enwiki-latest-page_props.sql.gz]\n"\
    "\t [enwiki-latest-page.sql.gz\n"\
    "\t [All_hidden_categories.txt.gz]\n\n"\
    "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n"
    exit(0)


hiddencat_id = dict()   #Dictionary to keep track of the ids of all hidden categories
hidden_cat = dict()     #Dictionary to keep track of the name of all hidden categories
hiddencnt = 0
start_time = time.time()

# Looping through page props to find the ids of all hidden categories
p("Finding all hidden category ids from page props...", "info")
with gzip.open(pagepropsinputfilename) as inputfile:
    for line in inputfile:
        if (line.startswith("INSERT")):
            line = line.split("VALUES (")[1]
            line.decode('utf-8', 'ignore')
            insertions = line.split("),(")
            for insertion in insertions:
                insertion = insertion.lower()
                words = insertion.split(",")
                if "hiddencat" in words[1].lower():
                    # Hidden category is found, id i added
                    hiddencat_id[words[0]] = 1
                    hiddencnt += 1  # Counter to keep track of number of hidden categories
mytime = time.time() - start_time
p("Found %d hidden category ids (%s min, %s min) ---" %(hiddencnt,  mytime/60, mytime%60), "info")
Пример #14
0
class Path(object): 
    def __init__(self, score, path): 
        self.score = score
        self.path = path
        return 
    def __cmp__(self, other): 
        return cmp(other.score, self.score)
        
letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", \
               "t", "u", "v", "w", "x", "y", "z", "restfile"]

gradesfilename = "category-grade.txt.gz"
idfilename = "id-mapper.txt.gz"

p("Reading all grades", "info")
allgrades = dict()
grades = dict()
with gzip.open(gradesfilename, "rb") as gradefile: 
    for line in gradefile: 
        line = line.strip()
        splittet = line.split("\t")
        category = splittet[0]
        grade = float(splittet[1])
        grades[category] = grade

id_to_name = dict()
p("Reading all ids for all categories", "info")
with gzip.open(idfilename, "rb") as idfile: 
    for line in idfile: 
        line = line.strip()
Пример #15
0
def find_grades(categoryinfofilename, categoriesoutputfilename):
    global grades
    global C_in
    global C_out
    global avg_in
    global avg_out
    starttime = time.time()
    begintime = starttime
    p("Reading all category info", "info")
    parent = child = ""
    graph = dict(
    )  # Dictionary to keep track of the children to each parent cat
    subgraph = dict()  # Dictionary to keep track of the parents of each cat

    # Creating category graph

    with open(categoryinfofilename) as categorygraph:
        for line in categorygraph:
            line = line.strip()
            if line.startswith("*"):  #children
                child = name_to_id[line[2:]]
                if parent == "":
                    continue
                if parent in graph:
                    if child not in graph[parent]:
                        graph[parent].append(child)
                else:
                    graph[parent] = [child]

                if child in subgraph:
                    if parent in subgraph[child]:
                        a = 0
                    else:
                        subgraph[child].append(parent)
                else:
                    subgraph[child] = [parent]

            else:
                line = line.replace("_", " ")
                parent = name_to_id[line]
            #idmapper.insert_name(parent)

    p("Finished reading all info [Time: %s sec]" % (time.time() - starttime),
      "info")
    maxparent = maxchildren = outlinks = inlinks = 0
    mparent = mchildren = ""

    C_in = len(subgraph)
    C_out = len(subgraph)
    for category in graph:
        if len(graph[category]) > maxchildren:
            maxchildren = len(graph[category])
            mchildren = category
        outlinks += len(graph[category])

        grades[category] = [len(graph[category])]

        if category in subgraph:

            grades[category].append(len(subgraph[category]))

            inlinks += len(subgraph[category])
            if len(subgraph[category]) > maxparent:
                maxparent = len(subgraph[category])
                mparent = category
            """
                if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): 
                cnt10+= 1
            if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): 
                cnt20+= 1
            if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): 
                cnt30+= 1
            if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): 
                cnt40+= 1
            if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): 
                cnt50+= 1
            if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): 
                cnt60+= 1
            if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): 
                cnt70+= 1
            if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): 
                cnt80+= 1
            if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): 
                cnt90+= 1
            if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): 
                cnt100+= 1
                print "category: %s, number: %d\n" %(category, len(graph[category]))
                """
            #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
        else:
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph:
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
Пример #16
0
    # Method for printing the entry, only for debugging
    def entryprint(self):
        print "Old: %s, new: %s, tierones: %s\n" %(self.oldentry, self.newentry, self.tierone) 

    # Method for comparing this object to another
    def myequal(self, obj):
        for category in obj.tierone:
            if category not in self.tierone:
                return False
        return True

version = 4                                     # Version of the dictionary to be cleaned

# Reading the dictionary
inputfilename = "igg-dictionary-" + str(version) + ".json"
p("Reading json file", "info")
with open(inputfilename, "rb") as inputfile:
    iggdictionary = yaml.load(inputfile)        # Loading the dictionary from file
p("finished reading python json", "info")

iggiabtaxonomy = "igg-iabtaxonomy" + str(version)
iggiabtaxonomynew = "igg-iabtaxonomy" + str(version+1)
dictionary = iggdictionary[iggiabtaxonomy]

newdictionary = dict()                # Dictionary for storing the final results
paraentries = dict()                  # Dictionary for keeping track on the changes made to the entries
disambiguation = dict()               # Dictionary for all disambiguation titles
disambiguationentries = dict()        # Dictionary for keeping track on all disambiguation entries

yearregex = "(\d\d\d\d)"              # Regex for recognizing years in the title
parenthesisregex = "(\(.*\))"         # Regex for recognizing parenthesis in the title
Пример #17
0
import gzip, re
from myprint import myprint as p
from unidecode import unidecode

inputfilename = "no-mapping.txt.gz"
categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

nopages = dict()
redirects = dict()
pages = dict()

p("Reading %s" %(inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 

            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" %(inputfilename), "info")

p("Reading %s" %(redirectinputfilename), "info")
with gzip.open(redirectinputfilename, "rb") as redirectfile:
    for line in redirectfile:     
        #line = unicode(line, "utf-8")
from myprint import myprint as p

"""
Program for a file containing all dictionary entries and what page ids they are based on.
Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy

Why? Because the file containing all possible entries and their page ids is extremely large.

Input: A file containing all processed Wikipedia article pages and their corresponding ids
Output: A file continaing the relevant dictionary entries and what page ids they are based on
"""

# Reading the English
enmappingfilename = "pageid-pagetitle-en.txt.gz"
enpagetitle_to_id = dict()
p("Reading %s" %(enmappingfilename), "info")
with gzip.open(enmappingfilename, "rb") as enmappingfile: 
    for line in enmappingfile: 
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            continue
        pageid = splittet[0]
        pagetitle = splittet[1]
        if pagetitle in enpagetitle_to_id:
            enpagetitle_to_id[pagetitle].append(pageid)
        else: 
            enpagetitle_to_id[pagetitle] = [pageid]
enmappingfile.close()

Пример #19
0
from myprint import myprint as p

"""
Program for mapping all keywords to IAB categories. 
"""

inputfilename = "articlemapping-all.txt.gz"
outputcategoriesfilename = "Outputcategories"

letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile"]

outputcategories = dict()

tiertwo = ""
tierone = ""
p("Reading output categories", "info")
with open(outputcategoriesfilename, "r") as outputcatfile: 
    for line in outputcatfile: 
        line = line.strip()
        if line.startswith("*"): 
            tierone = line[1:]
            tierone = line.lower()
        else: 
            tiertwo = line
            tiertwo = line.lower()
            outputcategories[tiertwo] = tierone


idfilename = "id-mapper.txt.gz"
id_to_name = dict()
name_to_id = dict()
Пример #20
0
Program for mapping all keywords to IAB categories. 
"""

inputfilename = "articlemapping-all.txt.gz"
outputcategoriesfilename = "Outputcategories"

letters = [
    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
    "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile"
]

outputcategories = dict()

tiertwo = ""
tierone = ""
p("Reading output categories", "info")
with open(outputcategoriesfilename, "r") as outputcatfile:
    for line in outputcatfile:
        line = line.strip()
        if line.startswith("*"):
            tierone = line[1:]
            tierone = line.lower()
        else:
            tiertwo = line
            tiertwo = line.lower()
            outputcategories[tiertwo] = tierone

idfilename = "id-mapper.txt.gz"
id_to_name = dict()
name_to_id = dict()
p("Reading id-mapper", "info")
Пример #21
0
class Path(object):
    def __init__(self, score, path):
        self.score = score
        self.path = path
        return

    def __cmp__(self, other):
        return cmp(other.score, self.score)

letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", \
               "t", "u", "v", "w", "x", "y", "z", "restfile"]

gradesfilename = "category-grade.txt.gz"
idfilename = "id-mapper.txt.gz"

p("Reading all grades", "info")
allgrades = dict()
grades = dict()
with gzip.open(gradesfilename, "rb") as gradefile:
    for line in gradefile:
        line = line.strip()
        splittet = line.split("\t")
        category = splittet[0]
        grade = float(splittet[1])
        grades[category] = grade

id_to_name = dict()
p("Reading all ids for all categories", "info")
with gzip.open(idfilename, "rb") as idfile:
    for line in idfile:
        line = line.strip()
Пример #22
0
def is_number(input):
    try:
        int(input)
        return True
    except:
        return False


start_time = time.time()
categorycnt = pagecnt = lines = 0

hidden_cat = dict()
artskip = catskip = hiddencnt = 0

p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename) as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line] += 1
        else:
            hidden_cat[line] = 1
p("All hidden categories read", "info")

redirects = dict()
p("Reading all redirects", "info")
with gzip.open(redirectinputfilename, "rb") as redirectfile:
    for line in redirectfile:
        line.decode('utf-8', 'ignore')
        line = line.lower()
    "python Remove_hiddencats.py \n"\
    "\t [Sub-categories-new.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"\
    "\t [Subcat-links.txt.gz\n\n"\
    "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n"
    exit(0)

allcategories = dict()  #Dictionary to keep track of all categories
hidden_cat = dict()  #Dictionary to keep track of all hidden categories
links = dict()  #Dictionary to keep track of all links in the graph
artskip = catskip = hiddencnt = 0

starttime = time.time()

# Reads all the hidden categories from the file
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename, "rb") as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()

        # Code for representing the category names in same encoding
        try:
            line = line.decode('unicode-escape')
        except SyntaxError:
            line = line.decode('ascii')
        except Exception, e:
            a = 0

        try:
            line = unidecode(line)
        except UnicodeEncodeError, e:
Пример #24
0
    "economics", "education", "environment", "form", "geography", "government",
    "health", "history", "humanities", "humans", "industry", "information",
    "intellectual works", "knowledge", "language", "law", "leisure", "life",
    "mathematics", "matter", "medicine", "mind", "nature", "people",
    "politics", "professional studies", "science", "scientific disciplines",
    "society", "sports", "structure", "systems", "technology", "thought",
    "tools", "transport", "universe", "world"
]
startcategory = startcategory.lower()
parent = ""
graph = dict()  #Dictionary to keep track of the children to each parent cat
subgraph = dict()  #Dictionary to keep track of the parents to each subcategory

starttime = time.time()
begintime = starttime
p("Reading all category info", "info")
parent = child = ""
with open(categoryinfofilename) as categorygraph:
    for line in categorygraph:
        line = line.strip()
        if line.startswith("*"):  #children
            child = line[2:]
            idmapper.insert_name(child)
            if parent == "":
                continue
            if parent in graph:
                #   if child not in graph[parent]:
                graph[parent].append(child)
            else:
                graph[parent] = [child]
        else:
        "\t [Page-categories.txt.gz]\n"\
        "\t [article-info.txt.gz]\n\n"\
    "[FUNCTION]: \n"\
    "Store all articles with their immidiate subcategories\n"
    exit(0)

reload(sys)
sys.setdefaultencoding('utf-8')

articles = dict(
)  #Dictionary to keep track of all categories and their articles
artcnt = teller = articlecnt = 0
starttime = time.time()

# Reads the file file containg links between categories and articles
p("Reading all article content...", "info")
with gzip.open(articleinputfilename) as articleinfo:
    for line in articleinfo:
        line = line.strip()
        lines = line.split("\t")
        if len(lines) < 2:
            continue
        category = lines[0].lower()
        page = lines[1].lower()
        if "" == page or " " == page:
            continue

        if page in articles:
            # page is already in the dictionary
            if category not in articles[page]:
                # Add the category if not present
Пример #26
0
try:
    categoryinputfilename = sys.argv[1]
    hiddencategoryinputfilename = sys.argv[2]
except:
    print "\n[RUN]: \n"\
    "python Is_hidden_there.py \n"\
    "\t [Subcat-links.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"
    exit(0)

hidden_cat = dict()  #Dictionary to check all hidden category names
hiddencnt = 0  #Counter to see how many hidden categories are read
hiddencat_found = False  #Boolean to determine if hidden cateogry is found within graph

# Reads all hidden categories from the file containing the names of all hidden categories
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename) as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line] += 1
        else:
            hiddencnt += 1
            hidden_cat[line] = 1
p("All hidden categories read", "info")

# Reads the category graph
p("Read all category links", "info")
with gzip.open(categoryinputfilename, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()
Пример #27
0
# -*- coding: utf-8 -*-
import gzip, json, yaml, io
from myprint import myprint as p

"""
Program for creating a dictionary for a dictionary-based classifier
for another language (based on the English dictionary)
"""

nomappingfilename = "no-mapping.txt.gz"

nomapping = dict()
p("Reading %s" %(nomappingfilename), "info")
with gzip.open(nomappingfilename, "rb") as nomappingfile: 
    for line in nomappingfile: 
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            continue
        nomapping[splittet[0]] = splittet[1]
nomappingfile.close()

entrytoidfilename = "en-entry-to-pageid.json"
p("Reading %s" %(entrytoidfilename), "info")
with open(entrytoidfilename, "rb") as inputfile:
    entrypageid = yaml.load(inputfile)
p("finished reading python json", "info")

norwegianstopwords = []
stopwordfile = "norwegian_stop_words.txt"
with open(stopwordfile, "r") as inputfile: 
        print "Old: %s, new: %s, tierones: %s\n" % (
            self.oldentry, self.newentry, self.tierone)

    # Method for comparing this object to another
    def myequal(self, obj):
        for category in obj.tierone:
            if category not in self.tierone:
                return False
        return True


version = 4  # Version of the dictionary to be cleaned

# Reading the dictionary
inputfilename = "igg-dictionary-" + str(version) + ".json"
p("Reading json file", "info")
with open(inputfilename, "rb") as inputfile:
    iggdictionary = yaml.load(inputfile)  # Loading the dictionary from file
p("finished reading python json", "info")

iggiabtaxonomy = "igg-iabtaxonomy" + str(version)
iggiabtaxonomynew = "igg-iabtaxonomy" + str(version + 1)
dictionary = iggdictionary[iggiabtaxonomy]

newdictionary = dict()  # Dictionary for storing the final results
paraentries = dict(
)  # Dictionary for keeping track on the changes made to the entries
disambiguation = dict()  # Dictionary for all disambiguation titles
disambiguationentries = dict(
)  # Dictionary for keeping track on all disambiguation entries
# -*- coding: utf-8 -*-
import gzip, json, yaml, io
from myprint import myprint as p
"""
Program for creating a dictionary for a dictionary-based classifier
for another language (based on the English dictionary)
"""

nomappingfilename = "no-mapping.txt.gz"

nomapping = dict()
p("Reading %s" % (nomappingfilename), "info")
with gzip.open(nomappingfilename, "rb") as nomappingfile:
    for line in nomappingfile:
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2:
            continue
        nomapping[splittet[0]] = splittet[1]
nomappingfile.close()

entrytoidfilename = "en-entry-to-pageid.json"
p("Reading %s" % (entrytoidfilename), "info")
with open(entrytoidfilename, "rb") as inputfile:
    entrypageid = yaml.load(inputfile)
p("finished reading python json", "info")

norwegianstopwords = []
stopwordfile = "norwegian_stop_words.txt"
with open(stopwordfile, "r") as inputfile:
    for line in inputfile:
Пример #30
0
import gzip, re
from myprint import myprint as p
from unidecode import unidecode

inputfilename = "no-mapping.txt.gz"
categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

nopages = dict()
redirects = dict()
pages = dict()

p("Reading %s" % (inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2:

            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" % (inputfilename), "info")

p("Reading %s" % (redirectinputfilename), "info")
with gzip.open(redirectinputfilename, "rb") as redirectfile:
    for line in redirectfile:
        #line = unicode(line, "utf-8")
Пример #31
0
            subgraph.pop(category, None)
        else:
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph:
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
    #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category])))
    #return create_grades()


#outputfile.close()
"""
thresholds = [10, 20, 30, 40, 50]
p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info")

p("Maxparent: %d (%s)" %(maxparent, mparent), "info")
p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info")
#subcats = graph["2015"]