Python p示例，myprint.p Python示例

示例#1

0

显示文件

def read_ids(idfilename):
    with gzip.open(idfilename) as idfile:
        for line in idfile:
            line = line.strip()

            regex = "(\d+)\,\s+(.*)"
            identry = re.findall(regex, line)

            identry = identry[0]
            if len(identry) < 2:
                continue
            myid = identry[0]
            myentry = identry[1]

            id_to_name[myid] = myentry
            name_to_id[myentry] = myid
    p("Read all ids", "info")
    idfile.close()

示例#2

0

显示文件

文件： Create_category_grades-id.py 项目： AshBT/Master-Thesis-2015

def read_ids(idfilename): 
    with gzip.open(idfilename) as idfile: 
        for line in idfile:
            line = line.strip()

            regex = "(\d+)\,\s+(.*)"
            identry = re.findall(regex, line)
            
            identry = identry[0]
            if len(identry) < 2: 
                continue
            myid = identry[0]
            myentry = identry[1]

            id_to_name[myid] = myentry
            name_to_id[myentry] = myid
    p("Read all ids", "info")
    idfile.close()

示例#3

0

显示文件

文件： Is_hidden_there.py 项目： AshBT/Master-Thesis-2015

try:
    categoryinputfilename = sys.argv[1]
    hiddencategoryinputfilename = sys.argv[2]
except:
    print "\n[RUN]: \n"\
    "python Is_hidden_there.py \n"\
    "\t [Subcat-links.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"
    exit(0)

hidden_cat = dict()         #Dictionary to check all hidden category names
hiddencnt = 0               #Counter to see how many hidden categories are read
hiddencat_found = False     #Boolean to determine if hidden cateogry is found within graph

# Reads all hidden categories from the file containing the names of all hidden categories
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename) as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line]+=1
        else:
            hiddencnt += 1
            hidden_cat[line]=1
p("All hidden categories read", "info")

# Reads the category graph
p("Read all category links", "info")
with gzip.open(categoryinputfilename, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()

示例#4

0

显示文件

文件： Create_category_grades-id.py 项目： AshBT/Master-Thesis-2015

def find_grades(categoryinfofilename, categoriesoutputfilename): 
    global grades
    global C_in
    global C_out
    global avg_in
    global avg_out
    starttime = time.time()
    begintime = starttime
    p("Reading all category info", "info")
    parent = child = ""
    graph = dict()    # Dictionary to keep track of the children to each parent cat
    subgraph = dict() # Dictionary to keep track of the parents of each cat
    
    # Creating category graph
    
    with open(categoryinfofilename) as categorygraph:
        for line in categorygraph:
            line = line.strip()
            if line.startswith("*"): #children
                child = name_to_id[line[2:]]
                if parent == "":
                    continue
                if parent in graph:
                    if child not in graph[parent]: 
                        graph[parent].append(child)
                else:
                    graph[parent] = [child]

                if child in subgraph: 
                    if parent in subgraph[child]: 
                        a = 0
                    else: 
                        subgraph[child].append(parent)
                else: 
                    subgraph[child] = [parent]

            else:
                line = line.replace("_", " ")
                parent = name_to_id[line]
            #idmapper.insert_name(parent)
                
    p("Finished reading all info [Time: %s sec]" %(time.time()-starttime), "info")
    maxparent = maxchildren = outlinks = inlinks = 0
    mparent = mchildren = ""
    
    C_in = len(subgraph)
    C_out = len(subgraph)
    for category in graph: 
        if len(graph[category])> maxchildren: 
            maxchildren = len(graph[category])
            mchildren = category
        outlinks+= len(graph[category])
        
        grades[category] = [len(graph[category])]

        if category in subgraph:
            
            grades[category].append(len(subgraph[category]))
            
            inlinks += len(subgraph[category])
            if len(subgraph[category]) > maxparent: 
                maxparent = len(subgraph[category])
                mparent = category
            """
                if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): 
                cnt10+= 1
            if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): 
                cnt20+= 1
            if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): 
                cnt30+= 1
            if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): 
                cnt40+= 1
            if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): 
                cnt50+= 1
            if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): 
                cnt60+= 1
            if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): 
                cnt70+= 1
            if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): 
                cnt80+= 1
            if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): 
                cnt90+= 1
            if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): 
                cnt100+= 1
                print "category: %s, number: %d\n" %(category, len(graph[category]))
                """
                #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
        else: 
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph: 
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out

示例#5

0

显示文件

文件： Create_category_grades-id.py 项目： AshBT/Master-Thesis-2015

                #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
        else: 
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph: 
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
        #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category])))
    #return create_grades()
#outputfile.close()
"""
thresholds = [10, 20, 30, 40, 50]
p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info")

p("Maxparent: %d (%s)" %(maxparent, mparent), "info")
p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info")
#subcats = graph["2015"]

示例#6

0

显示文件

import gzip, json, yaml, io
from myprint import myprint as p
"""
Program for a file containing all dictionary entries and what page ids they are based on.
Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy

Why? Because the file containing all possible entries and their page ids is extremely large.

Input: A file containing all processed Wikipedia article pages and their corresponding ids
Output: A file continaing the relevant dictionary entries and what page ids they are based on
"""

# Reading the English
enmappingfilename = "pageid-pagetitle-en.txt.gz"
enpagetitle_to_id = dict()
p("Reading %s" % (enmappingfilename), "info")
with gzip.open(enmappingfilename, "rb") as enmappingfile:
    for line in enmappingfile:
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2:
            continue
        pageid = splittet[0]
        pagetitle = splittet[1]
        if pagetitle in enpagetitle_to_id:
            enpagetitle_to_id[pagetitle].append(pageid)
        else:
            enpagetitle_to_id[pagetitle] = [pageid]
enmappingfile.close()

latestversion = 5  # Latest version, update for later versions

示例#7

0

显示文件

文件： Article_path_builder-id.py 项目： AshBT/Master-Thesis-2015

    "\t [category-info.txt]\n"\
    "\t [article-info.txt.gz]\n"\
    "\t [article-output.txt.gz] \n\n"\
    "[FUNC:]\n"\
    "Create the complete paths of the articles. \n"
    exit(0)

startcategory = "Main topic classifications" #"Fundamental Categories"
startcategory = startcategory.lower()
parent = ""
graph = dict()    #Dictionary to keep track of the children to each parent cat
subgraph = dict() #Dictionary to keep track of the parents to each subcategory

starttime = time.time()
begintime = starttime
p("Reading all category info", "info")
parent = child = ""
with open(categoryinfofilename) as categorygraph:
    for line in categorygraph:
        line = line.strip()
        if line.startswith("*"): #children
            child_name = line[2:]
            idmapper.insert_name(child_name)
            child = idmapper.name_to_id(child_name)
            if parent == "":
                continue
            if parent in graph:
             #   if child not in graph[parent]:
                graph[parent].append(child)
            else:
                graph[parent] = [child]

示例#8

0

显示文件

    "python Hidden_categories.py \n"\
    "\t [enwiki-latest-page_props.sql.gz]\n"\
    "\t [enwiki-latest-page.sql.gz\n"\
    "\t [All_hidden_categories.txt.gz]\n\n"\
    "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n"
    exit(0)

hiddencat_id = dict(
)  #Dictionary to keep track of the ids of all hidden categories
hidden_cat = dict(
)  #Dictionary to keep track of the name of all hidden categories
hiddencnt = 0
start_time = time.time()

# Looping through page props to find the ids of all hidden categories
p("Finding all hidden category ids from page props...", "info")
with gzip.open(pagepropsinputfilename) as inputfile:
    for line in inputfile:
        if (line.startswith("INSERT")):
            line = line.split("VALUES (")[1]
            line.decode('utf-8', 'ignore')
            insertions = line.split("),(")
            for insertion in insertions:
                insertion = insertion.lower()
                words = insertion.split(",")
                if "hiddencat" in words[1].lower():
                    # Hidden category is found, id i added
                    hiddencat_id[words[0]] = 1
                    hiddencnt += 1  # Counter to keep track of number of hidden categories
mytime = time.time() - start_time
p(

示例#9

0

显示文件

    titlefilename = sys.argv[1]
    outputtitlesfilename = sys.argv[2]
except:
    print "\n[RUN]: \n"\
    "python Redirecting.py \n"\
    "\t [enwiki-latest-redirects.sql.gz]\n"\
    "\t [output-redirect-titles.txt.gz]\n\n"\
    "[FUNC:]\n"\
    "Find all titles that redirecs \n"
    exit(0)

redirects = dict()  #Dictionary for keeping all redirect pages
starttime = time.time()

# Reads the redictfile
p("Reading all redirect titles", "info")
with gzip.open(titlefilename) as titlefile:
    for line in titlefile:
        if line.startswith("INSERT"):
            line.decode('utf-8', 'ignore')
            line_split = line[30:] #.split("VALUES (")[1]
            insertions = line_split.split("),(")
            for insertion in insertions:

                # Code for encoding
                try:
                    insertion = insertion.decode('unicode-escape')
                except SyntaxError:
                    insertion = insertion.decode('ascii')
                except Exception,e:
                    a = 0

示例#10

0

显示文件

文件： Create-pageid-entry.py 项目： AshBT/Master-Thesis-2015

"""
Program for finding the pageid for an entry.
Needs to do the same process as the mapper so that the entries are identical.
"""


categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

redirects = dict()
pageid_to_title = dict()
pagetitle_to_id = dict()

"""
p("Reading %s" %(inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            #print "< 2: " + line
            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" %(inputfilename), "info")
"""
p("Reading %s" %(redirectinputfilename), "info")

示例#11

0

显示文件

文件： Remove_hiddencats.py 项目： AshBT/Master-Thesis-2015

    "python Remove_hiddencats.py \n"\
    "\t [Sub-categories-new.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"\
    "\t [Subcat-links.txt.gz\n\n"\
    "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n"
    exit(0)

allcategories = dict()  #Dictionary to keep track of all categories
hidden_cat = dict()     #Dictionary to keep track of all hidden categories
links = dict()          #Dictionary to keep track of all links in the graph
artskip = catskip = hiddencnt = 0

starttime = time.time()

# Reads all the hidden categories from the file
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename, "rb") as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()

        # Code for representing the category names in same encoding
        try:
            line = line.decode('unicode-escape')
        except SyntaxError:
            line = line.decode('ascii')
        except Exception,e:
            a = 0

        try:
            line = unidecode(line)
        except UnicodeEncodeError, e:

示例#12

0

显示文件

import gzip, re
from myprint import myprint as p
from unidecode import unidecode
"""
Program for finding the pageid for an entry.
Needs to do the same process as the mapper so that the entries are identical.
"""

categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

redirects = dict()
pageid_to_title = dict()
pagetitle_to_id = dict()
"""
p("Reading %s" %(inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            #print "< 2: " + line
            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" %(inputfilename), "info")
"""
p("Reading %s" % (redirectinputfilename), "info")

示例#13

0

显示文件

文件： Find_all_hidden_categories.py 项目： AshBT/Master-Thesis-2015

    print "\n[RUN]: \n"\
    "python Hidden_categories.py \n"\
    "\t [enwiki-latest-page_props.sql.gz]\n"\
    "\t [enwiki-latest-page.sql.gz\n"\
    "\t [All_hidden_categories.txt.gz]\n\n"\
    "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n"
    exit(0)


hiddencat_id = dict()   #Dictionary to keep track of the ids of all hidden categories
hidden_cat = dict()     #Dictionary to keep track of the name of all hidden categories
hiddencnt = 0
start_time = time.time()

# Looping through page props to find the ids of all hidden categories
p("Finding all hidden category ids from page props...", "info")
with gzip.open(pagepropsinputfilename) as inputfile:
    for line in inputfile:
        if (line.startswith("INSERT")):
            line = line.split("VALUES (")[1]
            line.decode('utf-8', 'ignore')
            insertions = line.split("),(")
            for insertion in insertions:
                insertion = insertion.lower()
                words = insertion.split(",")
                if "hiddencat" in words[1].lower():
                    # Hidden category is found, id i added
                    hiddencat_id[words[0]] = 1
                    hiddencnt += 1  # Counter to keep track of number of hidden categories
mytime = time.time() - start_time
p("Found %d hidden category ids (%s min, %s min) ---" %(hiddencnt,  mytime/60, mytime%60), "info")

示例#14

0

显示文件

文件： Grader-id.py 项目： AshBT/Master-Thesis-2015

class Path(object): 
    def __init__(self, score, path): 
        self.score = score
        self.path = path
        return 
    def __cmp__(self, other): 
        return cmp(other.score, self.score)
        
letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", \
               "t", "u", "v", "w", "x", "y", "z", "restfile"]

gradesfilename = "category-grade.txt.gz"
idfilename = "id-mapper.txt.gz"

p("Reading all grades", "info")
allgrades = dict()
grades = dict()
with gzip.open(gradesfilename, "rb") as gradefile: 
    for line in gradefile: 
        line = line.strip()
        splittet = line.split("\t")
        category = splittet[0]
        grade = float(splittet[1])
        grades[category] = grade

id_to_name = dict()
p("Reading all ids for all categories", "info")
with gzip.open(idfilename, "rb") as idfile: 
    for line in idfile: 
        line = line.strip()

示例#15

0

显示文件

def find_grades(categoryinfofilename, categoriesoutputfilename):
    global grades
    global C_in
    global C_out
    global avg_in
    global avg_out
    starttime = time.time()
    begintime = starttime
    p("Reading all category info", "info")
    parent = child = ""
    graph = dict(
    )  # Dictionary to keep track of the children to each parent cat
    subgraph = dict()  # Dictionary to keep track of the parents of each cat

    # Creating category graph

    with open(categoryinfofilename) as categorygraph:
        for line in categorygraph:
            line = line.strip()
            if line.startswith("*"):  #children
                child = name_to_id[line[2:]]
                if parent == "":
                    continue
                if parent in graph:
                    if child not in graph[parent]:
                        graph[parent].append(child)
                else:
                    graph[parent] = [child]

                if child in subgraph:
                    if parent in subgraph[child]:
                        a = 0
                    else:
                        subgraph[child].append(parent)
                else:
                    subgraph[child] = [parent]

            else:
                line = line.replace("_", " ")
                parent = name_to_id[line]
            #idmapper.insert_name(parent)

    p("Finished reading all info [Time: %s sec]" % (time.time() - starttime),
      "info")
    maxparent = maxchildren = outlinks = inlinks = 0
    mparent = mchildren = ""

    C_in = len(subgraph)
    C_out = len(subgraph)
    for category in graph:
        if len(graph[category]) > maxchildren:
            maxchildren = len(graph[category])
            mchildren = category
        outlinks += len(graph[category])

        grades[category] = [len(graph[category])]

        if category in subgraph:

            grades[category].append(len(subgraph[category]))

            inlinks += len(subgraph[category])
            if len(subgraph[category]) > maxparent:
                maxparent = len(subgraph[category])
                mparent = category
            """
                if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): 
                cnt10+= 1
            if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): 
                cnt20+= 1
            if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): 
                cnt30+= 1
            if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): 
                cnt40+= 1
            if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): 
                cnt50+= 1
            if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): 
                cnt60+= 1
            if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): 
                cnt70+= 1
            if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): 
                cnt80+= 1
            if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): 
                cnt90+= 1
            if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): 
                cnt100+= 1
                print "category: %s, number: %d\n" %(category, len(graph[category]))
                """
            #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
        else:
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph:
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out

示例#16

0

显示文件

文件： Clean-dictionary.py 项目： AshBT/Master-Thesis-2015

    # Method for printing the entry, only for debugging
    def entryprint(self):
        print "Old: %s, new: %s, tierones: %s\n" %(self.oldentry, self.newentry, self.tierone) 

    # Method for comparing this object to another
    def myequal(self, obj):
        for category in obj.tierone:
            if category not in self.tierone:
                return False
        return True

version = 4                                     # Version of the dictionary to be cleaned

# Reading the dictionary
inputfilename = "igg-dictionary-" + str(version) + ".json"
p("Reading json file", "info")
with open(inputfilename, "rb") as inputfile:
    iggdictionary = yaml.load(inputfile)        # Loading the dictionary from file
p("finished reading python json", "info")

iggiabtaxonomy = "igg-iabtaxonomy" + str(version)
iggiabtaxonomynew = "igg-iabtaxonomy" + str(version+1)
dictionary = iggdictionary[iggiabtaxonomy]

newdictionary = dict()                # Dictionary for storing the final results
paraentries = dict()                  # Dictionary for keeping track on the changes made to the entries
disambiguation = dict()               # Dictionary for all disambiguation titles
disambiguationentries = dict()        # Dictionary for keeping track on all disambiguation entries

yearregex = "(\d\d\d\d)"              # Regex for recognizing years in the title
parenthesisregex = "(\(.*\))"         # Regex for recognizing parenthesis in the title

示例#17

0

显示文件

文件： Translate.py 项目： AshBT/Master-Thesis-2015

import gzip, re
from myprint import myprint as p
from unidecode import unidecode

inputfilename = "no-mapping.txt.gz"
categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

nopages = dict()
redirects = dict()
pages = dict()

p("Reading %s" %(inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 

            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" %(inputfilename), "info")

p("Reading %s" %(redirectinputfilename), "info")
with gzip.open(redirectinputfilename, "rb") as redirectfile:
    for line in redirectfile:     
        #line = unicode(line, "utf-8")

示例#18

0

显示文件

文件： Create-entry-pageid.py 项目： AshBT/Master-Thesis-2015

from myprint import myprint as p

"""
Program for a file containing all dictionary entries and what page ids they are based on.
Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy

Why? Because the file containing all possible entries and their page ids is extremely large.

Input: A file containing all processed Wikipedia article pages and their corresponding ids
Output: A file continaing the relevant dictionary entries and what page ids they are based on
"""

# Reading the English
enmappingfilename = "pageid-pagetitle-en.txt.gz"
enpagetitle_to_id = dict()
p("Reading %s" %(enmappingfilename), "info")
with gzip.open(enmappingfilename, "rb") as enmappingfile: 
    for line in enmappingfile: 
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            continue
        pageid = splittet[0]
        pagetitle = splittet[1]
        if pagetitle in enpagetitle_to_id:
            enpagetitle_to_id[pagetitle].append(pageid)
        else: 
            enpagetitle_to_id[pagetitle] = [pageid]
enmappingfile.close()

示例#19

0

显示文件

文件： Mapper.py 项目： AshBT/Master-Thesis-2015

from myprint import myprint as p

"""
Program for mapping all keywords to IAB categories. 
"""

inputfilename = "articlemapping-all.txt.gz"
outputcategoriesfilename = "Outputcategories"

letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile"]

outputcategories = dict()

tiertwo = ""
tierone = ""
p("Reading output categories", "info")
with open(outputcategoriesfilename, "r") as outputcatfile: 
    for line in outputcatfile: 
        line = line.strip()
        if line.startswith("*"): 
            tierone = line[1:]
            tierone = line.lower()
        else: 
            tiertwo = line
            tiertwo = line.lower()
            outputcategories[tiertwo] = tierone


idfilename = "id-mapper.txt.gz"
id_to_name = dict()
name_to_id = dict()

示例#20

0

显示文件

Program for mapping all keywords to IAB categories. 
"""

inputfilename = "articlemapping-all.txt.gz"
outputcategoriesfilename = "Outputcategories"

letters = [
    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
    "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile"
]

outputcategories = dict()

tiertwo = ""
tierone = ""
p("Reading output categories", "info")
with open(outputcategoriesfilename, "r") as outputcatfile:
    for line in outputcatfile:
        line = line.strip()
        if line.startswith("*"):
            tierone = line[1:]
            tierone = line.lower()
        else:
            tiertwo = line
            tiertwo = line.lower()
            outputcategories[tiertwo] = tierone

idfilename = "id-mapper.txt.gz"
id_to_name = dict()
name_to_id = dict()
p("Reading id-mapper", "info")

示例#21

0

显示文件

class Path(object):
    def __init__(self, score, path):
        self.score = score
        self.path = path
        return

    def __cmp__(self, other):
        return cmp(other.score, self.score)

letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", \
               "t", "u", "v", "w", "x", "y", "z", "restfile"]

gradesfilename = "category-grade.txt.gz"
idfilename = "id-mapper.txt.gz"

p("Reading all grades", "info")
allgrades = dict()
grades = dict()
with gzip.open(gradesfilename, "rb") as gradefile:
    for line in gradefile:
        line = line.strip()
        splittet = line.split("\t")
        category = splittet[0]
        grade = float(splittet[1])
        grades[category] = grade

id_to_name = dict()
p("Reading all ids for all categories", "info")
with gzip.open(idfilename, "rb") as idfile:
    for line in idfile:
        line = line.strip()

示例#22

0

显示文件

def is_number(input):
    try:
        int(input)
        return True
    except:
        return False


start_time = time.time()
categorycnt = pagecnt = lines = 0

hidden_cat = dict()
artskip = catskip = hiddencnt = 0

p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename) as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line] += 1
        else:
            hidden_cat[line] = 1
p("All hidden categories read", "info")

redirects = dict()
p("Reading all redirects", "info")
with gzip.open(redirectinputfilename, "rb") as redirectfile:
    for line in redirectfile:
        line.decode('utf-8', 'ignore')
        line = line.lower()

示例#23

0

显示文件

文件： Remove_hiddencats.py 项目： kukkipareek/Master-Thesis-2015

    "python Remove_hiddencats.py \n"\
    "\t [Sub-categories-new.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"\
    "\t [Subcat-links.txt.gz\n\n"\
    "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n"
    exit(0)

allcategories = dict()  #Dictionary to keep track of all categories
hidden_cat = dict()  #Dictionary to keep track of all hidden categories
links = dict()  #Dictionary to keep track of all links in the graph
artskip = catskip = hiddencnt = 0

starttime = time.time()

# Reads all the hidden categories from the file
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename, "rb") as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()

        # Code for representing the category names in same encoding
        try:
            line = line.decode('unicode-escape')
        except SyntaxError:
            line = line.decode('ascii')
        except Exception, e:
            a = 0

        try:
            line = unidecode(line)
        except UnicodeEncodeError, e:

示例#24

0

显示文件

    "economics", "education", "environment", "form", "geography", "government",
    "health", "history", "humanities", "humans", "industry", "information",
    "intellectual works", "knowledge", "language", "law", "leisure", "life",
    "mathematics", "matter", "medicine", "mind", "nature", "people",
    "politics", "professional studies", "science", "scientific disciplines",
    "society", "sports", "structure", "systems", "technology", "thought",
    "tools", "transport", "universe", "world"
]
startcategory = startcategory.lower()
parent = ""
graph = dict()  #Dictionary to keep track of the children to each parent cat
subgraph = dict()  #Dictionary to keep track of the parents to each subcategory

starttime = time.time()
begintime = starttime
p("Reading all category info", "info")
parent = child = ""
with open(categoryinfofilename) as categorygraph:
    for line in categorygraph:
        line = line.strip()
        if line.startswith("*"):  #children
            child = line[2:]
            idmapper.insert_name(child)
            if parent == "":
                continue
            if parent in graph:
                #   if child not in graph[parent]:
                graph[parent].append(child)
            else:
                graph[parent] = [child]
        else:

示例#25

0

显示文件

文件： Articlegraph_builder.py 项目： kukkipareek/Master-Thesis-2015

        "\t [Page-categories.txt.gz]\n"\
        "\t [article-info.txt.gz]\n\n"\
    "[FUNCTION]: \n"\
    "Store all articles with their immidiate subcategories\n"
    exit(0)

reload(sys)
sys.setdefaultencoding('utf-8')

articles = dict(
)  #Dictionary to keep track of all categories and their articles
artcnt = teller = articlecnt = 0
starttime = time.time()

# Reads the file file containg links between categories and articles
p("Reading all article content...", "info")
with gzip.open(articleinputfilename) as articleinfo:
    for line in articleinfo:
        line = line.strip()
        lines = line.split("\t")
        if len(lines) < 2:
            continue
        category = lines[0].lower()
        page = lines[1].lower()
        if "" == page or " " == page:
            continue

        if page in articles:
            # page is already in the dictionary
            if category not in articles[page]:
                # Add the category if not present

示例#26

0

显示文件

try:
    categoryinputfilename = sys.argv[1]
    hiddencategoryinputfilename = sys.argv[2]
except:
    print "\n[RUN]: \n"\
    "python Is_hidden_there.py \n"\
    "\t [Subcat-links.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"
    exit(0)

hidden_cat = dict()  #Dictionary to check all hidden category names
hiddencnt = 0  #Counter to see how many hidden categories are read
hiddencat_found = False  #Boolean to determine if hidden cateogry is found within graph

# Reads all hidden categories from the file containing the names of all hidden categories
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename) as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line] += 1
        else:
            hiddencnt += 1
            hidden_cat[line] = 1
p("All hidden categories read", "info")

# Reads the category graph
p("Read all category links", "info")
with gzip.open(categoryinputfilename, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()

示例#27

0

显示文件

文件： Create-dictionary.py 项目： AshBT/Master-Thesis-2015

# -*- coding: utf-8 -*-
import gzip, json, yaml, io
from myprint import myprint as p

"""
Program for creating a dictionary for a dictionary-based classifier
for another language (based on the English dictionary)
"""

nomappingfilename = "no-mapping.txt.gz"

nomapping = dict()
p("Reading %s" %(nomappingfilename), "info")
with gzip.open(nomappingfilename, "rb") as nomappingfile: 
    for line in nomappingfile: 
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            continue
        nomapping[splittet[0]] = splittet[1]
nomappingfile.close()

entrytoidfilename = "en-entry-to-pageid.json"
p("Reading %s" %(entrytoidfilename), "info")
with open(entrytoidfilename, "rb") as inputfile:
    entrypageid = yaml.load(inputfile)
p("finished reading python json", "info")

norwegianstopwords = []
stopwordfile = "norwegian_stop_words.txt"
with open(stopwordfile, "r") as inputfile:

示例#28

0

显示文件

文件： Clean-dictionary.py 项目： kukkipareek/Master-Thesis-2015

        print "Old: %s, new: %s, tierones: %s\n" % (
            self.oldentry, self.newentry, self.tierone)

    # Method for comparing this object to another
    def myequal(self, obj):
        for category in obj.tierone:
            if category not in self.tierone:
                return False
        return True


version = 4  # Version of the dictionary to be cleaned

# Reading the dictionary
inputfilename = "igg-dictionary-" + str(version) + ".json"
p("Reading json file", "info")
with open(inputfilename, "rb") as inputfile:
    iggdictionary = yaml.load(inputfile)  # Loading the dictionary from file
p("finished reading python json", "info")

iggiabtaxonomy = "igg-iabtaxonomy" + str(version)
iggiabtaxonomynew = "igg-iabtaxonomy" + str(version + 1)
dictionary = iggdictionary[iggiabtaxonomy]

newdictionary = dict()  # Dictionary for storing the final results
paraentries = dict(
)  # Dictionary for keeping track on the changes made to the entries
disambiguation = dict()  # Dictionary for all disambiguation titles
disambiguationentries = dict(
)  # Dictionary for keeping track on all disambiguation entries

示例#29

0

显示文件

文件： Create-dictionary.py 项目： kukkipareek/Master-Thesis-2015

# -*- coding: utf-8 -*-
import gzip, json, yaml, io
from myprint import myprint as p
"""
Program for creating a dictionary for a dictionary-based classifier
for another language (based on the English dictionary)
"""

nomappingfilename = "no-mapping.txt.gz"

nomapping = dict()
p("Reading %s" % (nomappingfilename), "info")
with gzip.open(nomappingfilename, "rb") as nomappingfile:
    for line in nomappingfile:
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2:
            continue
        nomapping[splittet[0]] = splittet[1]
nomappingfile.close()

entrytoidfilename = "en-entry-to-pageid.json"
p("Reading %s" % (entrytoidfilename), "info")
with open(entrytoidfilename, "rb") as inputfile:
    entrypageid = yaml.load(inputfile)
p("finished reading python json", "info")

norwegianstopwords = []
stopwordfile = "norwegian_stop_words.txt"
with open(stopwordfile, "r") as inputfile:
    for line in inputfile:

示例#30

0

显示文件

import gzip, re
from myprint import myprint as p
from unidecode import unidecode

inputfilename = "no-mapping.txt.gz"
categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

nopages = dict()
redirects = dict()
pages = dict()

p("Reading %s" % (inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2:

            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" % (inputfilename), "info")

p("Reading %s" % (redirectinputfilename), "info")
with gzip.open(redirectinputfilename, "rb") as redirectfile:
    for line in redirectfile:
        #line = unicode(line, "utf-8")

示例#31

0

显示文件

            subgraph.pop(category, None)
        else:
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph:
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
    #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category])))
    #return create_grades()


#outputfile.close()
"""
thresholds = [10, 20, 30, 40, 50]
p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info")

p("Maxparent: %d (%s)" %(maxparent, mparent), "info")
p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info")
#subcats = graph["2015"]