Python p примеры, myprint.p Python примеры использования

Пример #1

0

Показать файл

def read_ids(idfilename):
    with gzip.open(idfilename) as idfile:
        for line in idfile:
            line = line.strip()

            regex = "(\d+)\,\s+(.*)"
            identry = re.findall(regex, line)

            identry = identry[0]
            if len(identry) < 2:
                continue
            myid = identry[0]
            myentry = identry[1]

            id_to_name[myid] = myentry
            name_to_id[myentry] = myid
    p("Read all ids", "info")
    idfile.close()

Пример #2

0

Показать файл

Файл: Create_category_grades-id.py Проект: AshBT/Master-Thesis-2015

def read_ids(idfilename): 
    with gzip.open(idfilename) as idfile: 
        for line in idfile:
            line = line.strip()

            regex = "(\d+)\,\s+(.*)"
            identry = re.findall(regex, line)
            
            identry = identry[0]
            if len(identry) < 2: 
                continue
            myid = identry[0]
            myentry = identry[1]

            id_to_name[myid] = myentry
            name_to_id[myentry] = myid
    p("Read all ids", "info")
    idfile.close()

Пример #3

0

Показать файл

Файл: Is_hidden_there.py Проект: AshBT/Master-Thesis-2015

try:
    categoryinputfilename = sys.argv[1]
    hiddencategoryinputfilename = sys.argv[2]
except:
    print "\n[RUN]: \n"\
    "python Is_hidden_there.py \n"\
    "\t [Subcat-links.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"
    exit(0)

hidden_cat = dict()         #Dictionary to check all hidden category names
hiddencnt = 0               #Counter to see how many hidden categories are read
hiddencat_found = False     #Boolean to determine if hidden cateogry is found within graph

# Reads all hidden categories from the file containing the names of all hidden categories
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename) as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line]+=1
        else:
            hiddencnt += 1
            hidden_cat[line]=1
p("All hidden categories read", "info")

# Reads the category graph
p("Read all category links", "info")
with gzip.open(categoryinputfilename, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()

Пример #4

0

Показать файл

Файл: Create_category_grades-id.py Проект: AshBT/Master-Thesis-2015

def find_grades(categoryinfofilename, categoriesoutputfilename): 
    global grades
    global C_in
    global C_out
    global avg_in
    global avg_out
    starttime = time.time()
    begintime = starttime
    p("Reading all category info", "info")
    parent = child = ""
    graph = dict()    # Dictionary to keep track of the children to each parent cat
    subgraph = dict() # Dictionary to keep track of the parents of each cat
    
    # Creating category graph
    
    with open(categoryinfofilename) as categorygraph:
        for line in categorygraph:
            line = line.strip()
            if line.startswith("*"): #children
                child = name_to_id[line[2:]]
                if parent == "":
                    continue
                if parent in graph:
                    if child not in graph[parent]: 
                        graph[parent].append(child)
                else:
                    graph[parent] = [child]

                if child in subgraph: 
                    if parent in subgraph[child]: 
                        a = 0
                    else: 
                        subgraph[child].append(parent)
                else: 
                    subgraph[child] = [parent]

            else:
                line = line.replace("_", " ")
                parent = name_to_id[line]
            #idmapper.insert_name(parent)
                
    p("Finished reading all info [Time: %s sec]" %(time.time()-starttime), "info")
    maxparent = maxchildren = outlinks = inlinks = 0
    mparent = mchildren = ""
    
    C_in = len(subgraph)
    C_out = len(subgraph)
    for category in graph: 
        if len(graph[category])> maxchildren: 
            maxchildren = len(graph[category])
            mchildren = category
        outlinks+= len(graph[category])
        
        grades[category] = [len(graph[category])]

        if category in subgraph:
            
            grades[category].append(len(subgraph[category]))
            
            inlinks += len(subgraph[category])
            if len(subgraph[category]) > maxparent: 
                maxparent = len(subgraph[category])
                mparent = category
            """
                if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): 
                cnt10+= 1
            if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): 
                cnt20+= 1
            if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): 
                cnt30+= 1
            if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): 
                cnt40+= 1
            if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): 
                cnt50+= 1
            if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): 
                cnt60+= 1
            if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): 
                cnt70+= 1
            if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): 
                cnt80+= 1
            if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): 
                cnt90+= 1
            if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): 
                cnt100+= 1
                print "category: %s, number: %d\n" %(category, len(graph[category]))
                """
                #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
        else: 
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph: 
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out

Пример #5

0

Показать файл

Файл: Create_category_grades-id.py Проект: AshBT/Master-Thesis-2015

                #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
        else: 
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph: 
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
        #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category])))
    #return create_grades()
#outputfile.close()
"""
thresholds = [10, 20, 30, 40, 50]
p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info")

p("Maxparent: %d (%s)" %(maxparent, mparent), "info")
p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info")
#subcats = graph["2015"]

Пример #6

0

Показать файл

import gzip, json, yaml, io
from myprint import myprint as p
"""
Program for a file containing all dictionary entries and what page ids they are based on.
Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy

Why? Because the file containing all possible entries and their page ids is extremely large.

Input: A file containing all processed Wikipedia article pages and their corresponding ids
Output: A file continaing the relevant dictionary entries and what page ids they are based on
"""

# Reading the English
enmappingfilename = "pageid-pagetitle-en.txt.gz"
enpagetitle_to_id = dict()
p("Reading %s" % (enmappingfilename), "info")
with gzip.open(enmappingfilename, "rb") as enmappingfile:
    for line in enmappingfile:
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2:
            continue
        pageid = splittet[0]
        pagetitle = splittet[1]
        if pagetitle in enpagetitle_to_id:
            enpagetitle_to_id[pagetitle].append(pageid)
        else:
            enpagetitle_to_id[pagetitle] = [pageid]
enmappingfile.close()

latestversion = 5  # Latest version, update for later versions

Пример #7

0

Показать файл

Файл: Article_path_builder-id.py Проект: AshBT/Master-Thesis-2015

    "\t [category-info.txt]\n"\
    "\t [article-info.txt.gz]\n"\
    "\t [article-output.txt.gz] \n\n"\
    "[FUNC:]\n"\
    "Create the complete paths of the articles. \n"
    exit(0)

startcategory = "Main topic classifications" #"Fundamental Categories"
startcategory = startcategory.lower()
parent = ""
graph = dict()    #Dictionary to keep track of the children to each parent cat
subgraph = dict() #Dictionary to keep track of the parents to each subcategory

starttime = time.time()
begintime = starttime
p("Reading all category info", "info")
parent = child = ""
with open(categoryinfofilename) as categorygraph:
    for line in categorygraph:
        line = line.strip()
        if line.startswith("*"): #children
            child_name = line[2:]
            idmapper.insert_name(child_name)
            child = idmapper.name_to_id(child_name)
            if parent == "":
                continue
            if parent in graph:
             #   if child not in graph[parent]:
                graph[parent].append(child)
            else:
                graph[parent] = [child]

Пример #8

0

Показать файл

    "python Hidden_categories.py \n"\
    "\t [enwiki-latest-page_props.sql.gz]\n"\
    "\t [enwiki-latest-page.sql.gz\n"\
    "\t [All_hidden_categories.txt.gz]\n\n"\
    "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n"
    exit(0)

hiddencat_id = dict(
)  #Dictionary to keep track of the ids of all hidden categories
hidden_cat = dict(
)  #Dictionary to keep track of the name of all hidden categories
hiddencnt = 0
start_time = time.time()

# Looping through page props to find the ids of all hidden categories
p("Finding all hidden category ids from page props...", "info")
with gzip.open(pagepropsinputfilename) as inputfile:
    for line in inputfile:
        if (line.startswith("INSERT")):
            line = line.split("VALUES (")[1]
            line.decode('utf-8', 'ignore')
            insertions = line.split("),(")
            for insertion in insertions:
                insertion = insertion.lower()
                words = insertion.split(",")
                if "hiddencat" in words[1].lower():
                    # Hidden category is found, id i added
                    hiddencat_id[words[0]] = 1
                    hiddencnt += 1  # Counter to keep track of number of hidden categories
mytime = time.time() - start_time
p(

Пример #9

0

Показать файл

    titlefilename = sys.argv[1]
    outputtitlesfilename = sys.argv[2]
except:
    print "\n[RUN]: \n"\
    "python Redirecting.py \n"\
    "\t [enwiki-latest-redirects.sql.gz]\n"\
    "\t [output-redirect-titles.txt.gz]\n\n"\
    "[FUNC:]\n"\
    "Find all titles that redirecs \n"
    exit(0)

redirects = dict()  #Dictionary for keeping all redirect pages
starttime = time.time()

# Reads the redictfile
p("Reading all redirect titles", "info")
with gzip.open(titlefilename) as titlefile:
    for line in titlefile:
        if line.startswith("INSERT"):
            line.decode('utf-8', 'ignore')
            line_split = line[30:] #.split("VALUES (")[1]
            insertions = line_split.split("),(")
            for insertion in insertions:

                # Code for encoding
                try:
                    insertion = insertion.decode('unicode-escape')
                except SyntaxError:
                    insertion = insertion.decode('ascii')
                except Exception,e:
                    a = 0

Пример #10

0

Показать файл

Файл: Create-pageid-entry.py Проект: AshBT/Master-Thesis-2015

"""
Program for finding the pageid for an entry.
Needs to do the same process as the mapper so that the entries are identical.
"""


categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

redirects = dict()
pageid_to_title = dict()
pagetitle_to_id = dict()

"""
p("Reading %s" %(inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            #print "< 2: " + line
            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" %(inputfilename), "info")
"""
p("Reading %s" %(redirectinputfilename), "info")

Пример #11

0

Показать файл

Файл: Remove_hiddencats.py Проект: AshBT/Master-Thesis-2015

    "python Remove_hiddencats.py \n"\
    "\t [Sub-categories-new.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"\
    "\t [Subcat-links.txt.gz\n\n"\
    "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n"
    exit(0)

allcategories = dict()  #Dictionary to keep track of all categories
hidden_cat = dict()     #Dictionary to keep track of all hidden categories
links = dict()          #Dictionary to keep track of all links in the graph
artskip = catskip = hiddencnt = 0

starttime = time.time()

# Reads all the hidden categories from the file
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename, "rb") as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()

        # Code for representing the category names in same encoding
        try:
            line = line.decode('unicode-escape')
        except SyntaxError:
            line = line.decode('ascii')
        except Exception,e:
            a = 0

        try:
            line = unidecode(line)
        except UnicodeEncodeError, e:

Пример #12

0

Показать файл

import gzip, re
from myprint import myprint as p
from unidecode import unidecode
"""
Program for finding the pageid for an entry.
Needs to do the same process as the mapper so that the entries are identical.
"""

categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

redirects = dict()
pageid_to_title = dict()
pagetitle_to_id = dict()
"""
p("Reading %s" %(inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            #print "< 2: " + line
            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" %(inputfilename), "info")
"""
p("Reading %s" % (redirectinputfilename), "info")

Пример #13

0

Показать файл

Файл: Find_all_hidden_categories.py Проект: AshBT/Master-Thesis-2015

    print "\n[RUN]: \n"\
    "python Hidden_categories.py \n"\
    "\t [enwiki-latest-page_props.sql.gz]\n"\
    "\t [enwiki-latest-page.sql.gz\n"\
    "\t [All_hidden_categories.txt.gz]\n\n"\
    "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n"
    exit(0)


hiddencat_id = dict()   #Dictionary to keep track of the ids of all hidden categories
hidden_cat = dict()     #Dictionary to keep track of the name of all hidden categories
hiddencnt = 0
start_time = time.time()

# Looping through page props to find the ids of all hidden categories
p("Finding all hidden category ids from page props...", "info")
with gzip.open(pagepropsinputfilename) as inputfile:
    for line in inputfile:
        if (line.startswith("INSERT")):
            line = line.split("VALUES (")[1]
            line.decode('utf-8', 'ignore')
            insertions = line.split("),(")
            for insertion in insertions:
                insertion = insertion.lower()
                words = insertion.split(",")
                if "hiddencat" in words[1].lower():
                    # Hidden category is found, id i added
                    hiddencat_id[words[0]] = 1
                    hiddencnt += 1  # Counter to keep track of number of hidden categories
mytime = time.time() - start_time
p("Found %d hidden category ids (%s min, %s min) ---" %(hiddencnt,  mytime/60, mytime%60), "info")

Пример #14

0

Показать файл

Файл: Grader-id.py Проект: AshBT/Master-Thesis-2015

class Path(object): 
    def __init__(self, score, path): 
        self.score = score
        self.path = path
        return 
    def __cmp__(self, other): 
        return cmp(other.score, self.score)
        
letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", \
               "t", "u", "v", "w", "x", "y", "z", "restfile"]

gradesfilename = "category-grade.txt.gz"
idfilename = "id-mapper.txt.gz"

p("Reading all grades", "info")
allgrades = dict()
grades = dict()
with gzip.open(gradesfilename, "rb") as gradefile: 
    for line in gradefile: 
        line = line.strip()
        splittet = line.split("\t")
        category = splittet[0]
        grade = float(splittet[1])
        grades[category] = grade

id_to_name = dict()
p("Reading all ids for all categories", "info")
with gzip.open(idfilename, "rb") as idfile: 
    for line in idfile: 
        line = line.strip()

Пример #15

0

Показать файл

def find_grades(categoryinfofilename, categoriesoutputfilename):
    global grades
    global C_in
    global C_out
    global avg_in
    global avg_out
    starttime = time.time()
    begintime = starttime
    p("Reading all category info", "info")
    parent = child = ""
    graph = dict(
    )  # Dictionary to keep track of the children to each parent cat
    subgraph = dict()  # Dictionary to keep track of the parents of each cat

    # Creating category graph

    with open(categoryinfofilename) as categorygraph:
        for line in categorygraph:
            line = line.strip()
            if line.startswith("*"):  #children
                child = name_to_id[line[2:]]
                if parent == "":
                    continue
                if parent in graph:
                    if child not in graph[parent]:
                        graph[parent].append(child)
                else:
                    graph[parent] = [child]

                if child in subgraph:
                    if parent in subgraph[child]:
                        a = 0
                    else:
                        subgraph[child].append(parent)
                else:
                    subgraph[child] = [parent]

            else:
                line = line.replace("_", " ")
                parent = name_to_id[line]
            #idmapper.insert_name(parent)

    p("Finished reading all info [Time: %s sec]" % (time.time() - starttime),
      "info")
    maxparent = maxchildren = outlinks = inlinks = 0
    mparent = mchildren = ""

    C_in = len(subgraph)
    C_out = len(subgraph)
    for category in graph:
        if len(graph[category]) > maxchildren:
            maxchildren = len(graph[category])
            mchildren = category
        outlinks += len(graph[category])

        grades[category] = [len(graph[category])]

        if category in subgraph:

            grades[category].append(len(subgraph[category]))

            inlinks += len(subgraph[category])
            if len(subgraph[category]) > maxparent:
                maxparent = len(subgraph[category])
                mparent = category
            """
                if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): 
                cnt10+= 1
            if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): 
                cnt20+= 1
            if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): 
                cnt30+= 1
            if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): 
                cnt40+= 1
            if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): 
                cnt50+= 1
            if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): 
                cnt60+= 1
            if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): 
                cnt70+= 1
            if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): 
                cnt80+= 1
            if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): 
                cnt90+= 1
            if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): 
                cnt100+= 1
                print "category: %s, number: %d\n" %(category, len(graph[category]))
                """
            #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
        else:
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph:
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out

Пример #16

0

Показать файл

Файл: Clean-dictionary.py Проект: AshBT/Master-Thesis-2015

    # Method for printing the entry, only for debugging
    def entryprint(self):
        print "Old: %s, new: %s, tierones: %s\n" %(self.oldentry, self.newentry, self.tierone) 

    # Method for comparing this object to another
    def myequal(self, obj):
        for category in obj.tierone:
            if category not in self.tierone:
                return False
        return True

version = 4                                     # Version of the dictionary to be cleaned

# Reading the dictionary
inputfilename = "igg-dictionary-" + str(version) + ".json"
p("Reading json file", "info")
with open(inputfilename, "rb") as inputfile:
    iggdictionary = yaml.load(inputfile)        # Loading the dictionary from file
p("finished reading python json", "info")

iggiabtaxonomy = "igg-iabtaxonomy" + str(version)
iggiabtaxonomynew = "igg-iabtaxonomy" + str(version+1)
dictionary = iggdictionary[iggiabtaxonomy]

newdictionary = dict()                # Dictionary for storing the final results
paraentries = dict()                  # Dictionary for keeping track on the changes made to the entries
disambiguation = dict()               # Dictionary for all disambiguation titles
disambiguationentries = dict()        # Dictionary for keeping track on all disambiguation entries

yearregex = "(\d\d\d\d)"              # Regex for recognizing years in the title
parenthesisregex = "(\(.*\))"         # Regex for recognizing parenthesis in the title

Пример #17

0

Показать файл

Файл: Translate.py Проект: AshBT/Master-Thesis-2015

import gzip, re
from myprint import myprint as p
from unidecode import unidecode

inputfilename = "no-mapping.txt.gz"
categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

nopages = dict()
redirects = dict()
pages = dict()

p("Reading %s" %(inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 

            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" %(inputfilename), "info")

p("Reading %s" %(redirectinputfilename), "info")
with gzip.open(redirectinputfilename, "rb") as redirectfile:
    for line in redirectfile:     
        #line = unicode(line, "utf-8")

Пример #18

0

Показать файл

Файл: Create-entry-pageid.py Проект: AshBT/Master-Thesis-2015

from myprint import myprint as p

"""
Program for a file containing all dictionary entries and what page ids they are based on.
Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy

Why? Because the file containing all possible entries and their page ids is extremely large.

Input: A file containing all processed Wikipedia article pages and their corresponding ids
Output: A file continaing the relevant dictionary entries and what page ids they are based on
"""

# Reading the English
enmappingfilename = "pageid-pagetitle-en.txt.gz"
enpagetitle_to_id = dict()
p("Reading %s" %(enmappingfilename), "info")
with gzip.open(enmappingfilename, "rb") as enmappingfile: 
    for line in enmappingfile: 
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            continue
        pageid = splittet[0]
        pagetitle = splittet[1]
        if pagetitle in enpagetitle_to_id:
            enpagetitle_to_id[pagetitle].append(pageid)
        else: 
            enpagetitle_to_id[pagetitle] = [pageid]
enmappingfile.close()

Пример #19

0

Показать файл

Файл: Mapper.py Проект: AshBT/Master-Thesis-2015

from myprint import myprint as p

"""
Program for mapping all keywords to IAB categories. 
"""

inputfilename = "articlemapping-all.txt.gz"
outputcategoriesfilename = "Outputcategories"

letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile"]

outputcategories = dict()

tiertwo = ""
tierone = ""
p("Reading output categories", "info")
with open(outputcategoriesfilename, "r") as outputcatfile: 
    for line in outputcatfile: 
        line = line.strip()
        if line.startswith("*"): 
            tierone = line[1:]
            tierone = line.lower()
        else: 
            tiertwo = line
            tiertwo = line.lower()
            outputcategories[tiertwo] = tierone


idfilename = "id-mapper.txt.gz"
id_to_name = dict()
name_to_id = dict()

Пример #20

0

Показать файл

Program for mapping all keywords to IAB categories. 
"""

inputfilename = "articlemapping-all.txt.gz"
outputcategoriesfilename = "Outputcategories"

letters = [
    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
    "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile"
]

outputcategories = dict()

tiertwo = ""
tierone = ""
p("Reading output categories", "info")
with open(outputcategoriesfilename, "r") as outputcatfile:
    for line in outputcatfile:
        line = line.strip()
        if line.startswith("*"):
            tierone = line[1:]
            tierone = line.lower()
        else:
            tiertwo = line
            tiertwo = line.lower()
            outputcategories[tiertwo] = tierone

idfilename = "id-mapper.txt.gz"
id_to_name = dict()
name_to_id = dict()
p("Reading id-mapper", "info")

Пример #21

0

Показать файл

class Path(object):
    def __init__(self, score, path):
        self.score = score
        self.path = path
        return

    def __cmp__(self, other):
        return cmp(other.score, self.score)

letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", \
               "t", "u", "v", "w", "x", "y", "z", "restfile"]

gradesfilename = "category-grade.txt.gz"
idfilename = "id-mapper.txt.gz"

p("Reading all grades", "info")
allgrades = dict()
grades = dict()
with gzip.open(gradesfilename, "rb") as gradefile:
    for line in gradefile:
        line = line.strip()
        splittet = line.split("\t")
        category = splittet[0]
        grade = float(splittet[1])
        grades[category] = grade

id_to_name = dict()
p("Reading all ids for all categories", "info")
with gzip.open(idfilename, "rb") as idfile:
    for line in idfile:
        line = line.strip()

Пример #22

0

Показать файл

def is_number(input):
    try:
        int(input)
        return True
    except:
        return False


start_time = time.time()
categorycnt = pagecnt = lines = 0

hidden_cat = dict()
artskip = catskip = hiddencnt = 0

p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename) as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line] += 1
        else:
            hidden_cat[line] = 1
p("All hidden categories read", "info")

redirects = dict()
p("Reading all redirects", "info")
with gzip.open(redirectinputfilename, "rb") as redirectfile:
    for line in redirectfile:
        line.decode('utf-8', 'ignore')
        line = line.lower()

Пример #23

0

Показать файл

Файл: Remove_hiddencats.py Проект: kukkipareek/Master-Thesis-2015

    "python Remove_hiddencats.py \n"\
    "\t [Sub-categories-new.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"\
    "\t [Subcat-links.txt.gz\n\n"\
    "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n"
    exit(0)

allcategories = dict()  #Dictionary to keep track of all categories
hidden_cat = dict()  #Dictionary to keep track of all hidden categories
links = dict()  #Dictionary to keep track of all links in the graph
artskip = catskip = hiddencnt = 0

starttime = time.time()

# Reads all the hidden categories from the file
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename, "rb") as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()

        # Code for representing the category names in same encoding
        try:
            line = line.decode('unicode-escape')
        except SyntaxError:
            line = line.decode('ascii')
        except Exception, e:
            a = 0

        try:
            line = unidecode(line)
        except UnicodeEncodeError, e:

Пример #24

0

Показать файл

    "economics", "education", "environment", "form", "geography", "government",
    "health", "history", "humanities", "humans", "industry", "information",
    "intellectual works", "knowledge", "language", "law", "leisure", "life",
    "mathematics", "matter", "medicine", "mind", "nature", "people",
    "politics", "professional studies", "science", "scientific disciplines",
    "society", "sports", "structure", "systems", "technology", "thought",
    "tools", "transport", "universe", "world"
]
startcategory = startcategory.lower()
parent = ""
graph = dict()  #Dictionary to keep track of the children to each parent cat
subgraph = dict()  #Dictionary to keep track of the parents to each subcategory

starttime = time.time()
begintime = starttime
p("Reading all category info", "info")
parent = child = ""
with open(categoryinfofilename) as categorygraph:
    for line in categorygraph:
        line = line.strip()
        if line.startswith("*"):  #children
            child = line[2:]
            idmapper.insert_name(child)
            if parent == "":
                continue
            if parent in graph:
                #   if child not in graph[parent]:
                graph[parent].append(child)
            else:
                graph[parent] = [child]
        else:

Пример #25

0

Показать файл

Файл: Articlegraph_builder.py Проект: kukkipareek/Master-Thesis-2015

        "\t [Page-categories.txt.gz]\n"\
        "\t [article-info.txt.gz]\n\n"\
    "[FUNCTION]: \n"\
    "Store all articles with their immidiate subcategories\n"
    exit(0)

reload(sys)
sys.setdefaultencoding('utf-8')

articles = dict(
)  #Dictionary to keep track of all categories and their articles
artcnt = teller = articlecnt = 0
starttime = time.time()

# Reads the file file containg links between categories and articles
p("Reading all article content...", "info")
with gzip.open(articleinputfilename) as articleinfo:
    for line in articleinfo:
        line = line.strip()
        lines = line.split("\t")
        if len(lines) < 2:
            continue
        category = lines[0].lower()
        page = lines[1].lower()
        if "" == page or " " == page:
            continue

        if page in articles:
            # page is already in the dictionary
            if category not in articles[page]:
                # Add the category if not present

Пример #26

0

Показать файл

try:
    categoryinputfilename = sys.argv[1]
    hiddencategoryinputfilename = sys.argv[2]
except:
    print "\n[RUN]: \n"\
    "python Is_hidden_there.py \n"\
    "\t [Subcat-links.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"
    exit(0)

hidden_cat = dict()  #Dictionary to check all hidden category names
hiddencnt = 0  #Counter to see how many hidden categories are read
hiddencat_found = False  #Boolean to determine if hidden cateogry is found within graph

# Reads all hidden categories from the file containing the names of all hidden categories
p("Reading all hidden categories", "info")
with gzip.open(hiddencategoryinputfilename) as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line] += 1
        else:
            hiddencnt += 1
            hidden_cat[line] = 1
p("All hidden categories read", "info")

# Reads the category graph
p("Read all category links", "info")
with gzip.open(categoryinputfilename, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()

Пример #27

0

Показать файл

Файл: Create-dictionary.py Проект: AshBT/Master-Thesis-2015

# -*- coding: utf-8 -*-
import gzip, json, yaml, io
from myprint import myprint as p

"""
Program for creating a dictionary for a dictionary-based classifier
for another language (based on the English dictionary)
"""

nomappingfilename = "no-mapping.txt.gz"

nomapping = dict()
p("Reading %s" %(nomappingfilename), "info")
with gzip.open(nomappingfilename, "rb") as nomappingfile: 
    for line in nomappingfile: 
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            continue
        nomapping[splittet[0]] = splittet[1]
nomappingfile.close()

entrytoidfilename = "en-entry-to-pageid.json"
p("Reading %s" %(entrytoidfilename), "info")
with open(entrytoidfilename, "rb") as inputfile:
    entrypageid = yaml.load(inputfile)
p("finished reading python json", "info")

norwegianstopwords = []
stopwordfile = "norwegian_stop_words.txt"
with open(stopwordfile, "r") as inputfile:

Пример #28

0

Показать файл

Файл: Clean-dictionary.py Проект: kukkipareek/Master-Thesis-2015

        print "Old: %s, new: %s, tierones: %s\n" % (
            self.oldentry, self.newentry, self.tierone)

    # Method for comparing this object to another
    def myequal(self, obj):
        for category in obj.tierone:
            if category not in self.tierone:
                return False
        return True


version = 4  # Version of the dictionary to be cleaned

# Reading the dictionary
inputfilename = "igg-dictionary-" + str(version) + ".json"
p("Reading json file", "info")
with open(inputfilename, "rb") as inputfile:
    iggdictionary = yaml.load(inputfile)  # Loading the dictionary from file
p("finished reading python json", "info")

iggiabtaxonomy = "igg-iabtaxonomy" + str(version)
iggiabtaxonomynew = "igg-iabtaxonomy" + str(version + 1)
dictionary = iggdictionary[iggiabtaxonomy]

newdictionary = dict()  # Dictionary for storing the final results
paraentries = dict(
)  # Dictionary for keeping track on the changes made to the entries
disambiguation = dict()  # Dictionary for all disambiguation titles
disambiguationentries = dict(
)  # Dictionary for keeping track on all disambiguation entries

Пример #29

0

Показать файл

Файл: Create-dictionary.py Проект: kukkipareek/Master-Thesis-2015

# -*- coding: utf-8 -*-
import gzip, json, yaml, io
from myprint import myprint as p
"""
Program for creating a dictionary for a dictionary-based classifier
for another language (based on the English dictionary)
"""

nomappingfilename = "no-mapping.txt.gz"

nomapping = dict()
p("Reading %s" % (nomappingfilename), "info")
with gzip.open(nomappingfilename, "rb") as nomappingfile:
    for line in nomappingfile:
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2:
            continue
        nomapping[splittet[0]] = splittet[1]
nomappingfile.close()

entrytoidfilename = "en-entry-to-pageid.json"
p("Reading %s" % (entrytoidfilename), "info")
with open(entrytoidfilename, "rb") as inputfile:
    entrypageid = yaml.load(inputfile)
p("finished reading python json", "info")

norwegianstopwords = []
stopwordfile = "norwegian_stop_words.txt"
with open(stopwordfile, "r") as inputfile:
    for line in inputfile:

Пример #30

0

Показать файл

import gzip, re
from myprint import myprint as p
from unidecode import unidecode

inputfilename = "no-mapping.txt.gz"
categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

nopages = dict()
redirects = dict()
pages = dict()

p("Reading %s" % (inputfilename), "info")
with gzip.open(inputfilename, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2:

            continue
        nopages[splittet[0]] = splittet[1]

inputfile.close()
p("Finished reading %s" % (inputfilename), "info")

p("Reading %s" % (redirectinputfilename), "info")
with gzip.open(redirectinputfilename, "rb") as redirectfile:
    for line in redirectfile:
        #line = unicode(line, "utf-8")

Пример #31

0

Показать файл

            subgraph.pop(category, None)
        else:
            grades[category].append(0)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph:
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
    #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category])))
    #return create_grades()


#outputfile.close()
"""
thresholds = [10, 20, 30, 40, 50]
p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info")

p("Maxparent: %d (%s)" %(maxparent, mparent), "info")
p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info")
#subcats = graph["2015"]

Python p примеры использования