Пример #1
0
 def translateInstanceType(self):
     #read interlanguage_links_en.ttl
     picklename = "en_zh"
     if not os.path.isfile(picklename+'.pickle'):
         self.saveMapping(picklename)
 
     dict = fio.loadPickle(picklename)
     
     # read the instance type
     input = self.dataDirectory + "/" + self.inputLangaugeTag + "/instance_types_"+self.inputLangaugeTag+".ttl"
     output = self.dataDirectory + "/" + self.outputLangaugeTag + "/instance_types_"+self.outputLangaugeTag+".ttl"
     
     SavedStdOut = sys.stdout
     sys.stdout = codecs.open(output, 'wb', 'utf8')
 
     with open(input, 'r') as f:
         for line in f:
             g = re.search("(<http://dbpedia\.org/resource/)(.*)(>\s*<.*>\s*<.*>\s*.)", line)
             if g != None:
                 try:
                     if g.group(2) not in dict: continue
                     #print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3)
                     print '<http://dbpedia.org/resource/' + g.group(2) + g.group(3)
                 except Exception:
                     continue
     
     sys.stdout = SavedStdOut
Пример #2
0
 def translateLabels(self):
     
     picklename = "zh_en"
     if not os.path.isfile(picklename+'.pickle'):
         self.saveMapping(picklename)
 
     dict = fio.loadPickle(picklename)
     
     # read the label type
     input = self.dataDirectory + "/" + self.outputLangaugeTag + "/labels_"+self.outputLangaugeTag+".ttl.old"
     output = self.dataDirectory + "/" + self.outputLangaugeTag + "/labels_"+self.outputLangaugeTag+".ttl"
     
     SavedStdOut = sys.stdout
     sys.stdout = codecs.open(output, 'wb', 'utf8')
 
     with open(input, 'r') as f:
         for line in f:
             g = re.search("(<http://zh.dbpedia\.org/resource/)(.*)(>\s*<.*>\s*\")(.*\"@zh\s*.)", line)
             if g != None:
                 try:
                     if g.group(2) not in dict:
                         print '<http://dbpedia.org/resource/' + g.group(2) + g.group(3) + g.group(2) + "\"@zh ."
                     #print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3) + dict[g.group(2)] + "\"@zh ."
                     else:
                         print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3) + g.group(2) + "\"@zh ."
                 except Exception:
                     continue
     
     sys.stdout = SavedStdOut
Пример #3
0
   def translateRedirectsTransitive(self):
       # read the instance type
       input = self.dataDirectory + "/" + self.outputLangaugeTag + "/redirects_transitive_"+self.outputLangaugeTag+".ttl.old"
       output = self.dataDirectory + "/" + self.outputLangaugeTag + "/redirects_transitive_"+self.outputLangaugeTag+".ttl"
       
       picklename = "zh_en"
       if not os.path.isfile(picklename+'.pickle'):
           self.saveMapping(picklename)
   
       dict = fio.loadPickle(picklename)
           
       SavedStdOut = sys.stdout
       sys.stdout = codecs.open(output, 'wb', 'utf8')
 
       with open(input, 'r') as f:
           for line in f:
               g = re.search("(<http://zh\.dbpedia\.org/resource/)(.*)(>\s*<)(http://zh\.dbpedia.org/resource/)(.*)(>\s*.)", line)
               if g != None:
                   try:
                       if g.group(5) not in dict:
                           print '<http://dbpedia.org/resource/' + g.group(2) + g.group(3) +"http://dbpedia.org/resource/" + g.group(5) + g.group(6)
                       else:
                           print '<http://dbpedia.org/resource/' + g.group(2) + g.group(3) + "http://dbpedia.org/resource/" + dict[g.group(5)] + g.group(6)  
                   except Exception:
                       continue
       
       sys.stdout = SavedStdOut
Пример #4
0
    def translateInstanceType(self):
        #read interlanguage_links_en.ttl
        picklename = "en_zh"
        if not os.path.isfile(picklename + '.pickle'):
            self.saveMapping(picklename)

        dict = fio.loadPickle(picklename)

        # read the instance type
        input = self.dataDirectory + "/" + self.inputLangaugeTag + "/instance_types_" + self.inputLangaugeTag + ".ttl"
        output = self.dataDirectory + "/" + self.outputLangaugeTag + "/instance_types_" + self.outputLangaugeTag + ".ttl"

        SavedStdOut = sys.stdout
        sys.stdout = codecs.open(output, 'wb', 'utf8')

        with open(input, 'r') as f:
            for line in f:
                g = re.search(
                    "(<http://dbpedia\.org/resource/)(.*)(>\s*<.*>\s*<.*>\s*.)",
                    line)
                if g != None:
                    try:
                        if g.group(2) not in dict: continue
                        #print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3)
                        print '<http://dbpedia.org/resource/' + g.group(
                            2) + g.group(3)
                    except Exception:
                        continue

        sys.stdout = SavedStdOut
Пример #5
0
    def translateLabels(self):

        picklename = "zh_en"
        if not os.path.isfile(picklename + '.pickle'):
            self.saveMapping(picklename)

        dict = fio.loadPickle(picklename)

        # read the label type
        input = self.dataDirectory + "/" + self.outputLangaugeTag + "/labels_" + self.outputLangaugeTag + ".ttl.old"
        output = self.dataDirectory + "/" + self.outputLangaugeTag + "/labels_" + self.outputLangaugeTag + ".ttl"

        SavedStdOut = sys.stdout
        sys.stdout = codecs.open(output, 'wb', 'utf8')

        with open(input, 'r') as f:
            for line in f:
                g = re.search(
                    "(<http://zh.dbpedia\.org/resource/)(.*)(>\s*<.*>\s*\")(.*\"@zh\s*.)",
                    line)
                if g != None:
                    try:
                        if g.group(2) not in dict:
                            print '<http://dbpedia.org/resource/' + g.group(
                                2) + g.group(3) + g.group(2) + "\"@zh ."
                        #print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3) + dict[g.group(2)] + "\"@zh ."
                        else:
                            print '<http://dbpedia.org/resource/' + dict[
                                g.group(2)] + g.group(3) + g.group(
                                    2) + "\"@zh ."
                    except Exception:
                        continue

        sys.stdout = SavedStdOut
Пример #6
0
def getNELCorrectLabel(input, output):
    reload(sys)
    sys.setdefaultencoding('utf8')

    picklename = "en_zh"
    dict = fio.loadPickle(picklename)

    parser = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(input, parser=parser)
    root = tree.getroot()

    for child in root:
        print child.tag, child.attrib
        #         for str in child.findall('string'):
        #             print str.tag, str.attrib, str.text
        #         for keywords in child.findall('keywords'):
        #             print keywords.tag, keywords.attrib, keywords.text
        #         for query in child.findall('query'):
        #             print query.text
        for query in child.findall('query'):
            #             print query.tex
            links = GetLink(query.text)
            if len(links) > 0:
                for k, v in links.items():
                    node = ET.Element(tag='link',
                                      attrib={
                                          'lang': 'en',
                                          'entity': k
                                      })
                    node.text = v
                    node.tail = '\n'
                    child.insert(0, node)

            if len(links) > 0:
                for k, v in links.items():
                    if k in dict:
                        k = dict[k]
                        v = 'http://dbpedia.org/resource/' + k
                    else:
                        v = ' '
                    node = ET.Element(tag='link',
                                      attrib={
                                          'lang': 'zh',
                                          'entity': k
                                      })
                    node.text = v
                    node.tail = '\n'
                    child.insert(0, node)

    tree.write(output, encoding='utf8')
Пример #7
0
def getNELCorrectLabel(input, output):
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    picklename = "en_zh"
    dict = fio.loadPickle(picklename)
    
    parser = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(input, parser = parser)
    root = tree.getroot()
    
    for child in root:
        print child.tag, child.attrib
#         for str in child.findall('string'):
#             print str.tag, str.attrib, str.text
#         for keywords in child.findall('keywords'):
#             print keywords.tag, keywords.attrib, keywords.text
#         for query in child.findall('query'):
#             print query.text
        for query in child.findall('query'):
#             print query.tex
            links = GetLink(query.text)
            if len(links) > 0:
                for k, v in links.items():
                    node = ET.Element(tag='link', attrib={'lang':'en', 'entity':k})
                    node.text = v
                    node.tail = '\n'
                    child.insert(0, node)
            
            if len(links) > 0:
                for k, v in links.items():
                    if k in dict:
                        k = dict[k]
                        v = 'http://dbpedia.org/resource/' + k
                    else:
                        v = ' '
                    node = ET.Element(tag='link', attrib={'lang':'zh', 'entity':k})
                    node.text = v
                    node.tail = '\n'
                    child.insert(0, node)
            
    tree.write(output, encoding = 'utf8')
Пример #8
0
    def translateRedirectsTransitive(self):
        # read the instance type
        input = self.dataDirectory + "/" + self.outputLangaugeTag + "/redirects_transitive_" + self.outputLangaugeTag + ".ttl.old"
        output = self.dataDirectory + "/" + self.outputLangaugeTag + "/redirects_transitive_" + self.outputLangaugeTag + ".ttl"

        picklename = "zh_en"
        if not os.path.isfile(picklename + '.pickle'):
            self.saveMapping(picklename)

        dict = fio.loadPickle(picklename)

        SavedStdOut = sys.stdout
        sys.stdout = codecs.open(output, 'wb', 'utf8')

        with open(input, 'r') as f:
            for line in f:
                g = re.search(
                    "(<http://zh\.dbpedia\.org/resource/)(.*)(>\s*<)(http://zh\.dbpedia.org/resource/)(.*)(>\s*.)",
                    line)
                if g != None:
                    try:
                        if g.group(5) not in dict:
                            print '<http://dbpedia.org/resource/' + g.group(
                                2) + g.group(
                                    3
                                ) + "http://dbpedia.org/resource/" + g.group(
                                    5) + g.group(6)
                        else:
                            print '<http://dbpedia.org/resource/' + g.group(
                                2) + g.group(
                                    3) + "http://dbpedia.org/resource/" + dict[
                                        g.group(5)] + g.group(6)
                    except Exception:
                        continue

        sys.stdout = SavedStdOut