def process_ttriples(): lines = set([]) f_names = ['t_triples.nt'] for f_name in f_names: i_file = open(i_dir + f_name, 'r') line = i_file.readline() while line: if line[0] == '#': line = i_file.readline() continue lines.add(line) line = i_file.readline() i_file.close() o_file = open(i_dir + 't_triples_preprocessed.tab', 'w+') for line in sorted(lines): line = line[0:-1] spo = ts.get_spo(line) if len(spo) < 3: continue s = urllib.unquote(spo[0]) try: s = unicode(s, 'unicode-escape').encode('utf-8') except UnicodeDecodeError: pass s = detach_sprefix(s) s = normalize(s) p = urllib.unquote(spo[1]) try: p = unicode(p, 'unicode-escape').encode('utf-8') except UnicodeDecodeError: pass p = detach_pprefix(p) p = normalize(p) o = urllib.unquote(spo[2]) try: o = unicode(o, 'unicode-escape').encode('utf-8') except UnicodeDecodeError: pass o = detach_oprefix(o) o = normalize(o) o_file.write(s + '\t' + p + '\t' + o + '\n') if s != spo[0]: s_dictionary.add((s, spo[0])) if p != spo[1]: p_dictionary.add((p, spo[1])) if o != spo[2]: o_dictionary.add((o, spo[2])) o_file.close()
def process_cskos(): i_file = open(i_dir + 'c_skos.nt', 'r') o_file = open(i_dir + 'c_skos_preprocessed.tab', 'w+') line = i_file.readline() while line: if line[0] == '#': line = i_file.readline() continue line = line[0:-1] spo = ts.get_spo(line) if len(spo) < 3: line = i_file.readline() continue if spo[1] != 'http://www.w3.org/2004/02/skos/core#broader': line = i_file.readline() continue l_c = urllib.unquote(spo[0]) try: l_c = unicode(l_c, 'unicode-escape').encode('utf-8') except UnicodeDecodeError: pass l_c = detach_cprefix(l_c) l_c = normalize(l_c) r_c = urllib.unquote(spo[2]) try: r_c = unicode(r_c, 'unicode-escape').encode('utf-8') except UnicodeDecodeError: pass r_c = detach_cprefix(r_c) r_c = normalize(r_c) o_file.write(l_c + '\t' + r_c + '\n') if l_c != spo[0]: c_dictionary.add((l_c, spo[0])) if r_c != spo[2]: c_dictionary.add((r_c, spo[2])) line = i_file.readline() o_file.close() i_file.close()
def process_ctriples(): i_file = open(i_dir + 'c_triples.nt', 'r') o_file = open(i_dir + 'c_triples_preprocessed.tab', 'w+') line = i_file.readline() while line: if line[0] == '#': line = i_file.readline() continue line = line[0:-1] spo = ts.get_spo(line) if len(spo) < 3: line = i_file.readline() continue s = urllib.unquote(spo[0]) try: s = unicode(s, 'unicode-escape').encode('utf-8') except UnicodeDecodeError: pass s = detach_sprefix(s) s = normalize(s) c = urllib.unquote(spo[2]) try: c = unicode(c, 'unicode-escape').encode('utf-8') except UnicodeDecodeError: pass c = detach_cprefix(c) c = normalize(c) o_file.write(s + '\t' + 'categorizedIn' + '\t' + c + '\n') if s != spo[0]: s_dictionary.add((s, spo[0])) if c != spo[2]: c_dictionary.add((c, spo[2])) line = i_file.readline() o_file.close() i_file.close()