def tf_analyze(path): text = fileio.read_file(path) dictionary = ngram.wordgram_analyze(str.lower(text)) tf_dict = OrderedDict() for k, v in dictionary.items(): tf_dict[str(k)] = tf_value(dictionary, str(k)) sorted_tf_dict = OrderedDict(sorted(tf_dict.items(), key=operator.itemgetter(1), reverse=True)) return sorted_tf_dict
def idf_analyze(dirpath, path): dict_map = ngram.wordgram_map(dirpath) contents = fileio.read_file(path) dict_file = ngram.wordgram_analyze(str.lower(contents)) dict_idf = OrderedDict() for dict_elm in dict_file.keys(): dict_idf[dict_elm] = idf_value(dirpath, dict_map, dict_elm) sorted_idf_dict = OrderedDict(sorted(dict_idf.items(), key=operator.itemgetter(1), reverse=True)) return sorted_idf_dict
def get_api_key(api): api_file = io.read_file(os.path.join(DIRS["project"], "API.conf")) #print api_file for line in api_file: #skip empty lines if line: #skip comment lines if line[0] == "#": continue line_split = line.split(":") if line_split[0].lower() == api.lower(): return line_split[1].strip()
def wordgram_map(dirpath): file_list = listdir(dirpath) dict_map = [] for f in file_list: extension = path.splitext(f)[1] if extension != ".txt": continue full_path = dirpath + f file_contents = fileio.read_file(full_path) file_dict = ngram.wordgram_analyze(str.lower(file_contents)) dict_map.append(file_dict) return dict_map
def wordgram_analyze(dirpath): dict_map=[] file_list=listdir(dirpath) for file in file_list: extension = os.path.splitext(file)[1] if extension !='.txt': continue file = dirpath + file file_contents = fileio.read_file(file) #파일의 내용을 읽어온다. file_contents= remove_puc_marks(file_contents) #문장 부호제거 file_dict = hannanum_analyze_22(file_contents) #형태소별로 딕셔너리에 분류 # file_dict = hannanum_analyze_22_key(file_contents) #형태소별로 딕셔너리에 분류 dict_map.append(file_dict) #전체 딕셔너리에 추가한다. return dict_map
def main(): o = None try: o = Options() except InvalidArgsException as e: print(str(e), file=sys.stderr) exit(-1) if o.help: print(HELP) return if o.input_file is None: print("No input file specified", file=sys.stderr) exit(-1) if o.output_file is None: print("No output file specified", file=sys.stderr) exit(-1) file_data = None try: file_data = fileio.read_file(o.input_file) except IOError: print(f"Error reading file: {o.input_file}", file=sys.stderr) exit(-1) pre = None try: pre = preprocessor.pre_process(file_data, None) except preprocessor.PreProcessException as e: print(str(e), file=sys.stderr) exit(-1) print(pre, end="\n\n\n") toks = None try: toks = lexical.tokenize(pre + "\n\n") except lexical.TokenizerException as e: print(str(e), file=sys.stderr) exit(-1) except IndexError: print("Unexpected end of file in tokenizing") exit(-1) for t in toks: print(str(t))
ml_cons = label_ml_cons(labels_mapping) cl_cons = label_cl_cons(labels_mapping) fileio.write_labels_mapping(labels_mapping, path + "\\labels.csv") output_file_test = path + "\\test" output_file_train = path + "\\train" filelist = [] for i in xrange(0, num_processes): filelist.append(path + "\\execution_" + str(i) + "_.rdf") id_to_uri, graph_labels_train, graph_labels_test = fileio.create_graph( filelist, output_file_train, output_file_test, labels_mapping, k_fold, RDF.type, process_uri ) database_train = fileio.read_file(output_file_train + str(0) + ".txt") statistics = gspan.database_statistics(database_train) print statistics len_ml_cons = len(ml_cons) len_cl_cons = len(cl_cons) len_combined = 2500 # len_ml_cons + len_cl_cons step_size = len_combined / 4 with open("D:\\Dissertation\\Data Sets\\Manufacturing\\classifiers_break_cons.csv", "w") as f: writer = csv.writer(f) writer.writerow(["num_features", "accuracy", "model", "runtime", "classifier"]) for c in [True]: for num_constraints in range(1, len_combined, step_size): if num_constraints < len_ml_cons: cons = (ml_cons[:num_constraints], [])
import fileio as io import pandas as pd vars = [] var_names = [] for key,value in c.acs_variables.items(): print key for k,v in value.items(): vars.append(v["variable"]) var_names.append(k) yelp_zips = io.read_file(os.path.join('..',"data", "yelp_zipcodes.csv")) print len(vars) vars_1 = vars[0:49] var_names_1 = var_names[0:49] vars_2 = vars[50:99] var_names_2 = var_names[50:99] vars_3 = vars[100:] var_names_3 = var_names[100:] zip_query_1 = c.get(vars_1,year=2013, survey="acs5", zip_codes=yelp_zips) zip_query_2 = c.get(vars_2,year=2013, survey="acs5", zip_codes=yelp_zips) zip_query_3 = c.get(vars_3,year=2013, survey="acs5", zip_codes=yelp_zips)
import sys import os import fileio import gspan if __name__ == '__main__': print 'Database: ', sys.argv[1] database = fileio.read_file(sys.argv[1]) print 'Number Graphs Read: ', len(database) print 'Support: ', sys.argv[2], minsup = int((float(sys.argv[2]) * len(database))) print minsup database, freq, trimmed, flabels = gspan.trim_infrequent_nodes( database, minsup) database = fileio.read_file(sys.argv[1], frequent=freq) print 'Trimmed ', len(trimmed), ' labels from the database' print flabels gspan.project(database, freq, minsup, flabels)
def Gspan(support): database = fileio.read_file(r"database.txt") minsup = int((float(support)*len(database))) database, freq, trimmed, flabels = gspan.trim_infrequent_nodes(database, minsup) database = fileio.read_file(r"database.txt", frequent = freq) gspan.project(database, freq, minsup, flabels)
import sys import os import fileio import gspan if __name__ == '__main__': print 'Database: ', sys.argv[1] database = fileio.read_file(sys.argv[1]) print 'Number Graphs Read: ', len(database) print 'Support: ', sys.argv[2], minsup = int((float(sys.argv[2])*len(database))) print minsup database, freq, trimmed, flabels = gspan.trim_infrequent_nodes(database, minsup) database = fileio.read_file(sys.argv[1], frequent = freq) print 'Trimmed ', len(trimmed), ' labels from the database' print flabels gspan.project(database, freq, minsup, flabels) def Gspan(support): database = fileio.read_file(r"database.txt") minsup = int((float(support)*len(database))) database, freq, trimmed, flabels = gspan.trim_infrequent_nodes(database, minsup) database = fileio.read_file(r"database.txt", frequent = freq) gspan.project(database, freq, minsup, flabels)