Exemplo n.º 1
0
def tf_analyze(path):
    text = fileio.read_file(path)
    dictionary = ngram.wordgram_analyze(str.lower(text))
    tf_dict = OrderedDict()

    for k, v in dictionary.items():
        tf_dict[str(k)] = tf_value(dictionary, str(k))
    sorted_tf_dict = OrderedDict(sorted(tf_dict.items(), key=operator.itemgetter(1), reverse=True))

    return sorted_tf_dict
Exemplo n.º 2
0
def idf_analyze(dirpath, path):
    dict_map = ngram.wordgram_map(dirpath)
    contents = fileio.read_file(path)

    dict_file = ngram.wordgram_analyze(str.lower(contents))
    dict_idf = OrderedDict()
    for dict_elm in dict_file.keys():
        dict_idf[dict_elm] = idf_value(dirpath, dict_map, dict_elm)
    sorted_idf_dict = OrderedDict(sorted(dict_idf.items(), key=operator.itemgetter(1), reverse=True))

    return sorted_idf_dict
Exemplo n.º 3
0
def get_api_key(api):
    api_file = io.read_file(os.path.join(DIRS["project"], "API.conf"))
    #print api_file
    
    for line in api_file:
        #skip empty lines
        if line:
            #skip comment lines
            if line[0] == "#":
                continue
            line_split = line.split(":")
            if line_split[0].lower() == api.lower():
                return line_split[1].strip()
Exemplo n.º 4
0
def wordgram_map(dirpath):
    file_list = listdir(dirpath)
    dict_map = []

    for f in file_list:
        extension = path.splitext(f)[1]
        if extension != ".txt":
            continue
        full_path = dirpath + f
        file_contents = fileio.read_file(full_path)
        file_dict = ngram.wordgram_analyze(str.lower(file_contents))
        dict_map.append(file_dict)

    return dict_map
Exemplo n.º 5
0
def wordgram_analyze(dirpath):
	dict_map=[]
	file_list=listdir(dirpath)
	for file in file_list:
		extension = os.path.splitext(file)[1]
		if extension !='.txt':
			continue
		file = dirpath + file
		file_contents = fileio.read_file(file)			#파일의 내용을 읽어온다.
		file_contents= remove_puc_marks(file_contents)	#문장 부호제거
		file_dict = hannanum_analyze_22(file_contents)	#형태소별로 딕셔너리에 분류
		# file_dict = hannanum_analyze_22_key(file_contents)	#형태소별로 딕셔너리에 분류
		dict_map.append(file_dict)						#전체 딕셔너리에 추가한다.
	return dict_map
Exemplo n.º 6
0
def main():
    o = None
    try:
        o = Options()
    except InvalidArgsException as e:
        print(str(e), file=sys.stderr)
        exit(-1)

    if o.help:
        print(HELP)
        return
    if o.input_file is None:
        print("No input file specified", file=sys.stderr)
        exit(-1)
    if o.output_file is None:
        print("No output file specified", file=sys.stderr)
        exit(-1)

    file_data = None
    try:
        file_data = fileio.read_file(o.input_file)
    except IOError:
        print(f"Error reading file: {o.input_file}", file=sys.stderr)
        exit(-1)

    pre = None
    try:
        pre = preprocessor.pre_process(file_data, None)
    except preprocessor.PreProcessException as e:
        print(str(e), file=sys.stderr)
        exit(-1)
    print(pre, end="\n\n\n")

    toks = None
    try:
        toks = lexical.tokenize(pre + "\n\n")
    except lexical.TokenizerException as e:
        print(str(e), file=sys.stderr)
        exit(-1)
    except IndexError:
        print("Unexpected end of file in tokenizing")
        exit(-1)
    for t in toks:
        print(str(t))
Exemplo n.º 7
0
    ml_cons = label_ml_cons(labels_mapping)
    cl_cons = label_cl_cons(labels_mapping)

    fileio.write_labels_mapping(labels_mapping, path + "\\labels.csv")

    output_file_test = path + "\\test"
    output_file_train = path + "\\train"
    filelist = []
    for i in xrange(0, num_processes):
        filelist.append(path + "\\execution_" + str(i) + "_.rdf")
    id_to_uri, graph_labels_train, graph_labels_test = fileio.create_graph(
        filelist, output_file_train, output_file_test, labels_mapping, k_fold, RDF.type, process_uri
    )

    database_train = fileio.read_file(output_file_train + str(0) + ".txt")
    statistics = gspan.database_statistics(database_train)
    print statistics

    len_ml_cons = len(ml_cons)
    len_cl_cons = len(cl_cons)
    len_combined = 2500  # len_ml_cons + len_cl_cons
    step_size = len_combined / 4

    with open("D:\\Dissertation\\Data Sets\\Manufacturing\\classifiers_break_cons.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerow(["num_features", "accuracy", "model", "runtime", "classifier"])
        for c in [True]:
            for num_constraints in range(1, len_combined, step_size):
                if num_constraints < len_ml_cons:
                    cons = (ml_cons[:num_constraints], [])
Exemplo n.º 8
0
import fileio as io
import pandas as pd



vars = []
var_names = []
for key,value in c.acs_variables.items():
    print key
    for k,v in value.items():
        vars.append(v["variable"])
        var_names.append(k)


yelp_zips = io.read_file(os.path.join('..',"data", "yelp_zipcodes.csv"))


print len(vars)

vars_1 = vars[0:49]
var_names_1 = var_names[0:49]
vars_2 = vars[50:99]
var_names_2 = var_names[50:99]
vars_3 = vars[100:]
var_names_3 = var_names[100:]

zip_query_1 = c.get(vars_1,year=2013, survey="acs5", zip_codes=yelp_zips)
zip_query_2 = c.get(vars_2,year=2013, survey="acs5", zip_codes=yelp_zips)
zip_query_3 = c.get(vars_3,year=2013, survey="acs5", zip_codes=yelp_zips)
Exemplo n.º 9
0
import sys
import os

import fileio
import gspan

if __name__ == '__main__':
    print 'Database: ', sys.argv[1]
    database = fileio.read_file(sys.argv[1])
    print 'Number Graphs Read: ', len(database)
    print 'Support: ', sys.argv[2],
    minsup = int((float(sys.argv[2]) * len(database)))
    print minsup
    database, freq, trimmed, flabels = gspan.trim_infrequent_nodes(
        database, minsup)
    database = fileio.read_file(sys.argv[1], frequent=freq)
    print 'Trimmed ', len(trimmed), ' labels from the database'
    print flabels
    gspan.project(database, freq, minsup, flabels)
Exemplo n.º 10
0
def Gspan(support):
	database = fileio.read_file(r"database.txt")
	minsup = int((float(support)*len(database)))
	database, freq, trimmed, flabels = gspan.trim_infrequent_nodes(database, minsup)
	database = fileio.read_file(r"database.txt", frequent = freq)
	gspan.project(database, freq, minsup, flabels)
Exemplo n.º 11
0
import sys
import os

import fileio
import gspan

if __name__ == '__main__':
	print 'Database: ', sys.argv[1]
	database = fileio.read_file(sys.argv[1])
	print 'Number Graphs Read: ', len(database)
	print 'Support: ', sys.argv[2],
	minsup = int((float(sys.argv[2])*len(database)))
	print minsup
	database, freq, trimmed, flabels = gspan.trim_infrequent_nodes(database, minsup)
	database = fileio.read_file(sys.argv[1], frequent = freq)
	print 'Trimmed ', len(trimmed), ' labels from the database'
	print flabels
	gspan.project(database, freq, minsup, flabels)


def Gspan(support):
	database = fileio.read_file(r"database.txt")
	minsup = int((float(support)*len(database)))
	database, freq, trimmed, flabels = gspan.trim_infrequent_nodes(database, minsup)
	database = fileio.read_file(r"database.txt", frequent = freq)
	gspan.project(database, freq, minsup, flabels)