Python DataParser.DataParser示例

def results():
#def results(data={}):
    global flows
    global data
    global metadata
    global count_flocap
    global classifiers_to_display
    global classifier_names
        
    classifiers_to_display = []
    classifier_names = []
    display_fields = OrderedDict({})
    config_file = 'laui.cfg'
    fp = open(config_file,'r')
    for line in fp:
        if line.startswith('display_field'):
            tokens = line.split()
            display_fields[int(tokens[3])] = (tokens[1],tokens[2].replace('_',' '))
            continue
        elif line.strip() == '' or line.startswith('#') or not line.startswith('classifier'):
            continue
        tokens = line.split()
        if tokens[2] == 'logreg':
            classifiers_to_display.append((tokens[1], tokens[2], tokens[3], tokens[4]))
            classifier_names.append(tokens[1])
        elif tokens[2] == 'mapping':
            tmp_map = {}
            with open(tokens[4],'r') as fp2:
                for line2 in fp2:
                    tokens2 = line2.split()
                    tmp_map[tokens2[0]] = float(tokens2[1])
            classifiers_to_display.append((tokens[1], tokens[2], tmp_map, int(tokens[3])))
            classifier_names.append(tokens[1])
    fp.close()

    file_names = []
    is_upload = False
    if request.files.get('upload') != None:
#    if False:
        upload = request.files.get('upload')

        dir_name = tempfile.mkdtemp()
        upload.save(dir_name + 'temp.json')

        file_names.append(dir_name+'temp.json')
        is_upload = True
    else:
        tmp_files = get_files_by_time(out_dir)
        tmp_files.reverse()
        if len(tmp_files) > 0:
            file_names.append(out_dir+tmp_files[0])
        if len(tmp_files) > 1:
            file_names.append(out_dir+tmp_files[1])
        if len(tmp_files) > 2:
            file_names.append(out_dir+tmp_files[2])
        if len(tmp_files) > 3:
            file_names.append(out_dir+tmp_files[3])
        if len(tmp_files) > 4:
            file_names.append(out_dir+tmp_files[4])
        if len(tmp_files) > 5:
            file_names.append(out_dir+tmp_files[5])

    start_time = time.time()
    data = []
    metadata = []
    total_flows = 0
    for f in file_names:

        try: # just a robustness check
            parser = DataParser(f)
            tmpBD = parser.getByteDistribution()
            tmpIPT = parser.getIndividualFlowIPTs()
            tmpPL = parser.getIndividualFlowPacketLengths()
            tmp,tmp_m = parser.getIndividualFlowMetadata()
        except:
            continue
#        flows += parser.advancedInfo
        if parser.advancedInfo == None:
            continue
        for k in parser.advancedInfo:
            flows[k] = parser.advancedInfo[k]

        if tmp != None and tmpPL != None and tmpIPT != None:
            for i in range(len(tmp)):
                tmp_data = []
                tmp_data.extend(tmp[len(tmp)-i-1])
                tmp_data.extend(tmpPL[len(tmp)-i-1])
                tmp_data.extend(tmpIPT[len(tmp)-i-1])
                tmp_data.extend(tmpBD[len(tmp)-i-1])

                # nga issue, will fix when pcaps start flowing again
                if tmp_data[2] == 0 and tmp_data[4] > 0:
                    continue
                if tmp_data[3] == 0 and tmp_data[5] > 0:
                    continue

#                if len(tmp_data) != num_params:
#                    continue
                data.append(tmp_data)
                metadata.append(tmp_m[len(tmp)-i-1])
                total_flows += 1

                if total_flows == count_flocap*2 and not is_upload:
                    break
        if total_flows == count_flocap*2 and not is_upload:
            break

    if request.files.get('upload') != None:
        os.removedirs(dir_name)

    results = classify_samples(data, metadata)

    lhost = {}
    for i in range(len(metadata)):
        if metadata[i][0] not in lhost:
            lhost[metadata[i][0]] = 1
        else:
            lhost[metadata[i][0]] += 1
    sorted_lhost = sorted(lhost.items(), key=operator.itemgetter(1))
    sorted_lhost.reverse()
    if len(sorted_lhost) > 0:
        (lh,_) = sorted_lhost[0]
    else:
        lh = None

    tmp = []
    to_display = []
    to_display_names = []
    for key in display_fields:
        to_display_names.append(display_fields[key])
    for i in range(len(results)):
        color = []
        for j in range(len(results[i])):
            color.append(get_color(results[i][j]))

        s_orgName = ''
        d_orgName = ''
        if metadata[i][0] == lh:
            s_orgName = 'localhost'
        if metadata[i][1] == lh:
            d_orgName = 'localhost'

        tmp_to_display = []
        for key in display_fields:
            tmp_to_display.append(metadata[i][key])

        tmp.append((results[i],metadata[i][0],metadata[i][1],metadata[i][2],metadata[i][3],metadata[i][4],metadata[i][5],metadata[i][6],metadata[i][7],color,s_orgName,d_orgName,metadata[i][8],tmp_to_display))
    end_time = time.time()-start_time
    tmp = sorted(tmp,key=lambda x: x[0])
    tmp.reverse()

    return template('results',results=tmp,num_flows=len(results),t=end_time,classifier_names=classifier_names,
                    to_display_names=to_display_names)

示例#2

显示文件

文件： main.py 项目： Vishengel/address-clustering

import googlemaps
from data_parser import DataParser
from kmeans import KMeans

gmaps = googlemaps.Client(key='AIzaSyC543HM-TAYHQW25N6QQ81RbcFTwv6gyUY')

dp = DataParser(gmaps)
dp.open_file_path(
    "C:/Users/Jelle/Documents/GitHub/address-clustering/testadressen.csv")
dp.name_address_from_csv()

kmeans = KMeans()
labeled_data, cluster_centers = kmeans.do_kmeans(dp.address_list, 3)

print(labeled_data)

for c in cluster_centers:
    street_name = gmaps.reverse_geocode(
        c)[0]['address_components'][1]['long_name']
    house_no = gmaps.reverse_geocode(
        c)[0]['address_components'][0]['long_name']
    print(street_name + " " + house_no)