示例#1
0
def get_infinite(tag_id, get_time):
    new_time = []
    new_weeks = 0
    allmonday_list = []
    non_monday_weeks = []
    summed_month = []
    year_list = []
    dframe = load_data(tag_id)
    df1 = datetime.datetime.strptime(get_time + ' 19:00:00',
                                     "%Y-%m-%d %H:%M:%S")
    k = 1
    while (k != 13):
        weeks_list = []
        df1 = time_tango(str(df1.year) + "-" + str(k) + "-" + "01")
        b = 0
        non_monday_weeks = json.loads(new_week(tag_id, str(df1.date())))
        b = [non_monday_weeks[x][1] for x, y in enumerate(non_monday_weeks)]
        weeks_list.append(sum(b))
        for i in allmondays(df1.year, df1.month):
            w = json.loads(new_week(tag_id, str(i)))
            lst2 = [item[1] for item in w]
            weeks_list.append(sum(lst2))
        summed_month.append(sum(weeks_list))
        k += 1
    year_list.append(sum(summed_month))
    return json.dumps(util.parse_data(year_list, tag_id))
示例#2
0
def get_aggregated_day(tag_id, get_time):
    new_time = []
    df = datetime.datetime.strptime(get_time + ' 00:00:00',
                                    "%Y-%m-%d %H:%M:%S")
    for i in range(25):
        new_time.append(
            int(time.mktime(utc_return_nowtime(df).timetuple()) * 1000))
        df += datetime.timedelta(hours=1)
    dt = (util.parse_data_reon(
        Parallel(n_jobs=15,
                 backend="threading")(delayed(qb.query_aggregated_func)(
                     tag_id, new_time[j], new_time[j + 1], 's', 15, 'avg')
                                      for j in range(24))))
    dt = ast.literal_eval(dt)
    dft = []
    dt1 = []
    dt_final = []
    for i, v in dt.iteritems():
        dft = v['results']
    dft = [x for x in dft if x != []]
    for i in range(0, len(dft)):
        dt2 = (dft[i][0])
        # print(dt2)
        dt1.append(
            (datetime.datetime.fromtimestamp(dt2 / 1000)).strftime('%M'))
        if (dt1[i] == '00'):
            dt_final.append([dft[i][0], dft[i][1]])

    return json.dumps(util.parse_data(dt_final, tag_id))
def validate(data, trace=False):
    items = {}

    for (li, tvdb_id, sep, alias_list, line) in parse_data(data):
        # Check line ends with a ','
        if not line.strip().endswith(','):
            if trace:
                print "line does not end with ','"
                print "=================="
                print_error(li, line)
            return False

        # Check tvdb_id is a valid integer
        if not validate_tvdb_id(tvdb_id, trace=trace):
            if trace:
                print_error(li, line)
            return False

        # Check for duplicates
        if tvdb_id in items:
            if trace:
                print 'item already exists'
                print 'items[' + str(tvdb_id) + '] = ' + str(alias_list)
                print '=================='
                print_error(li, line)
            return False

        items[tvdb_id] = alias_list

    if trace:
        print 'valid'
    return True
示例#4
0
def get_aggregated_year(tag_id, get_time):
    new_time = []
    new_weeks = 0
    allmonday_list = []

    non_monday_weeks = []
    week_tuple = []
    summed_month = []
    b = 0
    dframe = load_data(tag_id)
    for k in range(1, 13):
        weeks_list = []
        df1 = datetime.datetime.strptime(get_time + ' 19:00:00',
                                         "%Y-%m-%d %H:%M:%S")
        df1 = time_tango(str(df1.year) + "-" + str(k) + "-" + "01")

        non_monday_weeks = json.loads(new_week(tag_id, str(df1.date())))
        b = [non_monday_weeks[x][1] for x, y in enumerate(non_monday_weeks)]
        weeks_list.append(sum(b))
        for i in allmondays(df1.year, df1.month):
            week = json.loads(new_week(tag_id, str(i)))
            lst2 = [item[1] for item in week]
            weeks_list.append(sum(lst2))
        summed_month.append(sum(weeks_list))
    return json.dumps(util.parse_data(summed_month, tag_id))
示例#5
0
def main():  
    # read in the data set
    subdir = 'data/original_tags/'
    fname = 'dataset.csv'
    data = util.parse_data(subdir,fname,extract_features=True)
    
    # randomize the data cases
    random.shuffle(data)
    
    # split into training and testing data
    slice = math.trunc(len(data)*(.8)) # 80% train, 20% test
    train_data = data[:slice]
    test_data = data[slice:]
    
    # instantiating classifier
    ovr = OneVsRestClassifier()
    a = Analyzer(subdir,fname)
    
    # printing dataset statistics
    print 'Dataset Statistics\n---'
    print 'Total Tokens:',a.total_tokens()
    print 'Total Types:',a.total_types()
    print 'Total Label Types:',a.total_label_types()
    print 'Average number of tags per sample:',a.mean_tag_set_size()
    print 
    
    # printing OVR training specific statistics
    print 'Training Statistics\n---'
    ovr.fit(train_data,threshold=200,print_stats=True)   
    
    total_hamming_error = ovr.total_hamming_error(test_data)
    total_recall_error = ovr.total_recall_error(test_data)
    total_precision_error =ovr.total_precision_error(test_data)
    test_size = len(test_data)
    
    print
    print 'Model Accuracy\n---'
    print 'Total Hamming Error:',total_hamming_error
    print 'Mean Hamming Error:',total_hamming_error / test_size
    print 'Total Recall Error:',total_recall_error
    print 'Mean Recall Error:',total_recall_error / test_size
    print 'Total Precision Error:',total_precision_error
    print 'Mean Precision Error:',total_precision_error / test_size   
    print
    
    # An example
    sample_str = 'How many numbers less than 70 are relatively prime to it?'
    sample = util.features(sample_str)
    gold_y = ovr.transform(['combinatorics','number-theory'])
    
    print 'An Example\n---'
    print sample_str
    pred_y = ovr.predict(sample)
    print 'Prediction:',ovr.inverse_transform(pred_y)
    print 'Actual:',ovr.inverse_transform(gold_y)
    print 'Hamming Error:',util.hamming_error(gold_y,pred_y)
    print 'Recall Error:',util.recall_error(gold_y,pred_y)
    print 'Precision Error:',util.precision_error(gold_y,pred_y)
    def __init__(self, subdir, fname):
        # read in the stopwords file
        stop_words_file = open(os.path.join('data/', 'stop_words'))
        self.stop_words = [line.rstrip('\n') for line in stop_words_file
                           ]  #striping each \n from stop words
        self.data = util.parse_data(subdir, fname)

        # extract the unique labels present in the data
        self.label_set = self.get_label_set()
示例#7
0
def main():
    # read in the data set
    subdir = 'data/original_tags/'
    fname = 'dataset.csv'
    data = util.parse_data(subdir, fname, extract_features=True)

    # grab number of trials form the cmd line input
    num_trials = int(sys.argv[1])

    # set up dictionary to hold results
    trial_results = {}
    trial_results['sum_total_hamming'] = 0
    trial_results['sum_total_precision'] = 0
    trial_results['sum_total_recall'] = 0

    trial_results['sum_mean_hamming'] = 0
    trial_results['sum_mean_precision'] = 0
    trial_results['sum_mean_recall'] = 0

    print 'Trial ',
    # run the trials
    for i in range(num_trials):
        print(i + 1),
        # randomize the data cases
        random.shuffle(data)

        # split into training and testing data
        slice = math.trunc(len(data) * (.8))  # 80% train, 20% test
        train_set = data[:slice]
        test_set = data[slice:]

        # train a new classifier
        ovr = OneVsRestClassifier()
        ovr.fit(train_set)

        # determine total error for each metric
        total_hamming_error = ovr.total_hamming_error(test_set)
        total_precision_error = ovr.total_precision_error(test_set)
        total_recall_error = ovr.total_recall_error(test_set)
        n = len(test_set)

        # update relevant error entries in the dictionary
        trial_results['sum_total_hamming'] += total_hamming_error
        trial_results['sum_total_precision'] += total_precision_error
        trial_results['sum_total_recall'] += total_recall_error

        trial_results['sum_mean_hamming'] += (total_hamming_error / n)
        trial_results['sum_mean_precision'] += (total_precision_error / n)
        trial_results['sum_mean_recall'] += (total_recall_error / n)

    # print the results
    print '\n---'
    print 'Number of trials:', num_trials
    for metric, value in trial_results.items():
        print metric, value
        print 'Average:', value / num_trials
        print
def get_parsed_data(features):
    parsed_features = []
    # parsing line by line
    for feature in features:
        parsed_data = parse_data(str(feature))
        
        # If list is not empty
        if parsed_data != None:
            parsed_features.append(parsed_data)
            
    final_features = remove_inconsistencies(parsed_features)      
      
    return final_features
示例#9
0
def get_parsed_data(features):
    parsed_features = []
    # parsing line by line
    for feature in features:
        parsed_data = parse_data(str(feature))

        # If list is not empty
        if parsed_data != None:
            parsed_features.append(parsed_data)

    final_features = remove_inconsistencies(parsed_features)

    return final_features
def get_features(folderpath):
    features = []
    fileData = read_file(folderpath)
    # For each line
    for dataVar in fileData:
        parsed_data = parse_data(dataVar)
        # If list is not empty
        if parsed_data != None:
            features.append(parsed_data)
    
    # Imputing missing data        
    final_features = remove_inconsistencies(features)  
          
    return final_features
示例#11
0
def get_features(folderpath):
    features = []
    fileData = read_file(folderpath)
    # For each line
    for dataVar in fileData:
        parsed_data = parse_data(dataVar)
        # If list is not empty
        if parsed_data != None:
            features.append(parsed_data)

    # Imputing missing data
    final_features = remove_inconsistencies(features)

    return final_features
def merge(localPath, remotePath):
    items = {}
    key_order = []

    changes = {}

    for path in [localPath, remotePath]:
        for (li, tvdb_id, sep, alias_list, line) in parse_data(get_text(path)):
            if not validate_tvdb_id(tvdb_id):
                continue

            if not items.has_key(tvdb_id):
                items[tvdb_id] = []
                key_order.append(tvdb_id)

            for alias in alias_list:
                alias = alias.strip().replace("'", "\\'")

                if not find_match(alias, items[tvdb_id]):
                    items[tvdb_id].append(alias)

                    # track remote changes
                    if path == remotePath:
                        if not changes.has_key(tvdb_id):
                            changes[tvdb_id] = []
                        changes[tvdb_id].append(alias)

    print "----------------------------------------------------------"
    print "New Shows"
    print "----------------------------------------------------------"
    for ck, added in changes.items():
        if items[ck] == added:
            print str(ck) + '\tnew\t\t' + str(added)

    print "----------------------------------------------------------"
    print "New Aliases"
    print "----------------------------------------------------------"
    for ck, added in changes.items():
        if items[ck] != added:
            print str(ck) + '\tadd\t\t' + str(added)
            print '=============\t', items[ck]
            print

    return dict_to_data(items, key_order)
def remove_duplicates(path):
    items = {}
    key_order = []

    changes = {}

    for (li, tvdb_id, sep, alias_list, line) in parse_data(get_text(path)):
        if not validate_tvdb_id(tvdb_id):
            continue

        if not items.has_key(tvdb_id):
            items[tvdb_id] = []
            key_order.append(tvdb_id)

        for alias in alias_list:
            alias = alias.strip().replace("'", "\\'")

            if not find_match(alias, items[tvdb_id]):
                items[tvdb_id].append(alias)

    return dict_to_data(items, key_order)
示例#14
0
def get_aggregated_week(tag_id, get_time):
    new_time = []
    new_list = []
    dframe = load_data(tag_id)
    pars = dateutil.parser.parse(get_time).date()
    df = time_tango(pars)
    week_day = df.weekday()

    for k in range(week_day):
        df -= datetime.timedelta(days=1)

    for i in range(8):
        try:
            if (df.date() <= datetime.datetime.now().date()):
                datevalues = dframe.Values[dframe.Date == df.date()].tolist()
                datevalues = [x for x in datevalues if x != 0]
                datevalues = datevalues[::-1]
                #values1=datevalues[0]
                new_list.append(datevalues[0])
            else:
                new_list.append(0)
        except:
            new_list.append(0)
        df += datetime.timedelta(days=1)
    df = datetime.datetime.strptime(get_time + ' 19:00:00',
                                    "%Y-%m-%d %H:%M:%S")
    df = df - datetime.timedelta(days=df.weekday())
    for i in range(len(new_list) - 1):
        mid = new_list[i + 1] - new_list[i]
        try:
            if (df.date() == datetime.datetime.now().date()):
                mid = datevalues[-1] - datevalues[0]
        except:
            mid = 0
        if (mid < 0 or new_list[i] == 0):
            new_time.append(0)
        else:
            new_time.append(mid)
        df += datetime.timedelta(days=1)
    return json.dumps(util.parse_data(new_time, tag_id))
        #R_gt_44 = np.matmul(R_tgt,np.linalg.inv(R_src))
        R_gt_44 = np.matmul(R_tgt_first, np.linalg.inv(R_src_first))
        R_gt = R_gt_44[:3,:3]
    else:
        R_src_first = R_src
        R_tgt_first = R_tgt

        #R_gt_44 = np.matmul(R_tgt,np.linalg.inv(R_src))
        R_gt_44 = np.matmul(R_tgt_first, np.linalg.inv(R_src_first))
        #R_gt_44 = np.matmul(R_tgt, np.linalg.inv(R_src))
        R_gt = R_gt_44[:3,:3]
    
    # generate source/target scans, point cloud

    depth_src,depth_tgt,_,_,color_src,color_tgt,pc_src,pc_tgt = util.parse_data(depth,rgb,norm,args.dataset,args.method)

    if len(pc_src) == 0 or len(pc_tgt)==0:
        print(f"this point cloud file contain no point")
        continue
            
    if args.old_scannet:
        
        overlap_val,cam_dist_this,pc_dist_this,pc_nn = util.point_cloud_overlap(pc_src, pc_tgt, R_gt_44)


        
    overlap = '0-0.1' if overlap_val <= 0.1 else '0.1-0.5' if overlap_val <= 0.5 else '0.5-1.0'


    data_s = {'rgb':   rgb[0,0,:,:,:].transpose(1,2,0),
示例#16
0
            # use origin size scan for baselines on scannet dataset
            if 'scannet' in args.dataList and 'ours' not in args.method:
                rgb, depth = data['rgb_full'], data['depth_full']
            R = torch_op.npy(R)
            rgb = torch_op.npy(rgb * 255).clip(0, 255).astype('uint8')
            norm = torch_op.npy(norm)
            depth = torch_op.npy(depth)
            segm = torch_op.npy(segm)

            R_src = R[0, 0, :, :]
            R_tgt = R[0, 1, :, :]
            R_gt_44 = np.matmul(R_tgt, np.linalg.inv(R_src))
            R_gt = R_gt_44[:3, :3]

            # generate source/target scans, point cloud
            depth_src, depth_tgt, normal_src, normal_tgt, color_src, color_tgt, pc_src, pc_tgt = util.parse_data(
                depth, rgb, norm, args.dataList, args.method)

            if len(pc_src) == 0 or len(pc_tgt) == 0:
                print(f"this point cloud file contain no point")
                continue

            # compute overlap and other stats
            overlap_val, cam_dist_this, pc_dist_this, pc_nn = util.point_cloud_overlap(
                pc_src, pc_tgt, R_gt_44)
            overlap = '0-0.1' if overlap_val <= 0.1 else '0.1-0.5' if overlap_val <= 0.5 else '0.5-1.0'

            # do not test non-overlap with traditional method since make no sense.
            if args.method in ['fgs', 'gs', 'super4pcs', 'cgs'
                               ] and overlap_val < 0.1:
                continue
示例#17
0
import sys
import os
import util

from keyword_frequency_classifier import KeywordFrequencyClassifier
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
from nltk import NaiveBayesClassifier

# analysis of the single label classification models
if __name__ == "__main__":
    # read in the data set
    subdir = 'data/single_tags/'
    fname = 'dataset.csv'
    data = util.parse_data(subdir,
                           fname,
                           single_label=True,
                           extract_features=True)

    # randomize the data cases
    random.shuffle(data)

    # split into training and testing data
    slice = math.trunc(len(data) * (.8))  # 80% train, 20% test
    train_set = data[:slice]
    test_set = data[slice:]

    # train classification models
    print 'Training models on', len(train_set), 'data samples...'
    nb = NaiveBayesClassifier.train(train_set)
    lr = SklearnClassifier(LogisticRegression()).train(train_set)
    kwfc = KeywordFrequencyClassifier()