예제 #1
0
    def export_to_csv(
            self,
            results: Collection[RepoDiffMetrics],
    ) -> None:
        header: Tuple[str, ...] = ('role id', 'v1', 'v2')
        change_cats = sorted(c.__name__ for c in get_diff_category_leafs())
        header = tuple(chain(header, change_cats))

        csv_lines: List[Tuple[Union[str, int, None], ...]] = []
        for repo_metrics in results:
            role_id = repo_metrics.id
            for metrics in repo_metrics.metric_map.values():
                v1 = metrics.v1
                v2 = metrics.v2
                summ = metrics.metric_summary
                csv_line: Tuple[Union[str, int, None], ...] = (role_id, v1, v2)
                if summ is None:
                    csv_line = tuple(chain(
                            csv_line, [None] * len(change_cats)))
                else:
                    csv_line = tuple(chain(
                            csv_line, (summ[cat] for cat in change_cats)))
                csv_lines.append(csv_line)

        write_csv(self.out / 'diff_metrics.csv', header, csv_lines)
def run():

    print("Performing Base MLP")
    print("-------------------")

    # Train with dataset
    mlp_clfr = MLPClassifier(activation="logistic",
                             solver="sgd")  #100 neurons by default
    mlp_model1 = mlp_clfr.fit(train1_x, train1_y)
    mlp_model2 = mlp_clfr.fit(train2_x, train2_y)

    # Predict trained model with dataset
    y_predict1 = mlp_model1.predict(test1_x)
    y_predict2 = mlp_model2.predict(test2_x)

    # Evaluate score on dataset
    eval_dataset(1, mlp_model1)

    # Plot confusion matrix
    c_matrix1 = plot_confusion_matrix(mlp_model1, test1_y, y_predict1)

    # Print precision, recall, f1-score, accuracy, macro-avg f1, weighted-avg f1
    print_model_details(test1_y, y_predict1)

    # Repeat steps for dataset 2
    eval_dataset(2, mlp_model2)
    test2_y_predict = mlp_model2.predict(test2_x)
    c_matrix2 = plot_confusion_matrix(mlp_model2, test2_y, y_predict2)
    print_model_details(test2_y, y_predict2)

    # Output results into file
    util.write_csv("./output/Base-MLP-DS1.csv", test1_y, y_predict1, c_matrix1)
    util.write_csv("./output/Base-MLP-DS2.csv", test2_y, y_predict2, c_matrix2)
예제 #3
0
def main(args):
    # Set revcomp parameter.
    if args.r != 1:
        args.r = False
    elif args.r == 1 and args.alphabet != 'DNA':
        print("Error, the -r parameter can only be used in DNA.")
    elif args.r == 1 and args.alphabet == 'DNA':
        args.r = True

    # Set alphabet parameter.
    if args.alphabet == 'DNA':
        args.alphabet = index_list.DNA
    elif args.alphabet == 'RNA':
        args.alphabet = index_list.RNA
    elif args.alphabet == 'Protein':
        args.alphabet = index_list.PROTEIN

    res = make_kmer_vector(k=args.k, alphabet=args.alphabet, filename=args.inputfile, revcomp=args.r)

    # Write correspond res file.
    if args.f == 'svm':
        from util import write_libsvm
        write_libsvm(res, [args.l] * len(res), args.outputfile)
    elif args.f == 'tab':
        from util import write_tab
        write_tab(res, args.outputfile)
    elif args.f == 'csv':
        from util import write_csv
        write_csv(res, args.outputfile)
예제 #4
0
def run():
    f = open('/home/kirayue/final/metadata', 'r')
    metadata = json.load(f)
    label = open('dataset/t1_train_label.txt', 'r').readlines()

    X = []
    y = []
    for ind, v in enumerate(label):
        if str(ind) in metadata:
            feature = []
            cur_metadata = metadata[str(ind)]
            feature.append(cur_metadata['commentCount'])
            feature.append(cur_metadata['num_groups'])
            feature.append(cur_metadata['viewCount'])
            feature.append(cur_metadata['faveCount'])
            X.append(feature)
            y.append(float(label[ind].replace('\n', '')))

    train_num = int(len(X) * 0.8)
    reg = RandomForestRegressor(n_estimators=400, n_jobs=50)
    print("Training ...")
    reg.fit(X[:train_num], y[:train_num])
    y_t = reg.predict(X[train_num:])
    print("MAE: {}".format(mean_absolute_error(y[train_num:], y_t)))
    print("MSE: {}".format(mean_squared_error(y[train_num:], y_t)))
    util.write_csv(y_t, y[train_num:], 'result/meta_predict.csv')
예제 #5
0
def main(args):
    #TODO:args.method will be finished
    #TODO:args.inputfile, name

    if args.alphabet == "RNA":

        if args.method.upper() == 'TRIPLET':
            res = get_triplet_matrix(args.inputfile)
        elif args.method.upper() == 'PSESSC':
            if args.k is None:
                print "parameters k is required. The default value of k is 2."
                args.k = 2
            if args.r is None:
                print "parameters r is required. The default value of r is 2."
                args.r = 2
            if args.w is None:
                print "parameters w is required. The default value of w is 0.1."
                args.w = 0.1
            res = get_psessc_matrix(args.inputfile, args.k, args.r, args.w)
        elif args.method.upper() == 'PSEDPC':
            if args.n is None:
                print "parameters n is required. The default value of d is 0."
                args.n = 0
            if args.r is None:
                print "parameters r is required. The default value of r is 2."
                args.r = 2
            if args.w is None:
                print "parameters w is required. The default value of w is 0.1."
                args.w = 0.1
            res = get_psedpc_matrix(args.inputfile, args.n, args.r, args.w)
        else:
            print("Method error!")
    else:
        print("sequence type error!")
    # Write correspond res file.

    if args.f == 'tab':
        from util import write_tab

        write_tab(res, args.outputfile)
    elif args.f == 'svm':
        if args.multi == 0 and args.l is None:
            args.l = '+1'
        elif args.multi == 0 and (args.l != '+1' and args.l != '-1'):
            print "For binary classification, the label should be either '+1' or '-1'."
            return False
        elif args.multi == 1 and args.l is None:
            args.l = '0'
        elif args.multi == 1 and args.l is not None:
            try:
                label = int(args.l)
            except ValueError:
                print 'The labels should be integer.'
                return False
        from util import write_libsvm
        write_libsvm(res, [args.l] * len(res), args.outputfile)
    elif args.f == 'csv':
        from util import write_csv
        write_csv(res, args.outputfile)
예제 #6
0
def run():
    data = h5py.File("image_arr_N_224_224_3.processed.hdf5", "r")
    test_X = data['X'][300000:]
    test_y = data['y'][300000:]
    model = load_model('model/fine_tune_VGG19.train_conv.train_fc.model')
    y_t = model.predict(test_X)
    print("MAE: {}".format(mean_absolute_error(test_y, y_t)))
    print("MSE: {}".format(mean_squared_error(test_y, y_t)))
    util.write_csv(y_t, test_y, 'result/cnn.csv')
예제 #7
0
def build_author_key_csv(author_key_list, authors):
    csv = []
    for author in authors:
        row = [author['id'], author['first_name'], author['last_name'], author['email']]
        for key in author['keys']:
            csv += [row + [key, 'x', make_author_link(key)]]

    write_csv(author_key_list, ['id', 'first_name', 'last_name', 'email', 'key',
                                'valid', 'key_link'], csv)
예제 #8
0
def main(args):
    with open(args.inputfile) as f:
        k = read_k(args.alphabet, args.method, 0)

        # Get index_list.
        if args.i is not None:
            from pse import read_index
            ind_list = read_index(args.i)
        else:
            ind_list = []

        default_e = []
        # Set Pse default index_list.
        if args.alphabet == 'DNA':
            args.alphabet = index_list.DNA
            if k == 2:
                default_e = const.DI_INDS_6_DNA
            elif k == 3:
                default_e = const.TRI_INDS_DNA
        elif args.alphabet == 'RNA':
            args.alphabet = index_list.RNA
            default_e = const.DI_INDS_RNA
        elif args.alphabet == 'Protein':
            args.alphabet = index_list.PROTEIN
            default_e = const.INDS_3_PROTEIN

        theta_type = 1
        if args.method in const.METHODS_AC:
            theta_type = 1
        elif args.method in const.METHODS_CC:
            theta_type = 2
        elif args.method in const.METHODS_ACC:
            theta_type = 3
        else:
            print("Method error!")

        # ACC.
        if args.e is None and len(ind_list) == 0 and args.a is False:
            # Default Pse.
            res = acc(f, k, args.lag, default_e, args.alphabet,
                      extra_index_file=args.e, all_prop=args.a, theta_type=theta_type)
        else:
            res = acc(f, k, args.lag, ind_list, args.alphabet,
                      extra_index_file=args.e, all_prop=args.a, theta_type=theta_type)

    # Write correspond res file.
    if args.f == 'tab':
        from util import write_tab
        write_tab(res, args.outputfile)
    elif args.f == 'svm':
        from util import write_libsvm
        write_libsvm(res, [args.l] * len(res), args.outputfile)
    elif args.f == 'csv':
        from util import write_csv
        write_csv(res, args.outputfile)
예제 #9
0
def download(project_name, url, storyPointKey):
    try:
        jira = JIRA(url, basic_auth=(credential.username, credential.password))
    except Exception:
        jira = JIRA(url)

    status = "Resolved, Done, Closed"
    # jql = 'project=' + project_name + \
    #       ' AND status in (' + status + ')' + \
    #       ' AND "' + storyPointKey + '" > 0'  # AND "Actual Story Points"  > 0'
    jql = 'project=' + project_name + \
          ' AND status in (' + status + ')' + \
          ' AND "' + storyPointKey + '" > 0 AND "Actual Story Points"  > 0'

    block_size = 100
    block_num = 0
    header_fields = None
    data_list = []
    while True:
        start_idx = block_num * block_size

        # just duplicate it, deepcopy sucks
        original_issues = jira.search_issues(jql,
                                             start_idx,
                                             block_size,
                                             expand="changelog")
        latest_issues = jira.search_issues(jql,
                                           start_idx,
                                           block_size,
                                           expand="changelog")

        issue_field_name_id, issue_field_id_name = get_field_name_id_list(jira)
        if len(original_issues) == 0:
            # Retrieve issues until there are no more to come
            break
        block_num += 1
        print("BLOCK = " + block_num.__str__())

        for x in range(len(original_issues)):
            original_issue = original_issues[x]
            latest_issue = latest_issues[x]
            # field_names, history = original_issue_extractor.run(original_issue, latest_issue, issue_field_name_id, issue_field_id_name)
            field_names, history = issue_extractor.run(original_issue,
                                                       latest_issue,
                                                       issue_field_name_id,
                                                       issue_field_id_name)
            for item in history:
                data_list.append(item)
            header_fields = field_names  # lowercase only
            # print('%s: %s' % (issue.key, issue.fields.summary))  # import csv

    util.write_csv(filename=project_name,
                   field_names=header_fields,
                   data_records=data_list)
예제 #10
0
def build_paper_csv(pub_list, authors, whitelist):
    schema = ['id', 'first_name', 'last_name', 'email', 'keys',
              'valid', 'pub_key', 'pub_title', 'put_year', 'pub_authors']
    csv = []
    for k, author in authors.items():
        row = [k, author['first_name'], author['last_name'], author['email'],
               ";".join(author['keys']), 'x']
        for pub in author['pubs']:
            csv += [row + [pub['key'], pub['title'], pub['year'],
                           ';'.join(pub['authors'])]]
    write_csv(pub_list, schema, csv)
예제 #11
0
def test1():
    fname = "./data/temperature.csv"
    city = "Denver"
    #city = "New York"
    #city = "Kansas City"
    #city = "Seattle"
    data = load(fname, city)
    util.write_csv(data, "./data/denver.csv")

    dat = [(city, data)]
    util.plot_figs(dat, "temprature")
    return
예제 #12
0
def write_mw_prefixed_roots(prefixed_roots, unprefixed_roots, prefix_groups,
                            sandhi_rules, out_path):
    """Parse the prefixes in a prefix root and write the parsed roots."""

    with util.read_csv(prefix_groups) as reader:
        prefix_groups = {x['group']: x['prefixes'] for x in reader}
    with util.read_csv(unprefixed_roots) as reader:
        root_set = {(x['root'], x['hom']) for x in reader}

    candidate_homs = [None] + [str(i) for i in range(1, 10)]
    sandhi = make_sandhi_object(sandhi_rules)

    rows = []
    for row in util.read_csv_rows(prefixed_roots):
        for group in sandhi.split_off(row['prefixed_root'],
                                      row['unprefixed_root']):
            if group in prefix_groups:
                basis, hom = row['unprefixed_root'], row['hom']
                if (basis, hom) not in root_set:
                    for x in candidate_homs:
                        if (basis, x) in root_set:
                            hom = x
                            break
                    if (basis, hom) not in root_set:
                        continue

                rows.append((row['prefixed_root'], prefix_groups[group],
                             row['unprefixed_root'], hom))
                break

    labels = ['prefixed_root', 'prefixes', 'unprefixed_root', 'hom']
    with util.write_csv(out_path, labels) as write_row:
        for row in rows:
            write_row(dict(zip(labels, row)))
예제 #13
0
def write_shs_verbal_indeclinables(adverbs_path, final_path, root_converter,
                                   out_path):
    """Write SHS verbal indeclinables."""
    labels = None
    clean_rows = []
    with util.read_csv(adverbs_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            row['root'] = root_pair[0]
            row['hom'] = root_pair[1]
            clean_rows.append(row)

    with util.read_csv(final_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue

            row['root'] = root_pair[0]
            row['hom'] = root_pair[1]
            # TODO: handle 'ya' gerunds
            if not row['form'].endswith('um'):
                continue
            clean_rows.append(row)

        labels = reader.fieldnames
        labels.insert(labels.index('root') + 1, 'hom')

    with util.write_csv(out_path, labels) as write_row:
        for row in clean_rows:
            write_row(row)
예제 #14
0
def write_shs_verbal_data(data_path, root_converter, out_path):
    """Write Sanskrit Heritage Site data after converting its roots.

    :param data_path: path to the actual verb data
    :param blacklist_path: path to a list of blacklisted roots
    :param override_path: path to a map from SHS roots to MW roots. If a root
                          isn't in this map, assume the SHS roots are just fine.
    :param out_path:
    """
    labels = None
    clean_rows = []
    with util.read_csv(data_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            root, hom = root_pair
            row['root'] = root
            row['hom'] = hom
            clean_rows.append(row)
        labels = reader.fieldnames
        labels.insert(labels.index('root') + 1, 'hom')

    with util.write_csv(out_path, labels) as write_row:
        for row in clean_rows:
            write_row(row)
예제 #15
0
def run():
    # ========= DATASET 1 ========= #
    filepath = "./output/Base-DT-DS1.csv"

    X_train, Y_train = util.load_csv(util.train_1_filepath)
    X_test, Y_test = util.load_csv(util.test_with_label_1_filepath)

    clf = tree.DecisionTreeClassifier(criterion="entropy")
    # Train
    clf = clf.fit(X_train, Y_train)
    # Test/Predict
    Y_pred = clf.predict(X_test)
    # Confusion Matrix
    confusion_matrix = metrics.confusion_matrix(Y_test, Y_pred)
    metrics.plot_confusion_matrix(clf, X_test, Y_test)
    # Evaluation
    classification_report = metrics.classification_report(Y_test, Y_pred)
    # Debug print
    print_debug(1, clf, Y_pred, confusion_matrix, classification_report)
    # Save
    util.write_csv(filepath, Y_test, Y_pred, confusion_matrix)

    # ========= DATASET 2 ========= #
    filepath = "./output/Base-DT-DS2.csv"

    X_train, Y_train = util.load_csv(util.train_2_filepath)
    X_test, Y_test = util.load_csv(util.test_with_label_2_filepath)

    clf = tree.DecisionTreeClassifier(criterion="entropy")
    # Train
    clf = clf.fit(X_train, Y_train)
    # Test/Predict
    Y_pred = clf.predict(X_test)
    # Confusion Matrix
    confusion_matrix = metrics.confusion_matrix(Y_test, Y_pred)
    metrics.plot_confusion_matrix(clf, X_test, Y_test)
    # Evaluation
    classification_report = metrics.classification_report(Y_test, Y_pred)
    # Debug print
    print_debug(2, clf, Y_pred, confusion_matrix, classification_report)
    # Save
    util.write_csv(filepath, Y_test, Y_pred, confusion_matrix)


# DEBUG--------------------------------------------------------------------
#run()
예제 #16
0
def main():
    data = util.read_csv('C:/test/csv/test.csv')

    new_data = []
    row_idx = 0
    col_idx = 0
    for row in data:
        row_idx = row_idx + 1
        new_row = []
        col_idx = 0
        for col in row:
            col_idx = col_idx + 1
            s = str(row_idx) + ':' + str(col_idx) + '=' + col
            print(s)
            new_row.append(col + '_NEW')
        new_data.append(new_row)

    util.write_csv('C:/tmp/newcsv.csv', new_data)
예제 #17
0
def main(args):
    X = pd.read_csv(args.data)
    y = pd.read_csv(args.labels)

    X, y = SMOTE().fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    svclassifier = SVC(kernel='linear')
    svclassifier.fit(X_train, y_train)

    y_pred = svclassifier.predict(X_test)

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    X_test = pd.read_csv(args.test_set)
    y_pred = svclassifier.predict(X_test)
    util.write_csv('predicted_svm.csv', np.transpose(np.array(y_pred,
                                                              ndmin=2)))
예제 #18
0
def run():
    f1 = h5py.File('image_features_4096.hdf5', 'r')
    f2 = h5py.File('image_arr_N_224_224_3.hdf5', 'r')
    X = f1['X'][()]
    y = f2['y'][:len(X)]

    model_path = 'model/rf_8000.joblib.pkl'
    reg = None
    if os.path.exists(model_path):
        reg = joblib.load(model_path)
    else:
        train_num = int(len(X) * 0.8)
        reg = RandomForestRegressor(n_estimators=400, n_jobs=50)
        print("Training ...")
        reg.fit(X[:train_num], y[:train_num])
        _ = joblib.dump(reg, model_path)

    y_t = reg.predict(X[300000:])
    print("MAE: {}".format(mean_absolute_error(y[300000:], y_t)))
    print("MSE: {}".format(mean_squared_error(y[300000:], y_t)))
    util.write_csv(y_t, y[300000:], 'result/rf_all.csv')
예제 #19
0
    def export_to_csv(self, results: Collection[RepoVersionDiffs],
                      repos: ResultMap[GitRepoPath],
                      roles: ResultMap[GalaxyRole]) -> None:
        header_files = ('role id', 'role name', 'owner', 'repo',
                        'touched file', 'insertions', 'deletions', 'v1..v2')
        header_lines = ('role id', 'role name', 'owner', 'repo', 'insertions',
                        'deletions', 'v1..v2')
        header_commits = ('role id', 'role name', 'owner', 'repo',
                          'commit sha1', 'author name', 'author email', 'date',
                          'v1..v2')

        files: List[Tuple[str, str, str, str, str, int, int, str]] = []
        lines: List[Tuple[str, str, str, str, int, int, str]] = []
        commits: List[Tuple[str, str, str, str, str, str, str, int, str]] = []
        for diffs in results:
            role = roles[diffs.id]
            for bump_diff in diffs.bumps:
                diff_id = bump_diff.id
                files.extend(
                    ((role.id, role.name, role.github_user, role.github_repo,
                      str(f.file_path), f.insertions, f.deletions, diff_id)
                     for f in bump_diff.touched_files))
                lines.append(
                    (role.id, role.name, role.github_user, role.github_repo,
                     bump_diff.insertions, bump_diff.deletions, diff_id))
                commits.extend(
                    ((role.id, role.name, role.github_user, role.github_repo,
                      commit.sha1, commit.author_name, commit.author_email,
                      commit.authored_date, diff_id)
                     for commit in bump_diff.commits))

        self.out.mkdir(exist_ok=True, parents=True)
        write_csv(self.out / 'commits.csv', header_commits, commits)
        write_csv(self.out / 'touched_files.csv', header_files, files)
        write_csv(self.out / 'touched_lines.csv', header_lines, lines)
예제 #20
0
def run_dataset(filepath_train, filepath_test, filepath_output):
    
    x_train, y_train = util.load_csv(filepath_train)
    x_test, y_test = util.load_csv(filepath_test)
    
    clf = Perceptron()
    y_pred = clf.fit(x_train,y_train).predict(x_test)
    
    train_accuracy = clf.score(x_train, y_train)
    test_accuracy = metrics.accuracy_score(y_test, y_pred)
    
    #confusion matrix
    cmatrix = metrics.confusion_matrix(y_test, y_pred)
    metrics.plot_confusion_matrix(clf, x_test, y_test)
    
    #evalution
    classification_report = metrics.classification_report(y_test, y_pred)
    
    #print to output file
    util.write_csv(filepath_output, y_test, y_pred, cmatrix)
    
    #print to console for debug purposes
    print_result(clf, train_accuracy, test_accuracy, y_pred, cmatrix, classification_report, filepath_output)
예제 #21
0
def run_dataset(filepath_train, filepath_test, filepath_output):

    x_train, y_train = util.load_csv(filepath_train)
    x_test, y_test = util.load_csv(filepath_test)

    gnb = GaussianNB()
    y_pred = gnb.fit(x_train, y_train).predict(x_test)

    train_accuracy = gnb.score(x_train, y_train)
    test_accuracy = metrics.accuracy_score(y_test, y_pred)

    #confusion matrix
    cmatrix = metrics.confusion_matrix(y_test, y_pred)
    metrics.plot_confusion_matrix(gnb, x_test, y_test)

    #evalution
    classification_report = metrics.classification_report(y_test, y_pred)

    #output file
    util.write_csv(filepath_output, y_test, y_pred, cmatrix)

    #Print result to console
    print_result(gnb, train_accuracy, test_accuracy, y_pred, cmatrix,
                 classification_report, filepath_output)
예제 #22
0
def write_verb_prefixes(upasargas, other, out_path):
    with util.read_csv(upasargas) as reader:
        upasargas = list(reader)

    with util.read_csv(other) as reader:
        other = list(reader)
        labels = reader.fieldnames

    assert 'prefix_type' in labels
    for x in upasargas:
        assert 'prefix_type' not in x
        x['prefix_type'] = 'upasarga'

    rows = sorted(upasargas + other, key=lambda x: util.key_fn(x['name']))
    with util.write_csv(out_path, labels) as write_row:
        for row in rows:
            write_row(row)
예제 #23
0
def write_prefixed_shs_verbal_indeclinables(final_path, sandhi_rules,
                                            prefixed_roots, root_converter,
                                            out_path):
    """Write prefixed SHS verbal indeclinables."""
    sandhi = make_sandhi_object(sandhi_rules)

    root_to_prefixed = {}
    with util.read_csv(prefixed_roots) as reader:
        for row in reader:
            root_to_prefixed.setdefault(row['unprefixed_root'], []).append(row)

    labels = None
    clean_rows = []
    with util.read_csv(final_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            root, hom = root_pair

            row['root'] = root
            for result in root_to_prefixed.get(root, []):
                new_row = row.copy()
                for field in ['form', 'stem']:
                    if field in row:
                        new_row[field] = sandhi.join(
                            result['prefixes'].split('-') + [new_row[field]])
                new_row['root'] = result['prefixed_root']
                new_row['hom'] = result['hom']
                clean_rows.append(new_row)

        labels = reader.fieldnames

    labels += ['hom']
    old_rows = list(util.read_csv_rows(out_path))
    clean_rows.sort(key=lambda x: util.key_fn(x['root']))
    with util.write_csv(out_path, labels) as write_row:
        for row in old_rows:
            write_row(row)
        for row in clean_rows:
            write_row(row)
예제 #24
0
def write_prefixed_shs_verbal_data(data_path, prefixed_roots, root_converter,
                                   sandhi_rules, out_path):
    """Write Sanskrit Heritage Site data after converting its roots.

    :param data_path: path to the actual verb data
    :param out_path:
    """
    sandhi = make_sandhi_object(sandhi_rules)

    root_to_prefixed = {}
    with util.read_csv(prefixed_roots) as reader:
        for row in reader:
            root_to_prefixed.setdefault(row['unprefixed_root'], []).append(row)

    labels = None
    clean_rows = []
    with util.read_csv(data_path) as reader:
        for row in reader:
            root_pair = root_converter.get(row['root'])
            if root_pair is None:
                continue
            root, hom = root_pair

            for result in root_to_prefixed.get(root, []):
                new_row = row.copy()
                for field in ['form', 'stem']:
                    if field in row:
                        new_row[field] = sandhi.join(
                            result['prefixes'].split('-') + [new_row[field]])
                new_row['root'] = result['prefixed_root']
                new_row['hom'] = hom
                clean_rows.append(new_row)
        labels = reader.fieldnames + ['hom']

    old_rows = list(util.read_csv_rows(out_path))
    clean_rows.sort(key=lambda x: util.key_fn(x['root']))
    with util.write_csv(out_path, labels) as write_row:
        for row in old_rows:
            write_row(row)
        for row in clean_rows:
            write_row(row)
def main():
    global queue_size

    # Initialize Myo and create a Hub and our listener.
    myo.init(sdk_path='./myo_sdk/')
    hub = myo.Hub()
    listener = Listener(queue_size)

    def get_data():
        global count
        global flg_get_data
        global all_data

        emgs = np.array([x[1] for x in listener.get_emg_data()]).T
        # print('emgs')
        # print(emgs)
        if emgs.shape == (8, queue_size):
            if count > DATANUM_TOTAL - 2:
                flg_get_data = False

            label_index = int(count / DATANUM_EACH) % LEBELS_NUM
            label = LABELS[label_index]
            label = np.array(label).astype('int32')
            print(
                f' label_index: {label_index} {LABELS[label_index]}  {count+1}/{DATANUM_TOTAL}'
            )
            count += 1

            # print(type(emgs))
            # print(emgs.shape)
            # print(emgs[0])
            f = emgs
            m = np.mean(f, axis=1)
            v = np.var(f, axis=1)
            # m_norm = normalize(m)
            # print(m_norm)
            # print(m)
            # print(normalize(v))
            # print(v)
            # print(sigmoid(v - np.mean(v)))
            # v_chg = sigmoid(v - np.mean(v))
            m = np.mean(np.abs(f), axis=1)
            # print(np.mean(np.abs(f), axis=1))
            F = np.fft.fft(f)
            Amp = np.abs(F)
            first_Amp = Amp[:, 0:int(queue_size / 2)]

            # size: 8*queue_size/2
            flat_Amp = np.reshape(first_Amp, (1, int(8 * queue_size / 2)))[0]
            flat_Amp_norm = normalize(flat_Amp)
            # print(flat_Amp_norm)
            # size: len(label) + 8*queue_size/2
            # save_data = np.hstack((label, flat_Amp))
            save_data = np.hstack((label, m, flat_Amp_norm))

            save_data = list(save_data)
            # print('save_data', save_data)
            # print(save_data)
            all_data = data_append(all_data, save_data)

        else:
            print("buffering")
            # print(emgs)

    try:
        threading.Thread(
            target=lambda: hub.run_forever(listener.on_event)).start()

        while flg_get_data:
            get_data()
            time.sleep(1)

        # save file
        print('saving data...')
        print('Do not remove Myo')
        print(np.array(all_data).shape)
        write_csv(all_data, SAVE_DATA_PATH)
        print('finish', SAVE_DATA_PATH)
    finally:
        hub.stop()
def write_output_csv(rows):
    if output_csv != None:
        ut.write_csv(rows, output_csv)
예제 #27
0
x_range = [-100.0, 100.0]

N_train = int(args.Ntrain)
N_test = int(args.Ntest)
N = N_train + N_test

# choose model parameters from the command line
if args.beta == 0.0:
    beta = np.array(args.beta)
    beta = np.reshape(beta, (1, beta.shape[0]))
else:
    # generate model parameters
    beta = s.get_model_parameters(beta_range, args.dim+1)

# generate data
X = s.generate_data(x_range, N, args.dim)
# split data into train and test and normalize
X_train, X_test = s.split_data(X, N_train)
X_train, X_test = s.normalize_sets(X_train, X_test)
# add bias
X_train, X_test = s.add_bias(X_train, X_test)

Y_train = s.get_labels(X_train, beta)
Y_test = s.get_labels(X_test, beta)

util.write_csv('./beta_{0}.csv'.format(args.dim), 'beta', beta, args.precision)
util.write_csv('./X_train.csv', 'x', X_train, args.precision)
util.write_csv('./Y_train.csv', 'y', Y_train, args.precision)
util.write_csv('./X_test.csv', 'x', X_test, args.precision)
util.write_csv('./Y_test.csv', 'y', Y_test, args.precision)
예제 #28
0
def main(args):
    # Set revcomp parameter.
    if args.r != 1:
        args.r = False
    elif args.r == 1 and args.alphabet != 'DNA':
        print("Error, the -r parameter can only be used in DNA.")
    elif args.r == 1 and args.alphabet == 'DNA':
        args.r = True

    # Set alphabet parameter.
    if args.alphabet == 'DNA':
        args.alphabet = index_list.DNA
    elif args.alphabet == 'RNA':
        args.alphabet = index_list.RNA
    elif args.alphabet == 'Protein':
        args.alphabet = index_list.PROTEIN

    if args.method.upper() == 'KMER':
        if args.k is None:
            print "parameters k is required. The default value of k is 2."
            args.k = 2
        if args.r is None:
            print "parameters r is required. The default value of r is 0."
            args.r = 0
        res = make_kmer_vector(k=args.k, alphabet=args.alphabet, filename=args.inputfile, revcomp=args.r)
    elif args.method.upper() == 'IDKMER':
        if args.k is None:
            print "parameters k is required. The default value of k is 6."
            args.k = 6
        if args.ps is None or args.ns is None:
            print 'The positive  and the negative source files are required.'
            return False
        res = idkmer(k=args.k, filename=args.inputfile, pos_src_name=args.ps, neg_src_name=args.ns)
    elif args.method.upper() == "MISMATCH":
        if args.k is None:
            print "parameters k is required. The default value of k is 3."
            args.k = 3
        if args.m is None:
            print "parameters m is required. The default value of m is 1."
            args.m = 1
        if args.m >= args.k:
            print "parameters m should be less than parameter k."
        else:
            res = getMismatchProfileMatrix(args.inputfile, args.alphabet, args.k, args.m)
    elif args.method.upper() == "SUBSEQUENCE":
        if args.delta is None:
            print "parameters delta is required. The default value of delta is 1."
            args.delta = 1
        elif args.delta > 1 or args.delta < 0:
            print "delta should be greater than or equal to 0 and less than or equal to 1."
        if args.k is None:
            print "parameters k is required. The default value of k is 3."
            args.k = 3
        res = getSubsequenceProfileByParallel(filename=args.inputfile, alphabet=args.alphabet, k=args.k, delta=args.delta)
    elif args.method.upper() == 'DR':
        if args.alphabet != index_list.PROTEIN:
            print 'DR method is only available for Protein.'
            return False
        elif args.max_dis < 0 or args.max_dis > 10:
            print 'The max distance can not be negative integer and should be smaller than 11.'
            return False
        else:
            res = dr_method(inputfile=args.inputfile, max_dis=args.max_dis)
            print res
    elif args.method.upper() == 'DP':
        if args.alphabet != index_list.PROTEIN:
            print 'Distance Pair method is only available for Protein.'
            return False
        elif args.max_dis < 0 or args.max_dis > 10:
            print 'The max distance can not be negative integer and should be smaller than 11.'
            return False
        else:
            if args.cp == 'cp_13':
                reduce_alphabet_scheme = const.cp_13
            elif args.cp == 'cp_14':
                reduce_alphabet_scheme = const.cp_14
            elif args.cp == 'cp_19':
                reduce_alphabet_scheme = const.cp_19
            elif args.cp == 'cp_20':
                reduce_alphabet_scheme = const.cp_20
            res = get_pseaacdis_matrix(filename=args.inputfile, reduce_alphabet_scheme=reduce_alphabet_scheme, 
                max_distance=args.max_dis, alphabet=args.alphabet)

    else:
        print("Method error!")

    # Write correspond res file.
    if args.f == 'svm':
        if args.multi == 0 and args.l is None:
            args.l = '+1'
        elif args.multi == 0 and (args.l != '+1' and args.l != '-1'):
            print "For binary classification, the label should be either '+1' or '-1'."
            return False
        elif args.multi == 1 and args.l is None:
            args.l = '0'
        elif args.multi == 1 and args.l is not None:
            try:
                label = int(args.l)
            except ValueError:
                print 'The labels should be integer.'
                return False
        from util import write_libsvm
        write_libsvm(res, [args.l] * len(res), args.outputfile)
    elif args.f == 'tab':
        from util import write_tab
        write_tab(res, args.outputfile)
    elif args.f == 'csv':
        from util import write_csv
        write_csv(res, args.outputfile)
예제 #29
0
def write_prefix_groups(prefixed_roots, unprefixed_roots, upasargas, other,
                        sandhi_rules, out_path):
    """Parse the prefixes in a prefix root and write out the prefix groups.

    The procedure is roughly as follows:

        for each prefixed root in `prefixed_roots`:
            find (p_1, ..., p_n, r), where p_x is a prefix and r is a root
            write the prefix group (p_1, ..., p_n) to file.

    We find (p_1, .., p_n) by using the rules in `sandhi_rules` and verify
    that `p_x` is a prefix by checking for membership in `upasargas` and
    `other`.
    """

    # Loading prefixes
    all_prefixes = set()
    with util.read_csv(upasargas) as reader:
        all_prefixes.update([x['name'] for x in reader])
    with util.read_csv(other) as reader:
        all_prefixes.update([x['name'] for x in reader])

    # The 's' prefix is used in roots like 'saMskf' and 'parizkf'. Although it
    # is prefixed to a verb, it is not semantically the same as the other verb
    # prefixes. Here, though, we treat it as a verb prefix.
    all_prefixes.add('s')

    # Some prefixes have alternate forms.
    prefix_alternates = {
        'pi': 'api',
        'ut': 'ud',
        'Ri': 'ni',
        'niz': 'nis',
        'iz': 'nis',
        'palA': 'parA',
        'pali': 'pari',
        'z': 's',
    }
    all_prefixes.update(prefix_alternates.keys())

    # Loading sandhi rules
    sandhi = make_sandhi_object(sandhi_rules)

    with util.read_csv(prefixed_roots) as reader:
        rows = []
        for row in reader:
            # Nibble away at `prefixed_root` until we have all prefixes for the
            # given root.
            prefixes = []
            prefixed_root = row['prefixed_root']
            unprefixed_root = row['unprefixed_root']
            last_letter = None

            q = Queue.PriorityQueue()
            for remainder in sandhi.split_off(prefixed_root, unprefixed_root):
                q.put_nowait((0, (), remainder))

            while not q.empty():
                _, cur_prefixes, remainder = q.get_nowait()

                # `remainder` is something we recognize: we're done!
                if remainder in all_prefixes:
                    prefixes = list(cur_prefixes)
                    if remainder:
                        prefixes.append(remainder)
                        last_letter = remainder[-1]
                    break

                for before, after in sandhi.splits(remainder):
                    # Prevent recursion. As of this comment, the `splits` method
                    # returns the non-split of some term X as (X, ''). In other
                    # words, this conditional will *never* be true. But since the
                    # behavior of various functions is still unsettled, this check
                    # will stay here for the time being.
                    if after == remainder:
                        continue

                    if before in all_prefixes:
                        state = (cur_prefixes + (before, ), after)
                        cost = len(after)

                        # Incentivize short vowels. This avoids errors with roots
                        # like "upodgrah" ("upa-ud-grah"). Without the incentive,
                        # we could have "upa-A-ud-grah" instead.
                        if before and before[-1] in 'aiufx':
                            cost -= 1
                        q.put_nowait((cost, ) + state)

            # Convert 'alternate' prefixes back to their original forms.
            prefixes = [prefix_alternates.get(x, x) for x in prefixes]
            if not prefixes:
                # Occurs if the root's prefix is unrecognized
                continue

            # We still don't know the prefix group. We can find it by splitting
            # off the root and keeping whatever matches `last_letter`.
            for group in sandhi.split_off(prefixed_root, unprefixed_root):
                if group[-1] == last_letter:
                    break
            prefix_string = '-'.join(prefixes)
            rows.append((group, prefix_string))

    labels = ['group', 'prefixes']
    with util.write_csv(out_path, labels) as write_row:
        for row in util.unique(rows):
            datum = dict(zip(labels, row))
            write_row(datum)
예제 #30
0
#! /usr/bin/env python3

import re

import util

URL = "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes?action=raw"
CACHE = "article.wiki"
COLUMNS = ["Code", "Message"]


def scrape():
    for line in open(util.get_cache_file(CACHE, URL)):
        m = re.match(r"^;\{\{.*?\}\}(\d{3}) (.*?)\s*$", line)
        if m:
            yield m.group(1), m.group(2).replace("[[", "").replace("]]", "")


if __name__ == "__main__":
    util.write_csv(COLUMNS, scrape())