예제 #1
0
def verify_negatives(lat: list,
                     node_motivation=False,
                     composite=False,
                     verbose=False) -> bool:
    """
    :lat     list of directory names
    :returns True iff the runs were made with the same negatives
    """
    if node_motivation:  ## skip this for now if node_motivation is specified.
        negfilenames = ['negatives-nodes.csv', 'negatives-nodes-ignoreadj.csv']
    elif composite:
        return True
    else:
        negfilenames = ['negatives.csv']

    toReturn = {}
    for negfilename in negfilenames:
        #get one of the lists of negatives as a dataframe
        df = utils.read_df(lat[0], negfilename)

        if verbose:
            for j in lat[1:]:
                print(os.path.join(j, negfilename),
                      df.equals(utils.read_df(j, negfilename)))
        #exploiting transitivity of identity, compare the first against all others
        toReturn[negfilename] = all(
            df.equals(i)
            for i in [utils.read_df(j, negfilename) for j in lat[1:]])
    return all(list(toReturn.values()))
예제 #2
0
파일: settings.py 프로젝트: sg-s/pyvocab
def build_distractor_db():
    bar = st.progress(0)

    words = read_df("words.csv")
    words = words["word"]

    distractors = []

    for i, word in enumerate(words):

        percent_complete = int(i / len(words) * 100)

        bar.progress(percent_complete)
        distractors.append(find_n_closest_words(word))
    bar.progress(100)

    distractors_db = dict()
    distractors_db["word"] = words
    distractors_db["distractors"] = distractors

    distractors_db = pd.DataFrame(distractors_db)

    st.write(distractors_db)

    save_df(distractors_db, "distractors.csv")
예제 #3
0
파일: settings.py 프로젝트: sg-s/pyvocab
def check_db():

    words = read_df("words.csv")

    bar = st.progress(0)

    bad_words = []

    for index, row in words.iterrows():

        percent_complete = int(index / len(words)) * 100

        bar.progress(percent_complete)

        if type(words["definition"][index]) == str:
            continue

        if np.isnan(row["definition"]):
            bad_words.append(row["word"])

    bar.progress(100)

    if len(bad_words) > 0:
        st.write(bad_words)
    else:
        st.success("database looks good!")
예제 #4
0
    def __init__(self,
                 imdir,
                 metafile,
                 train=True,
                 transform=None,
                 filter=None):
        super().__init__()
        self.imdir = imdir
        self.metafile = metafile
        self.train = train
        self.transform = transform
        self.filter = filter

        self.cols = ['wealth', 'water_src', 'toilet_type', 'roof']

        metainfo = read_df(metafile)
        # TODO: Only consider RURAL AREAS
        metainfo = metainfo[metainfo.uor == 'R']
        metainfo = metainfo[['cluster'] + self.cols]

        # if self.filter:
        #     metainfo = metainfo[metainfo[target] == filter]

        self.targets = dict()
        for col in self.cols:
            unique_classes = sorted(metainfo[col].unique())
            self.targets[col] = {
                'classes': unique_classes,
                'o2i': {o: i
                        for i, o in enumerate(unique_classes)}
            }

        self.data = self.split(metainfo, 0.1, 42)
예제 #5
0
def pr(name: str, edges=True) -> (list, list):
    """
    :name    name of directory
    :edges   (trivalent) boolean as to whether we are plotting edges,nodes or both
    :returns recall, precision
    """
    #fetch precision and recall
    if edges == True:
        df = utils.read_df(name, 'pr-edges.csv')
    elif edges == False:
        df = utils.read_df(name, 'pr-nodes.csv')
    df = df.sort_values(by=['recall', 'precision'], ascending=[True, False])
    #df = df.sort_values('precision',ascending=False)
    recall = list(df['recall'])
    precision = list(df['precision'])
    return recall, precision
예제 #6
0
def main():
    data = read_df(data_dir/'sentinel_main.csv')
    image_list = read_images(image_dir)

    for ((state, region), group) in data.groupby(['state', 'region']):
        with ThreadPoolExecutor(max_workers=5) as executor:
            rows = [(Coordinate(row['lat'], row['lng']), row['cluster'])
                    for _, row in group.iterrows() if str(row['cluster']) not in image_list]

            results = list(tqdm(executor.map(process, rows), desc=f'State: {state}, Region: {region}', total=len(rows)))
예제 #7
0
def main():
    # run once!
    dfs_grouped_company = utils.read_df(file_path, tsv_header)

    split_train_test_file(dfs_grouped_company)

    dump_ks_test_data(dfs_grouped_company)

    dump_scaling_factor(dfs_grouped_company)

    dump_ticker_names(dfs_grouped_company)
예제 #8
0
def match_file2image(imdir, metafile):
    ims = set(
        int(p.stem) for p in imdir.glob('*.png')
        if not p.stem.endswith('mask'))
    metainfo = read_df(metafile)
    missing = [
        row['cluster'] for (_, row) in metainfo.iterrows()
        if row['cluster'] not in ims
    ]

    metainfo[~metainfo.cluster.isin(missing)].to_csv(metafile, index=False)
    def __init__(self,
                 imdir,
                 metafile,
                 train=True,
                 transform=None,
                 filter=None):
        super().__init__()
        self.imdir = imdir
        self.metafile = metafile
        self.train = train
        self.transform = transform
        self.filter = filter

        self.cols = [
            'wealth', 'water_src', 'toilet_type', 'roof', 'cooking_fuel',
            'drought', 'pop_density', 'livestock_bin', 'agriculture_land_bin'
        ]

        metainfo = read_df(metafile)
        # TODO: Only consider RURAL AREAS
        # metainfo = metainfo[metainfo.uor == 'R']
        metainfo = metainfo[['cluster'] + self.cols]

        # if self.filter:
        #     metainfo = metainfo[metainfo[target] == filter]

        self.targets = dict()
        for col in self.cols:
            unique_classes = sorted(metainfo[col].unique())
            weights = {
                k: 1 / v
                for k, v in metainfo[col].value_counts().to_dict().items()
            }

            self.targets[col] = {
                'classes': unique_classes,
                'o2i': {o: i
                        for i, o in enumerate(unique_classes)},
                'i2p': {i: weights[o]
                        for i, o in enumerate(unique_classes)}
            }

        self.data = self.split(metainfo, 0.1, 42)
예제 #10
0
    def __init__(self,
                 metric,
                 imdir,
                 metafile,
                 train=True,
                 transform=None,
                 filter=None):
        super().__init__()
        self.imdir = imdir
        self.metafile = metafile
        self.train = train
        self.transform = transform
        self.filter = filter

        self.cols = [metric]

        metainfo = read_df(metafile)
        # TODO: Only consider RURAL AREAS
        # metainfo = metainfo[metainfo.uor == 'R']
        metainfo = metainfo[['cluster'] + self.cols]

        # if self.filter:
        #     metainfo = metainfo[metainfo[target] == filter]

        self.targets = dict()
        for col in self.cols:
            # print(col)
            unique_classes = sorted(metainfo[col].unique())
            weights = {
                k: 1 / v
                for k, v in metainfo[col].value_counts().to_dict().items()
            }

            self.targets[col] = {
                'classes': unique_classes,
                'o2i': {o: i
                        for i, o in enumerate(unique_classes)},
                'i2p': {i: weights[o]
                        for i, o in enumerate(unique_classes)}
            }

        self.data = self.split(metainfo, 0.1, 42)
예제 #11
0
파일: settings.py 프로젝트: sg-s/pyvocab
def find_n_closest_words(target_word, n=5):
    """find  n closest words to word using the Levenstein distance"""

    words = read_df("words.csv")
    words = list(words["word"])

    # make a list of possible distractor words
    distractor_words = words

    # find distancs to all possible distractors
    distances = []
    for word in distractor_words:
        distances.append(distance(word, target_word))

    distances = np.array(distances)

    # reorder distractors by distance
    distractor_words = list(pd.Series(distractor_words)[np.argsort(distances)])

    distractor_words = distractor_words[1 : n + 1]
    return distractor_words
예제 #12
0
    def __init__(self,
                 imdir,
                 metafile,
                 imid,
                 target,
                 train=True,
                 transform=None,
                 filter=None):
        super().__init__()
        self.imdir = imdir
        self.metafile = metafile
        self.imid = imid
        self.target = target
        self.train = train
        self.transform = transform
        self.filter = filter

        metainfo = read_df(metafile)[[imid, target]]
        if self.filter:
            metainfo = metainfo[metainfo[target] == filter]

        self.classes = sorted(metainfo[target].unique())
        self.o2i = {o: i for i, o in enumerate(self.classes)}
        self.data = self.split(metainfo, 0.2, 42)
예제 #13
0
파일: 9rda.py 프로젝트: cfriedline/burt
pandas2ri.activate()
from IPython.display import display

# %%
analysis_dir = "/gpfs_fs/home/eckertlab/projects/burt/seq/dedupe/work/samtools1.3"

# %%
spp = ["E", "G", "P", "T"]

# %%
z12_df = {}
for s in spp:
    d = os.path.join(analysis_dir, s)
    print(d)
    if not "beagle" in d:
        z12_df[d] = read_df(d, "z12_swapped")

# %%
test_key = '/gpfs_fs/home/eckertlab/projects/burt/seq/dedupe/work/samtools1.3/E'

# %%
z12_df[test_key].head()

# %%
for k in z12_df:
    v = z12_df[k]
    v['population'] = v.apply(lambda x: "-".join(x.name.split("-")[0:-1]), axis=1)
    v = v.replace(-1, 9)
    z12_df[k] = v

# %%
예제 #14
0
        parser.print_help()
        #sys.exit(1)
    if not os.path.exists(args.filename):
        print("Sorry, file ", args.filename, "does not exists")
        #sys.exit(1)

    inputfile = args.filename

if 1:
    inputfile = 'wdir/hic_to_scaff_fortraining.als'
    min_size = 0

    try:
        df.head()
    except NameError:
        df = utils.read_df(inputfile, min_size, 1)
        df = df.rename(
            columns={
                0: 'target',
                1: 'scaff1',
                2: 'scaff2',
                3: 'lscaff1',
                4: 'lscaff2',
                5: 'nlinks'
            })  # only true for train sample!

    try:
        df0
    except NameError:
        df1 = df.groupby(['target']).get_group(1)
        df0 = df.groupby(['target']).get_group(0)
예제 #15
0
    argparser.add_argument("--df_corpus", type=str, default="")
    argparser.add_argument("--tags_file", type=str, default="")
    argparser.add_argument("--model", type=str)
    argparser.add_argument("--layer", type=str, default="lstm")
    argparser.add_argument("--max_seq_len", type=int, default=100)
    argparser.add_argument("--mlp_dim", type=int, default=50)  # to write in
    argparser.add_argument("--out_dir", type=str)
    argparser.add_argument("--full_results_file", type=str,
                           default="")  # to write in
    argparser.add_argument("--results_file", type=str,
                           default="")  # to write in

    args = argparser.parse_args()
    print '\n', args, '\n'

    df = read_df(args.df_corpus)
    df = df.fillna(u'')

    label_tags = pickle.load(open(args.tags_file, 'rb'))

    raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True)

    embedding_layer = create_embedding_layer(n_d=10,
                                             embs=load_embedding_iterator(
                                                 args.embeddings),
                                             only_words=False)

    with tf.Session() as sess:

        myqrapi = TPAPI(args.model, embedding_layer, sess, len(label_tags),
                        args.layer)
예제 #16
0
파일: game.py 프로젝트: sg-s/pyvocab
import random

import streamlit as st
from utils import read_df

st.set_page_config(page_title="pyvocab", layout="wide")

if "correct_streak" not in st.session_state:
    st.session_state.correct_streak = 0

if "words" not in st.session_state:
    st.session_state.words = read_df("words.csv")

if "distractors" not in st.session_state:
    st.session_state.distractors = read_df("distractors.csv")

words = st.session_state.words
distractors = st.session_state.distractors

this_word = words.sample()

st.write("#")
st.write("#")

st.write(" ## *" + this_word["definition"].iloc[0].strip() + "*")

st.write("#")
st.write("#")

other_words = words.sample(n=5)
예제 #17
0
    return data_frame


def remake_training_file_for_question_ranking(new_train_rows, out_file):
    with open(out_file, 'w') as f:
        for x in new_train_rows:
            q_ids_similar = " ".join([str(q) for q in x[1]])
            q_ids_candidates = " ".join([str(q) for q in x[2]])
            f.write('{}\t{}\t{}\n'.format(str(x[0]), q_ids_similar,
                                          q_ids_candidates))


if __name__ == '__main__':
    df = read_df(
        '/home/christina/Documents/Thesis/data/askubuntu/additional/data_frame_corpus_str.csv'
    )
    print 'total ids: ', df.shape[0]

    E = read_eval_rows(
        '/home/christina/Documents/Thesis/data/askubuntu/test.txt')
    test_ids = get_eval_ids(E)
    E = read_eval_rows(
        '/home/christina/Documents/Thesis/data/askubuntu/dev.txt')
    dev_ids = get_eval_ids(E)

    eval_ids = test_ids | dev_ids
    print 'total eval ids: ', len(eval_ids)

    T = list(
        read_eval_rows(