def verify_negatives(lat: list, node_motivation=False, composite=False, verbose=False) -> bool: """ :lat list of directory names :returns True iff the runs were made with the same negatives """ if node_motivation: ## skip this for now if node_motivation is specified. negfilenames = ['negatives-nodes.csv', 'negatives-nodes-ignoreadj.csv'] elif composite: return True else: negfilenames = ['negatives.csv'] toReturn = {} for negfilename in negfilenames: #get one of the lists of negatives as a dataframe df = utils.read_df(lat[0], negfilename) if verbose: for j in lat[1:]: print(os.path.join(j, negfilename), df.equals(utils.read_df(j, negfilename))) #exploiting transitivity of identity, compare the first against all others toReturn[negfilename] = all( df.equals(i) for i in [utils.read_df(j, negfilename) for j in lat[1:]]) return all(list(toReturn.values()))
def build_distractor_db(): bar = st.progress(0) words = read_df("words.csv") words = words["word"] distractors = [] for i, word in enumerate(words): percent_complete = int(i / len(words) * 100) bar.progress(percent_complete) distractors.append(find_n_closest_words(word)) bar.progress(100) distractors_db = dict() distractors_db["word"] = words distractors_db["distractors"] = distractors distractors_db = pd.DataFrame(distractors_db) st.write(distractors_db) save_df(distractors_db, "distractors.csv")
def check_db(): words = read_df("words.csv") bar = st.progress(0) bad_words = [] for index, row in words.iterrows(): percent_complete = int(index / len(words)) * 100 bar.progress(percent_complete) if type(words["definition"][index]) == str: continue if np.isnan(row["definition"]): bad_words.append(row["word"]) bar.progress(100) if len(bad_words) > 0: st.write(bad_words) else: st.success("database looks good!")
def __init__(self, imdir, metafile, train=True, transform=None, filter=None): super().__init__() self.imdir = imdir self.metafile = metafile self.train = train self.transform = transform self.filter = filter self.cols = ['wealth', 'water_src', 'toilet_type', 'roof'] metainfo = read_df(metafile) # TODO: Only consider RURAL AREAS metainfo = metainfo[metainfo.uor == 'R'] metainfo = metainfo[['cluster'] + self.cols] # if self.filter: # metainfo = metainfo[metainfo[target] == filter] self.targets = dict() for col in self.cols: unique_classes = sorted(metainfo[col].unique()) self.targets[col] = { 'classes': unique_classes, 'o2i': {o: i for i, o in enumerate(unique_classes)} } self.data = self.split(metainfo, 0.1, 42)
def pr(name: str, edges=True) -> (list, list): """ :name name of directory :edges (trivalent) boolean as to whether we are plotting edges,nodes or both :returns recall, precision """ #fetch precision and recall if edges == True: df = utils.read_df(name, 'pr-edges.csv') elif edges == False: df = utils.read_df(name, 'pr-nodes.csv') df = df.sort_values(by=['recall', 'precision'], ascending=[True, False]) #df = df.sort_values('precision',ascending=False) recall = list(df['recall']) precision = list(df['precision']) return recall, precision
def main(): data = read_df(data_dir/'sentinel_main.csv') image_list = read_images(image_dir) for ((state, region), group) in data.groupby(['state', 'region']): with ThreadPoolExecutor(max_workers=5) as executor: rows = [(Coordinate(row['lat'], row['lng']), row['cluster']) for _, row in group.iterrows() if str(row['cluster']) not in image_list] results = list(tqdm(executor.map(process, rows), desc=f'State: {state}, Region: {region}', total=len(rows)))
def main(): # run once! dfs_grouped_company = utils.read_df(file_path, tsv_header) split_train_test_file(dfs_grouped_company) dump_ks_test_data(dfs_grouped_company) dump_scaling_factor(dfs_grouped_company) dump_ticker_names(dfs_grouped_company)
def match_file2image(imdir, metafile): ims = set( int(p.stem) for p in imdir.glob('*.png') if not p.stem.endswith('mask')) metainfo = read_df(metafile) missing = [ row['cluster'] for (_, row) in metainfo.iterrows() if row['cluster'] not in ims ] metainfo[~metainfo.cluster.isin(missing)].to_csv(metafile, index=False)
def __init__(self, imdir, metafile, train=True, transform=None, filter=None): super().__init__() self.imdir = imdir self.metafile = metafile self.train = train self.transform = transform self.filter = filter self.cols = [ 'wealth', 'water_src', 'toilet_type', 'roof', 'cooking_fuel', 'drought', 'pop_density', 'livestock_bin', 'agriculture_land_bin' ] metainfo = read_df(metafile) # TODO: Only consider RURAL AREAS # metainfo = metainfo[metainfo.uor == 'R'] metainfo = metainfo[['cluster'] + self.cols] # if self.filter: # metainfo = metainfo[metainfo[target] == filter] self.targets = dict() for col in self.cols: unique_classes = sorted(metainfo[col].unique()) weights = { k: 1 / v for k, v in metainfo[col].value_counts().to_dict().items() } self.targets[col] = { 'classes': unique_classes, 'o2i': {o: i for i, o in enumerate(unique_classes)}, 'i2p': {i: weights[o] for i, o in enumerate(unique_classes)} } self.data = self.split(metainfo, 0.1, 42)
def __init__(self, metric, imdir, metafile, train=True, transform=None, filter=None): super().__init__() self.imdir = imdir self.metafile = metafile self.train = train self.transform = transform self.filter = filter self.cols = [metric] metainfo = read_df(metafile) # TODO: Only consider RURAL AREAS # metainfo = metainfo[metainfo.uor == 'R'] metainfo = metainfo[['cluster'] + self.cols] # if self.filter: # metainfo = metainfo[metainfo[target] == filter] self.targets = dict() for col in self.cols: # print(col) unique_classes = sorted(metainfo[col].unique()) weights = { k: 1 / v for k, v in metainfo[col].value_counts().to_dict().items() } self.targets[col] = { 'classes': unique_classes, 'o2i': {o: i for i, o in enumerate(unique_classes)}, 'i2p': {i: weights[o] for i, o in enumerate(unique_classes)} } self.data = self.split(metainfo, 0.1, 42)
def find_n_closest_words(target_word, n=5): """find n closest words to word using the Levenstein distance""" words = read_df("words.csv") words = list(words["word"]) # make a list of possible distractor words distractor_words = words # find distancs to all possible distractors distances = [] for word in distractor_words: distances.append(distance(word, target_word)) distances = np.array(distances) # reorder distractors by distance distractor_words = list(pd.Series(distractor_words)[np.argsort(distances)]) distractor_words = distractor_words[1 : n + 1] return distractor_words
def __init__(self, imdir, metafile, imid, target, train=True, transform=None, filter=None): super().__init__() self.imdir = imdir self.metafile = metafile self.imid = imid self.target = target self.train = train self.transform = transform self.filter = filter metainfo = read_df(metafile)[[imid, target]] if self.filter: metainfo = metainfo[metainfo[target] == filter] self.classes = sorted(metainfo[target].unique()) self.o2i = {o: i for i, o in enumerate(self.classes)} self.data = self.split(metainfo, 0.2, 42)
pandas2ri.activate() from IPython.display import display # %% analysis_dir = "/gpfs_fs/home/eckertlab/projects/burt/seq/dedupe/work/samtools1.3" # %% spp = ["E", "G", "P", "T"] # %% z12_df = {} for s in spp: d = os.path.join(analysis_dir, s) print(d) if not "beagle" in d: z12_df[d] = read_df(d, "z12_swapped") # %% test_key = '/gpfs_fs/home/eckertlab/projects/burt/seq/dedupe/work/samtools1.3/E' # %% z12_df[test_key].head() # %% for k in z12_df: v = z12_df[k] v['population'] = v.apply(lambda x: "-".join(x.name.split("-")[0:-1]), axis=1) v = v.replace(-1, 9) z12_df[k] = v # %%
parser.print_help() #sys.exit(1) if not os.path.exists(args.filename): print("Sorry, file ", args.filename, "does not exists") #sys.exit(1) inputfile = args.filename if 1: inputfile = 'wdir/hic_to_scaff_fortraining.als' min_size = 0 try: df.head() except NameError: df = utils.read_df(inputfile, min_size, 1) df = df.rename( columns={ 0: 'target', 1: 'scaff1', 2: 'scaff2', 3: 'lscaff1', 4: 'lscaff2', 5: 'nlinks' }) # only true for train sample! try: df0 except NameError: df1 = df.groupby(['target']).get_group(1) df0 = df.groupby(['target']).get_group(0)
argparser.add_argument("--df_corpus", type=str, default="") argparser.add_argument("--tags_file", type=str, default="") argparser.add_argument("--model", type=str) argparser.add_argument("--layer", type=str, default="lstm") argparser.add_argument("--max_seq_len", type=int, default=100) argparser.add_argument("--mlp_dim", type=int, default=50) # to write in argparser.add_argument("--out_dir", type=str) argparser.add_argument("--full_results_file", type=str, default="") # to write in argparser.add_argument("--results_file", type=str, default="") # to write in args = argparser.parse_args() print '\n', args, '\n' df = read_df(args.df_corpus) df = df.fillna(u'') label_tags = pickle.load(open(args.tags_file, 'rb')) raw_corpus = myio.read_corpus(args.corpus_w_tags, with_tags=True) embedding_layer = create_embedding_layer(n_d=10, embs=load_embedding_iterator( args.embeddings), only_words=False) with tf.Session() as sess: myqrapi = TPAPI(args.model, embedding_layer, sess, len(label_tags), args.layer)
import random import streamlit as st from utils import read_df st.set_page_config(page_title="pyvocab", layout="wide") if "correct_streak" not in st.session_state: st.session_state.correct_streak = 0 if "words" not in st.session_state: st.session_state.words = read_df("words.csv") if "distractors" not in st.session_state: st.session_state.distractors = read_df("distractors.csv") words = st.session_state.words distractors = st.session_state.distractors this_word = words.sample() st.write("#") st.write("#") st.write(" ## *" + this_word["definition"].iloc[0].strip() + "*") st.write("#") st.write("#") other_words = words.sample(n=5)
return data_frame def remake_training_file_for_question_ranking(new_train_rows, out_file): with open(out_file, 'w') as f: for x in new_train_rows: q_ids_similar = " ".join([str(q) for q in x[1]]) q_ids_candidates = " ".join([str(q) for q in x[2]]) f.write('{}\t{}\t{}\n'.format(str(x[0]), q_ids_similar, q_ids_candidates)) if __name__ == '__main__': df = read_df( '/home/christina/Documents/Thesis/data/askubuntu/additional/data_frame_corpus_str.csv' ) print 'total ids: ', df.shape[0] E = read_eval_rows( '/home/christina/Documents/Thesis/data/askubuntu/test.txt') test_ids = get_eval_ids(E) E = read_eval_rows( '/home/christina/Documents/Thesis/data/askubuntu/dev.txt') dev_ids = get_eval_ids(E) eval_ids = test_ids | dev_ids print 'total eval ids: ', len(eval_ids) T = list( read_eval_rows(