def make_score_hold(factory,seasons,score_name,compare=(lambda x,y:x!=y),modelFactory=linear_model.LogisticRegression): train_vector,train_ids,targets,test_vector,test_ids = factory() projects = utils.read_csv('projects.csv')[['projectid','date_posted']] train_idx = { pid:i for i,pid in enumerate(train_ids) } projects['idx'] = projects.projectid.map(train_idx) projects.date_posted = [ '%s-%02d-01'%(d[:4],(int(d[5:7])-1)/seasons+1) for d in projects.date_posted ] projects.date_posted[ projects.date_posted < '2008-01-01' ] = '2007-12-01' projects.date_posted[ projects.date_posted >= '2014-01-01' ] = '2014-01-01' training = projects[ projects.idx.notnull() ] outcomes = utils.read_csv('outcomes.csv')[['projectid','is_exciting']] training = pd.merge(training,outcomes) def handle(x): day = x.date_posted.iloc[0] logging.info('handling %s of shape %s'%(day,x.shape)) idx = compare(training.date_posted,day) if idx.sum()==0 : x[score_name] = -1 return x train_x = train_vector[training.idx[idx]] train_y = training.is_exciting[idx] if train_y.sum()==0 : x[score_name] = -1 return x model = modelFactory() model.fit(train_x,train_y) pred_x = test_vector if day == '2014-01-01' else train_vector[list(training.idx[training.date_posted==day])] x[score_name] = model.predict_proba(pred_x)[:,1] return x scores = projects.groupby('date_posted').apply(handle).reset_index()[['projectid',score_name]] return scores
def load_data(lines_file,conversations_file) : # construct dico lineID -> {characterid,charactername,movieid,text} lines_dico={} for line in utils.read_csv(lines_file,' +++$+++ ') : lines_dico[line[0]]={'characterid':line[1],'charactername':line[3],'movieid':line[2],'text':line[4]} # construct the film dico : id -> [dialogues = [text1,text2...]] films={} for dialogue in utils.read_csv(conversations_file,' +++$+++ ') : filmid=dialogue[2] if filmid not in films : films[filmid]=[] #print dialogue[3] #lineids = json.loads(dialogue[3])# issue with json string ? lineids = utils.parse_json_array(dialogue[3]) films[filmid].append([lines_dico[lineid]['text'] for lineid in lineids]) return films
def convert_metadata(f,t): res=[] for line in utils.read_csv(f,' +++$+++ ') : filmid=line[0];filmtitle=line[1];filmyear=line[2][:4];filmrating=line[3];filmratingnum=line[4] for filmtype in utils.parse_json_array(line[5]): res.append([filmid,filmtitle,filmyear,filmrating,filmratingnum,filmtype]) utils.export_csv(res,t,';')
def _feature_020(): ''' basic infos from resource.csv @return id => [resouceid_cnt,vendor_cnt,project_resource_type_cnt, item_quantity_sum,item_quantity_max, item_unit_price_mean,item_unit_price_max ] ''' resources = utils.read_csv('resources.csv') gp = resources.groupby('projectid') def mapper(x): return pd.Series([ x.shape[0], x.vendorid.unique().shape[0], x.project_resource_type.unique().shape[0], x.item_quantity.sum(),x.item_quantity.max(), x.item_unit_price.mean(),x.item_unit_price.max(), ]) data = gp.apply(mapper) columns = [ "resouceid_cnt","vendor_cnt","project_resource_type_cnt", "item_quantity_sum","item_quantity_max", "item_unit_price_mean","item_unit_price_max"] data.columns = columns data = data.reset_index() return data,columns
def dedupe_licenses(licenses_by_country): all_licenses = reduce(set.union, ((n for n, c in v if c > 0) for *_, v in licenses_by_country), set()) all_licenses = ((i, i.lower()) for i in sorted(all_licenses, key=str.lower)) csv_path = str(Path(__file__).parent/'data'/'license_mappings.csv') existing_keys = {k for k, _ in read_csv(csv_path)} with open(csv_path, 'a') as file: csv_writer(file).writerows((a, b) for a, b in all_licenses if a not in existing_keys) with open(str(Path(__file__).parent/'data'/'license_details.yaml'), 'w') \ as file: yaml.safe_dump({c: l for c, l, _ in licenses_by_country}, file, allow_unicode=True, default_flow_style=False) input('Press any key to continue') # Pause before reloading the CSV return dict(read_csv(csv_path))
def feature_008d(feature): ''' @return id => sparse encoded text lengths in essays.csv ''' essays = utils.read_csv('essays.csv') #pd.read_csv(os.path.join(utils.DATA_DIR,'essays.csv')) data = pd.DataFrame(feature.projectid) columns = ['title','short_description','need_statement','essay'] for c in columns : data['length_%s'%(c)] = essays[c].fillna('').map(len) dimensions = { 'length_%s'%(c):1 for c in columns } return data,dimensions
def feature_008(feature,dim=40,step=0.2): ''' @return id => sparse encoded text lengths in essays.csv ''' essays = utils.read_csv('essays.csv') #pd.read_csv(os.path.join(utils.DATA_DIR,'essays.csv')) data = pd.DataFrame(feature.projectid) columns = ['title','short_description','need_statement','essay'] for c in columns : data['length_%s'%(c)] = essays[c].fillna('').map(len) columns = [ 'length_%s'%(c) for c in columns ] return sparse_encoder_002(data,columns,dim,step)
def diff_grade(self, diff_grade, class_name): """returns the difference between the current class grade and the last one """ diff = '' for name, grade, date in reversed(utils.read_csv('data.csv')): if name == class_name: return float(grade) - float(diff_grade) return 0.0
def to_tfidf_vector(field,max_df=0.5,min_df=2,max_features=5000): ''' @return train_vectors,train_ids,targets,test_vectors,test_ids ''' df = utils.read_csv('essays.csv') outcomes = utils.read_csv('outcomes.csv')[['projectid','is_exciting']] df = pd.merge(df,outcomes,how='left') texts = df[field].fillna('') model = TfidfVectorizer(max_df=max_df,min_df=min_df,max_features=max_features) model.fit(texts) train_idx,test_idx = df.is_exciting.notnull(),df.is_exciting.isnull() train_texts = texts[train_idx] test_texts = texts[test_idx] train_ids = df.projectid[train_idx] test_ids = df.projectid[test_idx] targets = df.is_exciting[train_idx] train_vector = model.transform(train_texts) test_vector = model.transform(test_texts) return train_vector,train_ids,targets,test_vector,test_ids
def main(file_in, file_out): own_rows = [r for r in read_csv('portals.csv', has_header=True)[1] if r['url'] and r['url'] != 'N/A'] fields, rows = read_csv(file_in, has_header=True) rows = sorted(chain(rows, ({'name': format_name(r['url']), 'title': r['title'], 'url': r['url'], 'publisher': format_publisher(r['presiding_body']), 'publisher_classification': 'Government', 'tags': ' '.join(format_tags(r)), 'country': r['country_code'], 'generator': r['software_platform'], 'api_endpoint': r['metadata_api_endpoint']} for r in own_rows)), key=lambda r: r['name']) with open(file_out, 'w') as file: writer = csv_dict_writer(file, fields) writer.writeheader() writer.writerows(rows)
def prepare_data(filename,columns,target_columns=['is_exciting'],fillna=0,since=None): feature = utils.read_csv(filename) if filename != 'projects.csv' : pids = utils.read_csv('projects.csv')[['projectid','date_posted']] feature = pd.merge(pids,feature,how='left') data_target_columns = [ c for c in target_columns if c in feature.columns ] outcome_target_columns = [ c for c in target_columns if c not in feature.columns ] data = feature[['projectid','date_posted']+columns+data_target_columns] if since != None: data = data[data.date_posted>=since] logging.info('prepare_data begining, data shape=%s'%(data.shape,)) if len(outcome_target_columns) > 0 : outcomes = utils.read_csv('outcomes.csv') outcomes = outcomes[['projectid']+outcome_target_columns] for c in outcomes.columns: if c != 'projectid': outcomes[c] = map(int,outcomes[c].fillna(0)) data = pd.merge(data,outcomes,how='left') if fillna != None: data = data.fillna(fillna) return data
def diff_grade_custom(self, comp_grade, class_name, comp_date): """returns the difference between the current class grade and the grade from the provided timedelta """ diff = '' for entry in reversed(utils.read_csv('data.csv')): if entry[0] == class_name and entry[2] == str(comp_date): diff = float(entry[1]) - float(comp_grade) if diff < 0: return '+' + str(diff) else: return diff return 0.0
def tfidf_encoder(filename,columns,max_df,min_df,max_features): df = utils.read_csv('essays.csv') data = pd.DataFrame(df.projectid) dimensions = {} for c in columns : texts = df[c].fillna('') model = TfidfVectorizer(max_df=max_df,min_df=min_df,max_features=max_features) model.fit(texts) vector = model.transform(texts) c_name = '%s:tfidf_%s'%(filename,c) data[c_name] = map(lambda x:{ k:x[0,k] for k in x.nonzero()[1] },vector) dimensions[c_name] = vector.shape[1] return data,dimensions
def tfidf_encoder_001(filename,columns,max_df,min_df,max_features): ''' make tfidf vectors and state for gbdt return id => { num_words,sum_tfidf } for every column ''' df = utils.read_csv(filename) data = pd.DataFrame(df.projectid) dimensions = {} for c in columns : texts = df[c].fillna('') model = TfidfVectorizer(max_df=max_df,min_df=min_df,max_features=max_features) model.fit(texts) vector = model.transform(texts) stats = [ '%s@%s'%(s,c) for s in ['#words','sum_tfidf'] ] data[stats[0]] = [ vector[i].nonzero()[0].shape[0] for i in range(vector.shape[0]) ] data[stats[1]] = vector.sum(1) dimensions.update({ s:1 for s in stats }) return data,dimensions
def load_filmmetadata(f): res= {} #years=set() years={} types=set() for line in utils.read_csv(f,' +++$+++ ') : filmid=line[0] res[filmid]={'title':line[1],'year':line[2]} if line[2] not in years: years[line[2]]=1 else : years[line[2]]=years[line[2]]+1 # get film types filmtypes = utils.parse_json_array(line[5]) for t in filmtypes : types.add(t) #print(years) print(types) return res
def read_csv(filepath): ''' TODO: This function needs to be completed. Read the events.csv, mortality_events.csv and event_feature_map.csv files into events, mortality and feature_map. Return events, mortality and feature_map ''' #Columns in events.csv - patient_id,event_id,event_description,timestamp,value events = '' #Columns in mortality_event.csv - patient_id,timestamp,label mortality = '' #Columns in event_feature_map.csv - idx,event_id feature_map = '' events, mortality, feature_map=utils.read_csv(filepath) return events, mortality, feature_map
"-image_field", type=str, default="Input.Image_url", help="(Only used if compute_distances = True) Name of CSV field containing image URL", ) parser.add_argument( "-draw_events_field", type=str, default="Answer.WritingTexts", help="(Only used if compute_distances = True) Name of CSV field containing drawing task events", ) args = parser.parse_args() input_file = args.csv data_with_distance = [] file_header = [] if args.compute_distances: if not args.image_dir: parser.print_help() raise ValueError("-image_dir parameter is required if -compute_distances is true.") generator = BitmapMaker(args.bitmap_dim, args.bitmap_dim) image_url_field = args.image_field actions_field = args.draw_events_field data_with_distance, file_header = evaluate.get_hamming_distances( args.csv, args.image_dir, args.output, generator, image_url_field, actions_field ) else: data_with_distance, file_header = utils.read_csv(input_file) filter(data_with_distance, file_header, args.cutoff, args.output)
def main(): fields, rows = read_csv('portals.csv', has_header=True) country_stats = gather_country_stats(asyncio.get_event_loop(), rows) create_licenses_csv(tuple((c, d, l) for c, _, d, l in country_stats)) update_portals_csv(fields, rows, {c: t for c, t, *_ in country_stats})
def aff2city(aff): city_df = read_csv(DATA_PATH, 'city_name.csv') city_names = [city.lower() for city in city_df['City'].values] for i in range(len(city_names)): if city_names[i] in aff.lower() and city_df['level'][i] == 1: return city_names[i]
# from spacySim import spacySim, spacyPhraseSim from gloveSim import gloveSim with open('theme_dict_set.p') as f: theme_dict_set = pickle.load(f) with open('prop_dict_set.p') as f: prop_dict_set = pickle.load(f) app = Flask(__name__) # --- Init vocab (TODO: really hacky, fix this later) # --- Assume 1 word per line csv topics = {} for k,v in TOPICS.iteritems(): topics[k] = [(w[0], w[1].lower()) for w in read_csv(v)] # --- This is also hacky TOPIC_STAT = { 'weddingTheme': { 'sim_mean': 0.111250152359, 'sim_sd': 0.123640928544 }, 'weddingProp': { 'sim_mean': 0.141468741736, 'sim_sd': 0.129488421166 } }
def load_atomic2020(args): random.seed(args.random_seed) atomic2020_v1_file = args.data_folder + "atomic_original_tuples.tsv" atomic2020_addl_file = args.data_folder + "atomic_additional_tuples.tsv" atomic2020_cn_file = args.data_folder + "atomic_conceptnet_tuples.tsv" v1_data = read_csv(atomic2020_v1_file, delimiter="\t", skip_header=True) addl_data = read_csv(atomic2020_addl_file, delimiter="\t", skip_header=True) cn_data_with_id = read_csv(atomic2020_cn_file, delimiter="\t", skip_header=True) cn_data = [l[1:] for l in cn_data_with_id] # Atomic split atomic_train = read_csv(args.atomic_split + "train.tsv", delimiter="\t", skip_header=False) atomic_dev = read_csv(args.atomic_split + "dev.tsv", delimiter="\t", skip_header=False) atomic_test = read_csv(args.atomic_split + "test.tsv", delimiter="\t", skip_header=False) atomic_train_events = get_head_set(atomic_train) atomic_dev_events = get_head_set(atomic_dev) atomic_test_events = get_head_set(atomic_test) v1_data_train = [l for l in v1_data if l[0] in atomic_train_events] v1_data_dev = [l for l in v1_data if l[0] in atomic_dev_events] v1_data_test = [l for l in v1_data if l[0] in atomic_test_events] assert len(v1_data) == len(v1_data_train) + len(v1_data_dev) + len(v1_data_test) # CN split cn_train = read_csv(args.conceptnet_split + "train.tsv", delimiter="\t", skip_header=False) cn_dev = read_csv(args.conceptnet_split + "dev.tsv", delimiter="\t", skip_header=False) cn_test = read_csv(args.conceptnet_split + "test.tsv", delimiter="\t", skip_header=False) cn_train_heads = get_head_set(cn_train) cn_dev_heads = get_head_set(cn_dev) cn_test_heads = get_head_set(cn_test) cn_data_train = [l for l in cn_data if l[0] in cn_train_heads] cn_data_dev = [l for l in cn_data if l[0] in cn_dev_heads] cn_data_test = [l for l in cn_data if l[0] in cn_test_heads] # Additional tuples split (addl_train, addl_dev, addl_test) = head_based_split(addl_data, dev_size=args.dev_size, test_size=args.test_size, head_size_threshold=args.head_size_threshold, dev_heads=atomic_dev_events, test_heads=atomic_test_events) new_addl_train = [] new_addl_dev = [] new_addl_test = addl_test for l in addl_train: h = l[0] if h in cn_dev_heads: new_addl_dev.append(l) else: if h in cn_test_heads: new_addl_test.append(l) else: new_addl_train.append(l) for l in addl_dev: h = l[0] if h in cn_test_heads: new_addl_test.append(l) else: new_addl_dev.append(l) train = v1_data_train + cn_data_train + new_addl_train dev = v1_data_dev + cn_data_dev + new_addl_dev test = v1_data_test + cn_data_test + new_addl_test return train, dev, test
def start(self): self.status.set("Estado: -") self.leng.set("Iteración: -/- y Número: -/-") self.totaltime.set("Tiempo total: -") self.ones.set("Total de unos: -") self.types.set("Progreso: -") self.startButton.config(state = 'disabled') self.browseButton.config(state = 'disabled') self.cancelButton.config(state = 'normal') self.maxnumberSpinbox.config(state = 'disabled') self.complexSpinbox.config(state = 'disabled') if int(self.complexSpinbox.get()) in (1,2,3,4,5) and int(self.maxnumberSpinbox.get()) in (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21) and self.completeName != "": start_time = time.time() if self.name.get().split('.')[1] == 'csv': self.g = generator.Generator(self.maxnumberSpinbox.get(), self.complexSpinbox.get(), utils.read_csv(self.completeName), self.cancelButton, self.types) else: self.g = generator.Generator(self.maxnumberSpinbox.get(), self.complexSpinbox.get(), utils.read_json(self.completeName), self.cancelButton, self.types) self.g.count_one() self.ones.set("Total de unos: {0}".format(len(self.g.table_uno))) i = 0 while self.g.maxim > 1: i += 1 self.leng.set("Iteración: {0}/{1} y Número: {2}".format(i, self.complexSpinbox.get(), self.g.maxim)) self.status.set("Estado: Generando puzzle...") self.g.step_one() tim = utils.sec_to(int(time.time() - start_time)) self.totaltime.set("Tiempo total: {0}h:{1}m:{2}s".format(tim[0], tim[1], tim[2])) self.status.set("Estado: Aplicando condición uno...") self.g.cond_dos(1) tim = utils.sec_to(int(time.time() - start_time)) self.totaltime.set("Tiempo total: {0}h:{1}m:{2}s".format(tim[0], tim[1], tim[2])) self.status.set("Estado: Aplicando condición dos...") self.g.cond_dos(2) tim = utils.sec_to(int(time.time() - start_time)) self.totaltime.set("Tiempo total: {0}h:{1}m:{2}s".format(tim[0], tim[1], tim[2])) if self.g.maxim >= 4: self.status.set("Estado: Aplicando condición tres...") self.g.cond_dos(3) tim = utils.sec_to(int(time.time() - start_time)) self.totaltime.set("Tiempo total: {0}h:{1}m:{2}s".format(tim[0], tim[1], tim[2])) self.g.count_one() self.ones.set("Total de unos: {0}".format(len(self.g.table_uno))) if i == self.g.iters: self.g.maxim -= 1 i = 0 if self.name.get().split('.')[1] == 'csv': utils.write_csv(self.g.table_all) else: utils.write_json(self.g.table_all) if self.g.cancel: self.status.set("Estado: Cancelado") else: self.status.set("Estado: Completado") self.g = None self.startButton.config(state = 'normal') self.browseButton.config(state = 'normal') self.cancelButton.config(state = 'disabled') self.maxnumberSpinbox.config(state = 'normal') self.complexSpinbox.config(state = 'normal')
import utils import sys ''' This file accepts command line argument. Processes the contents in the input and generates the report in a csv format Command Line Arguments Input Path : String : Path to the input file Output Path : String : Path to which the report has to be written ''' if len(sys.argv) != 3: raise ValueError("Please enter input file and output file to be created as arguments") input_path = sys.argv[1] output_path = sys.argv[2] data_dict = utils.read_csv(input_path) data_dict = dict(sorted(data_dict.items(), key=lambda x: x[0])) utils.to_csv(data_dict, output_path)
def process(wav_file, outfile, csv_file=None, bpm=None, tol=0.35, ssm_read_pk=False, read_pk=False, rho=2, is_ismir=False, tonnetz=False, sonify=False): """Main process to find the patterns in a polyphonic audio file. Parameters ---------- wav_file : str Path to the wav file to be analyzed. csv_file : str Path to the csv containing the midi_score of the input audio file (needed to produce a result that can be read for JKU dataset). outfile : str Path to file to save the results. bpm : int Beats per minute of the piece. If None, bpms are read from the JKU. tol : float Tolerance to find the segments in the SSM. ssm_read_pk : bool Whether to read the SSM from a pickle file. read_pk : bool Whether to read the segments from a pickle file. rho : int Positive integer to compute the score of the segments. is_ismir : bool Produce the plots that appear on the ISMIR paper. tonnetz : bool Whether to use Tonnetz or Chromas. sonify : bool Whether to sonify the patterns or not. """ # Get the correct bpm if needed if bpm is None: bpm = get_bpm(wav_file) # Algorithm parameters min_notes = 8 max_diff_notes = 4 h = bpm / 60. / 8. # Hop size /8 works better than /4, but it takes longer # to process # Obtain the Self Similarity Matrix X = compute_ssm(wav_file, h, ssm_read_pk, is_ismir, tonnetz) # Read CSV file if csv_file is not None: logging.info("Reading the CSV file for MIDI pitches...") midi_score = utils.read_csv(csv_file) patterns = [] csv_patterns = [] while patterns == [] or csv_patterns == []: # Find the segments inside the self similarity matrix logging.info("Finding segments in the self-similarity matrix...") max_diff = int(max_diff_notes / float(h)) min_dur = int(np.ceil(min_notes / float(h))) #print min_dur, min_notes, h, max_diff if not read_pk: segments = [] while segments == []: logging.info("\ttrying tolerance %.2f" % tol) segments = utils.find_segments(X, min_dur, th=tol, rho=rho) tol -= 0.05 utils.write_cPickle(wav_file + "-audio.pk", segments) else: segments = utils.read_cPickle(wav_file + "-audio.pk") # Obtain the patterns from the segments and split them if needed logging.info("Obtaining the patterns from the segments...") patterns = obtain_patterns(segments, max_diff) # Decrease tolerance in case we couldn't find any patterns tol -= 0.05 # Get the csv patterns if they exist if csv_file is not None: csv_patterns = patterns_to_csv(patterns, midi_score, h) else: csv_patterns = [0] # Sonify patterns if needed if sonify: logging.info("Sonifying Patterns...") utils.sonify_patterns(wav_file, patterns, h) # Formatting csv patterns and save results if csv_file is not None: logging.info("Writting results into %s" % outfile) utils.save_results(csv_patterns, outfile=outfile) else: # If not csv, just print the results on the screen print_patterns(patterns, h) if is_ismir: ismir.plot_segments(X, segments) # Alright, we're done :D logging.info("Algorithm finished.")
import os.path import numpy as np from classic_algorithms import FirstFit from classic_algorithms import NextFit from classic_algorithms import BestFit from environment import BinEnvironment from environment import ItemProvider from agent import Agent from utils import plot_learning from utils import read_csv max_simultaneously_bins = 5 load_checkpoint = True filename = "flat" data = read_csv(file="training_data/%s.csv" % filename) print(data) agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01, input_dims=[2], lr=0.001) agent.train() if load_checkpoint and os.path.isfile("%s_checkpoint.pth.tar" % filename): agent.load_checkpoint(source_file="%s_checkpoint.pth.tar" % filename) item_provider = ItemProvider(sample_size=100, data=data, randomize=True) env = BinEnvironment(max_simultaneously_bins, item_provider=item_provider)
'Prénom du candidat', 'N° tour', 'Nom de la commune', 'N° de dépôt de la liste', 'Prénom du candidat tête de liste', u"Date de l'export", 'Libellé du département', 'Libellé de la commune', '% Abs/Ins', '% Vot/Ins', '% BlNuls/Ins', '% BlNuls/Vot', '% Exp/Ins', '% Exp/Vot', 'Sexe', 'Prénom', 'Liste', 'Sieges', u'% Voix/Ins', u'% Voix/Exp', 'Abstentions', 'Blancs et nuls'] #FIXME: si les données disponibles étaient plus uniformes, on pourrait # automatiser davantage tout ceci. # Export the data as several dataframes: with pd.HDFStore(os.path.join(var.path['out'], 'elections.h5'), 'w') as store: # European elections, 2014: fname = get("http://www.regardscitoyens.org/telechargement/donnees/elections/2014_europeennes/europ%C3%A9ennes-2014-r%C3%A9sultats-bureaux_vote-tour1.csv") store.put('europeennes2014', fmt_elections(read_csv(fname[0], cols_mapper, cols_to_drop, dtype_mapper)), format='fixed') # Municipal elections, second turn, 2014: fname = get("http://www.regardscitoyens.org/telechargement/donnees/elections/2014_municipales/municipales-2014-r%C3%A9sultats-bureaux_vote-tour2.csv") store.put('municipales2014_tour2', fmt_elections(read_csv(fname[0], cols_mapper, cols_to_drop, dtype_mapper)), format='fixed') # Municipal elections, first turn, 2014: fname = get("http://www.regardscitoyens.org/telechargement/donnees/elections/2014_municipales/municipales-2014-r%C3%A9sultats-bureaux_vote-tour1.csv") store.put('municipales2014_tour1', fmt_elections(read_csv(fname[0], cols_mapper, cols_to_drop, dtype_mapper)), format='fixed') # Legislative elections, 2012: fname = get("http://www.nosdonnees.fr/storage/f/2013-03-05T184148/LG12_BV_T1T2.zip") store.put('legislatives2012_tour2', fmt_elections(read_csv(fname[1], cols_mapper, cols_to_drop, dtype_mapper, encoding='cp1252')),
if os.path.exists(MEDIANS_PLOT_PATH): shutil.rmtree(MEDIANS_PLOT_PATH) for index, ecg_file in enumerate(median_files): print(f'Generating plots: { index + 1 } / { len(median_files) }', end='\r') ecg_id = os.path.splitext(ecg_file)[0] file_path = os.path.join(MEDIANS_PLOT_PATH, f'{ ecg_id }.{ PLOT_FORMAT }') # Read ECG data from .asc file ecg_data = read_csv(os.path.join(MEDIANS_PATH, ecg_file), delimiter=' ', transpose=True, skip_header=False, dtype=np.int) # Normalize ECG data between -1 and 1 ecg_normalized = normalize(ecg_data) # Scale normlaized ECG data between -Y_MAX and Y_MAX ecg_scaled = np.array(ecg_normalized * Y_MAX, dtype=int) # Reduce the length of ECG data by dropping every other element ecg_reduced = shorten(ecg_scaled) # Remove the first n values to fit within the X_MAX limit if len(ecg_reduced[0]) > X_MAX: ecg_reduced = [
def mining(self): """Mining frequent sequences by prefixSpan""" S = utils.read_csv(self.datafile) self.patternSet = utils.prefixSpan(S, Sequence([], sys.maxint), self.support)
parser.add_argument("-output_json_file", type=str, required=True, help="Output json file") parser.add_argument("-rseed", type=int, default=0, help="Random seed") parser.add_argument("-train_ratio", type=float, default=0.67, help="Ratio (between 0 and 1) of examples to use for training") parser.add_argument("-test_ratio", type=float, default=0.17, help="Ratio (between 0 and 1) of examples to use for testing") parser.add_argument("-val_ratio", type=float, default=0.16, help="Ratio (between 0 and 1) of examples to use for validation") parser.add_argument("-commands_field", type=str, default="Input.commands", help="Name of CSV field containing descriptions to arrange blocks") parser.add_argument("-image_field", type=str, default="Input.Image_url", help="Name of CSV field containing image URL") parser.add_argument("-retrieve_text_bitmaps", type=bool, default=True, help="Retrieve and augment data with bitmaps located adjacent to image URL's") parser.add_argument("-draw_events_field", type=str, default="Answer.WritingTexts", help="Name of CSV field containing drawing task events") args = parser.parse_args() random.seed(args.rseed) header, rows = read_csv(args.csv) commands_idx = header.index(args.commands_field) actions_idx = header.index(args.draw_events_field) image_idx = header.index(args.image_field) key_attrs = split_keys(rows, image_idx, args.train_ratio, args.test_ratio, args.val_ratio) output_file = args.output_json_file d = os.path.dirname(output_file) if not os.path.exists(d) and len(d)>0: os.makedirs(d) print "Processing {} rows".format(len(rows)) write_data(rows, key_attrs, image_idx, commands_idx, actions_idx, output_file, include_bitmaps=args.retrieve_text_bitmaps)
from glob import glob from nltk.tokenize import word_tokenize, sent_tokenize import re from utils import read_file, get_name_from_filepath, read_csv import pandas as pd from textstat import textstat #nltk.download('punkt') litigious_words = read_csv(r'C:\Users\Krista\hansell\lm_litigious.csv') files = glob( r'C:\Users\Krista\DocumentsRE _Call_re_potential_matter\out\\*.txt') info_dicts = [] for file in files: info_dict = {} name = get_name_from_filepath(file) text = read_file(file) words = word_tokenize(text) words_in_list = [ word for word in words if any(lit in word for lit in litigious_words.index) ] info_dict['Name'] = name info_dict['Total litigious words'] = len(words_in_list) info_dict['Total words'] = len(words) info_dicts.append(info_dict) df = pd.DataFrame(info_dicts) df.to_csv(